Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .eggs/nltk-3.8-py3.10.egg/nltk/app/chunkparser_app.py +1500 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/app/collocations_app.py +438 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/app/concordance_app.py +709 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/app/nemo_app.py +163 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/app/rdparser_app.py +1052 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/app/srparser_app.py +937 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/app/wordfreq_app.py +36 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/app/wordnet_app.py +997 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/ccg/__init__.py +34 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/ccg/api.py +358 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/ccg/chart.py +480 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/ccg/combinator.py +339 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/ccg/lexicon.py +338 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/ccg/logic.py +60 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chat/__init__.py +48 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chat/eliza.py +337 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chat/iesha.py +160 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chat/rude.py +125 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chat/suntsu.py +140 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chat/util.py +124 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chat/zen.py +329 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chunk/__init__.py +197 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chunk/api.py +56 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chunk/named_entity.py +352 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chunk/regexp.py +1475 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/chunk/util.py +643 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/classify/__init__.py +101 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/classify/api.py +195 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/classify/decisiontree.py +349 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/classify/maxent.py +1569 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/classify/megam.py +184 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/classify/naivebayes.py +260 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/classify/positivenaivebayes.py +180 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/classify/util.py +346 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/classify/weka.py +377 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/cluster/__init__.py +92 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/cluster/api.py +74 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/cluster/em.py +219 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/cluster/gaac.py +170 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/cluster/kmeans.py +231 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/__init__.py +529 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/europarl_raw.py +56 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/childes.py +630 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chunked.py +273 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/cmudict.py +88 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/comparative_sents.py +309 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/conll.py +579 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/crubadan.py +106 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/dependency.py +115 -0
- .eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/framenet.py +0 -0
.eggs/nltk-3.8-py3.10.egg/nltk/app/chunkparser_app.py
ADDED
|
@@ -0,0 +1,1500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Regexp Chunk Parser Application
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
A graphical tool for exploring the regular expression based chunk
|
| 10 |
+
parser ``nltk.chunk.RegexpChunkParser``.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
# Todo: Add a way to select the development set from the menubar. This
|
| 14 |
+
# might just need to be a selection box (conll vs treebank etc) plus
|
| 15 |
+
# configuration parameters to select what's being chunked (eg VP vs NP)
|
| 16 |
+
# and what part of the data is being used as the development set.
|
| 17 |
+
|
| 18 |
+
import random
|
| 19 |
+
import re
|
| 20 |
+
import textwrap
|
| 21 |
+
import time
|
| 22 |
+
from tkinter import (
|
| 23 |
+
Button,
|
| 24 |
+
Canvas,
|
| 25 |
+
Checkbutton,
|
| 26 |
+
Frame,
|
| 27 |
+
IntVar,
|
| 28 |
+
Label,
|
| 29 |
+
Menu,
|
| 30 |
+
Scrollbar,
|
| 31 |
+
Text,
|
| 32 |
+
Tk,
|
| 33 |
+
)
|
| 34 |
+
from tkinter.filedialog import askopenfilename, asksaveasfilename
|
| 35 |
+
from tkinter.font import Font
|
| 36 |
+
|
| 37 |
+
from nltk.chunk import ChunkScore, RegexpChunkParser
|
| 38 |
+
from nltk.chunk.regexp import RegexpChunkRule
|
| 39 |
+
from nltk.corpus import conll2000, treebank_chunk
|
| 40 |
+
from nltk.draw.util import ShowText
|
| 41 |
+
from nltk.tree import Tree
|
| 42 |
+
from nltk.util import in_idle
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class RegexpChunkApp:
|
| 46 |
+
"""
|
| 47 |
+
A graphical tool for exploring the regular expression based chunk
|
| 48 |
+
parser ``nltk.chunk.RegexpChunkParser``.
|
| 49 |
+
|
| 50 |
+
See ``HELP`` for instructional text.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
##/////////////////////////////////////////////////////////////////
|
| 54 |
+
## Help Text
|
| 55 |
+
##/////////////////////////////////////////////////////////////////
|
| 56 |
+
|
| 57 |
+
#: A dictionary mapping from part of speech tags to descriptions,
|
| 58 |
+
#: which is used in the help text. (This should probably live with
|
| 59 |
+
#: the conll and/or treebank corpus instead.)
|
| 60 |
+
TAGSET = {
|
| 61 |
+
"CC": "Coordinating conjunction",
|
| 62 |
+
"PRP$": "Possessive pronoun",
|
| 63 |
+
"CD": "Cardinal number",
|
| 64 |
+
"RB": "Adverb",
|
| 65 |
+
"DT": "Determiner",
|
| 66 |
+
"RBR": "Adverb, comparative",
|
| 67 |
+
"EX": "Existential there",
|
| 68 |
+
"RBS": "Adverb, superlative",
|
| 69 |
+
"FW": "Foreign word",
|
| 70 |
+
"RP": "Particle",
|
| 71 |
+
"JJ": "Adjective",
|
| 72 |
+
"TO": "to",
|
| 73 |
+
"JJR": "Adjective, comparative",
|
| 74 |
+
"UH": "Interjection",
|
| 75 |
+
"JJS": "Adjective, superlative",
|
| 76 |
+
"VB": "Verb, base form",
|
| 77 |
+
"LS": "List item marker",
|
| 78 |
+
"VBD": "Verb, past tense",
|
| 79 |
+
"MD": "Modal",
|
| 80 |
+
"NNS": "Noun, plural",
|
| 81 |
+
"NN": "Noun, singular or masps",
|
| 82 |
+
"VBN": "Verb, past participle",
|
| 83 |
+
"VBZ": "Verb,3rd ps. sing. present",
|
| 84 |
+
"NNP": "Proper noun, singular",
|
| 85 |
+
"NNPS": "Proper noun plural",
|
| 86 |
+
"WDT": "wh-determiner",
|
| 87 |
+
"PDT": "Predeterminer",
|
| 88 |
+
"WP": "wh-pronoun",
|
| 89 |
+
"POS": "Possessive ending",
|
| 90 |
+
"WP$": "Possessive wh-pronoun",
|
| 91 |
+
"PRP": "Personal pronoun",
|
| 92 |
+
"WRB": "wh-adverb",
|
| 93 |
+
"(": "open parenthesis",
|
| 94 |
+
")": "close parenthesis",
|
| 95 |
+
"``": "open quote",
|
| 96 |
+
",": "comma",
|
| 97 |
+
"''": "close quote",
|
| 98 |
+
".": "period",
|
| 99 |
+
"#": "pound sign (currency marker)",
|
| 100 |
+
"$": "dollar sign (currency marker)",
|
| 101 |
+
"IN": "Preposition/subord. conjunction",
|
| 102 |
+
"SYM": "Symbol (mathematical or scientific)",
|
| 103 |
+
"VBG": "Verb, gerund/present participle",
|
| 104 |
+
"VBP": "Verb, non-3rd ps. sing. present",
|
| 105 |
+
":": "colon",
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
#: Contents for the help box. This is a list of tuples, one for
|
| 109 |
+
#: each help page, where each tuple has four elements:
|
| 110 |
+
#: - A title (displayed as a tab)
|
| 111 |
+
#: - A string description of tabstops (see Tkinter.Text for details)
|
| 112 |
+
#: - The text contents for the help page. You can use expressions
|
| 113 |
+
#: like <red>...</red> to colorize the text; see ``HELP_AUTOTAG``
|
| 114 |
+
#: for a list of tags you can use for colorizing.
|
| 115 |
+
HELP = [
|
| 116 |
+
(
|
| 117 |
+
"Help",
|
| 118 |
+
"20",
|
| 119 |
+
"Welcome to the regular expression chunk-parser grammar editor. "
|
| 120 |
+
"You can use this editor to develop and test chunk parser grammars "
|
| 121 |
+
"based on NLTK's RegexpChunkParser class.\n\n"
|
| 122 |
+
# Help box.
|
| 123 |
+
"Use this box ('Help') to learn more about the editor; click on the "
|
| 124 |
+
"tabs for help on specific topics:"
|
| 125 |
+
"<indent>\n"
|
| 126 |
+
"Rules: grammar rule types\n"
|
| 127 |
+
"Regexps: regular expression syntax\n"
|
| 128 |
+
"Tags: part of speech tags\n</indent>\n"
|
| 129 |
+
# Grammar.
|
| 130 |
+
"Use the upper-left box ('Grammar') to edit your grammar. "
|
| 131 |
+
"Each line of your grammar specifies a single 'rule', "
|
| 132 |
+
"which performs an action such as creating a chunk or merging "
|
| 133 |
+
"two chunks.\n\n"
|
| 134 |
+
# Dev set.
|
| 135 |
+
"The lower-left box ('Development Set') runs your grammar on the "
|
| 136 |
+
"development set, and displays the results. "
|
| 137 |
+
"Your grammar's chunks are <highlight>highlighted</highlight>, and "
|
| 138 |
+
"the correct (gold standard) chunks are "
|
| 139 |
+
"<underline>underlined</underline>. If they "
|
| 140 |
+
"match, they are displayed in <green>green</green>; otherwise, "
|
| 141 |
+
"they are displayed in <red>red</red>. The box displays a single "
|
| 142 |
+
"sentence from the development set at a time; use the scrollbar or "
|
| 143 |
+
"the next/previous buttons view additional sentences.\n\n"
|
| 144 |
+
# Performance
|
| 145 |
+
"The lower-right box ('Evaluation') tracks the performance of "
|
| 146 |
+
"your grammar on the development set. The 'precision' axis "
|
| 147 |
+
"indicates how many of your grammar's chunks are correct; and "
|
| 148 |
+
"the 'recall' axis indicates how many of the gold standard "
|
| 149 |
+
"chunks your system generated. Typically, you should try to "
|
| 150 |
+
"design a grammar that scores high on both metrics. The "
|
| 151 |
+
"exact precision and recall of the current grammar, as well "
|
| 152 |
+
"as their harmonic mean (the 'f-score'), are displayed in "
|
| 153 |
+
"the status bar at the bottom of the window.",
|
| 154 |
+
),
|
| 155 |
+
(
|
| 156 |
+
"Rules",
|
| 157 |
+
"10",
|
| 158 |
+
"<h1>{...regexp...}</h1>"
|
| 159 |
+
"<indent>\nChunk rule: creates new chunks from words matching "
|
| 160 |
+
"regexp.</indent>\n\n"
|
| 161 |
+
"<h1>}...regexp...{</h1>"
|
| 162 |
+
"<indent>\nStrip rule: removes words matching regexp from existing "
|
| 163 |
+
"chunks.</indent>\n\n"
|
| 164 |
+
"<h1>...regexp1...}{...regexp2...</h1>"
|
| 165 |
+
"<indent>\nSplit rule: splits chunks that match regexp1 followed by "
|
| 166 |
+
"regexp2 in two.</indent>\n\n"
|
| 167 |
+
"<h1>...regexp...{}...regexp...</h1>"
|
| 168 |
+
"<indent>\nMerge rule: joins consecutive chunks that match regexp1 "
|
| 169 |
+
"and regexp2</indent>\n",
|
| 170 |
+
),
|
| 171 |
+
(
|
| 172 |
+
"Regexps",
|
| 173 |
+
"10 60",
|
| 174 |
+
# "Regular Expression Syntax Summary:\n\n"
|
| 175 |
+
"<h1>Pattern\t\tMatches...</h1>\n"
|
| 176 |
+
"<hangindent>"
|
| 177 |
+
"\t<<var>T</var>>\ta word with tag <var>T</var> "
|
| 178 |
+
"(where <var>T</var> may be a regexp).\n"
|
| 179 |
+
"\t<var>x</var>?\tan optional <var>x</var>\n"
|
| 180 |
+
"\t<var>x</var>+\ta sequence of 1 or more <var>x</var>'s\n"
|
| 181 |
+
"\t<var>x</var>*\ta sequence of 0 or more <var>x</var>'s\n"
|
| 182 |
+
"\t<var>x</var>|<var>y</var>\t<var>x</var> or <var>y</var>\n"
|
| 183 |
+
"\t.\tmatches any character\n"
|
| 184 |
+
"\t(<var>x</var>)\tTreats <var>x</var> as a group\n"
|
| 185 |
+
"\t# <var>x...</var>\tTreats <var>x...</var> "
|
| 186 |
+
"(to the end of the line) as a comment\n"
|
| 187 |
+
"\t\\<var>C</var>\tmatches character <var>C</var> "
|
| 188 |
+
"(useful when <var>C</var> is a special character "
|
| 189 |
+
"like + or #)\n"
|
| 190 |
+
"</hangindent>"
|
| 191 |
+
"\n<h1>Examples:</h1>\n"
|
| 192 |
+
"<hangindent>"
|
| 193 |
+
"\t<regexp><NN></regexp>\n"
|
| 194 |
+
'\t\tMatches <match>"cow/NN"</match>\n'
|
| 195 |
+
'\t\tMatches <match>"green/NN"</match>\n'
|
| 196 |
+
"\t<regexp><VB.*></regexp>\n"
|
| 197 |
+
'\t\tMatches <match>"eating/VBG"</match>\n'
|
| 198 |
+
'\t\tMatches <match>"ate/VBD"</match>\n'
|
| 199 |
+
"\t<regexp><IN><DT><NN></regexp>\n"
|
| 200 |
+
'\t\tMatches <match>"on/IN the/DT car/NN"</match>\n'
|
| 201 |
+
"\t<regexp><RB>?<VBD></regexp>\n"
|
| 202 |
+
'\t\tMatches <match>"ran/VBD"</match>\n'
|
| 203 |
+
'\t\tMatches <match>"slowly/RB ate/VBD"</match>\n'
|
| 204 |
+
r"\t<regexp><\#><CD> # This is a comment...</regexp>\n"
|
| 205 |
+
'\t\tMatches <match>"#/# 100/CD"</match>\n'
|
| 206 |
+
"</hangindent>",
|
| 207 |
+
),
|
| 208 |
+
(
|
| 209 |
+
"Tags",
|
| 210 |
+
"10 60",
|
| 211 |
+
"<h1>Part of Speech Tags:</h1>\n"
|
| 212 |
+
+ "<hangindent>"
|
| 213 |
+
+ "<<TAGSET>>"
|
| 214 |
+
+ "</hangindent>\n", # this gets auto-substituted w/ self.TAGSET
|
| 215 |
+
),
|
| 216 |
+
]
|
| 217 |
+
|
| 218 |
+
HELP_AUTOTAG = [
|
| 219 |
+
("red", dict(foreground="#a00")),
|
| 220 |
+
("green", dict(foreground="#080")),
|
| 221 |
+
("highlight", dict(background="#ddd")),
|
| 222 |
+
("underline", dict(underline=True)),
|
| 223 |
+
("h1", dict(underline=True)),
|
| 224 |
+
("indent", dict(lmargin1=20, lmargin2=20)),
|
| 225 |
+
("hangindent", dict(lmargin1=0, lmargin2=60)),
|
| 226 |
+
("var", dict(foreground="#88f")),
|
| 227 |
+
("regexp", dict(foreground="#ba7")),
|
| 228 |
+
("match", dict(foreground="#6a6")),
|
| 229 |
+
]
|
| 230 |
+
|
| 231 |
+
##/////////////////////////////////////////////////////////////////
|
| 232 |
+
## Config Parameters
|
| 233 |
+
##/////////////////////////////////////////////////////////////////
|
| 234 |
+
|
| 235 |
+
_EVAL_DELAY = 1
|
| 236 |
+
"""If the user has not pressed any key for this amount of time (in
|
| 237 |
+
seconds), and the current grammar has not been evaluated, then
|
| 238 |
+
the eval demon will evaluate it."""
|
| 239 |
+
|
| 240 |
+
_EVAL_CHUNK = 15
|
| 241 |
+
"""The number of sentences that should be evaluated by the eval
|
| 242 |
+
demon each time it runs."""
|
| 243 |
+
_EVAL_FREQ = 0.2
|
| 244 |
+
"""The frequency (in seconds) at which the eval demon is run"""
|
| 245 |
+
_EVAL_DEMON_MIN = 0.02
|
| 246 |
+
"""The minimum amount of time that the eval demon should take each time
|
| 247 |
+
it runs -- if it takes less than this time, _EVAL_CHUNK will be
|
| 248 |
+
modified upwards."""
|
| 249 |
+
_EVAL_DEMON_MAX = 0.04
|
| 250 |
+
"""The maximum amount of time that the eval demon should take each time
|
| 251 |
+
it runs -- if it takes more than this time, _EVAL_CHUNK will be
|
| 252 |
+
modified downwards."""
|
| 253 |
+
|
| 254 |
+
_GRAMMARBOX_PARAMS = dict(
|
| 255 |
+
width=40,
|
| 256 |
+
height=12,
|
| 257 |
+
background="#efe",
|
| 258 |
+
highlightbackground="#efe",
|
| 259 |
+
highlightthickness=1,
|
| 260 |
+
relief="groove",
|
| 261 |
+
border=2,
|
| 262 |
+
wrap="word",
|
| 263 |
+
)
|
| 264 |
+
_HELPBOX_PARAMS = dict(
|
| 265 |
+
width=15,
|
| 266 |
+
height=15,
|
| 267 |
+
background="#efe",
|
| 268 |
+
highlightbackground="#efe",
|
| 269 |
+
foreground="#555",
|
| 270 |
+
highlightthickness=1,
|
| 271 |
+
relief="groove",
|
| 272 |
+
border=2,
|
| 273 |
+
wrap="word",
|
| 274 |
+
)
|
| 275 |
+
_DEVSETBOX_PARAMS = dict(
|
| 276 |
+
width=70,
|
| 277 |
+
height=10,
|
| 278 |
+
background="#eef",
|
| 279 |
+
highlightbackground="#eef",
|
| 280 |
+
highlightthickness=1,
|
| 281 |
+
relief="groove",
|
| 282 |
+
border=2,
|
| 283 |
+
wrap="word",
|
| 284 |
+
tabs=(30,),
|
| 285 |
+
)
|
| 286 |
+
_STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2)
|
| 287 |
+
_FONT_PARAMS = dict(family="helvetica", size=-20)
|
| 288 |
+
_FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3)
|
| 289 |
+
_EVALBOX_PARAMS = dict(
|
| 290 |
+
background="#eef",
|
| 291 |
+
highlightbackground="#eef",
|
| 292 |
+
highlightthickness=1,
|
| 293 |
+
relief="groove",
|
| 294 |
+
border=2,
|
| 295 |
+
width=300,
|
| 296 |
+
height=280,
|
| 297 |
+
)
|
| 298 |
+
_BUTTON_PARAMS = dict(
|
| 299 |
+
background="#777", activebackground="#777", highlightbackground="#777"
|
| 300 |
+
)
|
| 301 |
+
_HELPTAB_BG_COLOR = "#aba"
|
| 302 |
+
_HELPTAB_FG_COLOR = "#efe"
|
| 303 |
+
|
| 304 |
+
_HELPTAB_FG_PARAMS = dict(background="#efe")
|
| 305 |
+
_HELPTAB_BG_PARAMS = dict(background="#aba")
|
| 306 |
+
_HELPTAB_SPACER = 6
|
| 307 |
+
|
| 308 |
+
def normalize_grammar(self, grammar):
|
| 309 |
+
# Strip comments
|
| 310 |
+
grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar)
|
| 311 |
+
# Normalize whitespace
|
| 312 |
+
grammar = re.sub(" +", " ", grammar)
|
| 313 |
+
grammar = re.sub(r"\n\s+", r"\n", grammar)
|
| 314 |
+
grammar = grammar.strip()
|
| 315 |
+
# [xx] Hack: automatically backslash $!
|
| 316 |
+
grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar)
|
| 317 |
+
return grammar
|
| 318 |
+
|
| 319 |
+
def __init__(
|
| 320 |
+
self,
|
| 321 |
+
devset_name="conll2000",
|
| 322 |
+
devset=None,
|
| 323 |
+
grammar="",
|
| 324 |
+
chunk_label="NP",
|
| 325 |
+
tagset=None,
|
| 326 |
+
):
|
| 327 |
+
"""
|
| 328 |
+
:param devset_name: The name of the development set; used for
|
| 329 |
+
display & for save files. If either the name 'treebank'
|
| 330 |
+
or the name 'conll2000' is used, and devset is None, then
|
| 331 |
+
devset will be set automatically.
|
| 332 |
+
:param devset: A list of chunked sentences
|
| 333 |
+
:param grammar: The initial grammar to display.
|
| 334 |
+
:param tagset: Dictionary from tags to string descriptions, used
|
| 335 |
+
for the help page. Defaults to ``self.TAGSET``.
|
| 336 |
+
"""
|
| 337 |
+
self._chunk_label = chunk_label
|
| 338 |
+
|
| 339 |
+
if tagset is None:
|
| 340 |
+
tagset = self.TAGSET
|
| 341 |
+
self.tagset = tagset
|
| 342 |
+
|
| 343 |
+
# Named development sets:
|
| 344 |
+
if devset is None:
|
| 345 |
+
if devset_name == "conll2000":
|
| 346 |
+
devset = conll2000.chunked_sents("train.txt") # [:100]
|
| 347 |
+
elif devset == "treebank":
|
| 348 |
+
devset = treebank_chunk.chunked_sents() # [:100]
|
| 349 |
+
else:
|
| 350 |
+
raise ValueError("Unknown development set %s" % devset_name)
|
| 351 |
+
|
| 352 |
+
self.chunker = None
|
| 353 |
+
"""The chunker built from the grammar string"""
|
| 354 |
+
|
| 355 |
+
self.grammar = grammar
|
| 356 |
+
"""The unparsed grammar string"""
|
| 357 |
+
|
| 358 |
+
self.normalized_grammar = None
|
| 359 |
+
"""A normalized version of ``self.grammar``."""
|
| 360 |
+
|
| 361 |
+
self.grammar_changed = 0
|
| 362 |
+
"""The last time() that the grammar was changed."""
|
| 363 |
+
|
| 364 |
+
self.devset = devset
|
| 365 |
+
"""The development set -- a list of chunked sentences."""
|
| 366 |
+
|
| 367 |
+
self.devset_name = devset_name
|
| 368 |
+
"""The name of the development set (for save files)."""
|
| 369 |
+
|
| 370 |
+
self.devset_index = -1
|
| 371 |
+
"""The index into the development set of the first instance
|
| 372 |
+
that's currently being viewed."""
|
| 373 |
+
|
| 374 |
+
self._last_keypress = 0
|
| 375 |
+
"""The time() when a key was most recently pressed"""
|
| 376 |
+
|
| 377 |
+
self._history = []
|
| 378 |
+
"""A list of (grammar, precision, recall, fscore) tuples for
|
| 379 |
+
grammars that the user has already tried."""
|
| 380 |
+
|
| 381 |
+
self._history_index = 0
|
| 382 |
+
"""When the user is scrolling through previous grammars, this
|
| 383 |
+
is used to keep track of which grammar they're looking at."""
|
| 384 |
+
|
| 385 |
+
self._eval_grammar = None
|
| 386 |
+
"""The grammar that is being currently evaluated by the eval
|
| 387 |
+
demon."""
|
| 388 |
+
|
| 389 |
+
self._eval_normalized_grammar = None
|
| 390 |
+
"""A normalized copy of ``_eval_grammar``."""
|
| 391 |
+
|
| 392 |
+
self._eval_index = 0
|
| 393 |
+
"""The index of the next sentence in the development set that
|
| 394 |
+
should be looked at by the eval demon."""
|
| 395 |
+
|
| 396 |
+
self._eval_score = ChunkScore(chunk_label=chunk_label)
|
| 397 |
+
"""The ``ChunkScore`` object that's used to keep track of the score
|
| 398 |
+
of the current grammar on the development set."""
|
| 399 |
+
|
| 400 |
+
# Set up the main window.
|
| 401 |
+
top = self.top = Tk()
|
| 402 |
+
top.geometry("+50+50")
|
| 403 |
+
top.title("Regexp Chunk Parser App")
|
| 404 |
+
top.bind("<Control-q>", self.destroy)
|
| 405 |
+
|
| 406 |
+
# Variable that restricts how much of the devset we look at.
|
| 407 |
+
self._devset_size = IntVar(top)
|
| 408 |
+
self._devset_size.set(100)
|
| 409 |
+
|
| 410 |
+
# Set up all the tkinter widgets
|
| 411 |
+
self._init_fonts(top)
|
| 412 |
+
self._init_widgets(top)
|
| 413 |
+
self._init_bindings(top)
|
| 414 |
+
self._init_menubar(top)
|
| 415 |
+
self.grammarbox.focus()
|
| 416 |
+
|
| 417 |
+
# If a grammar was given, then display it.
|
| 418 |
+
if grammar:
|
| 419 |
+
self.grammarbox.insert("end", grammar + "\n")
|
| 420 |
+
self.grammarbox.mark_set("insert", "1.0")
|
| 421 |
+
|
| 422 |
+
# Display the first item in the development set
|
| 423 |
+
self.show_devset(0)
|
| 424 |
+
self.update()
|
| 425 |
+
|
| 426 |
+
def _init_bindings(self, top):
|
| 427 |
+
top.bind("<Control-n>", self._devset_next)
|
| 428 |
+
top.bind("<Control-p>", self._devset_prev)
|
| 429 |
+
top.bind("<Control-t>", self.toggle_show_trace)
|
| 430 |
+
top.bind("<KeyPress>", self.update)
|
| 431 |
+
top.bind("<Control-s>", lambda e: self.save_grammar())
|
| 432 |
+
top.bind("<Control-o>", lambda e: self.load_grammar())
|
| 433 |
+
self.grammarbox.bind("<Control-t>", self.toggle_show_trace)
|
| 434 |
+
self.grammarbox.bind("<Control-n>", self._devset_next)
|
| 435 |
+
self.grammarbox.bind("<Control-p>", self._devset_prev)
|
| 436 |
+
|
| 437 |
+
# Redraw the eval graph when the window size changes
|
| 438 |
+
self.evalbox.bind("<Configure>", self._eval_plot)
|
| 439 |
+
|
| 440 |
+
def _init_fonts(self, top):
|
| 441 |
+
# TWhat's our font size (default=same as sysfont)
|
| 442 |
+
self._size = IntVar(top)
|
| 443 |
+
self._size.set(20)
|
| 444 |
+
self._font = Font(family="helvetica", size=-self._size.get())
|
| 445 |
+
self._smallfont = Font(
|
| 446 |
+
family="helvetica", size=-(int(self._size.get() * 14 // 20))
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
def _init_menubar(self, parent):
|
| 450 |
+
menubar = Menu(parent)
|
| 451 |
+
|
| 452 |
+
filemenu = Menu(menubar, tearoff=0)
|
| 453 |
+
filemenu.add_command(label="Reset Application", underline=0, command=self.reset)
|
| 454 |
+
filemenu.add_command(
|
| 455 |
+
label="Save Current Grammar",
|
| 456 |
+
underline=0,
|
| 457 |
+
accelerator="Ctrl-s",
|
| 458 |
+
command=self.save_grammar,
|
| 459 |
+
)
|
| 460 |
+
filemenu.add_command(
|
| 461 |
+
label="Load Grammar",
|
| 462 |
+
underline=0,
|
| 463 |
+
accelerator="Ctrl-o",
|
| 464 |
+
command=self.load_grammar,
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
filemenu.add_command(
|
| 468 |
+
label="Save Grammar History", underline=13, command=self.save_history
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
filemenu.add_command(
|
| 472 |
+
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
|
| 473 |
+
)
|
| 474 |
+
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
| 475 |
+
|
| 476 |
+
viewmenu = Menu(menubar, tearoff=0)
|
| 477 |
+
viewmenu.add_radiobutton(
|
| 478 |
+
label="Tiny",
|
| 479 |
+
variable=self._size,
|
| 480 |
+
underline=0,
|
| 481 |
+
value=10,
|
| 482 |
+
command=self.resize,
|
| 483 |
+
)
|
| 484 |
+
viewmenu.add_radiobutton(
|
| 485 |
+
label="Small",
|
| 486 |
+
variable=self._size,
|
| 487 |
+
underline=0,
|
| 488 |
+
value=16,
|
| 489 |
+
command=self.resize,
|
| 490 |
+
)
|
| 491 |
+
viewmenu.add_radiobutton(
|
| 492 |
+
label="Medium",
|
| 493 |
+
variable=self._size,
|
| 494 |
+
underline=0,
|
| 495 |
+
value=20,
|
| 496 |
+
command=self.resize,
|
| 497 |
+
)
|
| 498 |
+
viewmenu.add_radiobutton(
|
| 499 |
+
label="Large",
|
| 500 |
+
variable=self._size,
|
| 501 |
+
underline=0,
|
| 502 |
+
value=24,
|
| 503 |
+
command=self.resize,
|
| 504 |
+
)
|
| 505 |
+
viewmenu.add_radiobutton(
|
| 506 |
+
label="Huge",
|
| 507 |
+
variable=self._size,
|
| 508 |
+
underline=0,
|
| 509 |
+
value=34,
|
| 510 |
+
command=self.resize,
|
| 511 |
+
)
|
| 512 |
+
menubar.add_cascade(label="View", underline=0, menu=viewmenu)
|
| 513 |
+
|
| 514 |
+
devsetmenu = Menu(menubar, tearoff=0)
|
| 515 |
+
devsetmenu.add_radiobutton(
|
| 516 |
+
label="50 sentences",
|
| 517 |
+
variable=self._devset_size,
|
| 518 |
+
value=50,
|
| 519 |
+
command=self.set_devset_size,
|
| 520 |
+
)
|
| 521 |
+
devsetmenu.add_radiobutton(
|
| 522 |
+
label="100 sentences",
|
| 523 |
+
variable=self._devset_size,
|
| 524 |
+
value=100,
|
| 525 |
+
command=self.set_devset_size,
|
| 526 |
+
)
|
| 527 |
+
devsetmenu.add_radiobutton(
|
| 528 |
+
label="200 sentences",
|
| 529 |
+
variable=self._devset_size,
|
| 530 |
+
value=200,
|
| 531 |
+
command=self.set_devset_size,
|
| 532 |
+
)
|
| 533 |
+
devsetmenu.add_radiobutton(
|
| 534 |
+
label="500 sentences",
|
| 535 |
+
variable=self._devset_size,
|
| 536 |
+
value=500,
|
| 537 |
+
command=self.set_devset_size,
|
| 538 |
+
)
|
| 539 |
+
menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu)
|
| 540 |
+
|
| 541 |
+
helpmenu = Menu(menubar, tearoff=0)
|
| 542 |
+
helpmenu.add_command(label="About", underline=0, command=self.about)
|
| 543 |
+
menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
|
| 544 |
+
|
| 545 |
+
parent.config(menu=menubar)
|
| 546 |
+
|
| 547 |
+
def toggle_show_trace(self, *e):
|
| 548 |
+
if self._showing_trace:
|
| 549 |
+
self.show_devset()
|
| 550 |
+
else:
|
| 551 |
+
self.show_trace()
|
| 552 |
+
return "break"
|
| 553 |
+
|
| 554 |
+
_SCALE_N = 5 # center on the last 5 examples.
|
| 555 |
+
_DRAW_LINES = False
|
| 556 |
+
|
| 557 |
+
def _eval_plot(self, *e, **config):
|
| 558 |
+
width = config.get("width", self.evalbox.winfo_width())
|
| 559 |
+
height = config.get("height", self.evalbox.winfo_height())
|
| 560 |
+
|
| 561 |
+
# Clear the canvas
|
| 562 |
+
self.evalbox.delete("all")
|
| 563 |
+
|
| 564 |
+
# Draw the precision & recall labels.
|
| 565 |
+
tag = self.evalbox.create_text(
|
| 566 |
+
10, height // 2 - 10, justify="left", anchor="w", text="Precision"
|
| 567 |
+
)
|
| 568 |
+
left, right = self.evalbox.bbox(tag)[2] + 5, width - 10
|
| 569 |
+
tag = self.evalbox.create_text(
|
| 570 |
+
left + (width - left) // 2,
|
| 571 |
+
height - 10,
|
| 572 |
+
anchor="s",
|
| 573 |
+
text="Recall",
|
| 574 |
+
justify="center",
|
| 575 |
+
)
|
| 576 |
+
top, bot = 10, self.evalbox.bbox(tag)[1] - 10
|
| 577 |
+
|
| 578 |
+
# Draw masks for clipping the plot.
|
| 579 |
+
bg = self._EVALBOX_PARAMS["background"]
|
| 580 |
+
self.evalbox.lower(
|
| 581 |
+
self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg)
|
| 582 |
+
)
|
| 583 |
+
self.evalbox.lower(
|
| 584 |
+
self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg)
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
# Calculate the plot's scale.
|
| 588 |
+
if self._autoscale.get() and len(self._history) > 1:
|
| 589 |
+
max_precision = max_recall = 0
|
| 590 |
+
min_precision = min_recall = 1
|
| 591 |
+
for i in range(1, min(len(self._history), self._SCALE_N + 1)):
|
| 592 |
+
grammar, precision, recall, fmeasure = self._history[-i]
|
| 593 |
+
min_precision = min(precision, min_precision)
|
| 594 |
+
min_recall = min(recall, min_recall)
|
| 595 |
+
max_precision = max(precision, max_precision)
|
| 596 |
+
max_recall = max(recall, max_recall)
|
| 597 |
+
# if max_precision-min_precision > max_recall-min_recall:
|
| 598 |
+
# min_recall -= (max_precision-min_precision)/2
|
| 599 |
+
# max_recall += (max_precision-min_precision)/2
|
| 600 |
+
# else:
|
| 601 |
+
# min_precision -= (max_recall-min_recall)/2
|
| 602 |
+
# max_precision += (max_recall-min_recall)/2
|
| 603 |
+
# if min_recall < 0:
|
| 604 |
+
# max_recall -= min_recall
|
| 605 |
+
# min_recall = 0
|
| 606 |
+
# if min_precision < 0:
|
| 607 |
+
# max_precision -= min_precision
|
| 608 |
+
# min_precision = 0
|
| 609 |
+
min_precision = max(min_precision - 0.01, 0)
|
| 610 |
+
min_recall = max(min_recall - 0.01, 0)
|
| 611 |
+
max_precision = min(max_precision + 0.01, 1)
|
| 612 |
+
max_recall = min(max_recall + 0.01, 1)
|
| 613 |
+
else:
|
| 614 |
+
min_precision = min_recall = 0
|
| 615 |
+
max_precision = max_recall = 1
|
| 616 |
+
|
| 617 |
+
# Draw the axis lines & grid lines
|
| 618 |
+
for i in range(11):
|
| 619 |
+
x = left + (right - left) * (
|
| 620 |
+
(i / 10.0 - min_recall) / (max_recall - min_recall)
|
| 621 |
+
)
|
| 622 |
+
y = bot - (bot - top) * (
|
| 623 |
+
(i / 10.0 - min_precision) / (max_precision - min_precision)
|
| 624 |
+
)
|
| 625 |
+
if left < x < right:
|
| 626 |
+
self.evalbox.create_line(x, top, x, bot, fill="#888")
|
| 627 |
+
if top < y < bot:
|
| 628 |
+
self.evalbox.create_line(left, y, right, y, fill="#888")
|
| 629 |
+
self.evalbox.create_line(left, top, left, bot)
|
| 630 |
+
self.evalbox.create_line(left, bot, right, bot)
|
| 631 |
+
|
| 632 |
+
# Display the plot's scale
|
| 633 |
+
self.evalbox.create_text(
|
| 634 |
+
left - 3,
|
| 635 |
+
bot,
|
| 636 |
+
justify="right",
|
| 637 |
+
anchor="se",
|
| 638 |
+
text="%d%%" % (100 * min_precision),
|
| 639 |
+
)
|
| 640 |
+
self.evalbox.create_text(
|
| 641 |
+
left - 3,
|
| 642 |
+
top,
|
| 643 |
+
justify="right",
|
| 644 |
+
anchor="ne",
|
| 645 |
+
text="%d%%" % (100 * max_precision),
|
| 646 |
+
)
|
| 647 |
+
self.evalbox.create_text(
|
| 648 |
+
left,
|
| 649 |
+
bot + 3,
|
| 650 |
+
justify="center",
|
| 651 |
+
anchor="nw",
|
| 652 |
+
text="%d%%" % (100 * min_recall),
|
| 653 |
+
)
|
| 654 |
+
self.evalbox.create_text(
|
| 655 |
+
right,
|
| 656 |
+
bot + 3,
|
| 657 |
+
justify="center",
|
| 658 |
+
anchor="ne",
|
| 659 |
+
text="%d%%" % (100 * max_recall),
|
| 660 |
+
)
|
| 661 |
+
|
| 662 |
+
# Display the scores.
|
| 663 |
+
prev_x = prev_y = None
|
| 664 |
+
for i, (_, precision, recall, fscore) in enumerate(self._history):
|
| 665 |
+
x = left + (right - left) * (
|
| 666 |
+
(recall - min_recall) / (max_recall - min_recall)
|
| 667 |
+
)
|
| 668 |
+
y = bot - (bot - top) * (
|
| 669 |
+
(precision - min_precision) / (max_precision - min_precision)
|
| 670 |
+
)
|
| 671 |
+
if i == self._history_index:
|
| 672 |
+
self.evalbox.create_oval(
|
| 673 |
+
x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000"
|
| 674 |
+
)
|
| 675 |
+
self.status["text"] = (
|
| 676 |
+
"Precision: %.2f%%\t" % (precision * 100)
|
| 677 |
+
+ "Recall: %.2f%%\t" % (recall * 100)
|
| 678 |
+
+ "F-score: %.2f%%" % (fscore * 100)
|
| 679 |
+
)
|
| 680 |
+
else:
|
| 681 |
+
self.evalbox.lower(
|
| 682 |
+
self.evalbox.create_oval(
|
| 683 |
+
x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8"
|
| 684 |
+
)
|
| 685 |
+
)
|
| 686 |
+
if prev_x is not None and self._eval_lines.get():
|
| 687 |
+
self.evalbox.lower(
|
| 688 |
+
self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8")
|
| 689 |
+
)
|
| 690 |
+
prev_x, prev_y = x, y
|
| 691 |
+
|
| 692 |
+
_eval_demon_running = False
|
| 693 |
+
|
| 694 |
+
def _eval_demon(self):
|
| 695 |
+
if self.top is None:
|
| 696 |
+
return
|
| 697 |
+
if self.chunker is None:
|
| 698 |
+
self._eval_demon_running = False
|
| 699 |
+
return
|
| 700 |
+
|
| 701 |
+
# Note our starting time.
|
| 702 |
+
t0 = time.time()
|
| 703 |
+
|
| 704 |
+
# If are still typing, then wait for them to finish.
|
| 705 |
+
if (
|
| 706 |
+
time.time() - self._last_keypress < self._EVAL_DELAY
|
| 707 |
+
and self.normalized_grammar != self._eval_normalized_grammar
|
| 708 |
+
):
|
| 709 |
+
self._eval_demon_running = True
|
| 710 |
+
return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
|
| 711 |
+
|
| 712 |
+
# If the grammar changed, restart the evaluation.
|
| 713 |
+
if self.normalized_grammar != self._eval_normalized_grammar:
|
| 714 |
+
# Check if we've seen this grammar already. If so, then
|
| 715 |
+
# just use the old evaluation values.
|
| 716 |
+
for (g, p, r, f) in self._history:
|
| 717 |
+
if self.normalized_grammar == self.normalize_grammar(g):
|
| 718 |
+
self._history.append((g, p, r, f))
|
| 719 |
+
self._history_index = len(self._history) - 1
|
| 720 |
+
self._eval_plot()
|
| 721 |
+
self._eval_demon_running = False
|
| 722 |
+
self._eval_normalized_grammar = None
|
| 723 |
+
return
|
| 724 |
+
self._eval_index = 0
|
| 725 |
+
self._eval_score = ChunkScore(chunk_label=self._chunk_label)
|
| 726 |
+
self._eval_grammar = self.grammar
|
| 727 |
+
self._eval_normalized_grammar = self.normalized_grammar
|
| 728 |
+
|
| 729 |
+
# If the grammar is empty, the don't bother evaluating it, or
|
| 730 |
+
# recording it in history -- the score will just be 0.
|
| 731 |
+
if self.normalized_grammar.strip() == "":
|
| 732 |
+
# self._eval_index = self._devset_size.get()
|
| 733 |
+
self._eval_demon_running = False
|
| 734 |
+
return
|
| 735 |
+
|
| 736 |
+
# Score the next set of examples
|
| 737 |
+
for gold in self.devset[
|
| 738 |
+
self._eval_index : min(
|
| 739 |
+
self._eval_index + self._EVAL_CHUNK, self._devset_size.get()
|
| 740 |
+
)
|
| 741 |
+
]:
|
| 742 |
+
guess = self._chunkparse(gold.leaves())
|
| 743 |
+
self._eval_score.score(gold, guess)
|
| 744 |
+
|
| 745 |
+
# update our index in the devset.
|
| 746 |
+
self._eval_index += self._EVAL_CHUNK
|
| 747 |
+
|
| 748 |
+
# Check if we're done
|
| 749 |
+
if self._eval_index >= self._devset_size.get():
|
| 750 |
+
self._history.append(
|
| 751 |
+
(
|
| 752 |
+
self._eval_grammar,
|
| 753 |
+
self._eval_score.precision(),
|
| 754 |
+
self._eval_score.recall(),
|
| 755 |
+
self._eval_score.f_measure(),
|
| 756 |
+
)
|
| 757 |
+
)
|
| 758 |
+
self._history_index = len(self._history) - 1
|
| 759 |
+
self._eval_plot()
|
| 760 |
+
self._eval_demon_running = False
|
| 761 |
+
self._eval_normalized_grammar = None
|
| 762 |
+
else:
|
| 763 |
+
progress = 100 * self._eval_index / self._devset_size.get()
|
| 764 |
+
self.status["text"] = "Evaluating on Development Set (%d%%)" % progress
|
| 765 |
+
self._eval_demon_running = True
|
| 766 |
+
self._adaptively_modify_eval_chunk(time.time() - t0)
|
| 767 |
+
self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon)
|
| 768 |
+
|
| 769 |
+
def _adaptively_modify_eval_chunk(self, t):
|
| 770 |
+
"""
|
| 771 |
+
Modify _EVAL_CHUNK to try to keep the amount of time that the
|
| 772 |
+
eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX.
|
| 773 |
+
|
| 774 |
+
:param t: The amount of time that the eval demon took.
|
| 775 |
+
"""
|
| 776 |
+
if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5:
|
| 777 |
+
self._EVAL_CHUNK = min(
|
| 778 |
+
self._EVAL_CHUNK - 1,
|
| 779 |
+
max(
|
| 780 |
+
int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)),
|
| 781 |
+
self._EVAL_CHUNK - 10,
|
| 782 |
+
),
|
| 783 |
+
)
|
| 784 |
+
elif t < self._EVAL_DEMON_MIN:
|
| 785 |
+
self._EVAL_CHUNK = max(
|
| 786 |
+
self._EVAL_CHUNK + 1,
|
| 787 |
+
min(
|
| 788 |
+
int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)),
|
| 789 |
+
self._EVAL_CHUNK + 10,
|
| 790 |
+
),
|
| 791 |
+
)
|
| 792 |
+
|
| 793 |
+
def _init_widgets(self, top):
|
| 794 |
+
frame0 = Frame(top, **self._FRAME_PARAMS)
|
| 795 |
+
frame0.grid_columnconfigure(0, weight=4)
|
| 796 |
+
frame0.grid_columnconfigure(3, weight=2)
|
| 797 |
+
frame0.grid_rowconfigure(1, weight=1)
|
| 798 |
+
frame0.grid_rowconfigure(5, weight=1)
|
| 799 |
+
|
| 800 |
+
# The grammar
|
| 801 |
+
self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS)
|
| 802 |
+
self.grammarlabel = Label(
|
| 803 |
+
frame0,
|
| 804 |
+
font=self._font,
|
| 805 |
+
text="Grammar:",
|
| 806 |
+
highlightcolor="black",
|
| 807 |
+
background=self._GRAMMARBOX_PARAMS["background"],
|
| 808 |
+
)
|
| 809 |
+
self.grammarlabel.grid(column=0, row=0, sticky="SW")
|
| 810 |
+
self.grammarbox.grid(column=0, row=1, sticky="NEWS")
|
| 811 |
+
|
| 812 |
+
# Scroll bar for grammar
|
| 813 |
+
grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview)
|
| 814 |
+
grammar_scrollbar.grid(column=1, row=1, sticky="NWS")
|
| 815 |
+
self.grammarbox.config(yscrollcommand=grammar_scrollbar.set)
|
| 816 |
+
|
| 817 |
+
# grammar buttons
|
| 818 |
+
bg = self._FRAME_PARAMS["background"]
|
| 819 |
+
frame3 = Frame(frame0, background=bg)
|
| 820 |
+
frame3.grid(column=0, row=2, sticky="EW")
|
| 821 |
+
Button(
|
| 822 |
+
frame3,
|
| 823 |
+
text="Prev Grammar",
|
| 824 |
+
command=self._history_prev,
|
| 825 |
+
**self._BUTTON_PARAMS,
|
| 826 |
+
).pack(side="left")
|
| 827 |
+
Button(
|
| 828 |
+
frame3,
|
| 829 |
+
text="Next Grammar",
|
| 830 |
+
command=self._history_next,
|
| 831 |
+
**self._BUTTON_PARAMS,
|
| 832 |
+
).pack(side="left")
|
| 833 |
+
|
| 834 |
+
# Help box
|
| 835 |
+
self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS)
|
| 836 |
+
self.helpbox.grid(column=3, row=1, sticky="NEWS")
|
| 837 |
+
self.helptabs = {}
|
| 838 |
+
bg = self._FRAME_PARAMS["background"]
|
| 839 |
+
helptab_frame = Frame(frame0, background=bg)
|
| 840 |
+
helptab_frame.grid(column=3, row=0, sticky="SW")
|
| 841 |
+
for i, (tab, tabstops, text) in enumerate(self.HELP):
|
| 842 |
+
label = Label(helptab_frame, text=tab, font=self._smallfont)
|
| 843 |
+
label.grid(column=i * 2, row=0, sticky="S")
|
| 844 |
+
# help_frame.grid_columnconfigure(i, weight=1)
|
| 845 |
+
# label.pack(side='left')
|
| 846 |
+
label.bind("<ButtonPress>", lambda e, tab=tab: self.show_help(tab))
|
| 847 |
+
self.helptabs[tab] = label
|
| 848 |
+
Frame(
|
| 849 |
+
helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg
|
| 850 |
+
).grid(column=i * 2 + 1, row=0)
|
| 851 |
+
self.helptabs[self.HELP[0][0]].configure(font=self._font)
|
| 852 |
+
self.helpbox.tag_config("elide", elide=True)
|
| 853 |
+
for (tag, params) in self.HELP_AUTOTAG:
|
| 854 |
+
self.helpbox.tag_config("tag-%s" % tag, **params)
|
| 855 |
+
self.show_help(self.HELP[0][0])
|
| 856 |
+
|
| 857 |
+
# Scroll bar for helpbox
|
| 858 |
+
help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview)
|
| 859 |
+
self.helpbox.config(yscrollcommand=help_scrollbar.set)
|
| 860 |
+
help_scrollbar.grid(column=4, row=1, sticky="NWS")
|
| 861 |
+
|
| 862 |
+
# The dev set
|
| 863 |
+
frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"])
|
| 864 |
+
self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS)
|
| 865 |
+
self.devsetbox.pack(expand=True, fill="both")
|
| 866 |
+
self.devsetlabel = Label(
|
| 867 |
+
frame0,
|
| 868 |
+
font=self._font,
|
| 869 |
+
text="Development Set:",
|
| 870 |
+
justify="right",
|
| 871 |
+
background=self._DEVSETBOX_PARAMS["background"],
|
| 872 |
+
)
|
| 873 |
+
self.devsetlabel.grid(column=0, row=4, sticky="SW")
|
| 874 |
+
frame4.grid(column=0, row=5, sticky="NEWS")
|
| 875 |
+
|
| 876 |
+
# dev set scrollbars
|
| 877 |
+
self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll)
|
| 878 |
+
self.devset_scroll.grid(column=1, row=5, sticky="NWS")
|
| 879 |
+
self.devset_xscroll = Scrollbar(
|
| 880 |
+
frame4, command=self.devsetbox.xview, orient="horiz"
|
| 881 |
+
)
|
| 882 |
+
self.devsetbox["xscrollcommand"] = self.devset_xscroll.set
|
| 883 |
+
self.devset_xscroll.pack(side="bottom", fill="x")
|
| 884 |
+
|
| 885 |
+
# dev set buttons
|
| 886 |
+
bg = self._FRAME_PARAMS["background"]
|
| 887 |
+
frame1 = Frame(frame0, background=bg)
|
| 888 |
+
frame1.grid(column=0, row=7, sticky="EW")
|
| 889 |
+
Button(
|
| 890 |
+
frame1,
|
| 891 |
+
text="Prev Example (Ctrl-p)",
|
| 892 |
+
command=self._devset_prev,
|
| 893 |
+
**self._BUTTON_PARAMS,
|
| 894 |
+
).pack(side="left")
|
| 895 |
+
Button(
|
| 896 |
+
frame1,
|
| 897 |
+
text="Next Example (Ctrl-n)",
|
| 898 |
+
command=self._devset_next,
|
| 899 |
+
**self._BUTTON_PARAMS,
|
| 900 |
+
).pack(side="left")
|
| 901 |
+
self.devset_button = Button(
|
| 902 |
+
frame1,
|
| 903 |
+
text="Show example",
|
| 904 |
+
command=self.show_devset,
|
| 905 |
+
state="disabled",
|
| 906 |
+
**self._BUTTON_PARAMS,
|
| 907 |
+
)
|
| 908 |
+
self.devset_button.pack(side="right")
|
| 909 |
+
self.trace_button = Button(
|
| 910 |
+
frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS
|
| 911 |
+
)
|
| 912 |
+
self.trace_button.pack(side="right")
|
| 913 |
+
|
| 914 |
+
# evaluation box
|
| 915 |
+
self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS)
|
| 916 |
+
label = Label(
|
| 917 |
+
frame0,
|
| 918 |
+
font=self._font,
|
| 919 |
+
text="Evaluation:",
|
| 920 |
+
justify="right",
|
| 921 |
+
background=self._EVALBOX_PARAMS["background"],
|
| 922 |
+
)
|
| 923 |
+
label.grid(column=3, row=4, sticky="SW")
|
| 924 |
+
self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2)
|
| 925 |
+
|
| 926 |
+
# evaluation box buttons
|
| 927 |
+
bg = self._FRAME_PARAMS["background"]
|
| 928 |
+
frame2 = Frame(frame0, background=bg)
|
| 929 |
+
frame2.grid(column=3, row=7, sticky="EW")
|
| 930 |
+
self._autoscale = IntVar(self.top)
|
| 931 |
+
self._autoscale.set(False)
|
| 932 |
+
Checkbutton(
|
| 933 |
+
frame2,
|
| 934 |
+
variable=self._autoscale,
|
| 935 |
+
command=self._eval_plot,
|
| 936 |
+
text="Zoom",
|
| 937 |
+
**self._BUTTON_PARAMS,
|
| 938 |
+
).pack(side="left")
|
| 939 |
+
self._eval_lines = IntVar(self.top)
|
| 940 |
+
self._eval_lines.set(False)
|
| 941 |
+
Checkbutton(
|
| 942 |
+
frame2,
|
| 943 |
+
variable=self._eval_lines,
|
| 944 |
+
command=self._eval_plot,
|
| 945 |
+
text="Lines",
|
| 946 |
+
**self._BUTTON_PARAMS,
|
| 947 |
+
).pack(side="left")
|
| 948 |
+
Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right")
|
| 949 |
+
|
| 950 |
+
# The status label
|
| 951 |
+
self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS)
|
| 952 |
+
self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5)
|
| 953 |
+
|
| 954 |
+
# Help box & devset box can't be edited.
|
| 955 |
+
self.helpbox["state"] = "disabled"
|
| 956 |
+
self.devsetbox["state"] = "disabled"
|
| 957 |
+
|
| 958 |
+
# Spacers
|
| 959 |
+
bg = self._FRAME_PARAMS["background"]
|
| 960 |
+
Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3)
|
| 961 |
+
Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0)
|
| 962 |
+
Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8)
|
| 963 |
+
|
| 964 |
+
# pack the frame.
|
| 965 |
+
frame0.pack(fill="both", expand=True)
|
| 966 |
+
|
| 967 |
+
# Set up colors for the devset box
|
| 968 |
+
self.devsetbox.tag_config("true-pos", background="#afa", underline="True")
|
| 969 |
+
self.devsetbox.tag_config("false-neg", underline="True", foreground="#800")
|
| 970 |
+
self.devsetbox.tag_config("false-pos", background="#faa")
|
| 971 |
+
self.devsetbox.tag_config("trace", foreground="#666", wrap="none")
|
| 972 |
+
self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none")
|
| 973 |
+
self.devsetbox.tag_config("error", foreground="#800")
|
| 974 |
+
|
| 975 |
+
# And for the grammarbox
|
| 976 |
+
self.grammarbox.tag_config("error", background="#fec")
|
| 977 |
+
self.grammarbox.tag_config("comment", foreground="#840")
|
| 978 |
+
self.grammarbox.tag_config("angle", foreground="#00f")
|
| 979 |
+
self.grammarbox.tag_config("brace", foreground="#0a0")
|
| 980 |
+
self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40)
|
| 981 |
+
|
| 982 |
+
_showing_trace = False
|
| 983 |
+
|
| 984 |
+
def show_trace(self, *e):
|
| 985 |
+
self._showing_trace = True
|
| 986 |
+
self.trace_button["state"] = "disabled"
|
| 987 |
+
self.devset_button["state"] = "normal"
|
| 988 |
+
|
| 989 |
+
self.devsetbox["state"] = "normal"
|
| 990 |
+
# self.devsetbox['wrap'] = 'none'
|
| 991 |
+
self.devsetbox.delete("1.0", "end")
|
| 992 |
+
self.devsetlabel["text"] = "Development Set (%d/%d)" % (
|
| 993 |
+
(self.devset_index + 1, self._devset_size.get())
|
| 994 |
+
)
|
| 995 |
+
|
| 996 |
+
if self.chunker is None:
|
| 997 |
+
self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.")
|
| 998 |
+
self.devsetbox.tag_add("error", "1.0", "end")
|
| 999 |
+
return # can't do anything more
|
| 1000 |
+
|
| 1001 |
+
gold_tree = self.devset[self.devset_index]
|
| 1002 |
+
rules = self.chunker.rules()
|
| 1003 |
+
|
| 1004 |
+
# Calculate the tag sequence
|
| 1005 |
+
tagseq = "\t"
|
| 1006 |
+
charnum = [1]
|
| 1007 |
+
for wordnum, (word, pos) in enumerate(gold_tree.leaves()):
|
| 1008 |
+
tagseq += "%s " % pos
|
| 1009 |
+
charnum.append(len(tagseq))
|
| 1010 |
+
self.charnum = {
|
| 1011 |
+
(i, j): charnum[j]
|
| 1012 |
+
for i in range(len(rules) + 1)
|
| 1013 |
+
for j in range(len(charnum))
|
| 1014 |
+
}
|
| 1015 |
+
self.linenum = {i: i * 2 + 2 for i in range(len(rules) + 1)}
|
| 1016 |
+
|
| 1017 |
+
for i in range(len(rules) + 1):
|
| 1018 |
+
if i == 0:
|
| 1019 |
+
self.devsetbox.insert("end", "Start:\n")
|
| 1020 |
+
self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
|
| 1021 |
+
else:
|
| 1022 |
+
self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1])
|
| 1023 |
+
self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
|
| 1024 |
+
# Display the tag sequence.
|
| 1025 |
+
self.devsetbox.insert("end", tagseq + "\n")
|
| 1026 |
+
self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c")
|
| 1027 |
+
# Run a partial parser, and extract gold & test chunks
|
| 1028 |
+
chunker = RegexpChunkParser(rules[:i])
|
| 1029 |
+
test_tree = self._chunkparse(gold_tree.leaves())
|
| 1030 |
+
gold_chunks = self._chunks(gold_tree)
|
| 1031 |
+
test_chunks = self._chunks(test_tree)
|
| 1032 |
+
# Compare them.
|
| 1033 |
+
for chunk in gold_chunks.intersection(test_chunks):
|
| 1034 |
+
self._color_chunk(i, chunk, "true-pos")
|
| 1035 |
+
for chunk in gold_chunks - test_chunks:
|
| 1036 |
+
self._color_chunk(i, chunk, "false-neg")
|
| 1037 |
+
for chunk in test_chunks - gold_chunks:
|
| 1038 |
+
self._color_chunk(i, chunk, "false-pos")
|
| 1039 |
+
self.devsetbox.insert("end", "Finished.\n")
|
| 1040 |
+
self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c")
|
| 1041 |
+
|
| 1042 |
+
# This is a hack, because the x-scrollbar isn't updating its
|
| 1043 |
+
# position right -- I'm not sure what the underlying cause is
|
| 1044 |
+
# though. (This is on OS X w/ python 2.5)
|
| 1045 |
+
self.top.after(100, self.devset_xscroll.set, 0, 0.3)
|
| 1046 |
+
|
| 1047 |
+
def show_help(self, tab):
|
| 1048 |
+
self.helpbox["state"] = "normal"
|
| 1049 |
+
self.helpbox.delete("1.0", "end")
|
| 1050 |
+
for (name, tabstops, text) in self.HELP:
|
| 1051 |
+
if name == tab:
|
| 1052 |
+
text = text.replace(
|
| 1053 |
+
"<<TAGSET>>",
|
| 1054 |
+
"\n".join(
|
| 1055 |
+
"\t%s\t%s" % item
|
| 1056 |
+
for item in sorted(
|
| 1057 |
+
list(self.tagset.items()),
|
| 1058 |
+
key=lambda t_w: re.match(r"\w+", t_w[0])
|
| 1059 |
+
and (0, t_w[0])
|
| 1060 |
+
or (1, t_w[0]),
|
| 1061 |
+
)
|
| 1062 |
+
),
|
| 1063 |
+
)
|
| 1064 |
+
|
| 1065 |
+
self.helptabs[name].config(**self._HELPTAB_FG_PARAMS)
|
| 1066 |
+
self.helpbox.config(tabs=tabstops)
|
| 1067 |
+
self.helpbox.insert("1.0", text + "\n" * 20)
|
| 1068 |
+
C = "1.0 + %d chars"
|
| 1069 |
+
for (tag, params) in self.HELP_AUTOTAG:
|
| 1070 |
+
pattern = f"(?s)(<{tag}>)(.*?)(</{tag}>)"
|
| 1071 |
+
for m in re.finditer(pattern, text):
|
| 1072 |
+
self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1))
|
| 1073 |
+
self.helpbox.tag_add(
|
| 1074 |
+
"tag-%s" % tag, C % m.start(2), C % m.end(2)
|
| 1075 |
+
)
|
| 1076 |
+
self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3))
|
| 1077 |
+
else:
|
| 1078 |
+
self.helptabs[name].config(**self._HELPTAB_BG_PARAMS)
|
| 1079 |
+
self.helpbox["state"] = "disabled"
|
| 1080 |
+
|
| 1081 |
+
def _history_prev(self, *e):
|
| 1082 |
+
self._view_history(self._history_index - 1)
|
| 1083 |
+
return "break"
|
| 1084 |
+
|
| 1085 |
+
def _history_next(self, *e):
|
| 1086 |
+
self._view_history(self._history_index + 1)
|
| 1087 |
+
return "break"
|
| 1088 |
+
|
| 1089 |
+
def _view_history(self, index):
|
| 1090 |
+
# Bounds & sanity checking:
|
| 1091 |
+
index = max(0, min(len(self._history) - 1, index))
|
| 1092 |
+
if not self._history:
|
| 1093 |
+
return
|
| 1094 |
+
# Already viewing the requested history item?
|
| 1095 |
+
if index == self._history_index:
|
| 1096 |
+
return
|
| 1097 |
+
# Show the requested grammar. It will get added to _history
|
| 1098 |
+
# only if they edit it (causing self.update() to get run.)
|
| 1099 |
+
self.grammarbox["state"] = "normal"
|
| 1100 |
+
self.grammarbox.delete("1.0", "end")
|
| 1101 |
+
self.grammarbox.insert("end", self._history[index][0])
|
| 1102 |
+
self.grammarbox.mark_set("insert", "1.0")
|
| 1103 |
+
self._history_index = index
|
| 1104 |
+
self._syntax_highlight_grammar(self._history[index][0])
|
| 1105 |
+
# Record the normalized grammar & regenerate the chunker.
|
| 1106 |
+
self.normalized_grammar = self.normalize_grammar(self._history[index][0])
|
| 1107 |
+
if self.normalized_grammar:
|
| 1108 |
+
rules = [
|
| 1109 |
+
RegexpChunkRule.fromstring(line)
|
| 1110 |
+
for line in self.normalized_grammar.split("\n")
|
| 1111 |
+
]
|
| 1112 |
+
else:
|
| 1113 |
+
rules = []
|
| 1114 |
+
self.chunker = RegexpChunkParser(rules)
|
| 1115 |
+
# Show the score.
|
| 1116 |
+
self._eval_plot()
|
| 1117 |
+
# Update the devset box
|
| 1118 |
+
self._highlight_devset()
|
| 1119 |
+
if self._showing_trace:
|
| 1120 |
+
self.show_trace()
|
| 1121 |
+
# Update the grammar label
|
| 1122 |
+
if self._history_index < len(self._history) - 1:
|
| 1123 |
+
self.grammarlabel["text"] = "Grammar {}/{}:".format(
|
| 1124 |
+
self._history_index + 1,
|
| 1125 |
+
len(self._history),
|
| 1126 |
+
)
|
| 1127 |
+
else:
|
| 1128 |
+
self.grammarlabel["text"] = "Grammar:"
|
| 1129 |
+
|
| 1130 |
+
def _devset_next(self, *e):
|
| 1131 |
+
self._devset_scroll("scroll", 1, "page")
|
| 1132 |
+
return "break"
|
| 1133 |
+
|
| 1134 |
+
def _devset_prev(self, *e):
|
| 1135 |
+
self._devset_scroll("scroll", -1, "page")
|
| 1136 |
+
return "break"
|
| 1137 |
+
|
| 1138 |
+
def destroy(self, *e):
|
| 1139 |
+
if self.top is None:
|
| 1140 |
+
return
|
| 1141 |
+
self.top.destroy()
|
| 1142 |
+
self.top = None
|
| 1143 |
+
|
| 1144 |
+
def _devset_scroll(self, command, *args):
|
| 1145 |
+
N = 1 # size of a page -- one sentence.
|
| 1146 |
+
showing_trace = self._showing_trace
|
| 1147 |
+
if command == "scroll" and args[1].startswith("unit"):
|
| 1148 |
+
self.show_devset(self.devset_index + int(args[0]))
|
| 1149 |
+
elif command == "scroll" and args[1].startswith("page"):
|
| 1150 |
+
self.show_devset(self.devset_index + N * int(args[0]))
|
| 1151 |
+
elif command == "moveto":
|
| 1152 |
+
self.show_devset(int(float(args[0]) * self._devset_size.get()))
|
| 1153 |
+
else:
|
| 1154 |
+
assert 0, f"bad scroll command {command} {args}"
|
| 1155 |
+
if showing_trace:
|
| 1156 |
+
self.show_trace()
|
| 1157 |
+
|
| 1158 |
+
def show_devset(self, index=None):
|
| 1159 |
+
if index is None:
|
| 1160 |
+
index = self.devset_index
|
| 1161 |
+
|
| 1162 |
+
# Bounds checking
|
| 1163 |
+
index = min(max(0, index), self._devset_size.get() - 1)
|
| 1164 |
+
|
| 1165 |
+
if index == self.devset_index and not self._showing_trace:
|
| 1166 |
+
return
|
| 1167 |
+
self.devset_index = index
|
| 1168 |
+
|
| 1169 |
+
self._showing_trace = False
|
| 1170 |
+
self.trace_button["state"] = "normal"
|
| 1171 |
+
self.devset_button["state"] = "disabled"
|
| 1172 |
+
|
| 1173 |
+
# Clear the text box.
|
| 1174 |
+
self.devsetbox["state"] = "normal"
|
| 1175 |
+
self.devsetbox["wrap"] = "word"
|
| 1176 |
+
self.devsetbox.delete("1.0", "end")
|
| 1177 |
+
self.devsetlabel["text"] = "Development Set (%d/%d)" % (
|
| 1178 |
+
(self.devset_index + 1, self._devset_size.get())
|
| 1179 |
+
)
|
| 1180 |
+
|
| 1181 |
+
# Add the sentences
|
| 1182 |
+
sample = self.devset[self.devset_index : self.devset_index + 1]
|
| 1183 |
+
self.charnum = {}
|
| 1184 |
+
self.linenum = {0: 1}
|
| 1185 |
+
for sentnum, sent in enumerate(sample):
|
| 1186 |
+
linestr = ""
|
| 1187 |
+
for wordnum, (word, pos) in enumerate(sent.leaves()):
|
| 1188 |
+
self.charnum[sentnum, wordnum] = len(linestr)
|
| 1189 |
+
linestr += f"{word}/{pos} "
|
| 1190 |
+
self.charnum[sentnum, wordnum + 1] = len(linestr)
|
| 1191 |
+
self.devsetbox.insert("end", linestr[:-1] + "\n\n")
|
| 1192 |
+
|
| 1193 |
+
# Highlight chunks in the dev set
|
| 1194 |
+
if self.chunker is not None:
|
| 1195 |
+
self._highlight_devset()
|
| 1196 |
+
self.devsetbox["state"] = "disabled"
|
| 1197 |
+
|
| 1198 |
+
# Update the scrollbar
|
| 1199 |
+
first = self.devset_index / self._devset_size.get()
|
| 1200 |
+
last = (self.devset_index + 2) / self._devset_size.get()
|
| 1201 |
+
self.devset_scroll.set(first, last)
|
| 1202 |
+
|
| 1203 |
+
def _chunks(self, tree):
|
| 1204 |
+
chunks = set()
|
| 1205 |
+
wordnum = 0
|
| 1206 |
+
for child in tree:
|
| 1207 |
+
if isinstance(child, Tree):
|
| 1208 |
+
if child.label() == self._chunk_label:
|
| 1209 |
+
chunks.add((wordnum, wordnum + len(child)))
|
| 1210 |
+
wordnum += len(child)
|
| 1211 |
+
else:
|
| 1212 |
+
wordnum += 1
|
| 1213 |
+
return chunks
|
| 1214 |
+
|
| 1215 |
+
def _syntax_highlight_grammar(self, grammar):
|
| 1216 |
+
if self.top is None:
|
| 1217 |
+
return
|
| 1218 |
+
self.grammarbox.tag_remove("comment", "1.0", "end")
|
| 1219 |
+
self.grammarbox.tag_remove("angle", "1.0", "end")
|
| 1220 |
+
self.grammarbox.tag_remove("brace", "1.0", "end")
|
| 1221 |
+
self.grammarbox.tag_add("hangindent", "1.0", "end")
|
| 1222 |
+
for lineno, line in enumerate(grammar.split("\n")):
|
| 1223 |
+
if not line.strip():
|
| 1224 |
+
continue
|
| 1225 |
+
m = re.match(r"(\\.|[^#])*(#.*)?", line)
|
| 1226 |
+
comment_start = None
|
| 1227 |
+
if m.group(2):
|
| 1228 |
+
comment_start = m.start(2)
|
| 1229 |
+
s = "%d.%d" % (lineno + 1, m.start(2))
|
| 1230 |
+
e = "%d.%d" % (lineno + 1, m.end(2))
|
| 1231 |
+
self.grammarbox.tag_add("comment", s, e)
|
| 1232 |
+
for m in re.finditer("[<>{}]", line):
|
| 1233 |
+
if comment_start is not None and m.start() >= comment_start:
|
| 1234 |
+
break
|
| 1235 |
+
s = "%d.%d" % (lineno + 1, m.start())
|
| 1236 |
+
e = "%d.%d" % (lineno + 1, m.end())
|
| 1237 |
+
if m.group() in "<>":
|
| 1238 |
+
self.grammarbox.tag_add("angle", s, e)
|
| 1239 |
+
else:
|
| 1240 |
+
self.grammarbox.tag_add("brace", s, e)
|
| 1241 |
+
|
| 1242 |
+
def _grammarcheck(self, grammar):
|
| 1243 |
+
if self.top is None:
|
| 1244 |
+
return
|
| 1245 |
+
self.grammarbox.tag_remove("error", "1.0", "end")
|
| 1246 |
+
self._grammarcheck_errs = []
|
| 1247 |
+
for lineno, line in enumerate(grammar.split("\n")):
|
| 1248 |
+
line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line)
|
| 1249 |
+
line = line.strip()
|
| 1250 |
+
if line:
|
| 1251 |
+
try:
|
| 1252 |
+
RegexpChunkRule.fromstring(line)
|
| 1253 |
+
except ValueError as e:
|
| 1254 |
+
self.grammarbox.tag_add(
|
| 1255 |
+
"error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1)
|
| 1256 |
+
)
|
| 1257 |
+
self.status["text"] = ""
|
| 1258 |
+
|
| 1259 |
+
def update(self, *event):
|
| 1260 |
+
# Record when update was called (for grammarcheck)
|
| 1261 |
+
if event:
|
| 1262 |
+
self._last_keypress = time.time()
|
| 1263 |
+
|
| 1264 |
+
# Read the grammar from the Text box.
|
| 1265 |
+
self.grammar = grammar = self.grammarbox.get("1.0", "end")
|
| 1266 |
+
|
| 1267 |
+
# If the grammar hasn't changed, do nothing:
|
| 1268 |
+
normalized_grammar = self.normalize_grammar(grammar)
|
| 1269 |
+
if normalized_grammar == self.normalized_grammar:
|
| 1270 |
+
return
|
| 1271 |
+
else:
|
| 1272 |
+
self.normalized_grammar = normalized_grammar
|
| 1273 |
+
|
| 1274 |
+
# If the grammar has changed, and we're looking at history,
|
| 1275 |
+
# then stop looking at history.
|
| 1276 |
+
if self._history_index < len(self._history) - 1:
|
| 1277 |
+
self.grammarlabel["text"] = "Grammar:"
|
| 1278 |
+
|
| 1279 |
+
self._syntax_highlight_grammar(grammar)
|
| 1280 |
+
|
| 1281 |
+
# The grammar has changed; try parsing it. If it doesn't
|
| 1282 |
+
# parse, do nothing. (flag error location?)
|
| 1283 |
+
try:
|
| 1284 |
+
# Note: the normalized grammar has no blank lines.
|
| 1285 |
+
if normalized_grammar:
|
| 1286 |
+
rules = [
|
| 1287 |
+
RegexpChunkRule.fromstring(line)
|
| 1288 |
+
for line in normalized_grammar.split("\n")
|
| 1289 |
+
]
|
| 1290 |
+
else:
|
| 1291 |
+
rules = []
|
| 1292 |
+
except ValueError as e:
|
| 1293 |
+
# Use the un-normalized grammar for error highlighting.
|
| 1294 |
+
self._grammarcheck(grammar)
|
| 1295 |
+
self.chunker = None
|
| 1296 |
+
return
|
| 1297 |
+
|
| 1298 |
+
self.chunker = RegexpChunkParser(rules)
|
| 1299 |
+
self.grammarbox.tag_remove("error", "1.0", "end")
|
| 1300 |
+
self.grammar_changed = time.time()
|
| 1301 |
+
# Display the results
|
| 1302 |
+
if self._showing_trace:
|
| 1303 |
+
self.show_trace()
|
| 1304 |
+
else:
|
| 1305 |
+
self._highlight_devset()
|
| 1306 |
+
# Start the eval demon
|
| 1307 |
+
if not self._eval_demon_running:
|
| 1308 |
+
self._eval_demon()
|
| 1309 |
+
|
| 1310 |
+
def _highlight_devset(self, sample=None):
|
| 1311 |
+
if sample is None:
|
| 1312 |
+
sample = self.devset[self.devset_index : self.devset_index + 1]
|
| 1313 |
+
|
| 1314 |
+
self.devsetbox.tag_remove("true-pos", "1.0", "end")
|
| 1315 |
+
self.devsetbox.tag_remove("false-neg", "1.0", "end")
|
| 1316 |
+
self.devsetbox.tag_remove("false-pos", "1.0", "end")
|
| 1317 |
+
|
| 1318 |
+
# Run the grammar on the test cases.
|
| 1319 |
+
for sentnum, gold_tree in enumerate(sample):
|
| 1320 |
+
# Run the chunk parser
|
| 1321 |
+
test_tree = self._chunkparse(gold_tree.leaves())
|
| 1322 |
+
# Extract gold & test chunks
|
| 1323 |
+
gold_chunks = self._chunks(gold_tree)
|
| 1324 |
+
test_chunks = self._chunks(test_tree)
|
| 1325 |
+
# Compare them.
|
| 1326 |
+
for chunk in gold_chunks.intersection(test_chunks):
|
| 1327 |
+
self._color_chunk(sentnum, chunk, "true-pos")
|
| 1328 |
+
for chunk in gold_chunks - test_chunks:
|
| 1329 |
+
self._color_chunk(sentnum, chunk, "false-neg")
|
| 1330 |
+
for chunk in test_chunks - gold_chunks:
|
| 1331 |
+
self._color_chunk(sentnum, chunk, "false-pos")
|
| 1332 |
+
|
| 1333 |
+
def _chunkparse(self, words):
|
| 1334 |
+
try:
|
| 1335 |
+
return self.chunker.parse(words)
|
| 1336 |
+
except (ValueError, IndexError) as e:
|
| 1337 |
+
# There's an error somewhere in the grammar, but we're not sure
|
| 1338 |
+
# exactly where, so just mark the whole grammar as bad.
|
| 1339 |
+
# E.g., this is caused by: "({<NN>})"
|
| 1340 |
+
self.grammarbox.tag_add("error", "1.0", "end")
|
| 1341 |
+
# Treat it as tagging nothing:
|
| 1342 |
+
return words
|
| 1343 |
+
|
| 1344 |
+
def _color_chunk(self, sentnum, chunk, tag):
|
| 1345 |
+
start, end = chunk
|
| 1346 |
+
self.devsetbox.tag_add(
|
| 1347 |
+
tag,
|
| 1348 |
+
f"{self.linenum[sentnum]}.{self.charnum[sentnum, start]}",
|
| 1349 |
+
f"{self.linenum[sentnum]}.{self.charnum[sentnum, end] - 1}",
|
| 1350 |
+
)
|
| 1351 |
+
|
| 1352 |
+
def reset(self):
|
| 1353 |
+
# Clear various variables
|
| 1354 |
+
self.chunker = None
|
| 1355 |
+
self.grammar = None
|
| 1356 |
+
self.normalized_grammar = None
|
| 1357 |
+
self.grammar_changed = 0
|
| 1358 |
+
self._history = []
|
| 1359 |
+
self._history_index = 0
|
| 1360 |
+
# Update the on-screen display.
|
| 1361 |
+
self.grammarbox.delete("1.0", "end")
|
| 1362 |
+
self.show_devset(0)
|
| 1363 |
+
self.update()
|
| 1364 |
+
# self._eval_plot()
|
| 1365 |
+
|
| 1366 |
+
SAVE_GRAMMAR_TEMPLATE = (
|
| 1367 |
+
"# Regexp Chunk Parsing Grammar\n"
|
| 1368 |
+
"# Saved %(date)s\n"
|
| 1369 |
+
"#\n"
|
| 1370 |
+
"# Development set: %(devset)s\n"
|
| 1371 |
+
"# Precision: %(precision)s\n"
|
| 1372 |
+
"# Recall: %(recall)s\n"
|
| 1373 |
+
"# F-score: %(fscore)s\n\n"
|
| 1374 |
+
"%(grammar)s\n"
|
| 1375 |
+
)
|
| 1376 |
+
|
| 1377 |
+
def save_grammar(self, filename=None):
|
| 1378 |
+
if not filename:
|
| 1379 |
+
ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
|
| 1380 |
+
filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk")
|
| 1381 |
+
if not filename:
|
| 1382 |
+
return
|
| 1383 |
+
if self._history and self.normalized_grammar == self.normalize_grammar(
|
| 1384 |
+
self._history[-1][0]
|
| 1385 |
+
):
|
| 1386 |
+
precision, recall, fscore = (
|
| 1387 |
+
"%.2f%%" % (100 * v) for v in self._history[-1][1:]
|
| 1388 |
+
)
|
| 1389 |
+
elif self.chunker is None:
|
| 1390 |
+
precision = recall = fscore = "Grammar not well formed"
|
| 1391 |
+
else:
|
| 1392 |
+
precision = recall = fscore = "Not finished evaluation yet"
|
| 1393 |
+
|
| 1394 |
+
with open(filename, "w") as outfile:
|
| 1395 |
+
outfile.write(
|
| 1396 |
+
self.SAVE_GRAMMAR_TEMPLATE
|
| 1397 |
+
% dict(
|
| 1398 |
+
date=time.ctime(),
|
| 1399 |
+
devset=self.devset_name,
|
| 1400 |
+
precision=precision,
|
| 1401 |
+
recall=recall,
|
| 1402 |
+
fscore=fscore,
|
| 1403 |
+
grammar=self.grammar.strip(),
|
| 1404 |
+
)
|
| 1405 |
+
)
|
| 1406 |
+
|
| 1407 |
+
def load_grammar(self, filename=None):
|
| 1408 |
+
if not filename:
|
| 1409 |
+
ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")]
|
| 1410 |
+
filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk")
|
| 1411 |
+
if not filename:
|
| 1412 |
+
return
|
| 1413 |
+
self.grammarbox.delete("1.0", "end")
|
| 1414 |
+
self.update()
|
| 1415 |
+
with open(filename) as infile:
|
| 1416 |
+
grammar = infile.read()
|
| 1417 |
+
grammar = re.sub(
|
| 1418 |
+
r"^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar
|
| 1419 |
+
).lstrip()
|
| 1420 |
+
self.grammarbox.insert("1.0", grammar)
|
| 1421 |
+
self.update()
|
| 1422 |
+
|
| 1423 |
+
def save_history(self, filename=None):
|
| 1424 |
+
if not filename:
|
| 1425 |
+
ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")]
|
| 1426 |
+
filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt")
|
| 1427 |
+
if not filename:
|
| 1428 |
+
return
|
| 1429 |
+
|
| 1430 |
+
with open(filename, "w") as outfile:
|
| 1431 |
+
outfile.write("# Regexp Chunk Parsing Grammar History\n")
|
| 1432 |
+
outfile.write("# Saved %s\n" % time.ctime())
|
| 1433 |
+
outfile.write("# Development set: %s\n" % self.devset_name)
|
| 1434 |
+
for i, (g, p, r, f) in enumerate(self._history):
|
| 1435 |
+
hdr = (
|
| 1436 |
+
"Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, "
|
| 1437 |
+
"fscore=%.2f%%)"
|
| 1438 |
+
% (i + 1, len(self._history), p * 100, r * 100, f * 100)
|
| 1439 |
+
)
|
| 1440 |
+
outfile.write("\n%s\n" % hdr)
|
| 1441 |
+
outfile.write("".join(" %s\n" % line for line in g.strip().split()))
|
| 1442 |
+
|
| 1443 |
+
if not (
|
| 1444 |
+
self._history
|
| 1445 |
+
and self.normalized_grammar
|
| 1446 |
+
== self.normalize_grammar(self._history[-1][0])
|
| 1447 |
+
):
|
| 1448 |
+
if self.chunker is None:
|
| 1449 |
+
outfile.write("\nCurrent Grammar (not well-formed)\n")
|
| 1450 |
+
else:
|
| 1451 |
+
outfile.write("\nCurrent Grammar (not evaluated)\n")
|
| 1452 |
+
outfile.write(
|
| 1453 |
+
"".join(" %s\n" % line for line in self.grammar.strip().split())
|
| 1454 |
+
)
|
| 1455 |
+
|
| 1456 |
+
def about(self, *e):
|
| 1457 |
+
ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper"
|
| 1458 |
+
TITLE = "About: Regular Expression Chunk Parser Application"
|
| 1459 |
+
try:
|
| 1460 |
+
from tkinter.messagebox import Message
|
| 1461 |
+
|
| 1462 |
+
Message(message=ABOUT, title=TITLE).show()
|
| 1463 |
+
except:
|
| 1464 |
+
ShowText(self.top, TITLE, ABOUT)
|
| 1465 |
+
|
| 1466 |
+
def set_devset_size(self, size=None):
|
| 1467 |
+
if size is not None:
|
| 1468 |
+
self._devset_size.set(size)
|
| 1469 |
+
self._devset_size.set(min(len(self.devset), self._devset_size.get()))
|
| 1470 |
+
self.show_devset(1)
|
| 1471 |
+
self.show_devset(0)
|
| 1472 |
+
# what about history? Evaluated at diff dev set sizes!
|
| 1473 |
+
|
| 1474 |
+
def resize(self, size=None):
|
| 1475 |
+
if size is not None:
|
| 1476 |
+
self._size.set(size)
|
| 1477 |
+
size = self._size.get()
|
| 1478 |
+
self._font.configure(size=-(abs(size)))
|
| 1479 |
+
self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20))
|
| 1480 |
+
|
| 1481 |
+
def mainloop(self, *args, **kwargs):
|
| 1482 |
+
"""
|
| 1483 |
+
Enter the Tkinter mainloop. This function must be called if
|
| 1484 |
+
this demo is created from a non-interactive program (e.g.
|
| 1485 |
+
from a secript); otherwise, the demo will close as soon as
|
| 1486 |
+
the script completes.
|
| 1487 |
+
"""
|
| 1488 |
+
if in_idle():
|
| 1489 |
+
return
|
| 1490 |
+
self.top.mainloop(*args, **kwargs)
|
| 1491 |
+
|
| 1492 |
+
|
| 1493 |
+
def app():
|
| 1494 |
+
RegexpChunkApp().mainloop()
|
| 1495 |
+
|
| 1496 |
+
|
| 1497 |
+
if __name__ == "__main__":
|
| 1498 |
+
app()
|
| 1499 |
+
|
| 1500 |
+
__all__ = ["app"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/app/collocations_app.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Collocations Application
|
| 2 |
+
# Much of the GUI code is imported from concordance.py; We intend to merge these tools together
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
#
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
import queue as q
|
| 11 |
+
import threading
|
| 12 |
+
from tkinter import (
|
| 13 |
+
END,
|
| 14 |
+
LEFT,
|
| 15 |
+
SUNKEN,
|
| 16 |
+
Button,
|
| 17 |
+
Frame,
|
| 18 |
+
IntVar,
|
| 19 |
+
Label,
|
| 20 |
+
Menu,
|
| 21 |
+
OptionMenu,
|
| 22 |
+
Scrollbar,
|
| 23 |
+
StringVar,
|
| 24 |
+
Text,
|
| 25 |
+
Tk,
|
| 26 |
+
)
|
| 27 |
+
from tkinter.font import Font
|
| 28 |
+
|
| 29 |
+
from nltk.corpus import (
|
| 30 |
+
alpino,
|
| 31 |
+
brown,
|
| 32 |
+
cess_cat,
|
| 33 |
+
cess_esp,
|
| 34 |
+
floresta,
|
| 35 |
+
indian,
|
| 36 |
+
mac_morpho,
|
| 37 |
+
machado,
|
| 38 |
+
nps_chat,
|
| 39 |
+
sinica_treebank,
|
| 40 |
+
treebank,
|
| 41 |
+
)
|
| 42 |
+
from nltk.probability import FreqDist
|
| 43 |
+
from nltk.util import in_idle
|
| 44 |
+
|
| 45 |
+
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
|
| 46 |
+
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
|
| 47 |
+
POLL_INTERVAL = 100
|
| 48 |
+
|
| 49 |
+
_DEFAULT = "English: Brown Corpus (Humor)"
|
| 50 |
+
_CORPORA = {
|
| 51 |
+
"Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
|
| 52 |
+
"English: Brown Corpus": lambda: brown.words(),
|
| 53 |
+
"English: Brown Corpus (Press)": lambda: brown.words(
|
| 54 |
+
categories=["news", "editorial", "reviews"]
|
| 55 |
+
),
|
| 56 |
+
"English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
|
| 57 |
+
"English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
|
| 58 |
+
"English: Brown Corpus (Science Fiction)": lambda: brown.words(
|
| 59 |
+
categories="science_fiction"
|
| 60 |
+
),
|
| 61 |
+
"English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
|
| 62 |
+
"English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
|
| 63 |
+
"English: NPS Chat Corpus": lambda: nps_chat.words(),
|
| 64 |
+
"English: Wall Street Journal Corpus": lambda: treebank.words(),
|
| 65 |
+
"Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
|
| 66 |
+
"Dutch: Alpino Corpus": lambda: alpino.words(),
|
| 67 |
+
"Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
|
| 68 |
+
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
|
| 69 |
+
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
|
| 70 |
+
"Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
|
| 71 |
+
"Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class CollocationsView:
|
| 76 |
+
_BACKGROUND_COLOUR = "#FFF" # white
|
| 77 |
+
|
| 78 |
+
def __init__(self):
|
| 79 |
+
self.queue = q.Queue()
|
| 80 |
+
self.model = CollocationsModel(self.queue)
|
| 81 |
+
self.top = Tk()
|
| 82 |
+
self._init_top(self.top)
|
| 83 |
+
self._init_menubar()
|
| 84 |
+
self._init_widgets(self.top)
|
| 85 |
+
self.load_corpus(self.model.DEFAULT_CORPUS)
|
| 86 |
+
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
| 87 |
+
|
| 88 |
+
def _init_top(self, top):
|
| 89 |
+
top.geometry("550x650+50+50")
|
| 90 |
+
top.title("NLTK Collocations List")
|
| 91 |
+
top.bind("<Control-q>", self.destroy)
|
| 92 |
+
top.protocol("WM_DELETE_WINDOW", self.destroy)
|
| 93 |
+
top.minsize(550, 650)
|
| 94 |
+
|
| 95 |
+
def _init_widgets(self, parent):
|
| 96 |
+
self.main_frame = Frame(
|
| 97 |
+
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
|
| 98 |
+
)
|
| 99 |
+
self._init_corpus_select(self.main_frame)
|
| 100 |
+
self._init_results_box(self.main_frame)
|
| 101 |
+
self._init_paging(self.main_frame)
|
| 102 |
+
self._init_status(self.main_frame)
|
| 103 |
+
self.main_frame.pack(fill="both", expand=True)
|
| 104 |
+
|
| 105 |
+
def _init_corpus_select(self, parent):
|
| 106 |
+
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
| 107 |
+
self.var = StringVar(innerframe)
|
| 108 |
+
self.var.set(self.model.DEFAULT_CORPUS)
|
| 109 |
+
Label(
|
| 110 |
+
innerframe,
|
| 111 |
+
justify=LEFT,
|
| 112 |
+
text=" Corpus: ",
|
| 113 |
+
background=self._BACKGROUND_COLOUR,
|
| 114 |
+
padx=2,
|
| 115 |
+
pady=1,
|
| 116 |
+
border=0,
|
| 117 |
+
).pack(side="left")
|
| 118 |
+
|
| 119 |
+
other_corpora = list(self.model.CORPORA.keys()).remove(
|
| 120 |
+
self.model.DEFAULT_CORPUS
|
| 121 |
+
)
|
| 122 |
+
om = OptionMenu(
|
| 123 |
+
innerframe,
|
| 124 |
+
self.var,
|
| 125 |
+
self.model.DEFAULT_CORPUS,
|
| 126 |
+
command=self.corpus_selected,
|
| 127 |
+
*self.model.non_default_corpora()
|
| 128 |
+
)
|
| 129 |
+
om["borderwidth"] = 0
|
| 130 |
+
om["highlightthickness"] = 1
|
| 131 |
+
om.pack(side="left")
|
| 132 |
+
innerframe.pack(side="top", fill="x", anchor="n")
|
| 133 |
+
|
| 134 |
+
def _init_status(self, parent):
|
| 135 |
+
self.status = Label(
|
| 136 |
+
parent,
|
| 137 |
+
justify=LEFT,
|
| 138 |
+
relief=SUNKEN,
|
| 139 |
+
background=self._BACKGROUND_COLOUR,
|
| 140 |
+
border=0,
|
| 141 |
+
padx=1,
|
| 142 |
+
pady=0,
|
| 143 |
+
)
|
| 144 |
+
self.status.pack(side="top", anchor="sw")
|
| 145 |
+
|
| 146 |
+
def _init_menubar(self):
|
| 147 |
+
self._result_size = IntVar(self.top)
|
| 148 |
+
menubar = Menu(self.top)
|
| 149 |
+
|
| 150 |
+
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
|
| 151 |
+
filemenu.add_command(
|
| 152 |
+
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
|
| 153 |
+
)
|
| 154 |
+
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
| 155 |
+
|
| 156 |
+
editmenu = Menu(menubar, tearoff=0)
|
| 157 |
+
rescntmenu = Menu(editmenu, tearoff=0)
|
| 158 |
+
rescntmenu.add_radiobutton(
|
| 159 |
+
label="20",
|
| 160 |
+
variable=self._result_size,
|
| 161 |
+
underline=0,
|
| 162 |
+
value=20,
|
| 163 |
+
command=self.set_result_size,
|
| 164 |
+
)
|
| 165 |
+
rescntmenu.add_radiobutton(
|
| 166 |
+
label="50",
|
| 167 |
+
variable=self._result_size,
|
| 168 |
+
underline=0,
|
| 169 |
+
value=50,
|
| 170 |
+
command=self.set_result_size,
|
| 171 |
+
)
|
| 172 |
+
rescntmenu.add_radiobutton(
|
| 173 |
+
label="100",
|
| 174 |
+
variable=self._result_size,
|
| 175 |
+
underline=0,
|
| 176 |
+
value=100,
|
| 177 |
+
command=self.set_result_size,
|
| 178 |
+
)
|
| 179 |
+
rescntmenu.invoke(1)
|
| 180 |
+
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
|
| 181 |
+
|
| 182 |
+
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
| 183 |
+
self.top.config(menu=menubar)
|
| 184 |
+
|
| 185 |
+
def set_result_size(self, **kwargs):
|
| 186 |
+
self.model.result_count = self._result_size.get()
|
| 187 |
+
|
| 188 |
+
def _init_results_box(self, parent):
|
| 189 |
+
innerframe = Frame(parent)
|
| 190 |
+
i1 = Frame(innerframe)
|
| 191 |
+
i2 = Frame(innerframe)
|
| 192 |
+
vscrollbar = Scrollbar(i1, borderwidth=1)
|
| 193 |
+
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
|
| 194 |
+
self.results_box = Text(
|
| 195 |
+
i1,
|
| 196 |
+
font=Font(family="courier", size="16"),
|
| 197 |
+
state="disabled",
|
| 198 |
+
borderwidth=1,
|
| 199 |
+
yscrollcommand=vscrollbar.set,
|
| 200 |
+
xscrollcommand=hscrollbar.set,
|
| 201 |
+
wrap="none",
|
| 202 |
+
width="40",
|
| 203 |
+
height="20",
|
| 204 |
+
exportselection=1,
|
| 205 |
+
)
|
| 206 |
+
self.results_box.pack(side="left", fill="both", expand=True)
|
| 207 |
+
vscrollbar.pack(side="left", fill="y", anchor="e")
|
| 208 |
+
vscrollbar.config(command=self.results_box.yview)
|
| 209 |
+
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
|
| 210 |
+
hscrollbar.config(command=self.results_box.xview)
|
| 211 |
+
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
|
| 212 |
+
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
|
| 213 |
+
side="left", anchor="e"
|
| 214 |
+
)
|
| 215 |
+
i1.pack(side="top", fill="both", expand=True, anchor="n")
|
| 216 |
+
i2.pack(side="bottom", fill="x", anchor="s")
|
| 217 |
+
innerframe.pack(side="top", fill="both", expand=True)
|
| 218 |
+
|
| 219 |
+
def _init_paging(self, parent):
|
| 220 |
+
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
| 221 |
+
self.prev = prev = Button(
|
| 222 |
+
innerframe,
|
| 223 |
+
text="Previous",
|
| 224 |
+
command=self.previous,
|
| 225 |
+
width="10",
|
| 226 |
+
borderwidth=1,
|
| 227 |
+
highlightthickness=1,
|
| 228 |
+
state="disabled",
|
| 229 |
+
)
|
| 230 |
+
prev.pack(side="left", anchor="center")
|
| 231 |
+
self.next = next = Button(
|
| 232 |
+
innerframe,
|
| 233 |
+
text="Next",
|
| 234 |
+
command=self.__next__,
|
| 235 |
+
width="10",
|
| 236 |
+
borderwidth=1,
|
| 237 |
+
highlightthickness=1,
|
| 238 |
+
state="disabled",
|
| 239 |
+
)
|
| 240 |
+
next.pack(side="right", anchor="center")
|
| 241 |
+
innerframe.pack(side="top", fill="y")
|
| 242 |
+
self.reset_current_page()
|
| 243 |
+
|
| 244 |
+
def reset_current_page(self):
|
| 245 |
+
self.current_page = -1
|
| 246 |
+
|
| 247 |
+
def _poll(self):
|
| 248 |
+
try:
|
| 249 |
+
event = self.queue.get(block=False)
|
| 250 |
+
except q.Empty:
|
| 251 |
+
pass
|
| 252 |
+
else:
|
| 253 |
+
if event == CORPUS_LOADED_EVENT:
|
| 254 |
+
self.handle_corpus_loaded(event)
|
| 255 |
+
elif event == ERROR_LOADING_CORPUS_EVENT:
|
| 256 |
+
self.handle_error_loading_corpus(event)
|
| 257 |
+
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
| 258 |
+
|
| 259 |
+
def handle_error_loading_corpus(self, event):
|
| 260 |
+
self.status["text"] = "Error in loading " + self.var.get()
|
| 261 |
+
self.unfreeze_editable()
|
| 262 |
+
self.clear_results_box()
|
| 263 |
+
self.freeze_editable()
|
| 264 |
+
self.reset_current_page()
|
| 265 |
+
|
| 266 |
+
def handle_corpus_loaded(self, event):
|
| 267 |
+
self.status["text"] = self.var.get() + " is loaded"
|
| 268 |
+
self.unfreeze_editable()
|
| 269 |
+
self.clear_results_box()
|
| 270 |
+
self.reset_current_page()
|
| 271 |
+
# self.next()
|
| 272 |
+
collocations = self.model.next(self.current_page + 1)
|
| 273 |
+
self.write_results(collocations)
|
| 274 |
+
self.current_page += 1
|
| 275 |
+
|
| 276 |
+
def corpus_selected(self, *args):
|
| 277 |
+
new_selection = self.var.get()
|
| 278 |
+
self.load_corpus(new_selection)
|
| 279 |
+
|
| 280 |
+
def previous(self):
|
| 281 |
+
self.freeze_editable()
|
| 282 |
+
collocations = self.model.prev(self.current_page - 1)
|
| 283 |
+
self.current_page = self.current_page - 1
|
| 284 |
+
self.clear_results_box()
|
| 285 |
+
self.write_results(collocations)
|
| 286 |
+
self.unfreeze_editable()
|
| 287 |
+
|
| 288 |
+
def __next__(self):
|
| 289 |
+
self.freeze_editable()
|
| 290 |
+
collocations = self.model.next(self.current_page + 1)
|
| 291 |
+
self.clear_results_box()
|
| 292 |
+
self.write_results(collocations)
|
| 293 |
+
self.current_page += 1
|
| 294 |
+
self.unfreeze_editable()
|
| 295 |
+
|
| 296 |
+
def load_corpus(self, selection):
|
| 297 |
+
if self.model.selected_corpus != selection:
|
| 298 |
+
self.status["text"] = "Loading " + selection + "..."
|
| 299 |
+
self.freeze_editable()
|
| 300 |
+
self.model.load_corpus(selection)
|
| 301 |
+
|
| 302 |
+
def freeze_editable(self):
|
| 303 |
+
self.prev["state"] = "disabled"
|
| 304 |
+
self.next["state"] = "disabled"
|
| 305 |
+
|
| 306 |
+
def clear_results_box(self):
|
| 307 |
+
self.results_box["state"] = "normal"
|
| 308 |
+
self.results_box.delete("1.0", END)
|
| 309 |
+
self.results_box["state"] = "disabled"
|
| 310 |
+
|
| 311 |
+
def fire_event(self, event):
|
| 312 |
+
# Firing an event so that rendering of widgets happen in the mainloop thread
|
| 313 |
+
self.top.event_generate(event, when="tail")
|
| 314 |
+
|
| 315 |
+
def destroy(self, *e):
|
| 316 |
+
if self.top is None:
|
| 317 |
+
return
|
| 318 |
+
self.top.after_cancel(self.after)
|
| 319 |
+
self.top.destroy()
|
| 320 |
+
self.top = None
|
| 321 |
+
|
| 322 |
+
def mainloop(self, *args, **kwargs):
|
| 323 |
+
if in_idle():
|
| 324 |
+
return
|
| 325 |
+
self.top.mainloop(*args, **kwargs)
|
| 326 |
+
|
| 327 |
+
def unfreeze_editable(self):
|
| 328 |
+
self.set_paging_button_states()
|
| 329 |
+
|
| 330 |
+
def set_paging_button_states(self):
|
| 331 |
+
if self.current_page == -1 or self.current_page == 0:
|
| 332 |
+
self.prev["state"] = "disabled"
|
| 333 |
+
else:
|
| 334 |
+
self.prev["state"] = "normal"
|
| 335 |
+
if self.model.is_last_page(self.current_page):
|
| 336 |
+
self.next["state"] = "disabled"
|
| 337 |
+
else:
|
| 338 |
+
self.next["state"] = "normal"
|
| 339 |
+
|
| 340 |
+
def write_results(self, results):
|
| 341 |
+
self.results_box["state"] = "normal"
|
| 342 |
+
row = 1
|
| 343 |
+
for each in results:
|
| 344 |
+
self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n")
|
| 345 |
+
row += 1
|
| 346 |
+
self.results_box["state"] = "disabled"
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
class CollocationsModel:
|
| 350 |
+
def __init__(self, queue):
|
| 351 |
+
self.result_count = None
|
| 352 |
+
self.selected_corpus = None
|
| 353 |
+
self.collocations = None
|
| 354 |
+
self.CORPORA = _CORPORA
|
| 355 |
+
self.DEFAULT_CORPUS = _DEFAULT
|
| 356 |
+
self.queue = queue
|
| 357 |
+
self.reset_results()
|
| 358 |
+
|
| 359 |
+
def reset_results(self):
|
| 360 |
+
self.result_pages = []
|
| 361 |
+
self.results_returned = 0
|
| 362 |
+
|
| 363 |
+
def load_corpus(self, name):
|
| 364 |
+
self.selected_corpus = name
|
| 365 |
+
self.collocations = None
|
| 366 |
+
runner_thread = self.LoadCorpus(name, self)
|
| 367 |
+
runner_thread.start()
|
| 368 |
+
self.reset_results()
|
| 369 |
+
|
| 370 |
+
def non_default_corpora(self):
|
| 371 |
+
copy = []
|
| 372 |
+
copy.extend(list(self.CORPORA.keys()))
|
| 373 |
+
copy.remove(self.DEFAULT_CORPUS)
|
| 374 |
+
copy.sort()
|
| 375 |
+
return copy
|
| 376 |
+
|
| 377 |
+
def is_last_page(self, number):
|
| 378 |
+
if number < len(self.result_pages):
|
| 379 |
+
return False
|
| 380 |
+
return self.results_returned + (
|
| 381 |
+
number - len(self.result_pages)
|
| 382 |
+
) * self.result_count >= len(self.collocations)
|
| 383 |
+
|
| 384 |
+
def next(self, page):
|
| 385 |
+
if (len(self.result_pages) - 1) < page:
|
| 386 |
+
for i in range(page - (len(self.result_pages) - 1)):
|
| 387 |
+
self.result_pages.append(
|
| 388 |
+
self.collocations[
|
| 389 |
+
self.results_returned : self.results_returned
|
| 390 |
+
+ self.result_count
|
| 391 |
+
]
|
| 392 |
+
)
|
| 393 |
+
self.results_returned += self.result_count
|
| 394 |
+
return self.result_pages[page]
|
| 395 |
+
|
| 396 |
+
def prev(self, page):
|
| 397 |
+
if page == -1:
|
| 398 |
+
return []
|
| 399 |
+
return self.result_pages[page]
|
| 400 |
+
|
| 401 |
+
class LoadCorpus(threading.Thread):
|
| 402 |
+
def __init__(self, name, model):
|
| 403 |
+
threading.Thread.__init__(self)
|
| 404 |
+
self.model, self.name = model, name
|
| 405 |
+
|
| 406 |
+
def run(self):
|
| 407 |
+
try:
|
| 408 |
+
words = self.model.CORPORA[self.name]()
|
| 409 |
+
from operator import itemgetter
|
| 410 |
+
|
| 411 |
+
text = [w for w in words if len(w) > 2]
|
| 412 |
+
fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
|
| 413 |
+
vocab = FreqDist(text)
|
| 414 |
+
scored = [
|
| 415 |
+
((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
|
| 416 |
+
for w1, w2 in fd
|
| 417 |
+
]
|
| 418 |
+
scored.sort(key=itemgetter(1), reverse=True)
|
| 419 |
+
self.model.collocations = list(map(itemgetter(0), scored))
|
| 420 |
+
self.model.queue.put(CORPUS_LOADED_EVENT)
|
| 421 |
+
except Exception as e:
|
| 422 |
+
print(e)
|
| 423 |
+
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
# def collocations():
|
| 427 |
+
# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]]
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
def app():
|
| 431 |
+
c = CollocationsView()
|
| 432 |
+
c.mainloop()
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
if __name__ == "__main__":
|
| 436 |
+
app()
|
| 437 |
+
|
| 438 |
+
__all__ = ["app"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/app/concordance_app.py
ADDED
|
@@ -0,0 +1,709 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Concordance Application
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
import queue as q
|
| 9 |
+
import re
|
| 10 |
+
import threading
|
| 11 |
+
from tkinter import (
|
| 12 |
+
END,
|
| 13 |
+
LEFT,
|
| 14 |
+
SUNKEN,
|
| 15 |
+
Button,
|
| 16 |
+
Entry,
|
| 17 |
+
Frame,
|
| 18 |
+
IntVar,
|
| 19 |
+
Label,
|
| 20 |
+
Menu,
|
| 21 |
+
OptionMenu,
|
| 22 |
+
Scrollbar,
|
| 23 |
+
StringVar,
|
| 24 |
+
Text,
|
| 25 |
+
Tk,
|
| 26 |
+
)
|
| 27 |
+
from tkinter.font import Font
|
| 28 |
+
|
| 29 |
+
from nltk.corpus import (
|
| 30 |
+
alpino,
|
| 31 |
+
brown,
|
| 32 |
+
cess_cat,
|
| 33 |
+
cess_esp,
|
| 34 |
+
floresta,
|
| 35 |
+
indian,
|
| 36 |
+
mac_morpho,
|
| 37 |
+
nps_chat,
|
| 38 |
+
sinica_treebank,
|
| 39 |
+
treebank,
|
| 40 |
+
)
|
| 41 |
+
from nltk.draw.util import ShowText
|
| 42 |
+
from nltk.util import in_idle
|
| 43 |
+
|
| 44 |
+
WORD_OR_TAG = "[^/ ]+"
|
| 45 |
+
BOUNDARY = r"\b"
|
| 46 |
+
|
| 47 |
+
CORPUS_LOADED_EVENT = "<<CL_EVENT>>"
|
| 48 |
+
SEARCH_TERMINATED_EVENT = "<<ST_EVENT>>"
|
| 49 |
+
SEARCH_ERROR_EVENT = "<<SE_EVENT>>"
|
| 50 |
+
ERROR_LOADING_CORPUS_EVENT = "<<ELC_EVENT>>"
|
| 51 |
+
|
| 52 |
+
POLL_INTERVAL = 50
|
| 53 |
+
|
| 54 |
+
# NB All corpora must be specified in a lambda expression so as not to be
|
| 55 |
+
# loaded when the module is imported.
|
| 56 |
+
|
| 57 |
+
_DEFAULT = "English: Brown Corpus (Humor, simplified)"
|
| 58 |
+
_CORPORA = {
|
| 59 |
+
"Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(
|
| 60 |
+
tagset="universal"
|
| 61 |
+
),
|
| 62 |
+
"English: Brown Corpus": lambda: brown.tagged_sents(),
|
| 63 |
+
"English: Brown Corpus (simplified)": lambda: brown.tagged_sents(
|
| 64 |
+
tagset="universal"
|
| 65 |
+
),
|
| 66 |
+
"English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
|
| 67 |
+
categories=["news", "editorial", "reviews"], tagset="universal"
|
| 68 |
+
),
|
| 69 |
+
"English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
|
| 70 |
+
categories="religion", tagset="universal"
|
| 71 |
+
),
|
| 72 |
+
"English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
|
| 73 |
+
categories="learned", tagset="universal"
|
| 74 |
+
),
|
| 75 |
+
"English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
|
| 76 |
+
categories="science_fiction", tagset="universal"
|
| 77 |
+
),
|
| 78 |
+
"English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
|
| 79 |
+
categories="romance", tagset="universal"
|
| 80 |
+
),
|
| 81 |
+
"English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
|
| 82 |
+
categories="humor", tagset="universal"
|
| 83 |
+
),
|
| 84 |
+
"English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
|
| 85 |
+
"English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
|
| 86 |
+
tagset="universal"
|
| 87 |
+
),
|
| 88 |
+
"English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
|
| 89 |
+
"English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
|
| 90 |
+
tagset="universal"
|
| 91 |
+
),
|
| 92 |
+
"Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
|
| 93 |
+
"Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
|
| 94 |
+
tagset="universal"
|
| 95 |
+
),
|
| 96 |
+
"Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
|
| 97 |
+
"Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
|
| 98 |
+
tagset="universal"
|
| 99 |
+
),
|
| 100 |
+
"Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
|
| 101 |
+
"Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
|
| 102 |
+
files="hindi.pos", tagset="universal"
|
| 103 |
+
),
|
| 104 |
+
"Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
|
| 105 |
+
"Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
|
| 106 |
+
tagset="universal"
|
| 107 |
+
),
|
| 108 |
+
"Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
|
| 109 |
+
"Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
|
| 110 |
+
tagset="universal"
|
| 111 |
+
),
|
| 112 |
+
"Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
|
| 113 |
+
tagset="universal"
|
| 114 |
+
),
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class ConcordanceSearchView:
|
| 119 |
+
_BACKGROUND_COLOUR = "#FFF" # white
|
| 120 |
+
|
| 121 |
+
# Colour of highlighted results
|
| 122 |
+
_HIGHLIGHT_WORD_COLOUR = "#F00" # red
|
| 123 |
+
_HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
|
| 124 |
+
|
| 125 |
+
_HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey
|
| 126 |
+
_HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG"
|
| 127 |
+
|
| 128 |
+
# Percentage of text left of the scrollbar position
|
| 129 |
+
_FRACTION_LEFT_TEXT = 0.30
|
| 130 |
+
|
| 131 |
+
def __init__(self):
|
| 132 |
+
self.queue = q.Queue()
|
| 133 |
+
self.model = ConcordanceSearchModel(self.queue)
|
| 134 |
+
self.top = Tk()
|
| 135 |
+
self._init_top(self.top)
|
| 136 |
+
self._init_menubar()
|
| 137 |
+
self._init_widgets(self.top)
|
| 138 |
+
self.load_corpus(self.model.DEFAULT_CORPUS)
|
| 139 |
+
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
| 140 |
+
|
| 141 |
+
def _init_top(self, top):
|
| 142 |
+
top.geometry("950x680+50+50")
|
| 143 |
+
top.title("NLTK Concordance Search")
|
| 144 |
+
top.bind("<Control-q>", self.destroy)
|
| 145 |
+
top.protocol("WM_DELETE_WINDOW", self.destroy)
|
| 146 |
+
top.minsize(950, 680)
|
| 147 |
+
|
| 148 |
+
def _init_widgets(self, parent):
|
| 149 |
+
self.main_frame = Frame(
|
| 150 |
+
parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)
|
| 151 |
+
)
|
| 152 |
+
self._init_corpus_select(self.main_frame)
|
| 153 |
+
self._init_query_box(self.main_frame)
|
| 154 |
+
self._init_results_box(self.main_frame)
|
| 155 |
+
self._init_paging(self.main_frame)
|
| 156 |
+
self._init_status(self.main_frame)
|
| 157 |
+
self.main_frame.pack(fill="both", expand=True)
|
| 158 |
+
|
| 159 |
+
def _init_menubar(self):
|
| 160 |
+
self._result_size = IntVar(self.top)
|
| 161 |
+
self._cntx_bf_len = IntVar(self.top)
|
| 162 |
+
self._cntx_af_len = IntVar(self.top)
|
| 163 |
+
menubar = Menu(self.top)
|
| 164 |
+
|
| 165 |
+
filemenu = Menu(menubar, tearoff=0, borderwidth=0)
|
| 166 |
+
filemenu.add_command(
|
| 167 |
+
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q"
|
| 168 |
+
)
|
| 169 |
+
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
| 170 |
+
|
| 171 |
+
editmenu = Menu(menubar, tearoff=0)
|
| 172 |
+
rescntmenu = Menu(editmenu, tearoff=0)
|
| 173 |
+
rescntmenu.add_radiobutton(
|
| 174 |
+
label="20",
|
| 175 |
+
variable=self._result_size,
|
| 176 |
+
underline=0,
|
| 177 |
+
value=20,
|
| 178 |
+
command=self.set_result_size,
|
| 179 |
+
)
|
| 180 |
+
rescntmenu.add_radiobutton(
|
| 181 |
+
label="50",
|
| 182 |
+
variable=self._result_size,
|
| 183 |
+
underline=0,
|
| 184 |
+
value=50,
|
| 185 |
+
command=self.set_result_size,
|
| 186 |
+
)
|
| 187 |
+
rescntmenu.add_radiobutton(
|
| 188 |
+
label="100",
|
| 189 |
+
variable=self._result_size,
|
| 190 |
+
underline=0,
|
| 191 |
+
value=100,
|
| 192 |
+
command=self.set_result_size,
|
| 193 |
+
)
|
| 194 |
+
rescntmenu.invoke(1)
|
| 195 |
+
editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu)
|
| 196 |
+
|
| 197 |
+
cntxmenu = Menu(editmenu, tearoff=0)
|
| 198 |
+
cntxbfmenu = Menu(cntxmenu, tearoff=0)
|
| 199 |
+
cntxbfmenu.add_radiobutton(
|
| 200 |
+
label="60 characters",
|
| 201 |
+
variable=self._cntx_bf_len,
|
| 202 |
+
underline=0,
|
| 203 |
+
value=60,
|
| 204 |
+
command=self.set_cntx_bf_len,
|
| 205 |
+
)
|
| 206 |
+
cntxbfmenu.add_radiobutton(
|
| 207 |
+
label="80 characters",
|
| 208 |
+
variable=self._cntx_bf_len,
|
| 209 |
+
underline=0,
|
| 210 |
+
value=80,
|
| 211 |
+
command=self.set_cntx_bf_len,
|
| 212 |
+
)
|
| 213 |
+
cntxbfmenu.add_radiobutton(
|
| 214 |
+
label="100 characters",
|
| 215 |
+
variable=self._cntx_bf_len,
|
| 216 |
+
underline=0,
|
| 217 |
+
value=100,
|
| 218 |
+
command=self.set_cntx_bf_len,
|
| 219 |
+
)
|
| 220 |
+
cntxbfmenu.invoke(1)
|
| 221 |
+
cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu)
|
| 222 |
+
|
| 223 |
+
cntxafmenu = Menu(cntxmenu, tearoff=0)
|
| 224 |
+
cntxafmenu.add_radiobutton(
|
| 225 |
+
label="70 characters",
|
| 226 |
+
variable=self._cntx_af_len,
|
| 227 |
+
underline=0,
|
| 228 |
+
value=70,
|
| 229 |
+
command=self.set_cntx_af_len,
|
| 230 |
+
)
|
| 231 |
+
cntxafmenu.add_radiobutton(
|
| 232 |
+
label="90 characters",
|
| 233 |
+
variable=self._cntx_af_len,
|
| 234 |
+
underline=0,
|
| 235 |
+
value=90,
|
| 236 |
+
command=self.set_cntx_af_len,
|
| 237 |
+
)
|
| 238 |
+
cntxafmenu.add_radiobutton(
|
| 239 |
+
label="110 characters",
|
| 240 |
+
variable=self._cntx_af_len,
|
| 241 |
+
underline=0,
|
| 242 |
+
value=110,
|
| 243 |
+
command=self.set_cntx_af_len,
|
| 244 |
+
)
|
| 245 |
+
cntxafmenu.invoke(1)
|
| 246 |
+
cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu)
|
| 247 |
+
|
| 248 |
+
editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu)
|
| 249 |
+
|
| 250 |
+
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
| 251 |
+
|
| 252 |
+
self.top.config(menu=menubar)
|
| 253 |
+
|
| 254 |
+
def set_result_size(self, **kwargs):
|
| 255 |
+
self.model.result_count = self._result_size.get()
|
| 256 |
+
|
| 257 |
+
def set_cntx_af_len(self, **kwargs):
|
| 258 |
+
self._char_after = self._cntx_af_len.get()
|
| 259 |
+
|
| 260 |
+
def set_cntx_bf_len(self, **kwargs):
|
| 261 |
+
self._char_before = self._cntx_bf_len.get()
|
| 262 |
+
|
| 263 |
+
def _init_corpus_select(self, parent):
|
| 264 |
+
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
| 265 |
+
self.var = StringVar(innerframe)
|
| 266 |
+
self.var.set(self.model.DEFAULT_CORPUS)
|
| 267 |
+
Label(
|
| 268 |
+
innerframe,
|
| 269 |
+
justify=LEFT,
|
| 270 |
+
text=" Corpus: ",
|
| 271 |
+
background=self._BACKGROUND_COLOUR,
|
| 272 |
+
padx=2,
|
| 273 |
+
pady=1,
|
| 274 |
+
border=0,
|
| 275 |
+
).pack(side="left")
|
| 276 |
+
|
| 277 |
+
other_corpora = list(self.model.CORPORA.keys()).remove(
|
| 278 |
+
self.model.DEFAULT_CORPUS
|
| 279 |
+
)
|
| 280 |
+
om = OptionMenu(
|
| 281 |
+
innerframe,
|
| 282 |
+
self.var,
|
| 283 |
+
self.model.DEFAULT_CORPUS,
|
| 284 |
+
command=self.corpus_selected,
|
| 285 |
+
*self.model.non_default_corpora()
|
| 286 |
+
)
|
| 287 |
+
om["borderwidth"] = 0
|
| 288 |
+
om["highlightthickness"] = 1
|
| 289 |
+
om.pack(side="left")
|
| 290 |
+
innerframe.pack(side="top", fill="x", anchor="n")
|
| 291 |
+
|
| 292 |
+
def _init_status(self, parent):
|
| 293 |
+
self.status = Label(
|
| 294 |
+
parent,
|
| 295 |
+
justify=LEFT,
|
| 296 |
+
relief=SUNKEN,
|
| 297 |
+
background=self._BACKGROUND_COLOUR,
|
| 298 |
+
border=0,
|
| 299 |
+
padx=1,
|
| 300 |
+
pady=0,
|
| 301 |
+
)
|
| 302 |
+
self.status.pack(side="top", anchor="sw")
|
| 303 |
+
|
| 304 |
+
def _init_query_box(self, parent):
|
| 305 |
+
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
| 306 |
+
another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
|
| 307 |
+
self.query_box = Entry(another, width=60)
|
| 308 |
+
self.query_box.pack(side="left", fill="x", pady=25, anchor="center")
|
| 309 |
+
self.search_button = Button(
|
| 310 |
+
another,
|
| 311 |
+
text="Search",
|
| 312 |
+
command=self.search,
|
| 313 |
+
borderwidth=1,
|
| 314 |
+
highlightthickness=1,
|
| 315 |
+
)
|
| 316 |
+
self.search_button.pack(side="left", fill="x", pady=25, anchor="center")
|
| 317 |
+
self.query_box.bind("<KeyPress-Return>", self.search_enter_keypress_handler)
|
| 318 |
+
another.pack()
|
| 319 |
+
innerframe.pack(side="top", fill="x", anchor="n")
|
| 320 |
+
|
| 321 |
+
def search_enter_keypress_handler(self, *event):
|
| 322 |
+
self.search()
|
| 323 |
+
|
| 324 |
+
def _init_results_box(self, parent):
|
| 325 |
+
innerframe = Frame(parent)
|
| 326 |
+
i1 = Frame(innerframe)
|
| 327 |
+
i2 = Frame(innerframe)
|
| 328 |
+
vscrollbar = Scrollbar(i1, borderwidth=1)
|
| 329 |
+
hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz")
|
| 330 |
+
self.results_box = Text(
|
| 331 |
+
i1,
|
| 332 |
+
font=Font(family="courier", size="16"),
|
| 333 |
+
state="disabled",
|
| 334 |
+
borderwidth=1,
|
| 335 |
+
yscrollcommand=vscrollbar.set,
|
| 336 |
+
xscrollcommand=hscrollbar.set,
|
| 337 |
+
wrap="none",
|
| 338 |
+
width="40",
|
| 339 |
+
height="20",
|
| 340 |
+
exportselection=1,
|
| 341 |
+
)
|
| 342 |
+
self.results_box.pack(side="left", fill="both", expand=True)
|
| 343 |
+
self.results_box.tag_config(
|
| 344 |
+
self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR
|
| 345 |
+
)
|
| 346 |
+
self.results_box.tag_config(
|
| 347 |
+
self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR
|
| 348 |
+
)
|
| 349 |
+
vscrollbar.pack(side="left", fill="y", anchor="e")
|
| 350 |
+
vscrollbar.config(command=self.results_box.yview)
|
| 351 |
+
hscrollbar.pack(side="left", fill="x", expand=True, anchor="w")
|
| 352 |
+
hscrollbar.config(command=self.results_box.xview)
|
| 353 |
+
# there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!!
|
| 354 |
+
Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack(
|
| 355 |
+
side="left", anchor="e"
|
| 356 |
+
)
|
| 357 |
+
i1.pack(side="top", fill="both", expand=True, anchor="n")
|
| 358 |
+
i2.pack(side="bottom", fill="x", anchor="s")
|
| 359 |
+
innerframe.pack(side="top", fill="both", expand=True)
|
| 360 |
+
|
| 361 |
+
def _init_paging(self, parent):
|
| 362 |
+
innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
|
| 363 |
+
self.prev = prev = Button(
|
| 364 |
+
innerframe,
|
| 365 |
+
text="Previous",
|
| 366 |
+
command=self.previous,
|
| 367 |
+
width="10",
|
| 368 |
+
borderwidth=1,
|
| 369 |
+
highlightthickness=1,
|
| 370 |
+
state="disabled",
|
| 371 |
+
)
|
| 372 |
+
prev.pack(side="left", anchor="center")
|
| 373 |
+
self.next = next = Button(
|
| 374 |
+
innerframe,
|
| 375 |
+
text="Next",
|
| 376 |
+
command=self.__next__,
|
| 377 |
+
width="10",
|
| 378 |
+
borderwidth=1,
|
| 379 |
+
highlightthickness=1,
|
| 380 |
+
state="disabled",
|
| 381 |
+
)
|
| 382 |
+
next.pack(side="right", anchor="center")
|
| 383 |
+
innerframe.pack(side="top", fill="y")
|
| 384 |
+
self.current_page = 0
|
| 385 |
+
|
| 386 |
+
def previous(self):
|
| 387 |
+
self.clear_results_box()
|
| 388 |
+
self.freeze_editable()
|
| 389 |
+
self.model.prev(self.current_page - 1)
|
| 390 |
+
|
| 391 |
+
def __next__(self):
|
| 392 |
+
self.clear_results_box()
|
| 393 |
+
self.freeze_editable()
|
| 394 |
+
self.model.next(self.current_page + 1)
|
| 395 |
+
|
| 396 |
+
def about(self, *e):
|
| 397 |
+
ABOUT = "NLTK Concordance Search Demo\n"
|
| 398 |
+
TITLE = "About: NLTK Concordance Search Demo"
|
| 399 |
+
try:
|
| 400 |
+
from tkinter.messagebox import Message
|
| 401 |
+
|
| 402 |
+
Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
|
| 403 |
+
except:
|
| 404 |
+
ShowText(self.top, TITLE, ABOUT)
|
| 405 |
+
|
| 406 |
+
def _bind_event_handlers(self):
|
| 407 |
+
self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded)
|
| 408 |
+
self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated)
|
| 409 |
+
self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error)
|
| 410 |
+
self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus)
|
| 411 |
+
|
| 412 |
+
def _poll(self):
|
| 413 |
+
try:
|
| 414 |
+
event = self.queue.get(block=False)
|
| 415 |
+
except q.Empty:
|
| 416 |
+
pass
|
| 417 |
+
else:
|
| 418 |
+
if event == CORPUS_LOADED_EVENT:
|
| 419 |
+
self.handle_corpus_loaded(event)
|
| 420 |
+
elif event == SEARCH_TERMINATED_EVENT:
|
| 421 |
+
self.handle_search_terminated(event)
|
| 422 |
+
elif event == SEARCH_ERROR_EVENT:
|
| 423 |
+
self.handle_search_error(event)
|
| 424 |
+
elif event == ERROR_LOADING_CORPUS_EVENT:
|
| 425 |
+
self.handle_error_loading_corpus(event)
|
| 426 |
+
self.after = self.top.after(POLL_INTERVAL, self._poll)
|
| 427 |
+
|
| 428 |
+
def handle_error_loading_corpus(self, event):
|
| 429 |
+
self.status["text"] = "Error in loading " + self.var.get()
|
| 430 |
+
self.unfreeze_editable()
|
| 431 |
+
self.clear_all()
|
| 432 |
+
self.freeze_editable()
|
| 433 |
+
|
| 434 |
+
def handle_corpus_loaded(self, event):
|
| 435 |
+
self.status["text"] = self.var.get() + " is loaded"
|
| 436 |
+
self.unfreeze_editable()
|
| 437 |
+
self.clear_all()
|
| 438 |
+
self.query_box.focus_set()
|
| 439 |
+
|
| 440 |
+
def handle_search_terminated(self, event):
|
| 441 |
+
# todo: refactor the model such that it is less state sensitive
|
| 442 |
+
results = self.model.get_results()
|
| 443 |
+
self.write_results(results)
|
| 444 |
+
self.status["text"] = ""
|
| 445 |
+
if len(results) == 0:
|
| 446 |
+
self.status["text"] = "No results found for " + self.model.query
|
| 447 |
+
else:
|
| 448 |
+
self.current_page = self.model.last_requested_page
|
| 449 |
+
self.unfreeze_editable()
|
| 450 |
+
self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
|
| 451 |
+
|
| 452 |
+
def handle_search_error(self, event):
|
| 453 |
+
self.status["text"] = "Error in query " + self.model.query
|
| 454 |
+
self.unfreeze_editable()
|
| 455 |
+
|
| 456 |
+
def corpus_selected(self, *args):
|
| 457 |
+
new_selection = self.var.get()
|
| 458 |
+
self.load_corpus(new_selection)
|
| 459 |
+
|
| 460 |
+
def load_corpus(self, selection):
|
| 461 |
+
if self.model.selected_corpus != selection:
|
| 462 |
+
self.status["text"] = "Loading " + selection + "..."
|
| 463 |
+
self.freeze_editable()
|
| 464 |
+
self.model.load_corpus(selection)
|
| 465 |
+
|
| 466 |
+
def search(self):
|
| 467 |
+
self.current_page = 0
|
| 468 |
+
self.clear_results_box()
|
| 469 |
+
self.model.reset_results()
|
| 470 |
+
query = self.query_box.get()
|
| 471 |
+
if len(query.strip()) == 0:
|
| 472 |
+
return
|
| 473 |
+
self.status["text"] = "Searching for " + query
|
| 474 |
+
self.freeze_editable()
|
| 475 |
+
self.model.search(query, self.current_page + 1)
|
| 476 |
+
|
| 477 |
+
def write_results(self, results):
|
| 478 |
+
self.results_box["state"] = "normal"
|
| 479 |
+
row = 1
|
| 480 |
+
for each in results:
|
| 481 |
+
sent, pos1, pos2 = each[0].strip(), each[1], each[2]
|
| 482 |
+
if len(sent) != 0:
|
| 483 |
+
if pos1 < self._char_before:
|
| 484 |
+
sent, pos1, pos2 = self.pad(sent, pos1, pos2)
|
| 485 |
+
sentence = sent[pos1 - self._char_before : pos1 + self._char_after]
|
| 486 |
+
if not row == len(results):
|
| 487 |
+
sentence += "\n"
|
| 488 |
+
self.results_box.insert(str(row) + ".0", sentence)
|
| 489 |
+
word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
|
| 490 |
+
for marker in word_markers:
|
| 491 |
+
self.results_box.tag_add(
|
| 492 |
+
self._HIGHLIGHT_WORD_TAG,
|
| 493 |
+
str(row) + "." + str(marker[0]),
|
| 494 |
+
str(row) + "." + str(marker[1]),
|
| 495 |
+
)
|
| 496 |
+
for marker in label_markers:
|
| 497 |
+
self.results_box.tag_add(
|
| 498 |
+
self._HIGHLIGHT_LABEL_TAG,
|
| 499 |
+
str(row) + "." + str(marker[0]),
|
| 500 |
+
str(row) + "." + str(marker[1]),
|
| 501 |
+
)
|
| 502 |
+
row += 1
|
| 503 |
+
self.results_box["state"] = "disabled"
|
| 504 |
+
|
| 505 |
+
def words_and_labels(self, sentence, pos1, pos2):
|
| 506 |
+
search_exp = sentence[pos1:pos2]
|
| 507 |
+
words, labels = [], []
|
| 508 |
+
labeled_words = search_exp.split(" ")
|
| 509 |
+
index = 0
|
| 510 |
+
for each in labeled_words:
|
| 511 |
+
if each == "":
|
| 512 |
+
index += 1
|
| 513 |
+
else:
|
| 514 |
+
word, label = each.split("/")
|
| 515 |
+
words.append(
|
| 516 |
+
(self._char_before + index, self._char_before + index + len(word))
|
| 517 |
+
)
|
| 518 |
+
index += len(word) + 1
|
| 519 |
+
labels.append(
|
| 520 |
+
(self._char_before + index, self._char_before + index + len(label))
|
| 521 |
+
)
|
| 522 |
+
index += len(label)
|
| 523 |
+
index += 1
|
| 524 |
+
return words, labels
|
| 525 |
+
|
| 526 |
+
def pad(self, sent, hstart, hend):
|
| 527 |
+
if hstart >= self._char_before:
|
| 528 |
+
return sent, hstart, hend
|
| 529 |
+
d = self._char_before - hstart
|
| 530 |
+
sent = "".join([" "] * d) + sent
|
| 531 |
+
return sent, hstart + d, hend + d
|
| 532 |
+
|
| 533 |
+
def destroy(self, *e):
|
| 534 |
+
if self.top is None:
|
| 535 |
+
return
|
| 536 |
+
self.top.after_cancel(self.after)
|
| 537 |
+
self.top.destroy()
|
| 538 |
+
self.top = None
|
| 539 |
+
|
| 540 |
+
def clear_all(self):
|
| 541 |
+
self.query_box.delete(0, END)
|
| 542 |
+
self.model.reset_query()
|
| 543 |
+
self.clear_results_box()
|
| 544 |
+
|
| 545 |
+
def clear_results_box(self):
|
| 546 |
+
self.results_box["state"] = "normal"
|
| 547 |
+
self.results_box.delete("1.0", END)
|
| 548 |
+
self.results_box["state"] = "disabled"
|
| 549 |
+
|
| 550 |
+
def freeze_editable(self):
|
| 551 |
+
self.query_box["state"] = "disabled"
|
| 552 |
+
self.search_button["state"] = "disabled"
|
| 553 |
+
self.prev["state"] = "disabled"
|
| 554 |
+
self.next["state"] = "disabled"
|
| 555 |
+
|
| 556 |
+
def unfreeze_editable(self):
|
| 557 |
+
self.query_box["state"] = "normal"
|
| 558 |
+
self.search_button["state"] = "normal"
|
| 559 |
+
self.set_paging_button_states()
|
| 560 |
+
|
| 561 |
+
def set_paging_button_states(self):
|
| 562 |
+
if self.current_page == 0 or self.current_page == 1:
|
| 563 |
+
self.prev["state"] = "disabled"
|
| 564 |
+
else:
|
| 565 |
+
self.prev["state"] = "normal"
|
| 566 |
+
if self.model.has_more_pages(self.current_page):
|
| 567 |
+
self.next["state"] = "normal"
|
| 568 |
+
else:
|
| 569 |
+
self.next["state"] = "disabled"
|
| 570 |
+
|
| 571 |
+
def fire_event(self, event):
|
| 572 |
+
# Firing an event so that rendering of widgets happen in the mainloop thread
|
| 573 |
+
self.top.event_generate(event, when="tail")
|
| 574 |
+
|
| 575 |
+
def mainloop(self, *args, **kwargs):
|
| 576 |
+
if in_idle():
|
| 577 |
+
return
|
| 578 |
+
self.top.mainloop(*args, **kwargs)
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
class ConcordanceSearchModel:
|
| 582 |
+
def __init__(self, queue):
|
| 583 |
+
self.queue = queue
|
| 584 |
+
self.CORPORA = _CORPORA
|
| 585 |
+
self.DEFAULT_CORPUS = _DEFAULT
|
| 586 |
+
self.selected_corpus = None
|
| 587 |
+
self.reset_query()
|
| 588 |
+
self.reset_results()
|
| 589 |
+
self.result_count = None
|
| 590 |
+
self.last_sent_searched = 0
|
| 591 |
+
|
| 592 |
+
def non_default_corpora(self):
|
| 593 |
+
copy = []
|
| 594 |
+
copy.extend(list(self.CORPORA.keys()))
|
| 595 |
+
copy.remove(self.DEFAULT_CORPUS)
|
| 596 |
+
copy.sort()
|
| 597 |
+
return copy
|
| 598 |
+
|
| 599 |
+
def load_corpus(self, name):
|
| 600 |
+
self.selected_corpus = name
|
| 601 |
+
self.tagged_sents = []
|
| 602 |
+
runner_thread = self.LoadCorpus(name, self)
|
| 603 |
+
runner_thread.start()
|
| 604 |
+
|
| 605 |
+
def search(self, query, page):
|
| 606 |
+
self.query = query
|
| 607 |
+
self.last_requested_page = page
|
| 608 |
+
self.SearchCorpus(self, page, self.result_count).start()
|
| 609 |
+
|
| 610 |
+
def next(self, page):
|
| 611 |
+
self.last_requested_page = page
|
| 612 |
+
if len(self.results) < page:
|
| 613 |
+
self.search(self.query, page)
|
| 614 |
+
else:
|
| 615 |
+
self.queue.put(SEARCH_TERMINATED_EVENT)
|
| 616 |
+
|
| 617 |
+
def prev(self, page):
|
| 618 |
+
self.last_requested_page = page
|
| 619 |
+
self.queue.put(SEARCH_TERMINATED_EVENT)
|
| 620 |
+
|
| 621 |
+
def reset_results(self):
|
| 622 |
+
self.last_sent_searched = 0
|
| 623 |
+
self.results = []
|
| 624 |
+
self.last_page = None
|
| 625 |
+
|
| 626 |
+
def reset_query(self):
|
| 627 |
+
self.query = None
|
| 628 |
+
|
| 629 |
+
def set_results(self, page, resultset):
|
| 630 |
+
self.results.insert(page - 1, resultset)
|
| 631 |
+
|
| 632 |
+
def get_results(self):
|
| 633 |
+
return self.results[self.last_requested_page - 1]
|
| 634 |
+
|
| 635 |
+
def has_more_pages(self, page):
|
| 636 |
+
if self.results == [] or self.results[0] == []:
|
| 637 |
+
return False
|
| 638 |
+
if self.last_page is None:
|
| 639 |
+
return True
|
| 640 |
+
return page < self.last_page
|
| 641 |
+
|
| 642 |
+
class LoadCorpus(threading.Thread):
|
| 643 |
+
def __init__(self, name, model):
|
| 644 |
+
threading.Thread.__init__(self)
|
| 645 |
+
self.model, self.name = model, name
|
| 646 |
+
|
| 647 |
+
def run(self):
|
| 648 |
+
try:
|
| 649 |
+
ts = self.model.CORPORA[self.name]()
|
| 650 |
+
self.model.tagged_sents = [
|
| 651 |
+
" ".join(w + "/" + t for (w, t) in sent) for sent in ts
|
| 652 |
+
]
|
| 653 |
+
self.model.queue.put(CORPUS_LOADED_EVENT)
|
| 654 |
+
except Exception as e:
|
| 655 |
+
print(e)
|
| 656 |
+
self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)
|
| 657 |
+
|
| 658 |
+
class SearchCorpus(threading.Thread):
|
| 659 |
+
def __init__(self, model, page, count):
|
| 660 |
+
self.model, self.count, self.page = model, count, page
|
| 661 |
+
threading.Thread.__init__(self)
|
| 662 |
+
|
| 663 |
+
def run(self):
|
| 664 |
+
q = self.processed_query()
|
| 665 |
+
sent_pos, i, sent_count = [], 0, 0
|
| 666 |
+
for sent in self.model.tagged_sents[self.model.last_sent_searched :]:
|
| 667 |
+
try:
|
| 668 |
+
m = re.search(q, sent)
|
| 669 |
+
except re.error:
|
| 670 |
+
self.model.reset_results()
|
| 671 |
+
self.model.queue.put(SEARCH_ERROR_EVENT)
|
| 672 |
+
return
|
| 673 |
+
if m:
|
| 674 |
+
sent_pos.append((sent, m.start(), m.end()))
|
| 675 |
+
i += 1
|
| 676 |
+
if i > self.count:
|
| 677 |
+
self.model.last_sent_searched += sent_count - 1
|
| 678 |
+
break
|
| 679 |
+
sent_count += 1
|
| 680 |
+
if self.count >= len(sent_pos):
|
| 681 |
+
self.model.last_sent_searched += sent_count - 1
|
| 682 |
+
self.model.last_page = self.page
|
| 683 |
+
self.model.set_results(self.page, sent_pos)
|
| 684 |
+
else:
|
| 685 |
+
self.model.set_results(self.page, sent_pos[:-1])
|
| 686 |
+
self.model.queue.put(SEARCH_TERMINATED_EVENT)
|
| 687 |
+
|
| 688 |
+
def processed_query(self):
|
| 689 |
+
new = []
|
| 690 |
+
for term in self.model.query.split():
|
| 691 |
+
term = re.sub(r"\.", r"[^/ ]", term)
|
| 692 |
+
if re.match("[A-Z]+$", term):
|
| 693 |
+
new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY)
|
| 694 |
+
elif "/" in term:
|
| 695 |
+
new.append(BOUNDARY + term + BOUNDARY)
|
| 696 |
+
else:
|
| 697 |
+
new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY)
|
| 698 |
+
return " ".join(new)
|
| 699 |
+
|
| 700 |
+
|
| 701 |
+
def app():
|
| 702 |
+
d = ConcordanceSearchView()
|
| 703 |
+
d.mainloop()
|
| 704 |
+
|
| 705 |
+
|
| 706 |
+
if __name__ == "__main__":
|
| 707 |
+
app()
|
| 708 |
+
|
| 709 |
+
__all__ = ["app"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/app/nemo_app.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06
|
| 2 |
+
# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
Finding (and Replacing) Nemo
|
| 6 |
+
|
| 7 |
+
Instant Regular Expressions
|
| 8 |
+
Created by Aristide Grange
|
| 9 |
+
"""
|
| 10 |
+
import itertools
|
| 11 |
+
import re
|
| 12 |
+
from tkinter import SEL_FIRST, SEL_LAST, Frame, Label, PhotoImage, Scrollbar, Text, Tk
|
| 13 |
+
|
| 14 |
+
windowTitle = "Finding (and Replacing) Nemo"
|
| 15 |
+
initialFind = r"n(.*?)e(.*?)m(.*?)o"
|
| 16 |
+
initialRepl = r"M\1A\2K\3I"
|
| 17 |
+
initialText = """\
|
| 18 |
+
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
| 19 |
+
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
| 20 |
+
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
| 21 |
+
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
| 22 |
+
"""
|
| 23 |
+
images = {
|
| 24 |
+
"FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=",
|
| 25 |
+
"find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7",
|
| 26 |
+
"REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7",
|
| 27 |
+
"repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=",
|
| 28 |
+
}
|
| 29 |
+
colors = ["#FF7B39", "#80F121"]
|
| 30 |
+
emphColors = ["#DAFC33", "#F42548"]
|
| 31 |
+
fieldParams = {
|
| 32 |
+
"height": 3,
|
| 33 |
+
"width": 70,
|
| 34 |
+
"font": ("monaco", 14),
|
| 35 |
+
"highlightthickness": 0,
|
| 36 |
+
"borderwidth": 0,
|
| 37 |
+
"background": "white",
|
| 38 |
+
}
|
| 39 |
+
textParams = {
|
| 40 |
+
"bg": "#F7E0D4",
|
| 41 |
+
"fg": "#2321F1",
|
| 42 |
+
"highlightthickness": 0,
|
| 43 |
+
"width": 1,
|
| 44 |
+
"height": 10,
|
| 45 |
+
"font": ("verdana", 16),
|
| 46 |
+
"wrap": "word",
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class Zone:
|
| 51 |
+
def __init__(self, image, initialField, initialText):
|
| 52 |
+
frm = Frame(root)
|
| 53 |
+
frm.config(background="white")
|
| 54 |
+
self.image = PhotoImage(format="gif", data=images[image.upper()])
|
| 55 |
+
self.imageDimmed = PhotoImage(format="gif", data=images[image])
|
| 56 |
+
self.img = Label(frm)
|
| 57 |
+
self.img.config(borderwidth=0)
|
| 58 |
+
self.img.pack(side="left")
|
| 59 |
+
self.fld = Text(frm, **fieldParams)
|
| 60 |
+
self.initScrollText(frm, self.fld, initialField)
|
| 61 |
+
frm = Frame(root)
|
| 62 |
+
self.txt = Text(frm, **textParams)
|
| 63 |
+
self.initScrollText(frm, self.txt, initialText)
|
| 64 |
+
for i in range(2):
|
| 65 |
+
self.txt.tag_config(colors[i], background=colors[i])
|
| 66 |
+
self.txt.tag_config("emph" + colors[i], foreground=emphColors[i])
|
| 67 |
+
|
| 68 |
+
def initScrollText(self, frm, txt, contents):
|
| 69 |
+
scl = Scrollbar(frm)
|
| 70 |
+
scl.config(command=txt.yview)
|
| 71 |
+
scl.pack(side="right", fill="y")
|
| 72 |
+
txt.pack(side="left", expand=True, fill="x")
|
| 73 |
+
txt.config(yscrollcommand=scl.set)
|
| 74 |
+
txt.insert("1.0", contents)
|
| 75 |
+
frm.pack(fill="x")
|
| 76 |
+
Frame(height=2, bd=1, relief="ridge").pack(fill="x")
|
| 77 |
+
|
| 78 |
+
def refresh(self):
|
| 79 |
+
self.colorCycle = itertools.cycle(colors)
|
| 80 |
+
try:
|
| 81 |
+
self.substitute()
|
| 82 |
+
self.img.config(image=self.image)
|
| 83 |
+
except re.error:
|
| 84 |
+
self.img.config(image=self.imageDimmed)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class FindZone(Zone):
|
| 88 |
+
def addTags(self, m):
|
| 89 |
+
color = next(self.colorCycle)
|
| 90 |
+
self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end())
|
| 91 |
+
try:
|
| 92 |
+
self.txt.tag_add(
|
| 93 |
+
"emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph")
|
| 94 |
+
)
|
| 95 |
+
except:
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
def substitute(self, *args):
|
| 99 |
+
for color in colors:
|
| 100 |
+
self.txt.tag_remove(color, "1.0", "end")
|
| 101 |
+
self.txt.tag_remove("emph" + color, "1.0", "end")
|
| 102 |
+
self.rex = re.compile("") # default value in case of malformed regexp
|
| 103 |
+
self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE)
|
| 104 |
+
try:
|
| 105 |
+
re.compile("(?P<emph>%s)" % self.fld.get(SEL_FIRST, SEL_LAST))
|
| 106 |
+
self.rexSel = re.compile(
|
| 107 |
+
"%s(?P<emph>%s)%s"
|
| 108 |
+
% (
|
| 109 |
+
self.fld.get("1.0", SEL_FIRST),
|
| 110 |
+
self.fld.get(SEL_FIRST, SEL_LAST),
|
| 111 |
+
self.fld.get(SEL_LAST, "end")[:-1],
|
| 112 |
+
),
|
| 113 |
+
re.MULTILINE,
|
| 114 |
+
)
|
| 115 |
+
except:
|
| 116 |
+
self.rexSel = self.rex
|
| 117 |
+
self.rexSel.sub(self.addTags, self.txt.get("1.0", "end"))
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class ReplaceZone(Zone):
|
| 121 |
+
def addTags(self, m):
|
| 122 |
+
s = sz.rex.sub(self.repl, m.group())
|
| 123 |
+
self.txt.delete(
|
| 124 |
+
"1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff)
|
| 125 |
+
)
|
| 126 |
+
self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle))
|
| 127 |
+
self.diff += len(s) - (m.end() - m.start())
|
| 128 |
+
|
| 129 |
+
def substitute(self):
|
| 130 |
+
self.txt.delete("1.0", "end")
|
| 131 |
+
self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1])
|
| 132 |
+
self.diff = 0
|
| 133 |
+
self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1])
|
| 134 |
+
sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1])
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def launchRefresh(_):
|
| 138 |
+
sz.fld.after_idle(sz.refresh)
|
| 139 |
+
rz.fld.after_idle(rz.refresh)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def app():
|
| 143 |
+
global root, sz, rz, rex0
|
| 144 |
+
root = Tk()
|
| 145 |
+
root.resizable(height=False, width=True)
|
| 146 |
+
root.title(windowTitle)
|
| 147 |
+
root.minsize(width=250, height=0)
|
| 148 |
+
sz = FindZone("find", initialFind, initialText)
|
| 149 |
+
sz.fld.bind("<Button-1>", launchRefresh)
|
| 150 |
+
sz.fld.bind("<ButtonRelease-1>", launchRefresh)
|
| 151 |
+
sz.fld.bind("<B1-Motion>", launchRefresh)
|
| 152 |
+
sz.rexSel = re.compile("")
|
| 153 |
+
rz = ReplaceZone("repl", initialRepl, "")
|
| 154 |
+
rex0 = re.compile(r"(?<!\\)\\([0-9]+)")
|
| 155 |
+
root.bind_all("<Key>", launchRefresh)
|
| 156 |
+
launchRefresh(None)
|
| 157 |
+
root.mainloop()
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
app()
|
| 162 |
+
|
| 163 |
+
__all__ = ["app"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/app/rdparser_app.py
ADDED
|
@@ -0,0 +1,1052 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Recursive Descent Parser Application
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
A graphical tool for exploring the recursive descent parser.
|
| 10 |
+
|
| 11 |
+
The recursive descent parser maintains a tree, which records the
|
| 12 |
+
structure of the portion of the text that has been parsed. It uses
|
| 13 |
+
CFG productions to expand the fringe of the tree, and matches its
|
| 14 |
+
leaves against the text. Initially, the tree contains the start
|
| 15 |
+
symbol ("S"). It is shown in the main canvas, to the right of the
|
| 16 |
+
list of available expansions.
|
| 17 |
+
|
| 18 |
+
The parser builds up a tree structure for the text using three
|
| 19 |
+
operations:
|
| 20 |
+
|
| 21 |
+
- "expand" uses a CFG production to add children to a node on the
|
| 22 |
+
fringe of the tree.
|
| 23 |
+
- "match" compares a leaf in the tree to a text token.
|
| 24 |
+
- "backtrack" returns the tree to its state before the most recent
|
| 25 |
+
expand or match operation.
|
| 26 |
+
|
| 27 |
+
The parser maintains a list of tree locations called a "frontier" to
|
| 28 |
+
remember which nodes have not yet been expanded and which leaves have
|
| 29 |
+
not yet been matched against the text. The leftmost frontier node is
|
| 30 |
+
shown in green, and the other frontier nodes are shown in blue. The
|
| 31 |
+
parser always performs expand and match operations on the leftmost
|
| 32 |
+
element of the frontier.
|
| 33 |
+
|
| 34 |
+
You can control the parser's operation by using the "expand," "match,"
|
| 35 |
+
and "backtrack" buttons; or you can use the "step" button to let the
|
| 36 |
+
parser automatically decide which operation to apply. The parser uses
|
| 37 |
+
the following rules to decide which operation to apply:
|
| 38 |
+
|
| 39 |
+
- If the leftmost frontier element is a token, try matching it.
|
| 40 |
+
- If the leftmost frontier element is a node, try expanding it with
|
| 41 |
+
the first untried expansion.
|
| 42 |
+
- Otherwise, backtrack.
|
| 43 |
+
|
| 44 |
+
The "expand" button applies the untried expansion whose CFG production
|
| 45 |
+
is listed earliest in the grammar. To manually choose which expansion
|
| 46 |
+
to apply, click on a CFG production from the list of available
|
| 47 |
+
expansions, on the left side of the main window.
|
| 48 |
+
|
| 49 |
+
The "autostep" button will let the parser continue applying
|
| 50 |
+
applications to the tree until it reaches a complete parse. You can
|
| 51 |
+
cancel an autostep in progress at any time by clicking on the
|
| 52 |
+
"autostep" button again.
|
| 53 |
+
|
| 54 |
+
Keyboard Shortcuts::
|
| 55 |
+
[Space]\t Perform the next expand, match, or backtrack operation
|
| 56 |
+
[a]\t Step through operations until the next complete parse
|
| 57 |
+
[e]\t Perform an expand operation
|
| 58 |
+
[m]\t Perform a match operation
|
| 59 |
+
[b]\t Perform a backtrack operation
|
| 60 |
+
[Delete]\t Reset the parser
|
| 61 |
+
[g]\t Show/hide available expansions list
|
| 62 |
+
[h]\t Help
|
| 63 |
+
[Ctrl-p]\t Print
|
| 64 |
+
[q]\t Quit
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
|
| 68 |
+
from tkinter.font import Font
|
| 69 |
+
|
| 70 |
+
from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
|
| 71 |
+
from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
|
| 72 |
+
from nltk.parse import SteppingRecursiveDescentParser
|
| 73 |
+
from nltk.tree import Tree
|
| 74 |
+
from nltk.util import in_idle
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class RecursiveDescentApp:
|
| 78 |
+
"""
|
| 79 |
+
A graphical tool for exploring the recursive descent parser. The tool
|
| 80 |
+
displays the parser's tree and the remaining text, and allows the
|
| 81 |
+
user to control the parser's operation. In particular, the user
|
| 82 |
+
can expand subtrees on the frontier, match tokens on the frontier
|
| 83 |
+
against the text, and backtrack. A "step" button simply steps
|
| 84 |
+
through the parsing process, performing the operations that
|
| 85 |
+
``RecursiveDescentParser`` would use.
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
def __init__(self, grammar, sent, trace=0):
|
| 89 |
+
self._sent = sent
|
| 90 |
+
self._parser = SteppingRecursiveDescentParser(grammar, trace)
|
| 91 |
+
|
| 92 |
+
# Set up the main window.
|
| 93 |
+
self._top = Tk()
|
| 94 |
+
self._top.title("Recursive Descent Parser Application")
|
| 95 |
+
|
| 96 |
+
# Set up key bindings.
|
| 97 |
+
self._init_bindings()
|
| 98 |
+
|
| 99 |
+
# Initialize the fonts.
|
| 100 |
+
self._init_fonts(self._top)
|
| 101 |
+
|
| 102 |
+
# Animations. animating_lock is a lock to prevent the demo
|
| 103 |
+
# from performing new operations while it's animating.
|
| 104 |
+
self._animation_frames = IntVar(self._top)
|
| 105 |
+
self._animation_frames.set(5)
|
| 106 |
+
self._animating_lock = 0
|
| 107 |
+
self._autostep = 0
|
| 108 |
+
|
| 109 |
+
# The user can hide the grammar.
|
| 110 |
+
self._show_grammar = IntVar(self._top)
|
| 111 |
+
self._show_grammar.set(1)
|
| 112 |
+
|
| 113 |
+
# Create the basic frames.
|
| 114 |
+
self._init_menubar(self._top)
|
| 115 |
+
self._init_buttons(self._top)
|
| 116 |
+
self._init_feedback(self._top)
|
| 117 |
+
self._init_grammar(self._top)
|
| 118 |
+
self._init_canvas(self._top)
|
| 119 |
+
|
| 120 |
+
# Initialize the parser.
|
| 121 |
+
self._parser.initialize(self._sent)
|
| 122 |
+
|
| 123 |
+
# Resize callback
|
| 124 |
+
self._canvas.bind("<Configure>", self._configure)
|
| 125 |
+
|
| 126 |
+
#########################################
|
| 127 |
+
## Initialization Helpers
|
| 128 |
+
#########################################
|
| 129 |
+
|
| 130 |
+
def _init_fonts(self, root):
|
| 131 |
+
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
|
| 132 |
+
self._sysfont = Font(font=Button()["font"])
|
| 133 |
+
root.option_add("*Font", self._sysfont)
|
| 134 |
+
|
| 135 |
+
# TWhat's our font size (default=same as sysfont)
|
| 136 |
+
self._size = IntVar(root)
|
| 137 |
+
self._size.set(self._sysfont.cget("size"))
|
| 138 |
+
|
| 139 |
+
self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
|
| 140 |
+
self._font = Font(family="helvetica", size=self._size.get())
|
| 141 |
+
if self._size.get() < 0:
|
| 142 |
+
big = self._size.get() - 2
|
| 143 |
+
else:
|
| 144 |
+
big = self._size.get() + 2
|
| 145 |
+
self._bigfont = Font(family="helvetica", weight="bold", size=big)
|
| 146 |
+
|
| 147 |
+
def _init_grammar(self, parent):
|
| 148 |
+
# Grammar view.
|
| 149 |
+
self._prodframe = listframe = Frame(parent)
|
| 150 |
+
self._prodframe.pack(fill="both", side="left", padx=2)
|
| 151 |
+
self._prodlist_label = Label(
|
| 152 |
+
self._prodframe, font=self._boldfont, text="Available Expansions"
|
| 153 |
+
)
|
| 154 |
+
self._prodlist_label.pack()
|
| 155 |
+
self._prodlist = Listbox(
|
| 156 |
+
self._prodframe,
|
| 157 |
+
selectmode="single",
|
| 158 |
+
relief="groove",
|
| 159 |
+
background="white",
|
| 160 |
+
foreground="#909090",
|
| 161 |
+
font=self._font,
|
| 162 |
+
selectforeground="#004040",
|
| 163 |
+
selectbackground="#c0f0c0",
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
self._prodlist.pack(side="right", fill="both", expand=1)
|
| 167 |
+
|
| 168 |
+
self._productions = list(self._parser.grammar().productions())
|
| 169 |
+
for production in self._productions:
|
| 170 |
+
self._prodlist.insert("end", (" %s" % production))
|
| 171 |
+
self._prodlist.config(height=min(len(self._productions), 25))
|
| 172 |
+
|
| 173 |
+
# Add a scrollbar if there are more than 25 productions.
|
| 174 |
+
if len(self._productions) > 25:
|
| 175 |
+
listscroll = Scrollbar(self._prodframe, orient="vertical")
|
| 176 |
+
self._prodlist.config(yscrollcommand=listscroll.set)
|
| 177 |
+
listscroll.config(command=self._prodlist.yview)
|
| 178 |
+
listscroll.pack(side="left", fill="y")
|
| 179 |
+
|
| 180 |
+
# If they select a production, apply it.
|
| 181 |
+
self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
|
| 182 |
+
|
| 183 |
+
def _init_bindings(self):
|
| 184 |
+
# Key bindings are a good thing.
|
| 185 |
+
self._top.bind("<Control-q>", self.destroy)
|
| 186 |
+
self._top.bind("<Control-x>", self.destroy)
|
| 187 |
+
self._top.bind("<Escape>", self.destroy)
|
| 188 |
+
self._top.bind("e", self.expand)
|
| 189 |
+
# self._top.bind('<Alt-e>', self.expand)
|
| 190 |
+
# self._top.bind('<Control-e>', self.expand)
|
| 191 |
+
self._top.bind("m", self.match)
|
| 192 |
+
self._top.bind("<Alt-m>", self.match)
|
| 193 |
+
self._top.bind("<Control-m>", self.match)
|
| 194 |
+
self._top.bind("b", self.backtrack)
|
| 195 |
+
self._top.bind("<Alt-b>", self.backtrack)
|
| 196 |
+
self._top.bind("<Control-b>", self.backtrack)
|
| 197 |
+
self._top.bind("<Control-z>", self.backtrack)
|
| 198 |
+
self._top.bind("<BackSpace>", self.backtrack)
|
| 199 |
+
self._top.bind("a", self.autostep)
|
| 200 |
+
# self._top.bind('<Control-a>', self.autostep)
|
| 201 |
+
self._top.bind("<Control-space>", self.autostep)
|
| 202 |
+
self._top.bind("<Control-c>", self.cancel_autostep)
|
| 203 |
+
self._top.bind("<space>", self.step)
|
| 204 |
+
self._top.bind("<Delete>", self.reset)
|
| 205 |
+
self._top.bind("<Control-p>", self.postscript)
|
| 206 |
+
# self._top.bind('<h>', self.help)
|
| 207 |
+
# self._top.bind('<Alt-h>', self.help)
|
| 208 |
+
self._top.bind("<Control-h>", self.help)
|
| 209 |
+
self._top.bind("<F1>", self.help)
|
| 210 |
+
# self._top.bind('<g>', self.toggle_grammar)
|
| 211 |
+
# self._top.bind('<Alt-g>', self.toggle_grammar)
|
| 212 |
+
# self._top.bind('<Control-g>', self.toggle_grammar)
|
| 213 |
+
self._top.bind("<Control-g>", self.edit_grammar)
|
| 214 |
+
self._top.bind("<Control-t>", self.edit_sentence)
|
| 215 |
+
|
| 216 |
+
def _init_buttons(self, parent):
|
| 217 |
+
# Set up the frames.
|
| 218 |
+
self._buttonframe = buttonframe = Frame(parent)
|
| 219 |
+
buttonframe.pack(fill="none", side="bottom", padx=3, pady=2)
|
| 220 |
+
Button(
|
| 221 |
+
buttonframe,
|
| 222 |
+
text="Step",
|
| 223 |
+
background="#90c0d0",
|
| 224 |
+
foreground="black",
|
| 225 |
+
command=self.step,
|
| 226 |
+
).pack(side="left")
|
| 227 |
+
Button(
|
| 228 |
+
buttonframe,
|
| 229 |
+
text="Autostep",
|
| 230 |
+
background="#90c0d0",
|
| 231 |
+
foreground="black",
|
| 232 |
+
command=self.autostep,
|
| 233 |
+
).pack(side="left")
|
| 234 |
+
Button(
|
| 235 |
+
buttonframe,
|
| 236 |
+
text="Expand",
|
| 237 |
+
underline=0,
|
| 238 |
+
background="#90f090",
|
| 239 |
+
foreground="black",
|
| 240 |
+
command=self.expand,
|
| 241 |
+
).pack(side="left")
|
| 242 |
+
Button(
|
| 243 |
+
buttonframe,
|
| 244 |
+
text="Match",
|
| 245 |
+
underline=0,
|
| 246 |
+
background="#90f090",
|
| 247 |
+
foreground="black",
|
| 248 |
+
command=self.match,
|
| 249 |
+
).pack(side="left")
|
| 250 |
+
Button(
|
| 251 |
+
buttonframe,
|
| 252 |
+
text="Backtrack",
|
| 253 |
+
underline=0,
|
| 254 |
+
background="#f0a0a0",
|
| 255 |
+
foreground="black",
|
| 256 |
+
command=self.backtrack,
|
| 257 |
+
).pack(side="left")
|
| 258 |
+
# Replace autostep...
|
| 259 |
+
|
| 260 |
+
# self._autostep_button = Button(buttonframe, text='Autostep',
|
| 261 |
+
# underline=0, command=self.autostep)
|
| 262 |
+
# self._autostep_button.pack(side='left')
|
| 263 |
+
|
| 264 |
+
def _configure(self, event):
|
| 265 |
+
self._autostep = 0
|
| 266 |
+
(x1, y1, x2, y2) = self._cframe.scrollregion()
|
| 267 |
+
y2 = event.height - 6
|
| 268 |
+
self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2)
|
| 269 |
+
self._redraw()
|
| 270 |
+
|
| 271 |
+
def _init_feedback(self, parent):
|
| 272 |
+
self._feedbackframe = feedbackframe = Frame(parent)
|
| 273 |
+
feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
|
| 274 |
+
self._lastoper_label = Label(
|
| 275 |
+
feedbackframe, text="Last Operation:", font=self._font
|
| 276 |
+
)
|
| 277 |
+
self._lastoper_label.pack(side="left")
|
| 278 |
+
lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
|
| 279 |
+
lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
|
| 280 |
+
self._lastoper1 = Label(
|
| 281 |
+
lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
|
| 282 |
+
)
|
| 283 |
+
self._lastoper2 = Label(
|
| 284 |
+
lastoperframe,
|
| 285 |
+
anchor="w",
|
| 286 |
+
width=30,
|
| 287 |
+
foreground="#004040",
|
| 288 |
+
background="#f0f0f0",
|
| 289 |
+
font=self._font,
|
| 290 |
+
)
|
| 291 |
+
self._lastoper1.pack(side="left")
|
| 292 |
+
self._lastoper2.pack(side="left", fill="x", expand=1)
|
| 293 |
+
|
| 294 |
+
def _init_canvas(self, parent):
|
| 295 |
+
self._cframe = CanvasFrame(
|
| 296 |
+
parent,
|
| 297 |
+
background="white",
|
| 298 |
+
# width=525, height=250,
|
| 299 |
+
closeenough=10,
|
| 300 |
+
border=2,
|
| 301 |
+
relief="sunken",
|
| 302 |
+
)
|
| 303 |
+
self._cframe.pack(expand=1, fill="both", side="top", pady=2)
|
| 304 |
+
canvas = self._canvas = self._cframe.canvas()
|
| 305 |
+
|
| 306 |
+
# Initially, there's no tree or text
|
| 307 |
+
self._tree = None
|
| 308 |
+
self._textwidgets = []
|
| 309 |
+
self._textline = None
|
| 310 |
+
|
| 311 |
+
def _init_menubar(self, parent):
|
| 312 |
+
menubar = Menu(parent)
|
| 313 |
+
|
| 314 |
+
filemenu = Menu(menubar, tearoff=0)
|
| 315 |
+
filemenu.add_command(
|
| 316 |
+
label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
|
| 317 |
+
)
|
| 318 |
+
filemenu.add_command(
|
| 319 |
+
label="Print to Postscript",
|
| 320 |
+
underline=0,
|
| 321 |
+
command=self.postscript,
|
| 322 |
+
accelerator="Ctrl-p",
|
| 323 |
+
)
|
| 324 |
+
filemenu.add_command(
|
| 325 |
+
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
|
| 326 |
+
)
|
| 327 |
+
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
| 328 |
+
|
| 329 |
+
editmenu = Menu(menubar, tearoff=0)
|
| 330 |
+
editmenu.add_command(
|
| 331 |
+
label="Edit Grammar",
|
| 332 |
+
underline=5,
|
| 333 |
+
command=self.edit_grammar,
|
| 334 |
+
accelerator="Ctrl-g",
|
| 335 |
+
)
|
| 336 |
+
editmenu.add_command(
|
| 337 |
+
label="Edit Text",
|
| 338 |
+
underline=5,
|
| 339 |
+
command=self.edit_sentence,
|
| 340 |
+
accelerator="Ctrl-t",
|
| 341 |
+
)
|
| 342 |
+
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
| 343 |
+
|
| 344 |
+
rulemenu = Menu(menubar, tearoff=0)
|
| 345 |
+
rulemenu.add_command(
|
| 346 |
+
label="Step", underline=1, command=self.step, accelerator="Space"
|
| 347 |
+
)
|
| 348 |
+
rulemenu.add_separator()
|
| 349 |
+
rulemenu.add_command(
|
| 350 |
+
label="Match", underline=0, command=self.match, accelerator="Ctrl-m"
|
| 351 |
+
)
|
| 352 |
+
rulemenu.add_command(
|
| 353 |
+
label="Expand", underline=0, command=self.expand, accelerator="Ctrl-e"
|
| 354 |
+
)
|
| 355 |
+
rulemenu.add_separator()
|
| 356 |
+
rulemenu.add_command(
|
| 357 |
+
label="Backtrack", underline=0, command=self.backtrack, accelerator="Ctrl-b"
|
| 358 |
+
)
|
| 359 |
+
menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
|
| 360 |
+
|
| 361 |
+
viewmenu = Menu(menubar, tearoff=0)
|
| 362 |
+
viewmenu.add_checkbutton(
|
| 363 |
+
label="Show Grammar",
|
| 364 |
+
underline=0,
|
| 365 |
+
variable=self._show_grammar,
|
| 366 |
+
command=self._toggle_grammar,
|
| 367 |
+
)
|
| 368 |
+
viewmenu.add_separator()
|
| 369 |
+
viewmenu.add_radiobutton(
|
| 370 |
+
label="Tiny",
|
| 371 |
+
variable=self._size,
|
| 372 |
+
underline=0,
|
| 373 |
+
value=10,
|
| 374 |
+
command=self.resize,
|
| 375 |
+
)
|
| 376 |
+
viewmenu.add_radiobutton(
|
| 377 |
+
label="Small",
|
| 378 |
+
variable=self._size,
|
| 379 |
+
underline=0,
|
| 380 |
+
value=12,
|
| 381 |
+
command=self.resize,
|
| 382 |
+
)
|
| 383 |
+
viewmenu.add_radiobutton(
|
| 384 |
+
label="Medium",
|
| 385 |
+
variable=self._size,
|
| 386 |
+
underline=0,
|
| 387 |
+
value=14,
|
| 388 |
+
command=self.resize,
|
| 389 |
+
)
|
| 390 |
+
viewmenu.add_radiobutton(
|
| 391 |
+
label="Large",
|
| 392 |
+
variable=self._size,
|
| 393 |
+
underline=0,
|
| 394 |
+
value=18,
|
| 395 |
+
command=self.resize,
|
| 396 |
+
)
|
| 397 |
+
viewmenu.add_radiobutton(
|
| 398 |
+
label="Huge",
|
| 399 |
+
variable=self._size,
|
| 400 |
+
underline=0,
|
| 401 |
+
value=24,
|
| 402 |
+
command=self.resize,
|
| 403 |
+
)
|
| 404 |
+
menubar.add_cascade(label="View", underline=0, menu=viewmenu)
|
| 405 |
+
|
| 406 |
+
animatemenu = Menu(menubar, tearoff=0)
|
| 407 |
+
animatemenu.add_radiobutton(
|
| 408 |
+
label="No Animation", underline=0, variable=self._animation_frames, value=0
|
| 409 |
+
)
|
| 410 |
+
animatemenu.add_radiobutton(
|
| 411 |
+
label="Slow Animation",
|
| 412 |
+
underline=0,
|
| 413 |
+
variable=self._animation_frames,
|
| 414 |
+
value=10,
|
| 415 |
+
accelerator="-",
|
| 416 |
+
)
|
| 417 |
+
animatemenu.add_radiobutton(
|
| 418 |
+
label="Normal Animation",
|
| 419 |
+
underline=0,
|
| 420 |
+
variable=self._animation_frames,
|
| 421 |
+
value=5,
|
| 422 |
+
accelerator="=",
|
| 423 |
+
)
|
| 424 |
+
animatemenu.add_radiobutton(
|
| 425 |
+
label="Fast Animation",
|
| 426 |
+
underline=0,
|
| 427 |
+
variable=self._animation_frames,
|
| 428 |
+
value=2,
|
| 429 |
+
accelerator="+",
|
| 430 |
+
)
|
| 431 |
+
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
|
| 432 |
+
|
| 433 |
+
helpmenu = Menu(menubar, tearoff=0)
|
| 434 |
+
helpmenu.add_command(label="About", underline=0, command=self.about)
|
| 435 |
+
helpmenu.add_command(
|
| 436 |
+
label="Instructions", underline=0, command=self.help, accelerator="F1"
|
| 437 |
+
)
|
| 438 |
+
menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
|
| 439 |
+
|
| 440 |
+
parent.config(menu=menubar)
|
| 441 |
+
|
| 442 |
+
#########################################
|
| 443 |
+
## Helper
|
| 444 |
+
#########################################
|
| 445 |
+
|
| 446 |
+
def _get(self, widget, treeloc):
|
| 447 |
+
for i in treeloc:
|
| 448 |
+
widget = widget.subtrees()[i]
|
| 449 |
+
if isinstance(widget, TreeSegmentWidget):
|
| 450 |
+
widget = widget.label()
|
| 451 |
+
return widget
|
| 452 |
+
|
| 453 |
+
#########################################
|
| 454 |
+
## Main draw procedure
|
| 455 |
+
#########################################
|
| 456 |
+
|
| 457 |
+
def _redraw(self):
|
| 458 |
+
canvas = self._canvas
|
| 459 |
+
|
| 460 |
+
# Delete the old tree, widgets, etc.
|
| 461 |
+
if self._tree is not None:
|
| 462 |
+
self._cframe.destroy_widget(self._tree)
|
| 463 |
+
for twidget in self._textwidgets:
|
| 464 |
+
self._cframe.destroy_widget(twidget)
|
| 465 |
+
if self._textline is not None:
|
| 466 |
+
self._canvas.delete(self._textline)
|
| 467 |
+
|
| 468 |
+
# Draw the tree.
|
| 469 |
+
helv = ("helvetica", -self._size.get())
|
| 470 |
+
bold = ("helvetica", -self._size.get(), "bold")
|
| 471 |
+
attribs = {
|
| 472 |
+
"tree_color": "#000000",
|
| 473 |
+
"tree_width": 2,
|
| 474 |
+
"node_font": bold,
|
| 475 |
+
"leaf_font": helv,
|
| 476 |
+
}
|
| 477 |
+
tree = self._parser.tree()
|
| 478 |
+
self._tree = tree_to_treesegment(canvas, tree, **attribs)
|
| 479 |
+
self._cframe.add_widget(self._tree, 30, 5)
|
| 480 |
+
|
| 481 |
+
# Draw the text.
|
| 482 |
+
helv = ("helvetica", -self._size.get())
|
| 483 |
+
bottom = y = self._cframe.scrollregion()[3]
|
| 484 |
+
self._textwidgets = [
|
| 485 |
+
TextWidget(canvas, word, font=self._font) for word in self._sent
|
| 486 |
+
]
|
| 487 |
+
for twidget in self._textwidgets:
|
| 488 |
+
self._cframe.add_widget(twidget, 0, 0)
|
| 489 |
+
twidget.move(0, bottom - twidget.bbox()[3] - 5)
|
| 490 |
+
y = min(y, twidget.bbox()[1])
|
| 491 |
+
|
| 492 |
+
# Draw a line over the text, to separate it from the tree.
|
| 493 |
+
self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash=".")
|
| 494 |
+
|
| 495 |
+
# Highlight appropriate nodes.
|
| 496 |
+
self._highlight_nodes()
|
| 497 |
+
self._highlight_prodlist()
|
| 498 |
+
|
| 499 |
+
# Make sure the text lines up.
|
| 500 |
+
self._position_text()
|
| 501 |
+
|
| 502 |
+
def _redraw_quick(self):
|
| 503 |
+
# This should be more-or-less sufficient after an animation.
|
| 504 |
+
self._highlight_nodes()
|
| 505 |
+
self._highlight_prodlist()
|
| 506 |
+
self._position_text()
|
| 507 |
+
|
| 508 |
+
def _highlight_nodes(self):
|
| 509 |
+
# Highlight the list of nodes to be checked.
|
| 510 |
+
bold = ("helvetica", -self._size.get(), "bold")
|
| 511 |
+
for treeloc in self._parser.frontier()[:1]:
|
| 512 |
+
self._get(self._tree, treeloc)["color"] = "#20a050"
|
| 513 |
+
self._get(self._tree, treeloc)["font"] = bold
|
| 514 |
+
for treeloc in self._parser.frontier()[1:]:
|
| 515 |
+
self._get(self._tree, treeloc)["color"] = "#008080"
|
| 516 |
+
|
| 517 |
+
def _highlight_prodlist(self):
|
| 518 |
+
# Highlight the productions that can be expanded.
|
| 519 |
+
# Boy, too bad tkinter doesn't implement Listbox.itemconfig;
|
| 520 |
+
# that would be pretty useful here.
|
| 521 |
+
self._prodlist.delete(0, "end")
|
| 522 |
+
expandable = self._parser.expandable_productions()
|
| 523 |
+
untried = self._parser.untried_expandable_productions()
|
| 524 |
+
productions = self._productions
|
| 525 |
+
for index in range(len(productions)):
|
| 526 |
+
if productions[index] in expandable:
|
| 527 |
+
if productions[index] in untried:
|
| 528 |
+
self._prodlist.insert(index, " %s" % productions[index])
|
| 529 |
+
else:
|
| 530 |
+
self._prodlist.insert(index, " %s (TRIED)" % productions[index])
|
| 531 |
+
self._prodlist.selection_set(index)
|
| 532 |
+
else:
|
| 533 |
+
self._prodlist.insert(index, " %s" % productions[index])
|
| 534 |
+
|
| 535 |
+
def _position_text(self):
|
| 536 |
+
# Line up the text widgets that are matched against the tree
|
| 537 |
+
numwords = len(self._sent)
|
| 538 |
+
num_matched = numwords - len(self._parser.remaining_text())
|
| 539 |
+
leaves = self._tree_leaves()[:num_matched]
|
| 540 |
+
xmax = self._tree.bbox()[0]
|
| 541 |
+
for i in range(0, len(leaves)):
|
| 542 |
+
widget = self._textwidgets[i]
|
| 543 |
+
leaf = leaves[i]
|
| 544 |
+
widget["color"] = "#006040"
|
| 545 |
+
leaf["color"] = "#006040"
|
| 546 |
+
widget.move(leaf.bbox()[0] - widget.bbox()[0], 0)
|
| 547 |
+
xmax = widget.bbox()[2] + 10
|
| 548 |
+
|
| 549 |
+
# Line up the text widgets that are not matched against the tree.
|
| 550 |
+
for i in range(len(leaves), numwords):
|
| 551 |
+
widget = self._textwidgets[i]
|
| 552 |
+
widget["color"] = "#a0a0a0"
|
| 553 |
+
widget.move(xmax - widget.bbox()[0], 0)
|
| 554 |
+
xmax = widget.bbox()[2] + 10
|
| 555 |
+
|
| 556 |
+
# If we have a complete parse, make everything green :)
|
| 557 |
+
if self._parser.currently_complete():
|
| 558 |
+
for twidget in self._textwidgets:
|
| 559 |
+
twidget["color"] = "#00a000"
|
| 560 |
+
|
| 561 |
+
# Move the matched leaves down to the text.
|
| 562 |
+
for i in range(0, len(leaves)):
|
| 563 |
+
widget = self._textwidgets[i]
|
| 564 |
+
leaf = leaves[i]
|
| 565 |
+
dy = widget.bbox()[1] - leaf.bbox()[3] - 10.0
|
| 566 |
+
dy = max(dy, leaf.parent().label().bbox()[3] - leaf.bbox()[3] + 10)
|
| 567 |
+
leaf.move(0, dy)
|
| 568 |
+
|
| 569 |
+
def _tree_leaves(self, tree=None):
|
| 570 |
+
if tree is None:
|
| 571 |
+
tree = self._tree
|
| 572 |
+
if isinstance(tree, TreeSegmentWidget):
|
| 573 |
+
leaves = []
|
| 574 |
+
for child in tree.subtrees():
|
| 575 |
+
leaves += self._tree_leaves(child)
|
| 576 |
+
return leaves
|
| 577 |
+
else:
|
| 578 |
+
return [tree]
|
| 579 |
+
|
| 580 |
+
#########################################
|
| 581 |
+
## Button Callbacks
|
| 582 |
+
#########################################
|
| 583 |
+
|
| 584 |
+
def destroy(self, *e):
|
| 585 |
+
self._autostep = 0
|
| 586 |
+
if self._top is None:
|
| 587 |
+
return
|
| 588 |
+
self._top.destroy()
|
| 589 |
+
self._top = None
|
| 590 |
+
|
| 591 |
+
def reset(self, *e):
|
| 592 |
+
self._autostep = 0
|
| 593 |
+
self._parser.initialize(self._sent)
|
| 594 |
+
self._lastoper1["text"] = "Reset Application"
|
| 595 |
+
self._lastoper2["text"] = ""
|
| 596 |
+
self._redraw()
|
| 597 |
+
|
| 598 |
+
def autostep(self, *e):
|
| 599 |
+
if self._animation_frames.get() == 0:
|
| 600 |
+
self._animation_frames.set(2)
|
| 601 |
+
if self._autostep:
|
| 602 |
+
self._autostep = 0
|
| 603 |
+
else:
|
| 604 |
+
self._autostep = 1
|
| 605 |
+
self._step()
|
| 606 |
+
|
| 607 |
+
def cancel_autostep(self, *e):
|
| 608 |
+
# self._autostep_button['text'] = 'Autostep'
|
| 609 |
+
self._autostep = 0
|
| 610 |
+
|
| 611 |
+
# Make sure to stop auto-stepping if we get any user input.
|
| 612 |
+
def step(self, *e):
|
| 613 |
+
self._autostep = 0
|
| 614 |
+
self._step()
|
| 615 |
+
|
| 616 |
+
def match(self, *e):
|
| 617 |
+
self._autostep = 0
|
| 618 |
+
self._match()
|
| 619 |
+
|
| 620 |
+
def expand(self, *e):
|
| 621 |
+
self._autostep = 0
|
| 622 |
+
self._expand()
|
| 623 |
+
|
| 624 |
+
def backtrack(self, *e):
|
| 625 |
+
self._autostep = 0
|
| 626 |
+
self._backtrack()
|
| 627 |
+
|
| 628 |
+
def _step(self):
|
| 629 |
+
if self._animating_lock:
|
| 630 |
+
return
|
| 631 |
+
|
| 632 |
+
# Try expanding, matching, and backtracking (in that order)
|
| 633 |
+
if self._expand():
|
| 634 |
+
pass
|
| 635 |
+
elif self._parser.untried_match() and self._match():
|
| 636 |
+
pass
|
| 637 |
+
elif self._backtrack():
|
| 638 |
+
pass
|
| 639 |
+
else:
|
| 640 |
+
self._lastoper1["text"] = "Finished"
|
| 641 |
+
self._lastoper2["text"] = ""
|
| 642 |
+
self._autostep = 0
|
| 643 |
+
|
| 644 |
+
# Check if we just completed a parse.
|
| 645 |
+
if self._parser.currently_complete():
|
| 646 |
+
self._autostep = 0
|
| 647 |
+
self._lastoper2["text"] += " [COMPLETE PARSE]"
|
| 648 |
+
|
| 649 |
+
def _expand(self, *e):
|
| 650 |
+
if self._animating_lock:
|
| 651 |
+
return
|
| 652 |
+
old_frontier = self._parser.frontier()
|
| 653 |
+
rv = self._parser.expand()
|
| 654 |
+
if rv is not None:
|
| 655 |
+
self._lastoper1["text"] = "Expand:"
|
| 656 |
+
self._lastoper2["text"] = rv
|
| 657 |
+
self._prodlist.selection_clear(0, "end")
|
| 658 |
+
index = self._productions.index(rv)
|
| 659 |
+
self._prodlist.selection_set(index)
|
| 660 |
+
self._animate_expand(old_frontier[0])
|
| 661 |
+
return True
|
| 662 |
+
else:
|
| 663 |
+
self._lastoper1["text"] = "Expand:"
|
| 664 |
+
self._lastoper2["text"] = "(all expansions tried)"
|
| 665 |
+
return False
|
| 666 |
+
|
| 667 |
+
def _match(self, *e):
|
| 668 |
+
if self._animating_lock:
|
| 669 |
+
return
|
| 670 |
+
old_frontier = self._parser.frontier()
|
| 671 |
+
rv = self._parser.match()
|
| 672 |
+
if rv is not None:
|
| 673 |
+
self._lastoper1["text"] = "Match:"
|
| 674 |
+
self._lastoper2["text"] = rv
|
| 675 |
+
self._animate_match(old_frontier[0])
|
| 676 |
+
return True
|
| 677 |
+
else:
|
| 678 |
+
self._lastoper1["text"] = "Match:"
|
| 679 |
+
self._lastoper2["text"] = "(failed)"
|
| 680 |
+
return False
|
| 681 |
+
|
| 682 |
+
def _backtrack(self, *e):
|
| 683 |
+
if self._animating_lock:
|
| 684 |
+
return
|
| 685 |
+
if self._parser.backtrack():
|
| 686 |
+
elt = self._parser.tree()
|
| 687 |
+
for i in self._parser.frontier()[0]:
|
| 688 |
+
elt = elt[i]
|
| 689 |
+
self._lastoper1["text"] = "Backtrack"
|
| 690 |
+
self._lastoper2["text"] = ""
|
| 691 |
+
if isinstance(elt, Tree):
|
| 692 |
+
self._animate_backtrack(self._parser.frontier()[0])
|
| 693 |
+
else:
|
| 694 |
+
self._animate_match_backtrack(self._parser.frontier()[0])
|
| 695 |
+
return True
|
| 696 |
+
else:
|
| 697 |
+
self._autostep = 0
|
| 698 |
+
self._lastoper1["text"] = "Finished"
|
| 699 |
+
self._lastoper2["text"] = ""
|
| 700 |
+
return False
|
| 701 |
+
|
| 702 |
+
def about(self, *e):
|
| 703 |
+
ABOUT = (
|
| 704 |
+
"NLTK Recursive Descent Parser Application\n" + "Written by Edward Loper"
|
| 705 |
+
)
|
| 706 |
+
TITLE = "About: Recursive Descent Parser Application"
|
| 707 |
+
try:
|
| 708 |
+
from tkinter.messagebox import Message
|
| 709 |
+
|
| 710 |
+
Message(message=ABOUT, title=TITLE).show()
|
| 711 |
+
except:
|
| 712 |
+
ShowText(self._top, TITLE, ABOUT)
|
| 713 |
+
|
| 714 |
+
def help(self, *e):
|
| 715 |
+
self._autostep = 0
|
| 716 |
+
# The default font's not very legible; try using 'fixed' instead.
|
| 717 |
+
try:
|
| 718 |
+
ShowText(
|
| 719 |
+
self._top,
|
| 720 |
+
"Help: Recursive Descent Parser Application",
|
| 721 |
+
(__doc__ or "").strip(),
|
| 722 |
+
width=75,
|
| 723 |
+
font="fixed",
|
| 724 |
+
)
|
| 725 |
+
except:
|
| 726 |
+
ShowText(
|
| 727 |
+
self._top,
|
| 728 |
+
"Help: Recursive Descent Parser Application",
|
| 729 |
+
(__doc__ or "").strip(),
|
| 730 |
+
width=75,
|
| 731 |
+
)
|
| 732 |
+
|
| 733 |
+
def postscript(self, *e):
|
| 734 |
+
self._autostep = 0
|
| 735 |
+
self._cframe.print_to_file()
|
| 736 |
+
|
| 737 |
+
def mainloop(self, *args, **kwargs):
|
| 738 |
+
"""
|
| 739 |
+
Enter the Tkinter mainloop. This function must be called if
|
| 740 |
+
this demo is created from a non-interactive program (e.g.
|
| 741 |
+
from a secript); otherwise, the demo will close as soon as
|
| 742 |
+
the script completes.
|
| 743 |
+
"""
|
| 744 |
+
if in_idle():
|
| 745 |
+
return
|
| 746 |
+
self._top.mainloop(*args, **kwargs)
|
| 747 |
+
|
| 748 |
+
def resize(self, size=None):
|
| 749 |
+
if size is not None:
|
| 750 |
+
self._size.set(size)
|
| 751 |
+
size = self._size.get()
|
| 752 |
+
self._font.configure(size=-(abs(size)))
|
| 753 |
+
self._boldfont.configure(size=-(abs(size)))
|
| 754 |
+
self._sysfont.configure(size=-(abs(size)))
|
| 755 |
+
self._bigfont.configure(size=-(abs(size + 2)))
|
| 756 |
+
self._redraw()
|
| 757 |
+
|
| 758 |
+
#########################################
|
| 759 |
+
## Expand Production Selection
|
| 760 |
+
#########################################
|
| 761 |
+
|
| 762 |
+
def _toggle_grammar(self, *e):
|
| 763 |
+
if self._show_grammar.get():
|
| 764 |
+
self._prodframe.pack(
|
| 765 |
+
fill="both", side="left", padx=2, after=self._feedbackframe
|
| 766 |
+
)
|
| 767 |
+
self._lastoper1["text"] = "Show Grammar"
|
| 768 |
+
else:
|
| 769 |
+
self._prodframe.pack_forget()
|
| 770 |
+
self._lastoper1["text"] = "Hide Grammar"
|
| 771 |
+
self._lastoper2["text"] = ""
|
| 772 |
+
|
| 773 |
+
# def toggle_grammar(self, *e):
|
| 774 |
+
# self._show_grammar = not self._show_grammar
|
| 775 |
+
# if self._show_grammar:
|
| 776 |
+
# self._prodframe.pack(fill='both', expand='y', side='left',
|
| 777 |
+
# after=self._feedbackframe)
|
| 778 |
+
# self._lastoper1['text'] = 'Show Grammar'
|
| 779 |
+
# else:
|
| 780 |
+
# self._prodframe.pack_forget()
|
| 781 |
+
# self._lastoper1['text'] = 'Hide Grammar'
|
| 782 |
+
# self._lastoper2['text'] = ''
|
| 783 |
+
|
| 784 |
+
def _prodlist_select(self, event):
|
| 785 |
+
selection = self._prodlist.curselection()
|
| 786 |
+
if len(selection) != 1:
|
| 787 |
+
return
|
| 788 |
+
index = int(selection[0])
|
| 789 |
+
old_frontier = self._parser.frontier()
|
| 790 |
+
production = self._parser.expand(self._productions[index])
|
| 791 |
+
|
| 792 |
+
if production:
|
| 793 |
+
self._lastoper1["text"] = "Expand:"
|
| 794 |
+
self._lastoper2["text"] = production
|
| 795 |
+
self._prodlist.selection_clear(0, "end")
|
| 796 |
+
self._prodlist.selection_set(index)
|
| 797 |
+
self._animate_expand(old_frontier[0])
|
| 798 |
+
else:
|
| 799 |
+
# Reset the production selections.
|
| 800 |
+
self._prodlist.selection_clear(0, "end")
|
| 801 |
+
for prod in self._parser.expandable_productions():
|
| 802 |
+
index = self._productions.index(prod)
|
| 803 |
+
self._prodlist.selection_set(index)
|
| 804 |
+
|
| 805 |
+
#########################################
|
| 806 |
+
## Animation
|
| 807 |
+
#########################################
|
| 808 |
+
|
| 809 |
+
def _animate_expand(self, treeloc):
|
| 810 |
+
oldwidget = self._get(self._tree, treeloc)
|
| 811 |
+
oldtree = oldwidget.parent()
|
| 812 |
+
top = not isinstance(oldtree.parent(), TreeSegmentWidget)
|
| 813 |
+
|
| 814 |
+
tree = self._parser.tree()
|
| 815 |
+
for i in treeloc:
|
| 816 |
+
tree = tree[i]
|
| 817 |
+
|
| 818 |
+
widget = tree_to_treesegment(
|
| 819 |
+
self._canvas,
|
| 820 |
+
tree,
|
| 821 |
+
node_font=self._boldfont,
|
| 822 |
+
leaf_color="white",
|
| 823 |
+
tree_width=2,
|
| 824 |
+
tree_color="white",
|
| 825 |
+
node_color="white",
|
| 826 |
+
leaf_font=self._font,
|
| 827 |
+
)
|
| 828 |
+
widget.label()["color"] = "#20a050"
|
| 829 |
+
|
| 830 |
+
(oldx, oldy) = oldtree.label().bbox()[:2]
|
| 831 |
+
(newx, newy) = widget.label().bbox()[:2]
|
| 832 |
+
widget.move(oldx - newx, oldy - newy)
|
| 833 |
+
|
| 834 |
+
if top:
|
| 835 |
+
self._cframe.add_widget(widget, 0, 5)
|
| 836 |
+
widget.move(30 - widget.label().bbox()[0], 0)
|
| 837 |
+
self._tree = widget
|
| 838 |
+
else:
|
| 839 |
+
oldtree.parent().replace_child(oldtree, widget)
|
| 840 |
+
|
| 841 |
+
# Move the children over so they don't overlap.
|
| 842 |
+
# Line the children up in a strange way.
|
| 843 |
+
if widget.subtrees():
|
| 844 |
+
dx = (
|
| 845 |
+
oldx
|
| 846 |
+
+ widget.label().width() / 2
|
| 847 |
+
- widget.subtrees()[0].bbox()[0] / 2
|
| 848 |
+
- widget.subtrees()[0].bbox()[2] / 2
|
| 849 |
+
)
|
| 850 |
+
for subtree in widget.subtrees():
|
| 851 |
+
subtree.move(dx, 0)
|
| 852 |
+
|
| 853 |
+
self._makeroom(widget)
|
| 854 |
+
|
| 855 |
+
if top:
|
| 856 |
+
self._cframe.destroy_widget(oldtree)
|
| 857 |
+
else:
|
| 858 |
+
oldtree.destroy()
|
| 859 |
+
|
| 860 |
+
colors = [
|
| 861 |
+
"gray%d" % (10 * int(10 * x / self._animation_frames.get()))
|
| 862 |
+
for x in range(self._animation_frames.get(), 0, -1)
|
| 863 |
+
]
|
| 864 |
+
|
| 865 |
+
# Move the text string down, if necessary.
|
| 866 |
+
dy = widget.bbox()[3] + 30 - self._canvas.coords(self._textline)[1]
|
| 867 |
+
if dy > 0:
|
| 868 |
+
for twidget in self._textwidgets:
|
| 869 |
+
twidget.move(0, dy)
|
| 870 |
+
self._canvas.move(self._textline, 0, dy)
|
| 871 |
+
|
| 872 |
+
self._animate_expand_frame(widget, colors)
|
| 873 |
+
|
| 874 |
+
def _makeroom(self, treeseg):
|
| 875 |
+
"""
|
| 876 |
+
Make sure that no sibling tree bbox's overlap.
|
| 877 |
+
"""
|
| 878 |
+
parent = treeseg.parent()
|
| 879 |
+
if not isinstance(parent, TreeSegmentWidget):
|
| 880 |
+
return
|
| 881 |
+
|
| 882 |
+
index = parent.subtrees().index(treeseg)
|
| 883 |
+
|
| 884 |
+
# Handle siblings to the right
|
| 885 |
+
rsiblings = parent.subtrees()[index + 1 :]
|
| 886 |
+
if rsiblings:
|
| 887 |
+
dx = treeseg.bbox()[2] - rsiblings[0].bbox()[0] + 10
|
| 888 |
+
for sibling in rsiblings:
|
| 889 |
+
sibling.move(dx, 0)
|
| 890 |
+
|
| 891 |
+
# Handle siblings to the left
|
| 892 |
+
if index > 0:
|
| 893 |
+
lsibling = parent.subtrees()[index - 1]
|
| 894 |
+
dx = max(0, lsibling.bbox()[2] - treeseg.bbox()[0] + 10)
|
| 895 |
+
treeseg.move(dx, 0)
|
| 896 |
+
|
| 897 |
+
# Keep working up the tree.
|
| 898 |
+
self._makeroom(parent)
|
| 899 |
+
|
| 900 |
+
def _animate_expand_frame(self, widget, colors):
|
| 901 |
+
if len(colors) > 0:
|
| 902 |
+
self._animating_lock = 1
|
| 903 |
+
widget["color"] = colors[0]
|
| 904 |
+
for subtree in widget.subtrees():
|
| 905 |
+
if isinstance(subtree, TreeSegmentWidget):
|
| 906 |
+
subtree.label()["color"] = colors[0]
|
| 907 |
+
else:
|
| 908 |
+
subtree["color"] = colors[0]
|
| 909 |
+
self._top.after(50, self._animate_expand_frame, widget, colors[1:])
|
| 910 |
+
else:
|
| 911 |
+
widget["color"] = "black"
|
| 912 |
+
for subtree in widget.subtrees():
|
| 913 |
+
if isinstance(subtree, TreeSegmentWidget):
|
| 914 |
+
subtree.label()["color"] = "black"
|
| 915 |
+
else:
|
| 916 |
+
subtree["color"] = "black"
|
| 917 |
+
self._redraw_quick()
|
| 918 |
+
widget.label()["color"] = "black"
|
| 919 |
+
self._animating_lock = 0
|
| 920 |
+
if self._autostep:
|
| 921 |
+
self._step()
|
| 922 |
+
|
| 923 |
+
def _animate_backtrack(self, treeloc):
|
| 924 |
+
# Flash red first, if we're animating.
|
| 925 |
+
if self._animation_frames.get() == 0:
|
| 926 |
+
colors = []
|
| 927 |
+
else:
|
| 928 |
+
colors = ["#a00000", "#000000", "#a00000"]
|
| 929 |
+
colors += [
|
| 930 |
+
"gray%d" % (10 * int(10 * x / (self._animation_frames.get())))
|
| 931 |
+
for x in range(1, self._animation_frames.get() + 1)
|
| 932 |
+
]
|
| 933 |
+
|
| 934 |
+
widgets = [self._get(self._tree, treeloc).parent()]
|
| 935 |
+
for subtree in widgets[0].subtrees():
|
| 936 |
+
if isinstance(subtree, TreeSegmentWidget):
|
| 937 |
+
widgets.append(subtree.label())
|
| 938 |
+
else:
|
| 939 |
+
widgets.append(subtree)
|
| 940 |
+
|
| 941 |
+
self._animate_backtrack_frame(widgets, colors)
|
| 942 |
+
|
| 943 |
+
def _animate_backtrack_frame(self, widgets, colors):
|
| 944 |
+
if len(colors) > 0:
|
| 945 |
+
self._animating_lock = 1
|
| 946 |
+
for widget in widgets:
|
| 947 |
+
widget["color"] = colors[0]
|
| 948 |
+
self._top.after(50, self._animate_backtrack_frame, widgets, colors[1:])
|
| 949 |
+
else:
|
| 950 |
+
for widget in widgets[0].subtrees():
|
| 951 |
+
widgets[0].remove_child(widget)
|
| 952 |
+
widget.destroy()
|
| 953 |
+
self._redraw_quick()
|
| 954 |
+
self._animating_lock = 0
|
| 955 |
+
if self._autostep:
|
| 956 |
+
self._step()
|
| 957 |
+
|
| 958 |
+
def _animate_match_backtrack(self, treeloc):
|
| 959 |
+
widget = self._get(self._tree, treeloc)
|
| 960 |
+
node = widget.parent().label()
|
| 961 |
+
dy = (node.bbox()[3] - widget.bbox()[1] + 14) / max(
|
| 962 |
+
1, self._animation_frames.get()
|
| 963 |
+
)
|
| 964 |
+
self._animate_match_backtrack_frame(self._animation_frames.get(), widget, dy)
|
| 965 |
+
|
| 966 |
+
def _animate_match(self, treeloc):
|
| 967 |
+
widget = self._get(self._tree, treeloc)
|
| 968 |
+
|
| 969 |
+
dy = (self._textwidgets[0].bbox()[1] - widget.bbox()[3] - 10.0) / max(
|
| 970 |
+
1, self._animation_frames.get()
|
| 971 |
+
)
|
| 972 |
+
self._animate_match_frame(self._animation_frames.get(), widget, dy)
|
| 973 |
+
|
| 974 |
+
def _animate_match_frame(self, frame, widget, dy):
|
| 975 |
+
if frame > 0:
|
| 976 |
+
self._animating_lock = 1
|
| 977 |
+
widget.move(0, dy)
|
| 978 |
+
self._top.after(10, self._animate_match_frame, frame - 1, widget, dy)
|
| 979 |
+
else:
|
| 980 |
+
widget["color"] = "#006040"
|
| 981 |
+
self._redraw_quick()
|
| 982 |
+
self._animating_lock = 0
|
| 983 |
+
if self._autostep:
|
| 984 |
+
self._step()
|
| 985 |
+
|
| 986 |
+
def _animate_match_backtrack_frame(self, frame, widget, dy):
|
| 987 |
+
if frame > 0:
|
| 988 |
+
self._animating_lock = 1
|
| 989 |
+
widget.move(0, dy)
|
| 990 |
+
self._top.after(
|
| 991 |
+
10, self._animate_match_backtrack_frame, frame - 1, widget, dy
|
| 992 |
+
)
|
| 993 |
+
else:
|
| 994 |
+
widget.parent().remove_child(widget)
|
| 995 |
+
widget.destroy()
|
| 996 |
+
self._animating_lock = 0
|
| 997 |
+
if self._autostep:
|
| 998 |
+
self._step()
|
| 999 |
+
|
| 1000 |
+
def edit_grammar(self, *e):
|
| 1001 |
+
CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
|
| 1002 |
+
|
| 1003 |
+
def set_grammar(self, grammar):
|
| 1004 |
+
self._parser.set_grammar(grammar)
|
| 1005 |
+
self._productions = list(grammar.productions())
|
| 1006 |
+
self._prodlist.delete(0, "end")
|
| 1007 |
+
for production in self._productions:
|
| 1008 |
+
self._prodlist.insert("end", (" %s" % production))
|
| 1009 |
+
|
| 1010 |
+
def edit_sentence(self, *e):
|
| 1011 |
+
sentence = " ".join(self._sent)
|
| 1012 |
+
title = "Edit Text"
|
| 1013 |
+
instr = "Enter a new sentence to parse."
|
| 1014 |
+
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
|
| 1015 |
+
|
| 1016 |
+
def set_sentence(self, sentence):
|
| 1017 |
+
self._sent = sentence.split() # [XX] use tagged?
|
| 1018 |
+
self.reset()
|
| 1019 |
+
|
| 1020 |
+
|
| 1021 |
+
def app():
|
| 1022 |
+
"""
|
| 1023 |
+
Create a recursive descent parser demo, using a simple grammar and
|
| 1024 |
+
text.
|
| 1025 |
+
"""
|
| 1026 |
+
from nltk.grammar import CFG
|
| 1027 |
+
|
| 1028 |
+
grammar = CFG.fromstring(
|
| 1029 |
+
"""
|
| 1030 |
+
# Grammatical productions.
|
| 1031 |
+
S -> NP VP
|
| 1032 |
+
NP -> Det N PP | Det N
|
| 1033 |
+
VP -> V NP PP | V NP | V
|
| 1034 |
+
PP -> P NP
|
| 1035 |
+
# Lexical productions.
|
| 1036 |
+
NP -> 'I'
|
| 1037 |
+
Det -> 'the' | 'a'
|
| 1038 |
+
N -> 'man' | 'park' | 'dog' | 'telescope'
|
| 1039 |
+
V -> 'ate' | 'saw'
|
| 1040 |
+
P -> 'in' | 'under' | 'with'
|
| 1041 |
+
"""
|
| 1042 |
+
)
|
| 1043 |
+
|
| 1044 |
+
sent = "the dog saw a man in the park".split()
|
| 1045 |
+
|
| 1046 |
+
RecursiveDescentApp(grammar, sent).mainloop()
|
| 1047 |
+
|
| 1048 |
+
|
| 1049 |
+
if __name__ == "__main__":
|
| 1050 |
+
app()
|
| 1051 |
+
|
| 1052 |
+
__all__ = ["app"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/app/srparser_app.py
ADDED
|
@@ -0,0 +1,937 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Shift-Reduce Parser Application
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
A graphical tool for exploring the shift-reduce parser.
|
| 10 |
+
|
| 11 |
+
The shift-reduce parser maintains a stack, which records the structure
|
| 12 |
+
of the portion of the text that has been parsed. The stack is
|
| 13 |
+
initially empty. Its contents are shown on the left side of the main
|
| 14 |
+
canvas.
|
| 15 |
+
|
| 16 |
+
On the right side of the main canvas is the remaining text. This is
|
| 17 |
+
the portion of the text which has not yet been considered by the
|
| 18 |
+
parser.
|
| 19 |
+
|
| 20 |
+
The parser builds up a tree structure for the text using two
|
| 21 |
+
operations:
|
| 22 |
+
|
| 23 |
+
- "shift" moves the first token from the remaining text to the top
|
| 24 |
+
of the stack. In the demo, the top of the stack is its right-hand
|
| 25 |
+
side.
|
| 26 |
+
- "reduce" uses a grammar production to combine the rightmost stack
|
| 27 |
+
elements into a single tree token.
|
| 28 |
+
|
| 29 |
+
You can control the parser's operation by using the "shift" and
|
| 30 |
+
"reduce" buttons; or you can use the "step" button to let the parser
|
| 31 |
+
automatically decide which operation to apply. The parser uses the
|
| 32 |
+
following rules to decide which operation to apply:
|
| 33 |
+
|
| 34 |
+
- Only shift if no reductions are available.
|
| 35 |
+
- If multiple reductions are available, then apply the reduction
|
| 36 |
+
whose CFG production is listed earliest in the grammar.
|
| 37 |
+
|
| 38 |
+
The "reduce" button applies the reduction whose CFG production is
|
| 39 |
+
listed earliest in the grammar. There are two ways to manually choose
|
| 40 |
+
which reduction to apply:
|
| 41 |
+
|
| 42 |
+
- Click on a CFG production from the list of available reductions,
|
| 43 |
+
on the left side of the main window. The reduction based on that
|
| 44 |
+
production will be applied to the top of the stack.
|
| 45 |
+
- Click on one of the stack elements. A popup window will appear,
|
| 46 |
+
containing all available reductions. Select one, and it will be
|
| 47 |
+
applied to the top of the stack.
|
| 48 |
+
|
| 49 |
+
Note that reductions can only be applied to the top of the stack.
|
| 50 |
+
|
| 51 |
+
Keyboard Shortcuts::
|
| 52 |
+
[Space]\t Perform the next shift or reduce operation
|
| 53 |
+
[s]\t Perform a shift operation
|
| 54 |
+
[r]\t Perform a reduction operation
|
| 55 |
+
[Ctrl-z]\t Undo most recent operation
|
| 56 |
+
[Delete]\t Reset the parser
|
| 57 |
+
[g]\t Show/hide available production list
|
| 58 |
+
[Ctrl-a]\t Toggle animations
|
| 59 |
+
[h]\t Help
|
| 60 |
+
[Ctrl-p]\t Print
|
| 61 |
+
[q]\t Quit
|
| 62 |
+
|
| 63 |
+
"""
|
| 64 |
+
|
| 65 |
+
from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk
|
| 66 |
+
from tkinter.font import Font
|
| 67 |
+
|
| 68 |
+
from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment
|
| 69 |
+
from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget
|
| 70 |
+
from nltk.parse import SteppingShiftReduceParser
|
| 71 |
+
from nltk.tree import Tree
|
| 72 |
+
from nltk.util import in_idle
|
| 73 |
+
|
| 74 |
+
"""
|
| 75 |
+
Possible future improvements:
|
| 76 |
+
- button/window to change and/or select text. Just pop up a window
|
| 77 |
+
with an entry, and let them modify the text; and then retokenize
|
| 78 |
+
it? Maybe give a warning if it contains tokens whose types are
|
| 79 |
+
not in the grammar.
|
| 80 |
+
- button/window to change and/or select grammar. Select from
|
| 81 |
+
several alternative grammars? Or actually change the grammar? If
|
| 82 |
+
the later, then I'd want to define nltk.draw.cfg, which would be
|
| 83 |
+
responsible for that.
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class ShiftReduceApp:
|
| 88 |
+
"""
|
| 89 |
+
A graphical tool for exploring the shift-reduce parser. The tool
|
| 90 |
+
displays the parser's stack and the remaining text, and allows the
|
| 91 |
+
user to control the parser's operation. In particular, the user
|
| 92 |
+
can shift tokens onto the stack, and can perform reductions on the
|
| 93 |
+
top elements of the stack. A "step" button simply steps through
|
| 94 |
+
the parsing process, performing the operations that
|
| 95 |
+
``nltk.parse.ShiftReduceParser`` would use.
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
def __init__(self, grammar, sent, trace=0):
|
| 99 |
+
self._sent = sent
|
| 100 |
+
self._parser = SteppingShiftReduceParser(grammar, trace)
|
| 101 |
+
|
| 102 |
+
# Set up the main window.
|
| 103 |
+
self._top = Tk()
|
| 104 |
+
self._top.title("Shift Reduce Parser Application")
|
| 105 |
+
|
| 106 |
+
# Animations. animating_lock is a lock to prevent the demo
|
| 107 |
+
# from performing new operations while it's animating.
|
| 108 |
+
self._animating_lock = 0
|
| 109 |
+
self._animate = IntVar(self._top)
|
| 110 |
+
self._animate.set(10) # = medium
|
| 111 |
+
|
| 112 |
+
# The user can hide the grammar.
|
| 113 |
+
self._show_grammar = IntVar(self._top)
|
| 114 |
+
self._show_grammar.set(1)
|
| 115 |
+
|
| 116 |
+
# Initialize fonts.
|
| 117 |
+
self._init_fonts(self._top)
|
| 118 |
+
|
| 119 |
+
# Set up key bindings.
|
| 120 |
+
self._init_bindings()
|
| 121 |
+
|
| 122 |
+
# Create the basic frames.
|
| 123 |
+
self._init_menubar(self._top)
|
| 124 |
+
self._init_buttons(self._top)
|
| 125 |
+
self._init_feedback(self._top)
|
| 126 |
+
self._init_grammar(self._top)
|
| 127 |
+
self._init_canvas(self._top)
|
| 128 |
+
|
| 129 |
+
# A popup menu for reducing.
|
| 130 |
+
self._reduce_menu = Menu(self._canvas, tearoff=0)
|
| 131 |
+
|
| 132 |
+
# Reset the demo, and set the feedback frame to empty.
|
| 133 |
+
self.reset()
|
| 134 |
+
self._lastoper1["text"] = ""
|
| 135 |
+
|
| 136 |
+
#########################################
|
| 137 |
+
## Initialization Helpers
|
| 138 |
+
#########################################
|
| 139 |
+
|
| 140 |
+
def _init_fonts(self, root):
|
| 141 |
+
# See: <http://www.astro.washington.edu/owen/ROTKFolklore.html>
|
| 142 |
+
self._sysfont = Font(font=Button()["font"])
|
| 143 |
+
root.option_add("*Font", self._sysfont)
|
| 144 |
+
|
| 145 |
+
# TWhat's our font size (default=same as sysfont)
|
| 146 |
+
self._size = IntVar(root)
|
| 147 |
+
self._size.set(self._sysfont.cget("size"))
|
| 148 |
+
|
| 149 |
+
self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get())
|
| 150 |
+
self._font = Font(family="helvetica", size=self._size.get())
|
| 151 |
+
|
| 152 |
+
def _init_grammar(self, parent):
|
| 153 |
+
# Grammar view.
|
| 154 |
+
self._prodframe = listframe = Frame(parent)
|
| 155 |
+
self._prodframe.pack(fill="both", side="left", padx=2)
|
| 156 |
+
self._prodlist_label = Label(
|
| 157 |
+
self._prodframe, font=self._boldfont, text="Available Reductions"
|
| 158 |
+
)
|
| 159 |
+
self._prodlist_label.pack()
|
| 160 |
+
self._prodlist = Listbox(
|
| 161 |
+
self._prodframe,
|
| 162 |
+
selectmode="single",
|
| 163 |
+
relief="groove",
|
| 164 |
+
background="white",
|
| 165 |
+
foreground="#909090",
|
| 166 |
+
font=self._font,
|
| 167 |
+
selectforeground="#004040",
|
| 168 |
+
selectbackground="#c0f0c0",
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
self._prodlist.pack(side="right", fill="both", expand=1)
|
| 172 |
+
|
| 173 |
+
self._productions = list(self._parser.grammar().productions())
|
| 174 |
+
for production in self._productions:
|
| 175 |
+
self._prodlist.insert("end", (" %s" % production))
|
| 176 |
+
self._prodlist.config(height=min(len(self._productions), 25))
|
| 177 |
+
|
| 178 |
+
# Add a scrollbar if there are more than 25 productions.
|
| 179 |
+
if 1: # len(self._productions) > 25:
|
| 180 |
+
listscroll = Scrollbar(self._prodframe, orient="vertical")
|
| 181 |
+
self._prodlist.config(yscrollcommand=listscroll.set)
|
| 182 |
+
listscroll.config(command=self._prodlist.yview)
|
| 183 |
+
listscroll.pack(side="left", fill="y")
|
| 184 |
+
|
| 185 |
+
# If they select a production, apply it.
|
| 186 |
+
self._prodlist.bind("<<ListboxSelect>>", self._prodlist_select)
|
| 187 |
+
|
| 188 |
+
# When they hover over a production, highlight it.
|
| 189 |
+
self._hover = -1
|
| 190 |
+
self._prodlist.bind("<Motion>", self._highlight_hover)
|
| 191 |
+
self._prodlist.bind("<Leave>", self._clear_hover)
|
| 192 |
+
|
| 193 |
+
def _init_bindings(self):
|
| 194 |
+
# Quit
|
| 195 |
+
self._top.bind("<Control-q>", self.destroy)
|
| 196 |
+
self._top.bind("<Control-x>", self.destroy)
|
| 197 |
+
self._top.bind("<Alt-q>", self.destroy)
|
| 198 |
+
self._top.bind("<Alt-x>", self.destroy)
|
| 199 |
+
|
| 200 |
+
# Ops (step, shift, reduce, undo)
|
| 201 |
+
self._top.bind("<space>", self.step)
|
| 202 |
+
self._top.bind("<s>", self.shift)
|
| 203 |
+
self._top.bind("<Alt-s>", self.shift)
|
| 204 |
+
self._top.bind("<Control-s>", self.shift)
|
| 205 |
+
self._top.bind("<r>", self.reduce)
|
| 206 |
+
self._top.bind("<Alt-r>", self.reduce)
|
| 207 |
+
self._top.bind("<Control-r>", self.reduce)
|
| 208 |
+
self._top.bind("<Delete>", self.reset)
|
| 209 |
+
self._top.bind("<u>", self.undo)
|
| 210 |
+
self._top.bind("<Alt-u>", self.undo)
|
| 211 |
+
self._top.bind("<Control-u>", self.undo)
|
| 212 |
+
self._top.bind("<Control-z>", self.undo)
|
| 213 |
+
self._top.bind("<BackSpace>", self.undo)
|
| 214 |
+
|
| 215 |
+
# Misc
|
| 216 |
+
self._top.bind("<Control-p>", self.postscript)
|
| 217 |
+
self._top.bind("<Control-h>", self.help)
|
| 218 |
+
self._top.bind("<F1>", self.help)
|
| 219 |
+
self._top.bind("<Control-g>", self.edit_grammar)
|
| 220 |
+
self._top.bind("<Control-t>", self.edit_sentence)
|
| 221 |
+
|
| 222 |
+
# Animation speed control
|
| 223 |
+
self._top.bind("-", lambda e, a=self._animate: a.set(20))
|
| 224 |
+
self._top.bind("=", lambda e, a=self._animate: a.set(10))
|
| 225 |
+
self._top.bind("+", lambda e, a=self._animate: a.set(4))
|
| 226 |
+
|
| 227 |
+
def _init_buttons(self, parent):
|
| 228 |
+
# Set up the frames.
|
| 229 |
+
self._buttonframe = buttonframe = Frame(parent)
|
| 230 |
+
buttonframe.pack(fill="none", side="bottom")
|
| 231 |
+
Button(
|
| 232 |
+
buttonframe,
|
| 233 |
+
text="Step",
|
| 234 |
+
background="#90c0d0",
|
| 235 |
+
foreground="black",
|
| 236 |
+
command=self.step,
|
| 237 |
+
).pack(side="left")
|
| 238 |
+
Button(
|
| 239 |
+
buttonframe,
|
| 240 |
+
text="Shift",
|
| 241 |
+
underline=0,
|
| 242 |
+
background="#90f090",
|
| 243 |
+
foreground="black",
|
| 244 |
+
command=self.shift,
|
| 245 |
+
).pack(side="left")
|
| 246 |
+
Button(
|
| 247 |
+
buttonframe,
|
| 248 |
+
text="Reduce",
|
| 249 |
+
underline=0,
|
| 250 |
+
background="#90f090",
|
| 251 |
+
foreground="black",
|
| 252 |
+
command=self.reduce,
|
| 253 |
+
).pack(side="left")
|
| 254 |
+
Button(
|
| 255 |
+
buttonframe,
|
| 256 |
+
text="Undo",
|
| 257 |
+
underline=0,
|
| 258 |
+
background="#f0a0a0",
|
| 259 |
+
foreground="black",
|
| 260 |
+
command=self.undo,
|
| 261 |
+
).pack(side="left")
|
| 262 |
+
|
| 263 |
+
def _init_menubar(self, parent):
|
| 264 |
+
menubar = Menu(parent)
|
| 265 |
+
|
| 266 |
+
filemenu = Menu(menubar, tearoff=0)
|
| 267 |
+
filemenu.add_command(
|
| 268 |
+
label="Reset Parser", underline=0, command=self.reset, accelerator="Del"
|
| 269 |
+
)
|
| 270 |
+
filemenu.add_command(
|
| 271 |
+
label="Print to Postscript",
|
| 272 |
+
underline=0,
|
| 273 |
+
command=self.postscript,
|
| 274 |
+
accelerator="Ctrl-p",
|
| 275 |
+
)
|
| 276 |
+
filemenu.add_command(
|
| 277 |
+
label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x"
|
| 278 |
+
)
|
| 279 |
+
menubar.add_cascade(label="File", underline=0, menu=filemenu)
|
| 280 |
+
|
| 281 |
+
editmenu = Menu(menubar, tearoff=0)
|
| 282 |
+
editmenu.add_command(
|
| 283 |
+
label="Edit Grammar",
|
| 284 |
+
underline=5,
|
| 285 |
+
command=self.edit_grammar,
|
| 286 |
+
accelerator="Ctrl-g",
|
| 287 |
+
)
|
| 288 |
+
editmenu.add_command(
|
| 289 |
+
label="Edit Text",
|
| 290 |
+
underline=5,
|
| 291 |
+
command=self.edit_sentence,
|
| 292 |
+
accelerator="Ctrl-t",
|
| 293 |
+
)
|
| 294 |
+
menubar.add_cascade(label="Edit", underline=0, menu=editmenu)
|
| 295 |
+
|
| 296 |
+
rulemenu = Menu(menubar, tearoff=0)
|
| 297 |
+
rulemenu.add_command(
|
| 298 |
+
label="Step", underline=1, command=self.step, accelerator="Space"
|
| 299 |
+
)
|
| 300 |
+
rulemenu.add_separator()
|
| 301 |
+
rulemenu.add_command(
|
| 302 |
+
label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s"
|
| 303 |
+
)
|
| 304 |
+
rulemenu.add_command(
|
| 305 |
+
label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r"
|
| 306 |
+
)
|
| 307 |
+
rulemenu.add_separator()
|
| 308 |
+
rulemenu.add_command(
|
| 309 |
+
label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u"
|
| 310 |
+
)
|
| 311 |
+
menubar.add_cascade(label="Apply", underline=0, menu=rulemenu)
|
| 312 |
+
|
| 313 |
+
viewmenu = Menu(menubar, tearoff=0)
|
| 314 |
+
viewmenu.add_checkbutton(
|
| 315 |
+
label="Show Grammar",
|
| 316 |
+
underline=0,
|
| 317 |
+
variable=self._show_grammar,
|
| 318 |
+
command=self._toggle_grammar,
|
| 319 |
+
)
|
| 320 |
+
viewmenu.add_separator()
|
| 321 |
+
viewmenu.add_radiobutton(
|
| 322 |
+
label="Tiny",
|
| 323 |
+
variable=self._size,
|
| 324 |
+
underline=0,
|
| 325 |
+
value=10,
|
| 326 |
+
command=self.resize,
|
| 327 |
+
)
|
| 328 |
+
viewmenu.add_radiobutton(
|
| 329 |
+
label="Small",
|
| 330 |
+
variable=self._size,
|
| 331 |
+
underline=0,
|
| 332 |
+
value=12,
|
| 333 |
+
command=self.resize,
|
| 334 |
+
)
|
| 335 |
+
viewmenu.add_radiobutton(
|
| 336 |
+
label="Medium",
|
| 337 |
+
variable=self._size,
|
| 338 |
+
underline=0,
|
| 339 |
+
value=14,
|
| 340 |
+
command=self.resize,
|
| 341 |
+
)
|
| 342 |
+
viewmenu.add_radiobutton(
|
| 343 |
+
label="Large",
|
| 344 |
+
variable=self._size,
|
| 345 |
+
underline=0,
|
| 346 |
+
value=18,
|
| 347 |
+
command=self.resize,
|
| 348 |
+
)
|
| 349 |
+
viewmenu.add_radiobutton(
|
| 350 |
+
label="Huge",
|
| 351 |
+
variable=self._size,
|
| 352 |
+
underline=0,
|
| 353 |
+
value=24,
|
| 354 |
+
command=self.resize,
|
| 355 |
+
)
|
| 356 |
+
menubar.add_cascade(label="View", underline=0, menu=viewmenu)
|
| 357 |
+
|
| 358 |
+
animatemenu = Menu(menubar, tearoff=0)
|
| 359 |
+
animatemenu.add_radiobutton(
|
| 360 |
+
label="No Animation", underline=0, variable=self._animate, value=0
|
| 361 |
+
)
|
| 362 |
+
animatemenu.add_radiobutton(
|
| 363 |
+
label="Slow Animation",
|
| 364 |
+
underline=0,
|
| 365 |
+
variable=self._animate,
|
| 366 |
+
value=20,
|
| 367 |
+
accelerator="-",
|
| 368 |
+
)
|
| 369 |
+
animatemenu.add_radiobutton(
|
| 370 |
+
label="Normal Animation",
|
| 371 |
+
underline=0,
|
| 372 |
+
variable=self._animate,
|
| 373 |
+
value=10,
|
| 374 |
+
accelerator="=",
|
| 375 |
+
)
|
| 376 |
+
animatemenu.add_radiobutton(
|
| 377 |
+
label="Fast Animation",
|
| 378 |
+
underline=0,
|
| 379 |
+
variable=self._animate,
|
| 380 |
+
value=4,
|
| 381 |
+
accelerator="+",
|
| 382 |
+
)
|
| 383 |
+
menubar.add_cascade(label="Animate", underline=1, menu=animatemenu)
|
| 384 |
+
|
| 385 |
+
helpmenu = Menu(menubar, tearoff=0)
|
| 386 |
+
helpmenu.add_command(label="About", underline=0, command=self.about)
|
| 387 |
+
helpmenu.add_command(
|
| 388 |
+
label="Instructions", underline=0, command=self.help, accelerator="F1"
|
| 389 |
+
)
|
| 390 |
+
menubar.add_cascade(label="Help", underline=0, menu=helpmenu)
|
| 391 |
+
|
| 392 |
+
parent.config(menu=menubar)
|
| 393 |
+
|
| 394 |
+
def _init_feedback(self, parent):
|
| 395 |
+
self._feedbackframe = feedbackframe = Frame(parent)
|
| 396 |
+
feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3)
|
| 397 |
+
self._lastoper_label = Label(
|
| 398 |
+
feedbackframe, text="Last Operation:", font=self._font
|
| 399 |
+
)
|
| 400 |
+
self._lastoper_label.pack(side="left")
|
| 401 |
+
lastoperframe = Frame(feedbackframe, relief="sunken", border=1)
|
| 402 |
+
lastoperframe.pack(fill="x", side="right", expand=1, padx=5)
|
| 403 |
+
self._lastoper1 = Label(
|
| 404 |
+
lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font
|
| 405 |
+
)
|
| 406 |
+
self._lastoper2 = Label(
|
| 407 |
+
lastoperframe,
|
| 408 |
+
anchor="w",
|
| 409 |
+
width=30,
|
| 410 |
+
foreground="#004040",
|
| 411 |
+
background="#f0f0f0",
|
| 412 |
+
font=self._font,
|
| 413 |
+
)
|
| 414 |
+
self._lastoper1.pack(side="left")
|
| 415 |
+
self._lastoper2.pack(side="left", fill="x", expand=1)
|
| 416 |
+
|
| 417 |
+
def _init_canvas(self, parent):
|
| 418 |
+
self._cframe = CanvasFrame(
|
| 419 |
+
parent,
|
| 420 |
+
background="white",
|
| 421 |
+
width=525,
|
| 422 |
+
closeenough=10,
|
| 423 |
+
border=2,
|
| 424 |
+
relief="sunken",
|
| 425 |
+
)
|
| 426 |
+
self._cframe.pack(expand=1, fill="both", side="top", pady=2)
|
| 427 |
+
canvas = self._canvas = self._cframe.canvas()
|
| 428 |
+
|
| 429 |
+
self._stackwidgets = []
|
| 430 |
+
self._rtextwidgets = []
|
| 431 |
+
self._titlebar = canvas.create_rectangle(
|
| 432 |
+
0, 0, 0, 0, fill="#c0f0f0", outline="black"
|
| 433 |
+
)
|
| 434 |
+
self._exprline = canvas.create_line(0, 0, 0, 0, dash=".")
|
| 435 |
+
self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080")
|
| 436 |
+
size = self._size.get() + 4
|
| 437 |
+
self._stacklabel = TextWidget(
|
| 438 |
+
canvas, "Stack", color="#004040", font=self._boldfont
|
| 439 |
+
)
|
| 440 |
+
self._rtextlabel = TextWidget(
|
| 441 |
+
canvas, "Remaining Text", color="#004040", font=self._boldfont
|
| 442 |
+
)
|
| 443 |
+
self._cframe.add_widget(self._stacklabel)
|
| 444 |
+
self._cframe.add_widget(self._rtextlabel)
|
| 445 |
+
|
| 446 |
+
#########################################
|
| 447 |
+
## Main draw procedure
|
| 448 |
+
#########################################
|
| 449 |
+
|
| 450 |
+
def _redraw(self):
|
| 451 |
+
scrollregion = self._canvas["scrollregion"].split()
|
| 452 |
+
(cx1, cy1, cx2, cy2) = (int(c) for c in scrollregion)
|
| 453 |
+
|
| 454 |
+
# Delete the old stack & rtext widgets.
|
| 455 |
+
for stackwidget in self._stackwidgets:
|
| 456 |
+
self._cframe.destroy_widget(stackwidget)
|
| 457 |
+
self._stackwidgets = []
|
| 458 |
+
for rtextwidget in self._rtextwidgets:
|
| 459 |
+
self._cframe.destroy_widget(rtextwidget)
|
| 460 |
+
self._rtextwidgets = []
|
| 461 |
+
|
| 462 |
+
# Position the titlebar & exprline
|
| 463 |
+
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
| 464 |
+
y = y2 - y1 + 10
|
| 465 |
+
self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4)
|
| 466 |
+
self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10)
|
| 467 |
+
|
| 468 |
+
# Position the titlebar labels..
|
| 469 |
+
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
| 470 |
+
self._stacklabel.move(5 - x1, 3 - y1)
|
| 471 |
+
(x1, y1, x2, y2) = self._rtextlabel.bbox()
|
| 472 |
+
self._rtextlabel.move(cx2 - x2 - 5, 3 - y1)
|
| 473 |
+
|
| 474 |
+
# Draw the stack.
|
| 475 |
+
stackx = 5
|
| 476 |
+
for tok in self._parser.stack():
|
| 477 |
+
if isinstance(tok, Tree):
|
| 478 |
+
attribs = {
|
| 479 |
+
"tree_color": "#4080a0",
|
| 480 |
+
"tree_width": 2,
|
| 481 |
+
"node_font": self._boldfont,
|
| 482 |
+
"node_color": "#006060",
|
| 483 |
+
"leaf_color": "#006060",
|
| 484 |
+
"leaf_font": self._font,
|
| 485 |
+
}
|
| 486 |
+
widget = tree_to_treesegment(self._canvas, tok, **attribs)
|
| 487 |
+
widget.label()["color"] = "#000000"
|
| 488 |
+
else:
|
| 489 |
+
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
|
| 490 |
+
widget.bind_click(self._popup_reduce)
|
| 491 |
+
self._stackwidgets.append(widget)
|
| 492 |
+
self._cframe.add_widget(widget, stackx, y)
|
| 493 |
+
stackx = widget.bbox()[2] + 10
|
| 494 |
+
|
| 495 |
+
# Draw the remaining text.
|
| 496 |
+
rtextwidth = 0
|
| 497 |
+
for tok in self._parser.remaining_text():
|
| 498 |
+
widget = TextWidget(self._canvas, tok, color="#000000", font=self._font)
|
| 499 |
+
self._rtextwidgets.append(widget)
|
| 500 |
+
self._cframe.add_widget(widget, rtextwidth, y)
|
| 501 |
+
rtextwidth = widget.bbox()[2] + 4
|
| 502 |
+
|
| 503 |
+
# Allow enough room to shift the next token (for animations)
|
| 504 |
+
if len(self._rtextwidgets) > 0:
|
| 505 |
+
stackx += self._rtextwidgets[0].width()
|
| 506 |
+
|
| 507 |
+
# Move the remaining text to the correct location (keep it
|
| 508 |
+
# right-justified, when possible); and move the remaining text
|
| 509 |
+
# label, if necessary.
|
| 510 |
+
stackx = max(stackx, self._stacklabel.width() + 25)
|
| 511 |
+
rlabelwidth = self._rtextlabel.width() + 10
|
| 512 |
+
if stackx >= cx2 - max(rtextwidth, rlabelwidth):
|
| 513 |
+
cx2 = stackx + max(rtextwidth, rlabelwidth)
|
| 514 |
+
for rtextwidget in self._rtextwidgets:
|
| 515 |
+
rtextwidget.move(4 + cx2 - rtextwidth, 0)
|
| 516 |
+
self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0)
|
| 517 |
+
|
| 518 |
+
midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2
|
| 519 |
+
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
|
| 520 |
+
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
| 521 |
+
|
| 522 |
+
# Set up binding to allow them to shift a token by dragging it.
|
| 523 |
+
if len(self._rtextwidgets) > 0:
|
| 524 |
+
|
| 525 |
+
def drag_shift(widget, midx=midx, self=self):
|
| 526 |
+
if widget.bbox()[0] < midx:
|
| 527 |
+
self.shift()
|
| 528 |
+
else:
|
| 529 |
+
self._redraw()
|
| 530 |
+
|
| 531 |
+
self._rtextwidgets[0].bind_drag(drag_shift)
|
| 532 |
+
self._rtextwidgets[0].bind_click(self.shift)
|
| 533 |
+
|
| 534 |
+
# Draw the stack top.
|
| 535 |
+
self._highlight_productions()
|
| 536 |
+
|
| 537 |
+
def _draw_stack_top(self, widget):
|
| 538 |
+
# hack..
|
| 539 |
+
midx = widget.bbox()[2] + 50
|
| 540 |
+
self._canvas.coords(self._stacktop, midx, 0, midx, 5000)
|
| 541 |
+
|
| 542 |
+
def _highlight_productions(self):
|
| 543 |
+
# Highlight the productions that can be reduced.
|
| 544 |
+
self._prodlist.selection_clear(0, "end")
|
| 545 |
+
for prod in self._parser.reducible_productions():
|
| 546 |
+
index = self._productions.index(prod)
|
| 547 |
+
self._prodlist.selection_set(index)
|
| 548 |
+
|
| 549 |
+
#########################################
|
| 550 |
+
## Button Callbacks
|
| 551 |
+
#########################################
|
| 552 |
+
|
| 553 |
+
def destroy(self, *e):
|
| 554 |
+
if self._top is None:
|
| 555 |
+
return
|
| 556 |
+
self._top.destroy()
|
| 557 |
+
self._top = None
|
| 558 |
+
|
| 559 |
+
def reset(self, *e):
|
| 560 |
+
self._parser.initialize(self._sent)
|
| 561 |
+
self._lastoper1["text"] = "Reset App"
|
| 562 |
+
self._lastoper2["text"] = ""
|
| 563 |
+
self._redraw()
|
| 564 |
+
|
| 565 |
+
def step(self, *e):
|
| 566 |
+
if self.reduce():
|
| 567 |
+
return True
|
| 568 |
+
elif self.shift():
|
| 569 |
+
return True
|
| 570 |
+
else:
|
| 571 |
+
if list(self._parser.parses()):
|
| 572 |
+
self._lastoper1["text"] = "Finished:"
|
| 573 |
+
self._lastoper2["text"] = "Success"
|
| 574 |
+
else:
|
| 575 |
+
self._lastoper1["text"] = "Finished:"
|
| 576 |
+
self._lastoper2["text"] = "Failure"
|
| 577 |
+
|
| 578 |
+
def shift(self, *e):
|
| 579 |
+
if self._animating_lock:
|
| 580 |
+
return
|
| 581 |
+
if self._parser.shift():
|
| 582 |
+
tok = self._parser.stack()[-1]
|
| 583 |
+
self._lastoper1["text"] = "Shift:"
|
| 584 |
+
self._lastoper2["text"] = "%r" % tok
|
| 585 |
+
if self._animate.get():
|
| 586 |
+
self._animate_shift()
|
| 587 |
+
else:
|
| 588 |
+
self._redraw()
|
| 589 |
+
return True
|
| 590 |
+
return False
|
| 591 |
+
|
| 592 |
+
def reduce(self, *e):
|
| 593 |
+
if self._animating_lock:
|
| 594 |
+
return
|
| 595 |
+
production = self._parser.reduce()
|
| 596 |
+
if production:
|
| 597 |
+
self._lastoper1["text"] = "Reduce:"
|
| 598 |
+
self._lastoper2["text"] = "%s" % production
|
| 599 |
+
if self._animate.get():
|
| 600 |
+
self._animate_reduce()
|
| 601 |
+
else:
|
| 602 |
+
self._redraw()
|
| 603 |
+
return production
|
| 604 |
+
|
| 605 |
+
def undo(self, *e):
|
| 606 |
+
if self._animating_lock:
|
| 607 |
+
return
|
| 608 |
+
if self._parser.undo():
|
| 609 |
+
self._redraw()
|
| 610 |
+
|
| 611 |
+
def postscript(self, *e):
|
| 612 |
+
self._cframe.print_to_file()
|
| 613 |
+
|
| 614 |
+
def mainloop(self, *args, **kwargs):
|
| 615 |
+
"""
|
| 616 |
+
Enter the Tkinter mainloop. This function must be called if
|
| 617 |
+
this demo is created from a non-interactive program (e.g.
|
| 618 |
+
from a secript); otherwise, the demo will close as soon as
|
| 619 |
+
the script completes.
|
| 620 |
+
"""
|
| 621 |
+
if in_idle():
|
| 622 |
+
return
|
| 623 |
+
self._top.mainloop(*args, **kwargs)
|
| 624 |
+
|
| 625 |
+
#########################################
|
| 626 |
+
## Menubar callbacks
|
| 627 |
+
#########################################
|
| 628 |
+
|
| 629 |
+
def resize(self, size=None):
|
| 630 |
+
if size is not None:
|
| 631 |
+
self._size.set(size)
|
| 632 |
+
size = self._size.get()
|
| 633 |
+
self._font.configure(size=-(abs(size)))
|
| 634 |
+
self._boldfont.configure(size=-(abs(size)))
|
| 635 |
+
self._sysfont.configure(size=-(abs(size)))
|
| 636 |
+
|
| 637 |
+
# self._stacklabel['font'] = ('helvetica', -size-4, 'bold')
|
| 638 |
+
# self._rtextlabel['font'] = ('helvetica', -size-4, 'bold')
|
| 639 |
+
# self._lastoper_label['font'] = ('helvetica', -size)
|
| 640 |
+
# self._lastoper1['font'] = ('helvetica', -size)
|
| 641 |
+
# self._lastoper2['font'] = ('helvetica', -size)
|
| 642 |
+
# self._prodlist['font'] = ('helvetica', -size)
|
| 643 |
+
# self._prodlist_label['font'] = ('helvetica', -size-2, 'bold')
|
| 644 |
+
self._redraw()
|
| 645 |
+
|
| 646 |
+
def help(self, *e):
|
| 647 |
+
# The default font's not very legible; try using 'fixed' instead.
|
| 648 |
+
try:
|
| 649 |
+
ShowText(
|
| 650 |
+
self._top,
|
| 651 |
+
"Help: Shift-Reduce Parser Application",
|
| 652 |
+
(__doc__ or "").strip(),
|
| 653 |
+
width=75,
|
| 654 |
+
font="fixed",
|
| 655 |
+
)
|
| 656 |
+
except:
|
| 657 |
+
ShowText(
|
| 658 |
+
self._top,
|
| 659 |
+
"Help: Shift-Reduce Parser Application",
|
| 660 |
+
(__doc__ or "").strip(),
|
| 661 |
+
width=75,
|
| 662 |
+
)
|
| 663 |
+
|
| 664 |
+
def about(self, *e):
|
| 665 |
+
ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper"
|
| 666 |
+
TITLE = "About: Shift-Reduce Parser Application"
|
| 667 |
+
try:
|
| 668 |
+
from tkinter.messagebox import Message
|
| 669 |
+
|
| 670 |
+
Message(message=ABOUT, title=TITLE).show()
|
| 671 |
+
except:
|
| 672 |
+
ShowText(self._top, TITLE, ABOUT)
|
| 673 |
+
|
| 674 |
+
def edit_grammar(self, *e):
|
| 675 |
+
CFGEditor(self._top, self._parser.grammar(), self.set_grammar)
|
| 676 |
+
|
| 677 |
+
def set_grammar(self, grammar):
|
| 678 |
+
self._parser.set_grammar(grammar)
|
| 679 |
+
self._productions = list(grammar.productions())
|
| 680 |
+
self._prodlist.delete(0, "end")
|
| 681 |
+
for production in self._productions:
|
| 682 |
+
self._prodlist.insert("end", (" %s" % production))
|
| 683 |
+
|
| 684 |
+
def edit_sentence(self, *e):
|
| 685 |
+
sentence = " ".join(self._sent)
|
| 686 |
+
title = "Edit Text"
|
| 687 |
+
instr = "Enter a new sentence to parse."
|
| 688 |
+
EntryDialog(self._top, sentence, instr, self.set_sentence, title)
|
| 689 |
+
|
| 690 |
+
def set_sentence(self, sent):
|
| 691 |
+
self._sent = sent.split() # [XX] use tagged?
|
| 692 |
+
self.reset()
|
| 693 |
+
|
| 694 |
+
#########################################
|
| 695 |
+
## Reduce Production Selection
|
| 696 |
+
#########################################
|
| 697 |
+
|
| 698 |
+
def _toggle_grammar(self, *e):
|
| 699 |
+
if self._show_grammar.get():
|
| 700 |
+
self._prodframe.pack(
|
| 701 |
+
fill="both", side="left", padx=2, after=self._feedbackframe
|
| 702 |
+
)
|
| 703 |
+
self._lastoper1["text"] = "Show Grammar"
|
| 704 |
+
else:
|
| 705 |
+
self._prodframe.pack_forget()
|
| 706 |
+
self._lastoper1["text"] = "Hide Grammar"
|
| 707 |
+
self._lastoper2["text"] = ""
|
| 708 |
+
|
| 709 |
+
def _prodlist_select(self, event):
|
| 710 |
+
selection = self._prodlist.curselection()
|
| 711 |
+
if len(selection) != 1:
|
| 712 |
+
return
|
| 713 |
+
index = int(selection[0])
|
| 714 |
+
production = self._parser.reduce(self._productions[index])
|
| 715 |
+
if production:
|
| 716 |
+
self._lastoper1["text"] = "Reduce:"
|
| 717 |
+
self._lastoper2["text"] = "%s" % production
|
| 718 |
+
if self._animate.get():
|
| 719 |
+
self._animate_reduce()
|
| 720 |
+
else:
|
| 721 |
+
self._redraw()
|
| 722 |
+
else:
|
| 723 |
+
# Reset the production selections.
|
| 724 |
+
self._prodlist.selection_clear(0, "end")
|
| 725 |
+
for prod in self._parser.reducible_productions():
|
| 726 |
+
index = self._productions.index(prod)
|
| 727 |
+
self._prodlist.selection_set(index)
|
| 728 |
+
|
| 729 |
+
def _popup_reduce(self, widget):
|
| 730 |
+
# Remove old commands.
|
| 731 |
+
productions = self._parser.reducible_productions()
|
| 732 |
+
if len(productions) == 0:
|
| 733 |
+
return
|
| 734 |
+
|
| 735 |
+
self._reduce_menu.delete(0, "end")
|
| 736 |
+
for production in productions:
|
| 737 |
+
self._reduce_menu.add_command(label=str(production), command=self.reduce)
|
| 738 |
+
self._reduce_menu.post(
|
| 739 |
+
self._canvas.winfo_pointerx(), self._canvas.winfo_pointery()
|
| 740 |
+
)
|
| 741 |
+
|
| 742 |
+
#########################################
|
| 743 |
+
## Animations
|
| 744 |
+
#########################################
|
| 745 |
+
|
| 746 |
+
def _animate_shift(self):
|
| 747 |
+
# What widget are we shifting?
|
| 748 |
+
widget = self._rtextwidgets[0]
|
| 749 |
+
|
| 750 |
+
# Where are we shifting from & to?
|
| 751 |
+
right = widget.bbox()[0]
|
| 752 |
+
if len(self._stackwidgets) == 0:
|
| 753 |
+
left = 5
|
| 754 |
+
else:
|
| 755 |
+
left = self._stackwidgets[-1].bbox()[2] + 10
|
| 756 |
+
|
| 757 |
+
# Start animating.
|
| 758 |
+
dt = self._animate.get()
|
| 759 |
+
dx = (left - right) * 1.0 / dt
|
| 760 |
+
self._animate_shift_frame(dt, widget, dx)
|
| 761 |
+
|
| 762 |
+
def _animate_shift_frame(self, frame, widget, dx):
|
| 763 |
+
if frame > 0:
|
| 764 |
+
self._animating_lock = 1
|
| 765 |
+
widget.move(dx, 0)
|
| 766 |
+
self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx)
|
| 767 |
+
else:
|
| 768 |
+
# but: stacktop??
|
| 769 |
+
|
| 770 |
+
# Shift the widget to the stack.
|
| 771 |
+
del self._rtextwidgets[0]
|
| 772 |
+
self._stackwidgets.append(widget)
|
| 773 |
+
self._animating_lock = 0
|
| 774 |
+
|
| 775 |
+
# Display the available productions.
|
| 776 |
+
self._draw_stack_top(widget)
|
| 777 |
+
self._highlight_productions()
|
| 778 |
+
|
| 779 |
+
def _animate_reduce(self):
|
| 780 |
+
# What widgets are we shifting?
|
| 781 |
+
numwidgets = len(self._parser.stack()[-1]) # number of children
|
| 782 |
+
widgets = self._stackwidgets[-numwidgets:]
|
| 783 |
+
|
| 784 |
+
# How far are we moving?
|
| 785 |
+
if isinstance(widgets[0], TreeSegmentWidget):
|
| 786 |
+
ydist = 15 + widgets[0].label().height()
|
| 787 |
+
else:
|
| 788 |
+
ydist = 15 + widgets[0].height()
|
| 789 |
+
|
| 790 |
+
# Start animating.
|
| 791 |
+
dt = self._animate.get()
|
| 792 |
+
dy = ydist * 2.0 / dt
|
| 793 |
+
self._animate_reduce_frame(dt / 2, widgets, dy)
|
| 794 |
+
|
| 795 |
+
def _animate_reduce_frame(self, frame, widgets, dy):
|
| 796 |
+
if frame > 0:
|
| 797 |
+
self._animating_lock = 1
|
| 798 |
+
for widget in widgets:
|
| 799 |
+
widget.move(0, dy)
|
| 800 |
+
self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy)
|
| 801 |
+
else:
|
| 802 |
+
del self._stackwidgets[-len(widgets) :]
|
| 803 |
+
for widget in widgets:
|
| 804 |
+
self._cframe.remove_widget(widget)
|
| 805 |
+
tok = self._parser.stack()[-1]
|
| 806 |
+
if not isinstance(tok, Tree):
|
| 807 |
+
raise ValueError()
|
| 808 |
+
label = TextWidget(
|
| 809 |
+
self._canvas, str(tok.label()), color="#006060", font=self._boldfont
|
| 810 |
+
)
|
| 811 |
+
widget = TreeSegmentWidget(self._canvas, label, widgets, width=2)
|
| 812 |
+
(x1, y1, x2, y2) = self._stacklabel.bbox()
|
| 813 |
+
y = y2 - y1 + 10
|
| 814 |
+
if not self._stackwidgets:
|
| 815 |
+
x = 5
|
| 816 |
+
else:
|
| 817 |
+
x = self._stackwidgets[-1].bbox()[2] + 10
|
| 818 |
+
self._cframe.add_widget(widget, x, y)
|
| 819 |
+
self._stackwidgets.append(widget)
|
| 820 |
+
|
| 821 |
+
# Display the available productions.
|
| 822 |
+
self._draw_stack_top(widget)
|
| 823 |
+
self._highlight_productions()
|
| 824 |
+
|
| 825 |
+
# # Delete the old widgets..
|
| 826 |
+
# del self._stackwidgets[-len(widgets):]
|
| 827 |
+
# for widget in widgets:
|
| 828 |
+
# self._cframe.destroy_widget(widget)
|
| 829 |
+
#
|
| 830 |
+
# # Make a new one.
|
| 831 |
+
# tok = self._parser.stack()[-1]
|
| 832 |
+
# if isinstance(tok, Tree):
|
| 833 |
+
# attribs = {'tree_color': '#4080a0', 'tree_width': 2,
|
| 834 |
+
# 'node_font': bold, 'node_color': '#006060',
|
| 835 |
+
# 'leaf_color': '#006060', 'leaf_font':self._font}
|
| 836 |
+
# widget = tree_to_treesegment(self._canvas, tok.type(),
|
| 837 |
+
# **attribs)
|
| 838 |
+
# widget.node()['color'] = '#000000'
|
| 839 |
+
# else:
|
| 840 |
+
# widget = TextWidget(self._canvas, tok.type(),
|
| 841 |
+
# color='#000000', font=self._font)
|
| 842 |
+
# widget.bind_click(self._popup_reduce)
|
| 843 |
+
# (x1, y1, x2, y2) = self._stacklabel.bbox()
|
| 844 |
+
# y = y2-y1+10
|
| 845 |
+
# if not self._stackwidgets: x = 5
|
| 846 |
+
# else: x = self._stackwidgets[-1].bbox()[2] + 10
|
| 847 |
+
# self._cframe.add_widget(widget, x, y)
|
| 848 |
+
# self._stackwidgets.append(widget)
|
| 849 |
+
|
| 850 |
+
# self._redraw()
|
| 851 |
+
self._animating_lock = 0
|
| 852 |
+
|
| 853 |
+
#########################################
|
| 854 |
+
## Hovering.
|
| 855 |
+
#########################################
|
| 856 |
+
|
| 857 |
+
def _highlight_hover(self, event):
|
| 858 |
+
# What production are we hovering over?
|
| 859 |
+
index = self._prodlist.nearest(event.y)
|
| 860 |
+
if self._hover == index:
|
| 861 |
+
return
|
| 862 |
+
|
| 863 |
+
# Clear any previous hover highlighting.
|
| 864 |
+
self._clear_hover()
|
| 865 |
+
|
| 866 |
+
# If the production corresponds to an available reduction,
|
| 867 |
+
# highlight the stack.
|
| 868 |
+
selection = [int(s) for s in self._prodlist.curselection()]
|
| 869 |
+
if index in selection:
|
| 870 |
+
rhslen = len(self._productions[index].rhs())
|
| 871 |
+
for stackwidget in self._stackwidgets[-rhslen:]:
|
| 872 |
+
if isinstance(stackwidget, TreeSegmentWidget):
|
| 873 |
+
stackwidget.label()["color"] = "#00a000"
|
| 874 |
+
else:
|
| 875 |
+
stackwidget["color"] = "#00a000"
|
| 876 |
+
|
| 877 |
+
# Remember what production we're hovering over.
|
| 878 |
+
self._hover = index
|
| 879 |
+
|
| 880 |
+
def _clear_hover(self, *event):
|
| 881 |
+
# Clear any previous hover highlighting.
|
| 882 |
+
if self._hover == -1:
|
| 883 |
+
return
|
| 884 |
+
self._hover = -1
|
| 885 |
+
for stackwidget in self._stackwidgets:
|
| 886 |
+
if isinstance(stackwidget, TreeSegmentWidget):
|
| 887 |
+
stackwidget.label()["color"] = "black"
|
| 888 |
+
else:
|
| 889 |
+
stackwidget["color"] = "black"
|
| 890 |
+
|
| 891 |
+
|
| 892 |
+
def app():
|
| 893 |
+
"""
|
| 894 |
+
Create a shift reduce parser app, using a simple grammar and
|
| 895 |
+
text.
|
| 896 |
+
"""
|
| 897 |
+
|
| 898 |
+
from nltk.grammar import CFG, Nonterminal, Production
|
| 899 |
+
|
| 900 |
+
nonterminals = "S VP NP PP P N Name V Det"
|
| 901 |
+
(S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split())
|
| 902 |
+
|
| 903 |
+
productions = (
|
| 904 |
+
# Syntactic Productions
|
| 905 |
+
Production(S, [NP, VP]),
|
| 906 |
+
Production(NP, [Det, N]),
|
| 907 |
+
Production(NP, [NP, PP]),
|
| 908 |
+
Production(VP, [VP, PP]),
|
| 909 |
+
Production(VP, [V, NP, PP]),
|
| 910 |
+
Production(VP, [V, NP]),
|
| 911 |
+
Production(PP, [P, NP]),
|
| 912 |
+
# Lexical Productions
|
| 913 |
+
Production(NP, ["I"]),
|
| 914 |
+
Production(Det, ["the"]),
|
| 915 |
+
Production(Det, ["a"]),
|
| 916 |
+
Production(N, ["man"]),
|
| 917 |
+
Production(V, ["saw"]),
|
| 918 |
+
Production(P, ["in"]),
|
| 919 |
+
Production(P, ["with"]),
|
| 920 |
+
Production(N, ["park"]),
|
| 921 |
+
Production(N, ["dog"]),
|
| 922 |
+
Production(N, ["statue"]),
|
| 923 |
+
Production(Det, ["my"]),
|
| 924 |
+
)
|
| 925 |
+
|
| 926 |
+
grammar = CFG(S, productions)
|
| 927 |
+
|
| 928 |
+
# tokenize the sentence
|
| 929 |
+
sent = "my dog saw a man in the park with a statue".split()
|
| 930 |
+
|
| 931 |
+
ShiftReduceApp(grammar, sent).mainloop()
|
| 932 |
+
|
| 933 |
+
|
| 934 |
+
if __name__ == "__main__":
|
| 935 |
+
app()
|
| 936 |
+
|
| 937 |
+
__all__ = ["app"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/app/wordfreq_app.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Wordfreq Application
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
from matplotlib import pylab
|
| 9 |
+
|
| 10 |
+
from nltk.corpus import gutenberg
|
| 11 |
+
from nltk.text import Text
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def plot_word_freq_dist(text):
|
| 15 |
+
fd = text.vocab()
|
| 16 |
+
|
| 17 |
+
samples = [item for item, _ in fd.most_common(50)]
|
| 18 |
+
values = [fd[sample] for sample in samples]
|
| 19 |
+
values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))]
|
| 20 |
+
pylab.title(text.name)
|
| 21 |
+
pylab.xlabel("Samples")
|
| 22 |
+
pylab.ylabel("Cumulative Percentage")
|
| 23 |
+
pylab.plot(values)
|
| 24 |
+
pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
|
| 25 |
+
pylab.show()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def app():
|
| 29 |
+
t1 = Text(gutenberg.words("melville-moby_dick.txt"))
|
| 30 |
+
plot_word_freq_dist(t1)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
app()
|
| 35 |
+
|
| 36 |
+
__all__ = ["app"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/app/wordnet_app.py
ADDED
|
@@ -0,0 +1,997 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: WordNet Browser Application
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
|
| 5 |
+
# Paul Bone <pbone@students.csse.unimelb.edu.au>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
A WordNet Browser application which launches the default browser
|
| 11 |
+
(if it is not already running) and opens a new tab with a connection
|
| 12 |
+
to http://localhost:port/ . It also starts an HTTP server on the
|
| 13 |
+
specified port and begins serving browser requests. The default
|
| 14 |
+
port is 8000. (For command-line help, run "python wordnet -h")
|
| 15 |
+
This application requires that the user's web browser supports
|
| 16 |
+
Javascript.
|
| 17 |
+
|
| 18 |
+
BrowServer is a server for browsing the NLTK Wordnet database It first
|
| 19 |
+
launches a browser client to be used for browsing and then starts
|
| 20 |
+
serving the requests of that and maybe other clients
|
| 21 |
+
|
| 22 |
+
Usage::
|
| 23 |
+
|
| 24 |
+
browserver.py -h
|
| 25 |
+
browserver.py [-s] [-p <port>]
|
| 26 |
+
|
| 27 |
+
Options::
|
| 28 |
+
|
| 29 |
+
-h or --help
|
| 30 |
+
Display this help message.
|
| 31 |
+
|
| 32 |
+
-l <file> or --log-file <file>
|
| 33 |
+
Logs messages to the given file, If this option is not specified
|
| 34 |
+
messages are silently dropped.
|
| 35 |
+
|
| 36 |
+
-p <port> or --port <port>
|
| 37 |
+
Run the web server on this TCP port, defaults to 8000.
|
| 38 |
+
|
| 39 |
+
-s or --server-mode
|
| 40 |
+
Do not start a web browser, and do not allow a user to
|
| 41 |
+
shutdown the server through the web interface.
|
| 42 |
+
"""
|
| 43 |
+
# TODO: throughout this package variable names and docstrings need
|
| 44 |
+
# modifying to be compliant with NLTK's coding standards. Tests also
|
| 45 |
+
# need to be develop to ensure this continues to work in the face of
|
| 46 |
+
# changes to other NLTK packages.
|
| 47 |
+
|
| 48 |
+
import base64
|
| 49 |
+
import copy
|
| 50 |
+
import datetime
|
| 51 |
+
import getopt
|
| 52 |
+
import os
|
| 53 |
+
import pickle
|
| 54 |
+
import re
|
| 55 |
+
import sys
|
| 56 |
+
import threading
|
| 57 |
+
import time
|
| 58 |
+
import webbrowser
|
| 59 |
+
from collections import defaultdict
|
| 60 |
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
| 61 |
+
|
| 62 |
+
# Allow this program to run inside the NLTK source tree.
|
| 63 |
+
from sys import argv, path
|
| 64 |
+
from urllib.parse import unquote_plus
|
| 65 |
+
|
| 66 |
+
from nltk.corpus import wordnet as wn
|
| 67 |
+
from nltk.corpus.reader.wordnet import Lemma, Synset
|
| 68 |
+
|
| 69 |
+
# now included in local file
|
| 70 |
+
# from util import html_header, html_trailer, \
|
| 71 |
+
# get_static_index_page, get_static_page_by_path, \
|
| 72 |
+
# page_from_word, page_from_href
|
| 73 |
+
|
| 74 |
+
firstClient = True
|
| 75 |
+
|
| 76 |
+
# True if we're not also running a web browser. The value f server_mode
|
| 77 |
+
# gets set by demo().
|
| 78 |
+
server_mode = None
|
| 79 |
+
|
| 80 |
+
# If set this is a file object for writing log messages.
|
| 81 |
+
logfile = None
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class MyServerHandler(BaseHTTPRequestHandler):
|
| 85 |
+
def do_HEAD(self):
|
| 86 |
+
self.send_head()
|
| 87 |
+
|
| 88 |
+
def do_GET(self):
|
| 89 |
+
global firstClient
|
| 90 |
+
sp = self.path[1:]
|
| 91 |
+
if unquote_plus(sp) == "SHUTDOWN THE SERVER":
|
| 92 |
+
if server_mode:
|
| 93 |
+
page = "Server must be killed with SIGTERM."
|
| 94 |
+
type = "text/plain"
|
| 95 |
+
else:
|
| 96 |
+
print("Server shutting down!")
|
| 97 |
+
os._exit(0)
|
| 98 |
+
|
| 99 |
+
elif sp == "": # First request.
|
| 100 |
+
type = "text/html"
|
| 101 |
+
if not server_mode and firstClient:
|
| 102 |
+
firstClient = False
|
| 103 |
+
page = get_static_index_page(True)
|
| 104 |
+
else:
|
| 105 |
+
page = get_static_index_page(False)
|
| 106 |
+
word = "green"
|
| 107 |
+
|
| 108 |
+
elif sp.endswith(".html"): # Trying to fetch a HTML file TODO:
|
| 109 |
+
type = "text/html"
|
| 110 |
+
usp = unquote_plus(sp)
|
| 111 |
+
if usp == "NLTK Wordnet Browser Database Info.html":
|
| 112 |
+
word = "* Database Info *"
|
| 113 |
+
if os.path.isfile(usp):
|
| 114 |
+
with open(usp) as infile:
|
| 115 |
+
page = infile.read()
|
| 116 |
+
else:
|
| 117 |
+
page = (
|
| 118 |
+
(html_header % word) + "<p>The database info file:"
|
| 119 |
+
"<p><b>"
|
| 120 |
+
+ usp
|
| 121 |
+
+ "</b>"
|
| 122 |
+
+ "<p>was not found. Run this:"
|
| 123 |
+
+ "<p><b>python dbinfo_html.py</b>"
|
| 124 |
+
+ "<p>to produce it."
|
| 125 |
+
+ html_trailer
|
| 126 |
+
)
|
| 127 |
+
else:
|
| 128 |
+
# Handle files here.
|
| 129 |
+
word = sp
|
| 130 |
+
page = get_static_page_by_path(usp)
|
| 131 |
+
elif sp.startswith("search"):
|
| 132 |
+
# This doesn't seem to work with MWEs.
|
| 133 |
+
type = "text/html"
|
| 134 |
+
parts = (sp.split("?")[1]).split("&")
|
| 135 |
+
word = [
|
| 136 |
+
p.split("=")[1].replace("+", " ")
|
| 137 |
+
for p in parts
|
| 138 |
+
if p.startswith("nextWord")
|
| 139 |
+
][0]
|
| 140 |
+
page, word = page_from_word(word)
|
| 141 |
+
elif sp.startswith("lookup_"):
|
| 142 |
+
# TODO add a variation of this that takes a non ecoded word or MWE.
|
| 143 |
+
type = "text/html"
|
| 144 |
+
sp = sp[len("lookup_") :]
|
| 145 |
+
page, word = page_from_href(sp)
|
| 146 |
+
elif sp == "start_page":
|
| 147 |
+
# if this is the first request we should display help
|
| 148 |
+
# information, and possibly set a default word.
|
| 149 |
+
type = "text/html"
|
| 150 |
+
page, word = page_from_word("wordnet")
|
| 151 |
+
else:
|
| 152 |
+
type = "text/plain"
|
| 153 |
+
page = "Could not parse request: '%s'" % sp
|
| 154 |
+
|
| 155 |
+
# Send result.
|
| 156 |
+
self.send_head(type)
|
| 157 |
+
self.wfile.write(page.encode("utf8"))
|
| 158 |
+
|
| 159 |
+
def send_head(self, type=None):
|
| 160 |
+
self.send_response(200)
|
| 161 |
+
self.send_header("Content-type", type)
|
| 162 |
+
self.end_headers()
|
| 163 |
+
|
| 164 |
+
def log_message(self, format, *args):
|
| 165 |
+
global logfile
|
| 166 |
+
|
| 167 |
+
if logfile:
|
| 168 |
+
logfile.write(
|
| 169 |
+
"%s - - [%s] %s\n"
|
| 170 |
+
% (self.address_string(), self.log_date_time_string(), format % args)
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def get_unique_counter_from_url(sp):
|
| 175 |
+
"""
|
| 176 |
+
Extract the unique counter from the URL if it has one. Otherwise return
|
| 177 |
+
null.
|
| 178 |
+
"""
|
| 179 |
+
pos = sp.rfind("%23")
|
| 180 |
+
if pos != -1:
|
| 181 |
+
return int(sp[(pos + 3) :])
|
| 182 |
+
else:
|
| 183 |
+
return None
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def wnb(port=8000, runBrowser=True, logfilename=None):
|
| 187 |
+
"""
|
| 188 |
+
Run NLTK Wordnet Browser Server.
|
| 189 |
+
|
| 190 |
+
:param port: The port number for the server to listen on, defaults to
|
| 191 |
+
8000
|
| 192 |
+
:type port: int
|
| 193 |
+
|
| 194 |
+
:param runBrowser: True to start a web browser and point it at the web
|
| 195 |
+
server.
|
| 196 |
+
:type runBrowser: bool
|
| 197 |
+
"""
|
| 198 |
+
# The webbrowser module is unpredictable, typically it blocks if it uses
|
| 199 |
+
# a console web browser, and doesn't block if it uses a GUI webbrowser,
|
| 200 |
+
# so we need to force it to have a clear correct behaviour.
|
| 201 |
+
#
|
| 202 |
+
# Normally the server should run for as long as the user wants. they
|
| 203 |
+
# should idealy be able to control this from the UI by closing the
|
| 204 |
+
# window or tab. Second best would be clicking a button to say
|
| 205 |
+
# 'Shutdown' that first shutsdown the server and closes the window or
|
| 206 |
+
# tab, or exits the text-mode browser. Both of these are unfreasable.
|
| 207 |
+
#
|
| 208 |
+
# The next best alternative is to start the server, have it close when
|
| 209 |
+
# it receives SIGTERM (default), and run the browser as well. The user
|
| 210 |
+
# may have to shutdown both programs.
|
| 211 |
+
#
|
| 212 |
+
# Since webbrowser may block, and the webserver will block, we must run
|
| 213 |
+
# them in separate threads.
|
| 214 |
+
#
|
| 215 |
+
global server_mode, logfile
|
| 216 |
+
server_mode = not runBrowser
|
| 217 |
+
|
| 218 |
+
# Setup logging.
|
| 219 |
+
if logfilename:
|
| 220 |
+
try:
|
| 221 |
+
logfile = open(logfilename, "a", 1) # 1 means 'line buffering'
|
| 222 |
+
except OSError as e:
|
| 223 |
+
sys.stderr.write("Couldn't open %s for writing: %s", logfilename, e)
|
| 224 |
+
sys.exit(1)
|
| 225 |
+
else:
|
| 226 |
+
logfile = None
|
| 227 |
+
|
| 228 |
+
# Compute URL and start web browser
|
| 229 |
+
url = "http://localhost:" + str(port)
|
| 230 |
+
|
| 231 |
+
server_ready = None
|
| 232 |
+
browser_thread = None
|
| 233 |
+
|
| 234 |
+
if runBrowser:
|
| 235 |
+
server_ready = threading.Event()
|
| 236 |
+
browser_thread = startBrowser(url, server_ready)
|
| 237 |
+
|
| 238 |
+
# Start the server.
|
| 239 |
+
server = HTTPServer(("", port), MyServerHandler)
|
| 240 |
+
if logfile:
|
| 241 |
+
logfile.write("NLTK Wordnet browser server running serving: %s\n" % url)
|
| 242 |
+
if runBrowser:
|
| 243 |
+
server_ready.set()
|
| 244 |
+
|
| 245 |
+
try:
|
| 246 |
+
server.serve_forever()
|
| 247 |
+
except KeyboardInterrupt:
|
| 248 |
+
pass
|
| 249 |
+
|
| 250 |
+
if runBrowser:
|
| 251 |
+
browser_thread.join()
|
| 252 |
+
|
| 253 |
+
if logfile:
|
| 254 |
+
logfile.close()
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def startBrowser(url, server_ready):
|
| 258 |
+
def run():
|
| 259 |
+
server_ready.wait()
|
| 260 |
+
time.sleep(1) # Wait a little bit more, there's still the chance of
|
| 261 |
+
# a race condition.
|
| 262 |
+
webbrowser.open(url, new=2, autoraise=1)
|
| 263 |
+
|
| 264 |
+
t = threading.Thread(target=run)
|
| 265 |
+
t.start()
|
| 266 |
+
return t
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
#####################################################################
|
| 270 |
+
# Utilities
|
| 271 |
+
#####################################################################
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
"""
|
| 275 |
+
WordNet Browser Utilities.
|
| 276 |
+
|
| 277 |
+
This provides a backend to both wxbrowse and browserver.py.
|
| 278 |
+
"""
|
| 279 |
+
|
| 280 |
+
################################################################################
|
| 281 |
+
#
|
| 282 |
+
# Main logic for wordnet browser.
|
| 283 |
+
#
|
| 284 |
+
|
| 285 |
+
# This is wrapped inside a function since wn is only available if the
|
| 286 |
+
# WordNet corpus is installed.
|
| 287 |
+
def _pos_tuples():
|
| 288 |
+
return [
|
| 289 |
+
(wn.NOUN, "N", "noun"),
|
| 290 |
+
(wn.VERB, "V", "verb"),
|
| 291 |
+
(wn.ADJ, "J", "adj"),
|
| 292 |
+
(wn.ADV, "R", "adv"),
|
| 293 |
+
]
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def _pos_match(pos_tuple):
|
| 297 |
+
"""
|
| 298 |
+
This function returns the complete pos tuple for the partial pos
|
| 299 |
+
tuple given to it. It attempts to match it against the first
|
| 300 |
+
non-null component of the given pos tuple.
|
| 301 |
+
"""
|
| 302 |
+
if pos_tuple[0] == "s":
|
| 303 |
+
pos_tuple = ("a", pos_tuple[1], pos_tuple[2])
|
| 304 |
+
for n, x in enumerate(pos_tuple):
|
| 305 |
+
if x is not None:
|
| 306 |
+
break
|
| 307 |
+
for pt in _pos_tuples():
|
| 308 |
+
if pt[n] == pos_tuple[n]:
|
| 309 |
+
return pt
|
| 310 |
+
return None
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
HYPONYM = 0
|
| 314 |
+
HYPERNYM = 1
|
| 315 |
+
CLASS_REGIONAL = 2
|
| 316 |
+
PART_HOLONYM = 3
|
| 317 |
+
PART_MERONYM = 4
|
| 318 |
+
ATTRIBUTE = 5
|
| 319 |
+
SUBSTANCE_HOLONYM = 6
|
| 320 |
+
SUBSTANCE_MERONYM = 7
|
| 321 |
+
MEMBER_HOLONYM = 8
|
| 322 |
+
MEMBER_MERONYM = 9
|
| 323 |
+
VERB_GROUP = 10
|
| 324 |
+
INSTANCE_HYPONYM = 12
|
| 325 |
+
INSTANCE_HYPERNYM = 13
|
| 326 |
+
CAUSE = 14
|
| 327 |
+
ALSO_SEE = 15
|
| 328 |
+
SIMILAR = 16
|
| 329 |
+
ENTAILMENT = 17
|
| 330 |
+
ANTONYM = 18
|
| 331 |
+
FRAMES = 19
|
| 332 |
+
PERTAINYM = 20
|
| 333 |
+
|
| 334 |
+
CLASS_CATEGORY = 21
|
| 335 |
+
CLASS_USAGE = 22
|
| 336 |
+
CLASS_REGIONAL = 23
|
| 337 |
+
CLASS_USAGE = 24
|
| 338 |
+
CLASS_CATEGORY = 11
|
| 339 |
+
|
| 340 |
+
DERIVATIONALLY_RELATED_FORM = 25
|
| 341 |
+
|
| 342 |
+
INDIRECT_HYPERNYMS = 26
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
def lemma_property(word, synset, func):
|
| 346 |
+
def flattern(l):
|
| 347 |
+
if l == []:
|
| 348 |
+
return []
|
| 349 |
+
else:
|
| 350 |
+
return l[0] + flattern(l[1:])
|
| 351 |
+
|
| 352 |
+
return flattern([func(l) for l in synset.lemmas() if l.name == word])
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def rebuild_tree(orig_tree):
|
| 356 |
+
node = orig_tree[0]
|
| 357 |
+
children = orig_tree[1:]
|
| 358 |
+
return (node, [rebuild_tree(t) for t in children])
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def get_relations_data(word, synset):
|
| 362 |
+
"""
|
| 363 |
+
Get synset relations data for a synset. Note that this doesn't
|
| 364 |
+
yet support things such as full hyponym vs direct hyponym.
|
| 365 |
+
"""
|
| 366 |
+
if synset.pos() == wn.NOUN:
|
| 367 |
+
return (
|
| 368 |
+
(HYPONYM, "Hyponyms", synset.hyponyms()),
|
| 369 |
+
(INSTANCE_HYPONYM, "Instance hyponyms", synset.instance_hyponyms()),
|
| 370 |
+
(HYPERNYM, "Direct hypernyms", synset.hypernyms()),
|
| 371 |
+
(
|
| 372 |
+
INDIRECT_HYPERNYMS,
|
| 373 |
+
"Indirect hypernyms",
|
| 374 |
+
rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
|
| 375 |
+
),
|
| 376 |
+
# hypernyms', 'Sister terms',
|
| 377 |
+
(INSTANCE_HYPERNYM, "Instance hypernyms", synset.instance_hypernyms()),
|
| 378 |
+
# (CLASS_REGIONAL, ['domain term region'], ),
|
| 379 |
+
(PART_HOLONYM, "Part holonyms", synset.part_holonyms()),
|
| 380 |
+
(PART_MERONYM, "Part meronyms", synset.part_meronyms()),
|
| 381 |
+
(SUBSTANCE_HOLONYM, "Substance holonyms", synset.substance_holonyms()),
|
| 382 |
+
(SUBSTANCE_MERONYM, "Substance meronyms", synset.substance_meronyms()),
|
| 383 |
+
(MEMBER_HOLONYM, "Member holonyms", synset.member_holonyms()),
|
| 384 |
+
(MEMBER_MERONYM, "Member meronyms", synset.member_meronyms()),
|
| 385 |
+
(ATTRIBUTE, "Attributes", synset.attributes()),
|
| 386 |
+
(ANTONYM, "Antonyms", lemma_property(word, synset, lambda l: l.antonyms())),
|
| 387 |
+
(
|
| 388 |
+
DERIVATIONALLY_RELATED_FORM,
|
| 389 |
+
"Derivationally related form",
|
| 390 |
+
lemma_property(
|
| 391 |
+
word, synset, lambda l: l.derivationally_related_forms()
|
| 392 |
+
),
|
| 393 |
+
),
|
| 394 |
+
)
|
| 395 |
+
elif synset.pos() == wn.VERB:
|
| 396 |
+
return (
|
| 397 |
+
(ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
|
| 398 |
+
(HYPONYM, "Hyponym", synset.hyponyms()),
|
| 399 |
+
(HYPERNYM, "Direct hypernyms", synset.hypernyms()),
|
| 400 |
+
(
|
| 401 |
+
INDIRECT_HYPERNYMS,
|
| 402 |
+
"Indirect hypernyms",
|
| 403 |
+
rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1],
|
| 404 |
+
),
|
| 405 |
+
(ENTAILMENT, "Entailments", synset.entailments()),
|
| 406 |
+
(CAUSE, "Causes", synset.causes()),
|
| 407 |
+
(ALSO_SEE, "Also see", synset.also_sees()),
|
| 408 |
+
(VERB_GROUP, "Verb Groups", synset.verb_groups()),
|
| 409 |
+
(
|
| 410 |
+
DERIVATIONALLY_RELATED_FORM,
|
| 411 |
+
"Derivationally related form",
|
| 412 |
+
lemma_property(
|
| 413 |
+
word, synset, lambda l: l.derivationally_related_forms()
|
| 414 |
+
),
|
| 415 |
+
),
|
| 416 |
+
)
|
| 417 |
+
elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT:
|
| 418 |
+
return (
|
| 419 |
+
(ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
|
| 420 |
+
(SIMILAR, "Similar to", synset.similar_tos()),
|
| 421 |
+
# Participle of verb - not supported by corpus
|
| 422 |
+
(
|
| 423 |
+
PERTAINYM,
|
| 424 |
+
"Pertainyms",
|
| 425 |
+
lemma_property(word, synset, lambda l: l.pertainyms()),
|
| 426 |
+
),
|
| 427 |
+
(ATTRIBUTE, "Attributes", synset.attributes()),
|
| 428 |
+
(ALSO_SEE, "Also see", synset.also_sees()),
|
| 429 |
+
)
|
| 430 |
+
elif synset.pos() == wn.ADV:
|
| 431 |
+
# This is weird. adverbs such as 'quick' and 'fast' don't seem
|
| 432 |
+
# to have antonyms returned by the corpus.a
|
| 433 |
+
return (
|
| 434 |
+
(ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())),
|
| 435 |
+
)
|
| 436 |
+
# Derived from adjective - not supported by corpus
|
| 437 |
+
else:
|
| 438 |
+
raise TypeError("Unhandles synset POS type: " + str(synset.pos()))
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
html_header = """
|
| 442 |
+
<!DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
|
| 443 |
+
'http://www.w3.org/TR/html4/strict.dtd'>
|
| 444 |
+
<html>
|
| 445 |
+
<head>
|
| 446 |
+
<meta name='generator' content=
|
| 447 |
+
'HTML Tidy for Windows (vers 14 February 2006), see www.w3.org'>
|
| 448 |
+
<meta http-equiv='Content-Type' content=
|
| 449 |
+
'text/html; charset=us-ascii'>
|
| 450 |
+
<title>NLTK Wordnet Browser display of: %s</title></head>
|
| 451 |
+
<body bgcolor='#F5F5F5' text='#000000'>
|
| 452 |
+
"""
|
| 453 |
+
html_trailer = """
|
| 454 |
+
</body>
|
| 455 |
+
</html>
|
| 456 |
+
"""
|
| 457 |
+
|
| 458 |
+
explanation = """
|
| 459 |
+
<h3>Search Help</h3>
|
| 460 |
+
<ul><li>The display below the line is an example of the output the browser
|
| 461 |
+
shows you when you enter a search word. The search word was <b>green</b>.</li>
|
| 462 |
+
<li>The search result shows for different parts of speech the <b>synsets</b>
|
| 463 |
+
i.e. different meanings for the word.</li>
|
| 464 |
+
<li>All underlined texts are hypertext links. There are two types of links:
|
| 465 |
+
word links and others. Clicking a word link carries out a search for the word
|
| 466 |
+
in the Wordnet database.</li>
|
| 467 |
+
<li>Clicking a link of the other type opens a display section of data attached
|
| 468 |
+
to that link. Clicking that link a second time closes the section again.</li>
|
| 469 |
+
<li>Clicking <u>S:</u> opens a section showing the relations for that synset.
|
| 470 |
+
</li>
|
| 471 |
+
<li>Clicking on a relation name opens a section that displays the associated
|
| 472 |
+
synsets.</li>
|
| 473 |
+
<li>Type a search word in the <b>Word</b> field and start the search by the
|
| 474 |
+
<b>Enter/Return</b> key or click the <b>Search</b> button.</li>
|
| 475 |
+
</ul>
|
| 476 |
+
<hr width='100%'>
|
| 477 |
+
"""
|
| 478 |
+
|
| 479 |
+
# HTML oriented functions
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
def _bold(txt):
|
| 483 |
+
return "<b>%s</b>" % txt
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
def _center(txt):
|
| 487 |
+
return "<center>%s</center>" % txt
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
def _hlev(n, txt):
|
| 491 |
+
return "<h%d>%s</h%d>" % (n, txt, n)
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
def _italic(txt):
|
| 495 |
+
return "<i>%s</i>" % txt
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
def _li(txt):
|
| 499 |
+
return "<li>%s</li>" % txt
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
def pg(word, body):
|
| 503 |
+
"""
|
| 504 |
+
Return a HTML page of NLTK Browser format constructed from the
|
| 505 |
+
word and body
|
| 506 |
+
|
| 507 |
+
:param word: The word that the body corresponds to
|
| 508 |
+
:type word: str
|
| 509 |
+
:param body: The HTML body corresponding to the word
|
| 510 |
+
:type body: str
|
| 511 |
+
:return: a HTML page for the word-body combination
|
| 512 |
+
:rtype: str
|
| 513 |
+
"""
|
| 514 |
+
return (html_header % word) + body + html_trailer
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
def _ul(txt):
|
| 518 |
+
return "<ul>" + txt + "</ul>"
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
def _abbc(txt):
|
| 522 |
+
"""
|
| 523 |
+
abbc = asterisks, breaks, bold, center
|
| 524 |
+
"""
|
| 525 |
+
return _center(_bold("<br>" * 10 + "*" * 10 + " " + txt + " " + "*" * 10))
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
full_hyponym_cont_text = _ul(_li(_italic("(has full hyponym continuation)"))) + "\n"
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
def _get_synset(synset_key):
|
| 532 |
+
"""
|
| 533 |
+
The synset key is the unique name of the synset, this can be
|
| 534 |
+
retrieved via synset.name()
|
| 535 |
+
"""
|
| 536 |
+
return wn.synset(synset_key)
|
| 537 |
+
|
| 538 |
+
|
| 539 |
+
def _collect_one_synset(word, synset, synset_relations):
|
| 540 |
+
"""
|
| 541 |
+
Returns the HTML string for one synset or word
|
| 542 |
+
|
| 543 |
+
:param word: the current word
|
| 544 |
+
:type word: str
|
| 545 |
+
:param synset: a synset
|
| 546 |
+
:type synset: synset
|
| 547 |
+
:param synset_relations: information about which synset relations
|
| 548 |
+
to display.
|
| 549 |
+
:type synset_relations: dict(synset_key, set(relation_id))
|
| 550 |
+
:return: The HTML string built for this synset
|
| 551 |
+
:rtype: str
|
| 552 |
+
"""
|
| 553 |
+
if isinstance(synset, tuple): # It's a word
|
| 554 |
+
raise NotImplementedError("word not supported by _collect_one_synset")
|
| 555 |
+
|
| 556 |
+
typ = "S"
|
| 557 |
+
pos_tuple = _pos_match((synset.pos(), None, None))
|
| 558 |
+
assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos()
|
| 559 |
+
descr = pos_tuple[2]
|
| 560 |
+
ref = copy.deepcopy(Reference(word, synset_relations))
|
| 561 |
+
ref.toggle_synset(synset)
|
| 562 |
+
synset_label = typ + ";"
|
| 563 |
+
if synset.name() in synset_relations:
|
| 564 |
+
synset_label = _bold(synset_label)
|
| 565 |
+
s = f"<li>{make_lookup_link(ref, synset_label)} ({descr}) "
|
| 566 |
+
|
| 567 |
+
def format_lemma(w):
|
| 568 |
+
w = w.replace("_", " ")
|
| 569 |
+
if w.lower() == word:
|
| 570 |
+
return _bold(w)
|
| 571 |
+
else:
|
| 572 |
+
ref = Reference(w)
|
| 573 |
+
return make_lookup_link(ref, w)
|
| 574 |
+
|
| 575 |
+
s += ", ".join(format_lemma(l.name()) for l in synset.lemmas())
|
| 576 |
+
|
| 577 |
+
gl = " ({}) <i>{}</i> ".format(
|
| 578 |
+
synset.definition(),
|
| 579 |
+
"; ".join('"%s"' % e for e in synset.examples()),
|
| 580 |
+
)
|
| 581 |
+
return s + gl + _synset_relations(word, synset, synset_relations) + "</li>\n"
|
| 582 |
+
|
| 583 |
+
|
| 584 |
+
def _collect_all_synsets(word, pos, synset_relations=dict()):
|
| 585 |
+
"""
|
| 586 |
+
Return a HTML unordered list of synsets for the given word and
|
| 587 |
+
part of speech.
|
| 588 |
+
"""
|
| 589 |
+
return "<ul>%s\n</ul>\n" % "".join(
|
| 590 |
+
_collect_one_synset(word, synset, synset_relations)
|
| 591 |
+
for synset in wn.synsets(word, pos)
|
| 592 |
+
)
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
def _synset_relations(word, synset, synset_relations):
|
| 596 |
+
"""
|
| 597 |
+
Builds the HTML string for the relations of a synset
|
| 598 |
+
|
| 599 |
+
:param word: The current word
|
| 600 |
+
:type word: str
|
| 601 |
+
:param synset: The synset for which we're building the relations.
|
| 602 |
+
:type synset: Synset
|
| 603 |
+
:param synset_relations: synset keys and relation types for which to display relations.
|
| 604 |
+
:type synset_relations: dict(synset_key, set(relation_type))
|
| 605 |
+
:return: The HTML for a synset's relations
|
| 606 |
+
:rtype: str
|
| 607 |
+
"""
|
| 608 |
+
|
| 609 |
+
if not synset.name() in synset_relations:
|
| 610 |
+
return ""
|
| 611 |
+
ref = Reference(word, synset_relations)
|
| 612 |
+
|
| 613 |
+
def relation_html(r):
|
| 614 |
+
if isinstance(r, Synset):
|
| 615 |
+
return make_lookup_link(Reference(r.lemma_names()[0]), r.lemma_names()[0])
|
| 616 |
+
elif isinstance(r, Lemma):
|
| 617 |
+
return relation_html(r.synset())
|
| 618 |
+
elif isinstance(r, tuple):
|
| 619 |
+
# It's probably a tuple containing a Synset and a list of
|
| 620 |
+
# similar tuples. This forms a tree of synsets.
|
| 621 |
+
return "{}\n<ul>{}</ul>\n".format(
|
| 622 |
+
relation_html(r[0]),
|
| 623 |
+
"".join("<li>%s</li>\n" % relation_html(sr) for sr in r[1]),
|
| 624 |
+
)
|
| 625 |
+
else:
|
| 626 |
+
raise TypeError(
|
| 627 |
+
"r must be a synset, lemma or list, it was: type(r) = %s, r = %s"
|
| 628 |
+
% (type(r), r)
|
| 629 |
+
)
|
| 630 |
+
|
| 631 |
+
def make_synset_html(db_name, disp_name, rels):
|
| 632 |
+
synset_html = "<i>%s</i>\n" % make_lookup_link(
|
| 633 |
+
copy.deepcopy(ref).toggle_synset_relation(synset, db_name),
|
| 634 |
+
disp_name,
|
| 635 |
+
)
|
| 636 |
+
|
| 637 |
+
if db_name in ref.synset_relations[synset.name()]:
|
| 638 |
+
synset_html += "<ul>%s</ul>\n" % "".join(
|
| 639 |
+
"<li>%s</li>\n" % relation_html(r) for r in rels
|
| 640 |
+
)
|
| 641 |
+
|
| 642 |
+
return synset_html
|
| 643 |
+
|
| 644 |
+
html = (
|
| 645 |
+
"<ul>"
|
| 646 |
+
+ "\n".join(
|
| 647 |
+
"<li>%s</li>" % make_synset_html(*rel_data)
|
| 648 |
+
for rel_data in get_relations_data(word, synset)
|
| 649 |
+
if rel_data[2] != []
|
| 650 |
+
)
|
| 651 |
+
+ "</ul>"
|
| 652 |
+
)
|
| 653 |
+
|
| 654 |
+
return html
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
class Reference:
|
| 658 |
+
"""
|
| 659 |
+
A reference to a page that may be generated by page_word
|
| 660 |
+
"""
|
| 661 |
+
|
| 662 |
+
def __init__(self, word, synset_relations=dict()):
|
| 663 |
+
"""
|
| 664 |
+
Build a reference to a new page.
|
| 665 |
+
|
| 666 |
+
word is the word or words (separated by commas) for which to
|
| 667 |
+
search for synsets of
|
| 668 |
+
|
| 669 |
+
synset_relations is a dictionary of synset keys to sets of
|
| 670 |
+
synset relation identifaiers to unfold a list of synset
|
| 671 |
+
relations for.
|
| 672 |
+
"""
|
| 673 |
+
self.word = word
|
| 674 |
+
self.synset_relations = synset_relations
|
| 675 |
+
|
| 676 |
+
def encode(self):
|
| 677 |
+
"""
|
| 678 |
+
Encode this reference into a string to be used in a URL.
|
| 679 |
+
"""
|
| 680 |
+
# This uses a tuple rather than an object since the python
|
| 681 |
+
# pickle representation is much smaller and there is no need
|
| 682 |
+
# to represent the complete object.
|
| 683 |
+
string = pickle.dumps((self.word, self.synset_relations), -1)
|
| 684 |
+
return base64.urlsafe_b64encode(string).decode()
|
| 685 |
+
|
| 686 |
+
@staticmethod
|
| 687 |
+
def decode(string):
|
| 688 |
+
"""
|
| 689 |
+
Decode a reference encoded with Reference.encode
|
| 690 |
+
"""
|
| 691 |
+
string = base64.urlsafe_b64decode(string.encode())
|
| 692 |
+
word, synset_relations = pickle.loads(string)
|
| 693 |
+
return Reference(word, synset_relations)
|
| 694 |
+
|
| 695 |
+
def toggle_synset_relation(self, synset, relation):
|
| 696 |
+
"""
|
| 697 |
+
Toggle the display of the relations for the given synset and
|
| 698 |
+
relation type.
|
| 699 |
+
|
| 700 |
+
This function will throw a KeyError if the synset is currently
|
| 701 |
+
not being displayed.
|
| 702 |
+
"""
|
| 703 |
+
if relation in self.synset_relations[synset.name()]:
|
| 704 |
+
self.synset_relations[synset.name()].remove(relation)
|
| 705 |
+
else:
|
| 706 |
+
self.synset_relations[synset.name()].add(relation)
|
| 707 |
+
|
| 708 |
+
return self
|
| 709 |
+
|
| 710 |
+
def toggle_synset(self, synset):
|
| 711 |
+
"""
|
| 712 |
+
Toggle displaying of the relation types for the given synset
|
| 713 |
+
"""
|
| 714 |
+
if synset.name() in self.synset_relations:
|
| 715 |
+
del self.synset_relations[synset.name()]
|
| 716 |
+
else:
|
| 717 |
+
self.synset_relations[synset.name()] = set()
|
| 718 |
+
|
| 719 |
+
return self
|
| 720 |
+
|
| 721 |
+
|
| 722 |
+
def make_lookup_link(ref, label):
|
| 723 |
+
return f'<a href="lookup_{ref.encode()}">{label}</a>'
|
| 724 |
+
|
| 725 |
+
|
| 726 |
+
def page_from_word(word):
|
| 727 |
+
"""
|
| 728 |
+
Return a HTML page for the given word.
|
| 729 |
+
|
| 730 |
+
:type word: str
|
| 731 |
+
:param word: The currently active word
|
| 732 |
+
:return: A tuple (page,word), where page is the new current HTML page
|
| 733 |
+
to be sent to the browser and
|
| 734 |
+
word is the new current word
|
| 735 |
+
:rtype: A tuple (str,str)
|
| 736 |
+
"""
|
| 737 |
+
return page_from_reference(Reference(word))
|
| 738 |
+
|
| 739 |
+
|
| 740 |
+
def page_from_href(href):
|
| 741 |
+
"""
|
| 742 |
+
Returns a tuple of the HTML page built and the new current word
|
| 743 |
+
|
| 744 |
+
:param href: The hypertext reference to be solved
|
| 745 |
+
:type href: str
|
| 746 |
+
:return: A tuple (page,word), where page is the new current HTML page
|
| 747 |
+
to be sent to the browser and
|
| 748 |
+
word is the new current word
|
| 749 |
+
:rtype: A tuple (str,str)
|
| 750 |
+
"""
|
| 751 |
+
return page_from_reference(Reference.decode(href))
|
| 752 |
+
|
| 753 |
+
|
| 754 |
+
def page_from_reference(href):
|
| 755 |
+
"""
|
| 756 |
+
Returns a tuple of the HTML page built and the new current word
|
| 757 |
+
|
| 758 |
+
:param href: The hypertext reference to be solved
|
| 759 |
+
:type href: str
|
| 760 |
+
:return: A tuple (page,word), where page is the new current HTML page
|
| 761 |
+
to be sent to the browser and
|
| 762 |
+
word is the new current word
|
| 763 |
+
:rtype: A tuple (str,str)
|
| 764 |
+
"""
|
| 765 |
+
word = href.word
|
| 766 |
+
pos_forms = defaultdict(list)
|
| 767 |
+
words = word.split(",")
|
| 768 |
+
words = [w for w in [w.strip().lower().replace(" ", "_") for w in words] if w != ""]
|
| 769 |
+
if len(words) == 0:
|
| 770 |
+
# No words were found.
|
| 771 |
+
return "", "Please specify a word to search for."
|
| 772 |
+
|
| 773 |
+
# This looks up multiple words at once. This is probably not
|
| 774 |
+
# necessary and may lead to problems.
|
| 775 |
+
for w in words:
|
| 776 |
+
for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]:
|
| 777 |
+
form = wn.morphy(w, pos)
|
| 778 |
+
if form and form not in pos_forms[pos]:
|
| 779 |
+
pos_forms[pos].append(form)
|
| 780 |
+
body = ""
|
| 781 |
+
for pos, pos_str, name in _pos_tuples():
|
| 782 |
+
if pos in pos_forms:
|
| 783 |
+
body += _hlev(3, name) + "\n"
|
| 784 |
+
for w in pos_forms[pos]:
|
| 785 |
+
# Not all words of exc files are in the database, skip
|
| 786 |
+
# to the next word if a KeyError is raised.
|
| 787 |
+
try:
|
| 788 |
+
body += _collect_all_synsets(w, pos, href.synset_relations)
|
| 789 |
+
except KeyError:
|
| 790 |
+
pass
|
| 791 |
+
if not body:
|
| 792 |
+
body = "The word or words '%s' where not found in the dictionary." % word
|
| 793 |
+
return body, word
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
#####################################################################
|
| 797 |
+
# Static pages
|
| 798 |
+
#####################################################################
|
| 799 |
+
|
| 800 |
+
|
| 801 |
+
def get_static_page_by_path(path):
|
| 802 |
+
"""
|
| 803 |
+
Return a static HTML page from the path given.
|
| 804 |
+
"""
|
| 805 |
+
if path == "index_2.html":
|
| 806 |
+
return get_static_index_page(False)
|
| 807 |
+
elif path == "index.html":
|
| 808 |
+
return get_static_index_page(True)
|
| 809 |
+
elif path == "NLTK Wordnet Browser Database Info.html":
|
| 810 |
+
return "Display of Wordnet Database Statistics is not supported"
|
| 811 |
+
elif path == "upper_2.html":
|
| 812 |
+
return get_static_upper_page(False)
|
| 813 |
+
elif path == "upper.html":
|
| 814 |
+
return get_static_upper_page(True)
|
| 815 |
+
elif path == "web_help.html":
|
| 816 |
+
return get_static_web_help_page()
|
| 817 |
+
elif path == "wx_help.html":
|
| 818 |
+
return get_static_wx_help_page()
|
| 819 |
+
else:
|
| 820 |
+
return "Internal error: Path for static page '%s' is unknown" % path
|
| 821 |
+
|
| 822 |
+
|
| 823 |
+
def get_static_web_help_page():
|
| 824 |
+
"""
|
| 825 |
+
Return the static web help page.
|
| 826 |
+
"""
|
| 827 |
+
return """
|
| 828 |
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
| 829 |
+
<html>
|
| 830 |
+
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
|
| 831 |
+
Copyright (C) 2001-2022 NLTK Project
|
| 832 |
+
Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
|
| 833 |
+
URL: <https://www.nltk.org/>
|
| 834 |
+
For license information, see LICENSE.TXT -->
|
| 835 |
+
<head>
|
| 836 |
+
<meta http-equiv='Content-Type' content='text/html; charset=us-ascii'>
|
| 837 |
+
<title>NLTK Wordnet Browser display of: * Help *</title>
|
| 838 |
+
</head>
|
| 839 |
+
<body bgcolor='#F5F5F5' text='#000000'>
|
| 840 |
+
<h2>NLTK Wordnet Browser Help</h2>
|
| 841 |
+
<p>The NLTK Wordnet Browser is a tool to use in browsing the Wordnet database. It tries to behave like the Wordnet project's web browser but the difference is that the NLTK Wordnet Browser uses a local Wordnet database.
|
| 842 |
+
<p><b>You are using the Javascript client part of the NLTK Wordnet BrowseServer.</b> We assume your browser is in tab sheets enabled mode.</p>
|
| 843 |
+
<p>For background information on Wordnet, see the Wordnet project home page: <a href="https://wordnet.princeton.edu/"><b> https://wordnet.princeton.edu/</b></a>. For more information on the NLTK project, see the project home:
|
| 844 |
+
<a href="https://www.nltk.org/"><b>https://www.nltk.org/</b></a>. To get an idea of what the Wordnet version used by this browser includes choose <b>Show Database Info</b> from the <b>View</b> submenu.</p>
|
| 845 |
+
<h3>Word search</h3>
|
| 846 |
+
<p>The word to be searched is typed into the <b>New Word</b> field and the search started with Enter or by clicking the <b>Search</b> button. There is no uppercase/lowercase distinction: the search word is transformed to lowercase before the search.</p>
|
| 847 |
+
<p>In addition, the word does not have to be in base form. The browser tries to find the possible base form(s) by making certain morphological substitutions. Typing <b>fLIeS</b> as an obscure example gives one <a href="MfLIeS">this</a>. Click the previous link to see what this kind of search looks like and then come back to this page by using the <b>Alt+LeftArrow</b> key combination.</p>
|
| 848 |
+
<p>The result of a search is a display of one or more
|
| 849 |
+
<b>synsets</b> for every part of speech in which a form of the
|
| 850 |
+
search word was found to occur. A synset is a set of words
|
| 851 |
+
having the same sense or meaning. Each word in a synset that is
|
| 852 |
+
underlined is a hyperlink which can be clicked to trigger an
|
| 853 |
+
automatic search for that word.</p>
|
| 854 |
+
<p>Every synset has a hyperlink <b>S:</b> at the start of its
|
| 855 |
+
display line. Clicking that symbol shows you the name of every
|
| 856 |
+
<b>relation</b> that this synset is part of. Every relation name is a hyperlink that opens up a display for that relation. Clicking it another time closes the display again. Clicking another relation name on a line that has an opened relation closes the open relation and opens the clicked relation.</p>
|
| 857 |
+
<p>It is also possible to give two or more words or collocations to be searched at the same time separating them with a comma like this <a href="Mcheer up,clear up">cheer up,clear up</a>, for example. Click the previous link to see what this kind of search looks like and then come back to this page by using the <b>Alt+LeftArrow</b> key combination. As you could see the search result includes the synsets found in the same order than the forms were given in the search field.</p>
|
| 858 |
+
<p>
|
| 859 |
+
There are also word level (lexical) relations recorded in the Wordnet database. Opening this kind of relation displays lines with a hyperlink <b>W:</b> at their beginning. Clicking this link shows more info on the word in question.</p>
|
| 860 |
+
<h3>The Buttons</h3>
|
| 861 |
+
<p>The <b>Search</b> and <b>Help</b> buttons need no more explanation. </p>
|
| 862 |
+
<p>The <b>Show Database Info</b> button shows a collection of Wordnet database statistics.</p>
|
| 863 |
+
<p>The <b>Shutdown the Server</b> button is shown for the first client of the BrowServer program i.e. for the client that is automatically launched when the BrowServer is started but not for the succeeding clients in order to protect the server from accidental shutdowns.
|
| 864 |
+
</p></body>
|
| 865 |
+
</html>
|
| 866 |
+
"""
|
| 867 |
+
|
| 868 |
+
|
| 869 |
+
def get_static_welcome_message():
|
| 870 |
+
"""
|
| 871 |
+
Get the static welcome page.
|
| 872 |
+
"""
|
| 873 |
+
return """
|
| 874 |
+
<h3>Search Help</h3>
|
| 875 |
+
<ul><li>The display below the line is an example of the output the browser
|
| 876 |
+
shows you when you enter a search word. The search word was <b>green</b>.</li>
|
| 877 |
+
<li>The search result shows for different parts of speech the <b>synsets</b>
|
| 878 |
+
i.e. different meanings for the word.</li>
|
| 879 |
+
<li>All underlined texts are hypertext links. There are two types of links:
|
| 880 |
+
word links and others. Clicking a word link carries out a search for the word
|
| 881 |
+
in the Wordnet database.</li>
|
| 882 |
+
<li>Clicking a link of the other type opens a display section of data attached
|
| 883 |
+
to that link. Clicking that link a second time closes the section again.</li>
|
| 884 |
+
<li>Clicking <u>S:</u> opens a section showing the relations for that synset.</li>
|
| 885 |
+
<li>Clicking on a relation name opens a section that displays the associated
|
| 886 |
+
synsets.</li>
|
| 887 |
+
<li>Type a search word in the <b>Next Word</b> field and start the search by the
|
| 888 |
+
<b>Enter/Return</b> key or click the <b>Search</b> button.</li>
|
| 889 |
+
</ul>
|
| 890 |
+
"""
|
| 891 |
+
|
| 892 |
+
|
| 893 |
+
def get_static_index_page(with_shutdown):
|
| 894 |
+
"""
|
| 895 |
+
Get the static index page.
|
| 896 |
+
"""
|
| 897 |
+
template = """
|
| 898 |
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN" "http://www.w3.org/TR/html4/frameset.dtd">
|
| 899 |
+
<HTML>
|
| 900 |
+
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
|
| 901 |
+
Copyright (C) 2001-2022 NLTK Project
|
| 902 |
+
Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
|
| 903 |
+
URL: <https://www.nltk.org/>
|
| 904 |
+
For license information, see LICENSE.TXT -->
|
| 905 |
+
<HEAD>
|
| 906 |
+
<TITLE>NLTK Wordnet Browser</TITLE>
|
| 907 |
+
</HEAD>
|
| 908 |
+
|
| 909 |
+
<frameset rows="7%%,93%%">
|
| 910 |
+
<frame src="%s" name="header">
|
| 911 |
+
<frame src="start_page" name="body">
|
| 912 |
+
</frameset>
|
| 913 |
+
</HTML>
|
| 914 |
+
"""
|
| 915 |
+
if with_shutdown:
|
| 916 |
+
upper_link = "upper.html"
|
| 917 |
+
else:
|
| 918 |
+
upper_link = "upper_2.html"
|
| 919 |
+
|
| 920 |
+
return template % upper_link
|
| 921 |
+
|
| 922 |
+
|
| 923 |
+
def get_static_upper_page(with_shutdown):
|
| 924 |
+
"""
|
| 925 |
+
Return the upper frame page,
|
| 926 |
+
|
| 927 |
+
If with_shutdown is True then a 'shutdown' button is also provided
|
| 928 |
+
to shutdown the server.
|
| 929 |
+
"""
|
| 930 |
+
template = """
|
| 931 |
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
|
| 932 |
+
<html>
|
| 933 |
+
<!-- Natural Language Toolkit: Wordnet Interface: Graphical Wordnet Browser
|
| 934 |
+
Copyright (C) 2001-2022 NLTK Project
|
| 935 |
+
Author: Jussi Salmela <jtsalmela@users.sourceforge.net>
|
| 936 |
+
URL: <https://www.nltk.org/>
|
| 937 |
+
For license information, see LICENSE.TXT -->
|
| 938 |
+
<head>
|
| 939 |
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
| 940 |
+
<title>Untitled Document</title>
|
| 941 |
+
</head>
|
| 942 |
+
<body>
|
| 943 |
+
<form method="GET" action="search" target="body">
|
| 944 |
+
Current Word: <input type="text" id="currentWord" size="10" disabled>
|
| 945 |
+
Next Word: <input type="text" id="nextWord" name="nextWord" size="10">
|
| 946 |
+
<input name="searchButton" type="submit" value="Search">
|
| 947 |
+
</form>
|
| 948 |
+
<a target="body" href="web_help.html">Help</a>
|
| 949 |
+
%s
|
| 950 |
+
|
| 951 |
+
</body>
|
| 952 |
+
</html>
|
| 953 |
+
"""
|
| 954 |
+
if with_shutdown:
|
| 955 |
+
shutdown_link = '<a href="SHUTDOWN THE SERVER">Shutdown</a>'
|
| 956 |
+
else:
|
| 957 |
+
shutdown_link = ""
|
| 958 |
+
|
| 959 |
+
return template % shutdown_link
|
| 960 |
+
|
| 961 |
+
|
| 962 |
+
def usage():
|
| 963 |
+
"""
|
| 964 |
+
Display the command line help message.
|
| 965 |
+
"""
|
| 966 |
+
print(__doc__)
|
| 967 |
+
|
| 968 |
+
|
| 969 |
+
def app():
|
| 970 |
+
# Parse and interpret options.
|
| 971 |
+
(opts, _) = getopt.getopt(
|
| 972 |
+
argv[1:], "l:p:sh", ["logfile=", "port=", "server-mode", "help"]
|
| 973 |
+
)
|
| 974 |
+
port = 8000
|
| 975 |
+
server_mode = False
|
| 976 |
+
help_mode = False
|
| 977 |
+
logfilename = None
|
| 978 |
+
for (opt, value) in opts:
|
| 979 |
+
if (opt == "-l") or (opt == "--logfile"):
|
| 980 |
+
logfilename = str(value)
|
| 981 |
+
elif (opt == "-p") or (opt == "--port"):
|
| 982 |
+
port = int(value)
|
| 983 |
+
elif (opt == "-s") or (opt == "--server-mode"):
|
| 984 |
+
server_mode = True
|
| 985 |
+
elif (opt == "-h") or (opt == "--help"):
|
| 986 |
+
help_mode = True
|
| 987 |
+
|
| 988 |
+
if help_mode:
|
| 989 |
+
usage()
|
| 990 |
+
else:
|
| 991 |
+
wnb(port, not server_mode, logfilename)
|
| 992 |
+
|
| 993 |
+
|
| 994 |
+
if __name__ == "__main__":
|
| 995 |
+
app()
|
| 996 |
+
|
| 997 |
+
__all__ = ["app"]
|
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Combinatory Categorial Grammar
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Combinatory Categorial Grammar.
|
| 10 |
+
|
| 11 |
+
For more information see nltk/doc/contrib/ccg/ccg.pdf
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge
|
| 15 |
+
from nltk.ccg.combinator import (
|
| 16 |
+
BackwardApplication,
|
| 17 |
+
BackwardBx,
|
| 18 |
+
BackwardCombinator,
|
| 19 |
+
BackwardComposition,
|
| 20 |
+
BackwardSx,
|
| 21 |
+
BackwardT,
|
| 22 |
+
DirectedBinaryCombinator,
|
| 23 |
+
ForwardApplication,
|
| 24 |
+
ForwardCombinator,
|
| 25 |
+
ForwardComposition,
|
| 26 |
+
ForwardSubstitution,
|
| 27 |
+
ForwardT,
|
| 28 |
+
UndirectedBinaryCombinator,
|
| 29 |
+
UndirectedComposition,
|
| 30 |
+
UndirectedFunctionApplication,
|
| 31 |
+
UndirectedSubstitution,
|
| 32 |
+
UndirectedTypeRaise,
|
| 33 |
+
)
|
| 34 |
+
from nltk.ccg.lexicon import CCGLexicon
|
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/api.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: CCG Categories
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
from abc import ABCMeta, abstractmethod
|
| 9 |
+
from functools import total_ordering
|
| 10 |
+
|
| 11 |
+
from nltk.internals import raise_unorderable_types
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@total_ordering
|
| 15 |
+
class AbstractCCGCategory(metaclass=ABCMeta):
|
| 16 |
+
"""
|
| 17 |
+
Interface for categories in combinatory grammars.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
@abstractmethod
|
| 21 |
+
def is_primitive(self):
|
| 22 |
+
"""
|
| 23 |
+
Returns true if the category is primitive.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
@abstractmethod
|
| 27 |
+
def is_function(self):
|
| 28 |
+
"""
|
| 29 |
+
Returns true if the category is a function application.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
@abstractmethod
|
| 33 |
+
def is_var(self):
|
| 34 |
+
"""
|
| 35 |
+
Returns true if the category is a variable.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
@abstractmethod
|
| 39 |
+
def substitute(self, substitutions):
|
| 40 |
+
"""
|
| 41 |
+
Takes a set of (var, category) substitutions, and replaces every
|
| 42 |
+
occurrence of the variable with the corresponding category.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
@abstractmethod
|
| 46 |
+
def can_unify(self, other):
|
| 47 |
+
"""
|
| 48 |
+
Determines whether two categories can be unified.
|
| 49 |
+
- Returns None if they cannot be unified
|
| 50 |
+
- Returns a list of necessary substitutions if they can.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
# Utility functions: comparison, strings and hashing.
|
| 54 |
+
@abstractmethod
|
| 55 |
+
def __str__(self):
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
def __eq__(self, other):
|
| 59 |
+
return (
|
| 60 |
+
self.__class__ is other.__class__
|
| 61 |
+
and self._comparison_key == other._comparison_key
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
def __ne__(self, other):
|
| 65 |
+
return not self == other
|
| 66 |
+
|
| 67 |
+
def __lt__(self, other):
|
| 68 |
+
if not isinstance(other, AbstractCCGCategory):
|
| 69 |
+
raise_unorderable_types("<", self, other)
|
| 70 |
+
if self.__class__ is other.__class__:
|
| 71 |
+
return self._comparison_key < other._comparison_key
|
| 72 |
+
else:
|
| 73 |
+
return self.__class__.__name__ < other.__class__.__name__
|
| 74 |
+
|
| 75 |
+
def __hash__(self):
|
| 76 |
+
try:
|
| 77 |
+
return self._hash
|
| 78 |
+
except AttributeError:
|
| 79 |
+
self._hash = hash(self._comparison_key)
|
| 80 |
+
return self._hash
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class CCGVar(AbstractCCGCategory):
|
| 84 |
+
"""
|
| 85 |
+
Class representing a variable CCG category.
|
| 86 |
+
Used for conjunctions (and possibly type-raising, if implemented as a
|
| 87 |
+
unary rule).
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
_maxID = 0
|
| 91 |
+
|
| 92 |
+
def __init__(self, prim_only=False):
|
| 93 |
+
"""Initialize a variable (selects a new identifier)
|
| 94 |
+
|
| 95 |
+
:param prim_only: a boolean that determines whether the variable is
|
| 96 |
+
restricted to primitives
|
| 97 |
+
:type prim_only: bool
|
| 98 |
+
"""
|
| 99 |
+
self._id = self.new_id()
|
| 100 |
+
self._prim_only = prim_only
|
| 101 |
+
self._comparison_key = self._id
|
| 102 |
+
|
| 103 |
+
@classmethod
|
| 104 |
+
def new_id(cls):
|
| 105 |
+
"""
|
| 106 |
+
A class method allowing generation of unique variable identifiers.
|
| 107 |
+
"""
|
| 108 |
+
cls._maxID = cls._maxID + 1
|
| 109 |
+
return cls._maxID - 1
|
| 110 |
+
|
| 111 |
+
@classmethod
|
| 112 |
+
def reset_id(cls):
|
| 113 |
+
cls._maxID = 0
|
| 114 |
+
|
| 115 |
+
def is_primitive(self):
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
def is_function(self):
|
| 119 |
+
return False
|
| 120 |
+
|
| 121 |
+
def is_var(self):
|
| 122 |
+
return True
|
| 123 |
+
|
| 124 |
+
def substitute(self, substitutions):
|
| 125 |
+
"""If there is a substitution corresponding to this variable,
|
| 126 |
+
return the substituted category.
|
| 127 |
+
"""
|
| 128 |
+
for (var, cat) in substitutions:
|
| 129 |
+
if var == self:
|
| 130 |
+
return cat
|
| 131 |
+
return self
|
| 132 |
+
|
| 133 |
+
def can_unify(self, other):
|
| 134 |
+
"""If the variable can be replaced with other
|
| 135 |
+
a substitution is returned.
|
| 136 |
+
"""
|
| 137 |
+
if other.is_primitive() or not self._prim_only:
|
| 138 |
+
return [(self, other)]
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
def id(self):
|
| 142 |
+
return self._id
|
| 143 |
+
|
| 144 |
+
def __str__(self):
|
| 145 |
+
return "_var" + str(self._id)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
@total_ordering
|
| 149 |
+
class Direction:
|
| 150 |
+
"""
|
| 151 |
+
Class representing the direction of a function application.
|
| 152 |
+
Also contains maintains information as to which combinators
|
| 153 |
+
may be used with the category.
|
| 154 |
+
"""
|
| 155 |
+
|
| 156 |
+
def __init__(self, dir, restrictions):
|
| 157 |
+
self._dir = dir
|
| 158 |
+
self._restrs = restrictions
|
| 159 |
+
self._comparison_key = (dir, tuple(restrictions))
|
| 160 |
+
|
| 161 |
+
# Testing the application direction
|
| 162 |
+
def is_forward(self):
|
| 163 |
+
return self._dir == "/"
|
| 164 |
+
|
| 165 |
+
def is_backward(self):
|
| 166 |
+
return self._dir == "\\"
|
| 167 |
+
|
| 168 |
+
def dir(self):
|
| 169 |
+
return self._dir
|
| 170 |
+
|
| 171 |
+
def restrs(self):
|
| 172 |
+
"""A list of restrictions on the combinators.
|
| 173 |
+
'.' denotes that permuting operations are disallowed
|
| 174 |
+
',' denotes that function composition is disallowed
|
| 175 |
+
'_' denotes that the direction has variable restrictions.
|
| 176 |
+
(This is redundant in the current implementation of type-raising)
|
| 177 |
+
"""
|
| 178 |
+
return self._restrs
|
| 179 |
+
|
| 180 |
+
def is_variable(self):
|
| 181 |
+
return self._restrs == "_"
|
| 182 |
+
|
| 183 |
+
# Unification and substitution of variable directions.
|
| 184 |
+
# Used only if type-raising is implemented as a unary rule, as it
|
| 185 |
+
# must inherit restrictions from the argument category.
|
| 186 |
+
def can_unify(self, other):
|
| 187 |
+
if other.is_variable():
|
| 188 |
+
return [("_", self.restrs())]
|
| 189 |
+
elif self.is_variable():
|
| 190 |
+
return [("_", other.restrs())]
|
| 191 |
+
else:
|
| 192 |
+
if self.restrs() == other.restrs():
|
| 193 |
+
return []
|
| 194 |
+
return None
|
| 195 |
+
|
| 196 |
+
def substitute(self, subs):
|
| 197 |
+
if not self.is_variable():
|
| 198 |
+
return self
|
| 199 |
+
|
| 200 |
+
for (var, restrs) in subs:
|
| 201 |
+
if var == "_":
|
| 202 |
+
return Direction(self._dir, restrs)
|
| 203 |
+
return self
|
| 204 |
+
|
| 205 |
+
# Testing permitted combinators
|
| 206 |
+
def can_compose(self):
|
| 207 |
+
return "," not in self._restrs
|
| 208 |
+
|
| 209 |
+
def can_cross(self):
|
| 210 |
+
return "." not in self._restrs
|
| 211 |
+
|
| 212 |
+
def __eq__(self, other):
|
| 213 |
+
return (
|
| 214 |
+
self.__class__ is other.__class__
|
| 215 |
+
and self._comparison_key == other._comparison_key
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
def __ne__(self, other):
|
| 219 |
+
return not self == other
|
| 220 |
+
|
| 221 |
+
def __lt__(self, other):
|
| 222 |
+
if not isinstance(other, Direction):
|
| 223 |
+
raise_unorderable_types("<", self, other)
|
| 224 |
+
if self.__class__ is other.__class__:
|
| 225 |
+
return self._comparison_key < other._comparison_key
|
| 226 |
+
else:
|
| 227 |
+
return self.__class__.__name__ < other.__class__.__name__
|
| 228 |
+
|
| 229 |
+
def __hash__(self):
|
| 230 |
+
try:
|
| 231 |
+
return self._hash
|
| 232 |
+
except AttributeError:
|
| 233 |
+
self._hash = hash(self._comparison_key)
|
| 234 |
+
return self._hash
|
| 235 |
+
|
| 236 |
+
def __str__(self):
|
| 237 |
+
r_str = ""
|
| 238 |
+
for r in self._restrs:
|
| 239 |
+
r_str = r_str + "%s" % r
|
| 240 |
+
return f"{self._dir}{r_str}"
|
| 241 |
+
|
| 242 |
+
# The negation operator reverses the direction of the application
|
| 243 |
+
def __neg__(self):
|
| 244 |
+
if self._dir == "/":
|
| 245 |
+
return Direction("\\", self._restrs)
|
| 246 |
+
else:
|
| 247 |
+
return Direction("/", self._restrs)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
class PrimitiveCategory(AbstractCCGCategory):
|
| 251 |
+
"""
|
| 252 |
+
Class representing primitive categories.
|
| 253 |
+
Takes a string representation of the category, and a
|
| 254 |
+
list of strings specifying the morphological subcategories.
|
| 255 |
+
"""
|
| 256 |
+
|
| 257 |
+
def __init__(self, categ, restrictions=[]):
|
| 258 |
+
self._categ = categ
|
| 259 |
+
self._restrs = restrictions
|
| 260 |
+
self._comparison_key = (categ, tuple(restrictions))
|
| 261 |
+
|
| 262 |
+
def is_primitive(self):
|
| 263 |
+
return True
|
| 264 |
+
|
| 265 |
+
def is_function(self):
|
| 266 |
+
return False
|
| 267 |
+
|
| 268 |
+
def is_var(self):
|
| 269 |
+
return False
|
| 270 |
+
|
| 271 |
+
def restrs(self):
|
| 272 |
+
return self._restrs
|
| 273 |
+
|
| 274 |
+
def categ(self):
|
| 275 |
+
return self._categ
|
| 276 |
+
|
| 277 |
+
# Substitution does nothing to a primitive category
|
| 278 |
+
def substitute(self, subs):
|
| 279 |
+
return self
|
| 280 |
+
|
| 281 |
+
# A primitive can be unified with a class of the same
|
| 282 |
+
# base category, given that the other category shares all
|
| 283 |
+
# of its subclasses, or with a variable.
|
| 284 |
+
def can_unify(self, other):
|
| 285 |
+
if not other.is_primitive():
|
| 286 |
+
return None
|
| 287 |
+
if other.is_var():
|
| 288 |
+
return [(other, self)]
|
| 289 |
+
if other.categ() == self.categ():
|
| 290 |
+
for restr in self._restrs:
|
| 291 |
+
if restr not in other.restrs():
|
| 292 |
+
return None
|
| 293 |
+
return []
|
| 294 |
+
return None
|
| 295 |
+
|
| 296 |
+
def __str__(self):
|
| 297 |
+
if self._restrs == []:
|
| 298 |
+
return "%s" % self._categ
|
| 299 |
+
restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs)
|
| 300 |
+
return f"{self._categ}{restrictions}"
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
class FunctionalCategory(AbstractCCGCategory):
|
| 304 |
+
"""
|
| 305 |
+
Class that represents a function application category.
|
| 306 |
+
Consists of argument and result categories, together with
|
| 307 |
+
an application direction.
|
| 308 |
+
"""
|
| 309 |
+
|
| 310 |
+
def __init__(self, res, arg, dir):
|
| 311 |
+
self._res = res
|
| 312 |
+
self._arg = arg
|
| 313 |
+
self._dir = dir
|
| 314 |
+
self._comparison_key = (arg, dir, res)
|
| 315 |
+
|
| 316 |
+
def is_primitive(self):
|
| 317 |
+
return False
|
| 318 |
+
|
| 319 |
+
def is_function(self):
|
| 320 |
+
return True
|
| 321 |
+
|
| 322 |
+
def is_var(self):
|
| 323 |
+
return False
|
| 324 |
+
|
| 325 |
+
# Substitution returns the category consisting of the
|
| 326 |
+
# substitution applied to each of its constituents.
|
| 327 |
+
def substitute(self, subs):
|
| 328 |
+
sub_res = self._res.substitute(subs)
|
| 329 |
+
sub_dir = self._dir.substitute(subs)
|
| 330 |
+
sub_arg = self._arg.substitute(subs)
|
| 331 |
+
return FunctionalCategory(sub_res, sub_arg, self._dir)
|
| 332 |
+
|
| 333 |
+
# A function can unify with another function, so long as its
|
| 334 |
+
# constituents can unify, or with an unrestricted variable.
|
| 335 |
+
def can_unify(self, other):
|
| 336 |
+
if other.is_var():
|
| 337 |
+
return [(other, self)]
|
| 338 |
+
if other.is_function():
|
| 339 |
+
sa = self._res.can_unify(other.res())
|
| 340 |
+
sd = self._dir.can_unify(other.dir())
|
| 341 |
+
if sa is not None and sd is not None:
|
| 342 |
+
sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa))
|
| 343 |
+
if sb is not None:
|
| 344 |
+
return sa + sb
|
| 345 |
+
return None
|
| 346 |
+
|
| 347 |
+
# Constituent accessors
|
| 348 |
+
def arg(self):
|
| 349 |
+
return self._arg
|
| 350 |
+
|
| 351 |
+
def res(self):
|
| 352 |
+
return self._res
|
| 353 |
+
|
| 354 |
+
def dir(self):
|
| 355 |
+
return self._dir
|
| 356 |
+
|
| 357 |
+
def __str__(self):
|
| 358 |
+
return f"({self._res}{self._dir}{self._arg})"
|
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/chart.py
ADDED
|
@@ -0,0 +1,480 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Combinatory Categorial Grammar
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
The lexicon is constructed by calling
|
| 10 |
+
``lexicon.fromstring(<lexicon string>)``.
|
| 11 |
+
|
| 12 |
+
In order to construct a parser, you also need a rule set.
|
| 13 |
+
The standard English rules are provided in chart as
|
| 14 |
+
``chart.DefaultRuleSet``.
|
| 15 |
+
|
| 16 |
+
The parser can then be constructed by calling, for example:
|
| 17 |
+
``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
|
| 18 |
+
|
| 19 |
+
Parsing is then performed by running
|
| 20 |
+
``parser.parse(<sentence>.split())``.
|
| 21 |
+
|
| 22 |
+
While this returns a list of trees, the default representation
|
| 23 |
+
of the produced trees is not very enlightening, particularly
|
| 24 |
+
given that it uses the same tree class as the CFG parsers.
|
| 25 |
+
It is probably better to call:
|
| 26 |
+
``chart.printCCGDerivation(<parse tree extracted from list>)``
|
| 27 |
+
which should print a nice representation of the derivation.
|
| 28 |
+
|
| 29 |
+
This entire process is shown far more clearly in the demonstration:
|
| 30 |
+
python chart.py
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
import itertools
|
| 34 |
+
|
| 35 |
+
from nltk.ccg.combinator import *
|
| 36 |
+
from nltk.ccg.combinator import (
|
| 37 |
+
BackwardApplication,
|
| 38 |
+
BackwardBx,
|
| 39 |
+
BackwardComposition,
|
| 40 |
+
BackwardSx,
|
| 41 |
+
BackwardT,
|
| 42 |
+
ForwardApplication,
|
| 43 |
+
ForwardComposition,
|
| 44 |
+
ForwardSubstitution,
|
| 45 |
+
ForwardT,
|
| 46 |
+
)
|
| 47 |
+
from nltk.ccg.lexicon import Token, fromstring
|
| 48 |
+
from nltk.ccg.logic import *
|
| 49 |
+
from nltk.parse import ParserI
|
| 50 |
+
from nltk.parse.chart import AbstractChartRule, Chart, EdgeI
|
| 51 |
+
from nltk.sem.logic import *
|
| 52 |
+
from nltk.tree import Tree
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Based on the EdgeI class from NLTK.
|
| 56 |
+
# A number of the properties of the EdgeI interface don't
|
| 57 |
+
# transfer well to CCGs, however.
|
| 58 |
+
class CCGEdge(EdgeI):
|
| 59 |
+
def __init__(self, span, categ, rule):
|
| 60 |
+
self._span = span
|
| 61 |
+
self._categ = categ
|
| 62 |
+
self._rule = rule
|
| 63 |
+
self._comparison_key = (span, categ, rule)
|
| 64 |
+
|
| 65 |
+
# Accessors
|
| 66 |
+
def lhs(self):
|
| 67 |
+
return self._categ
|
| 68 |
+
|
| 69 |
+
def span(self):
|
| 70 |
+
return self._span
|
| 71 |
+
|
| 72 |
+
def start(self):
|
| 73 |
+
return self._span[0]
|
| 74 |
+
|
| 75 |
+
def end(self):
|
| 76 |
+
return self._span[1]
|
| 77 |
+
|
| 78 |
+
def length(self):
|
| 79 |
+
return self._span[1] - self.span[0]
|
| 80 |
+
|
| 81 |
+
def rhs(self):
|
| 82 |
+
return ()
|
| 83 |
+
|
| 84 |
+
def dot(self):
|
| 85 |
+
return 0
|
| 86 |
+
|
| 87 |
+
def is_complete(self):
|
| 88 |
+
return True
|
| 89 |
+
|
| 90 |
+
def is_incomplete(self):
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
def nextsym(self):
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
def categ(self):
|
| 97 |
+
return self._categ
|
| 98 |
+
|
| 99 |
+
def rule(self):
|
| 100 |
+
return self._rule
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class CCGLeafEdge(EdgeI):
|
| 104 |
+
"""
|
| 105 |
+
Class representing leaf edges in a CCG derivation.
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
def __init__(self, pos, token, leaf):
|
| 109 |
+
self._pos = pos
|
| 110 |
+
self._token = token
|
| 111 |
+
self._leaf = leaf
|
| 112 |
+
self._comparison_key = (pos, token.categ(), leaf)
|
| 113 |
+
|
| 114 |
+
# Accessors
|
| 115 |
+
def lhs(self):
|
| 116 |
+
return self._token.categ()
|
| 117 |
+
|
| 118 |
+
def span(self):
|
| 119 |
+
return (self._pos, self._pos + 1)
|
| 120 |
+
|
| 121 |
+
def start(self):
|
| 122 |
+
return self._pos
|
| 123 |
+
|
| 124 |
+
def end(self):
|
| 125 |
+
return self._pos + 1
|
| 126 |
+
|
| 127 |
+
def length(self):
|
| 128 |
+
return 1
|
| 129 |
+
|
| 130 |
+
def rhs(self):
|
| 131 |
+
return self._leaf
|
| 132 |
+
|
| 133 |
+
def dot(self):
|
| 134 |
+
return 0
|
| 135 |
+
|
| 136 |
+
def is_complete(self):
|
| 137 |
+
return True
|
| 138 |
+
|
| 139 |
+
def is_incomplete(self):
|
| 140 |
+
return False
|
| 141 |
+
|
| 142 |
+
def nextsym(self):
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
def token(self):
|
| 146 |
+
return self._token
|
| 147 |
+
|
| 148 |
+
def categ(self):
|
| 149 |
+
return self._token.categ()
|
| 150 |
+
|
| 151 |
+
def leaf(self):
|
| 152 |
+
return self._leaf
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
class BinaryCombinatorRule(AbstractChartRule):
|
| 156 |
+
"""
|
| 157 |
+
Class implementing application of a binary combinator to a chart.
|
| 158 |
+
Takes the directed combinator to apply.
|
| 159 |
+
"""
|
| 160 |
+
|
| 161 |
+
NUMEDGES = 2
|
| 162 |
+
|
| 163 |
+
def __init__(self, combinator):
|
| 164 |
+
self._combinator = combinator
|
| 165 |
+
|
| 166 |
+
# Apply a combinator
|
| 167 |
+
def apply(self, chart, grammar, left_edge, right_edge):
|
| 168 |
+
# The left & right edges must be touching.
|
| 169 |
+
if not (left_edge.end() == right_edge.start()):
|
| 170 |
+
return
|
| 171 |
+
|
| 172 |
+
# Check if the two edges are permitted to combine.
|
| 173 |
+
# If so, generate the corresponding edge.
|
| 174 |
+
if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
|
| 175 |
+
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
| 176 |
+
new_edge = CCGEdge(
|
| 177 |
+
span=(left_edge.start(), right_edge.end()),
|
| 178 |
+
categ=res,
|
| 179 |
+
rule=self._combinator,
|
| 180 |
+
)
|
| 181 |
+
if chart.insert(new_edge, (left_edge, right_edge)):
|
| 182 |
+
yield new_edge
|
| 183 |
+
|
| 184 |
+
# The representation of the combinator (for printing derivations)
|
| 185 |
+
def __str__(self):
|
| 186 |
+
return "%s" % self._combinator
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# Type-raising must be handled slightly differently to the other rules, as the
|
| 190 |
+
# resulting rules only span a single edge, rather than both edges.
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class ForwardTypeRaiseRule(AbstractChartRule):
|
| 194 |
+
"""
|
| 195 |
+
Class for applying forward type raising
|
| 196 |
+
"""
|
| 197 |
+
|
| 198 |
+
NUMEDGES = 2
|
| 199 |
+
|
| 200 |
+
def __init__(self):
|
| 201 |
+
self._combinator = ForwardT
|
| 202 |
+
|
| 203 |
+
def apply(self, chart, grammar, left_edge, right_edge):
|
| 204 |
+
if not (left_edge.end() == right_edge.start()):
|
| 205 |
+
return
|
| 206 |
+
|
| 207 |
+
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
| 208 |
+
new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
|
| 209 |
+
if chart.insert(new_edge, (left_edge,)):
|
| 210 |
+
yield new_edge
|
| 211 |
+
|
| 212 |
+
def __str__(self):
|
| 213 |
+
return "%s" % self._combinator
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
class BackwardTypeRaiseRule(AbstractChartRule):
|
| 217 |
+
"""
|
| 218 |
+
Class for applying backward type raising.
|
| 219 |
+
"""
|
| 220 |
+
|
| 221 |
+
NUMEDGES = 2
|
| 222 |
+
|
| 223 |
+
def __init__(self):
|
| 224 |
+
self._combinator = BackwardT
|
| 225 |
+
|
| 226 |
+
def apply(self, chart, grammar, left_edge, right_edge):
|
| 227 |
+
if not (left_edge.end() == right_edge.start()):
|
| 228 |
+
return
|
| 229 |
+
|
| 230 |
+
for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
|
| 231 |
+
new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
|
| 232 |
+
if chart.insert(new_edge, (right_edge,)):
|
| 233 |
+
yield new_edge
|
| 234 |
+
|
| 235 |
+
def __str__(self):
|
| 236 |
+
return "%s" % self._combinator
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
# Common sets of combinators used for English derivations.
|
| 240 |
+
ApplicationRuleSet = [
|
| 241 |
+
BinaryCombinatorRule(ForwardApplication),
|
| 242 |
+
BinaryCombinatorRule(BackwardApplication),
|
| 243 |
+
]
|
| 244 |
+
CompositionRuleSet = [
|
| 245 |
+
BinaryCombinatorRule(ForwardComposition),
|
| 246 |
+
BinaryCombinatorRule(BackwardComposition),
|
| 247 |
+
BinaryCombinatorRule(BackwardBx),
|
| 248 |
+
]
|
| 249 |
+
SubstitutionRuleSet = [
|
| 250 |
+
BinaryCombinatorRule(ForwardSubstitution),
|
| 251 |
+
BinaryCombinatorRule(BackwardSx),
|
| 252 |
+
]
|
| 253 |
+
TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
|
| 254 |
+
|
| 255 |
+
# The standard English rule set.
|
| 256 |
+
DefaultRuleSet = (
|
| 257 |
+
ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
class CCGChartParser(ParserI):
|
| 262 |
+
"""
|
| 263 |
+
Chart parser for CCGs.
|
| 264 |
+
Based largely on the ChartParser class from NLTK.
|
| 265 |
+
"""
|
| 266 |
+
|
| 267 |
+
def __init__(self, lexicon, rules, trace=0):
|
| 268 |
+
self._lexicon = lexicon
|
| 269 |
+
self._rules = rules
|
| 270 |
+
self._trace = trace
|
| 271 |
+
|
| 272 |
+
def lexicon(self):
|
| 273 |
+
return self._lexicon
|
| 274 |
+
|
| 275 |
+
# Implements the CYK algorithm
|
| 276 |
+
def parse(self, tokens):
|
| 277 |
+
tokens = list(tokens)
|
| 278 |
+
chart = CCGChart(list(tokens))
|
| 279 |
+
lex = self._lexicon
|
| 280 |
+
|
| 281 |
+
# Initialize leaf edges.
|
| 282 |
+
for index in range(chart.num_leaves()):
|
| 283 |
+
for token in lex.categories(chart.leaf(index)):
|
| 284 |
+
new_edge = CCGLeafEdge(index, token, chart.leaf(index))
|
| 285 |
+
chart.insert(new_edge, ())
|
| 286 |
+
|
| 287 |
+
# Select a span for the new edges
|
| 288 |
+
for span in range(2, chart.num_leaves() + 1):
|
| 289 |
+
for start in range(0, chart.num_leaves() - span + 1):
|
| 290 |
+
# Try all possible pairs of edges that could generate
|
| 291 |
+
# an edge for that span
|
| 292 |
+
for part in range(1, span):
|
| 293 |
+
lstart = start
|
| 294 |
+
mid = start + part
|
| 295 |
+
rend = start + span
|
| 296 |
+
|
| 297 |
+
for left in chart.select(span=(lstart, mid)):
|
| 298 |
+
for right in chart.select(span=(mid, rend)):
|
| 299 |
+
# Generate all possible combinations of the two edges
|
| 300 |
+
for rule in self._rules:
|
| 301 |
+
edges_added_by_rule = 0
|
| 302 |
+
for newedge in rule.apply(chart, lex, left, right):
|
| 303 |
+
edges_added_by_rule += 1
|
| 304 |
+
|
| 305 |
+
# Output the resulting parses
|
| 306 |
+
return chart.parses(lex.start())
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
class CCGChart(Chart):
|
| 310 |
+
def __init__(self, tokens):
|
| 311 |
+
Chart.__init__(self, tokens)
|
| 312 |
+
|
| 313 |
+
# Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
|
| 314 |
+
# constructed slightly differently to those in the default Chart class, so it has to
|
| 315 |
+
# be reimplemented
|
| 316 |
+
def _trees(self, edge, complete, memo, tree_class):
|
| 317 |
+
assert complete, "CCGChart cannot build incomplete trees"
|
| 318 |
+
|
| 319 |
+
if edge in memo:
|
| 320 |
+
return memo[edge]
|
| 321 |
+
|
| 322 |
+
if isinstance(edge, CCGLeafEdge):
|
| 323 |
+
word = tree_class(edge.token(), [self._tokens[edge.start()]])
|
| 324 |
+
leaf = tree_class((edge.token(), "Leaf"), [word])
|
| 325 |
+
memo[edge] = [leaf]
|
| 326 |
+
return [leaf]
|
| 327 |
+
|
| 328 |
+
memo[edge] = []
|
| 329 |
+
trees = []
|
| 330 |
+
|
| 331 |
+
for cpl in self.child_pointer_lists(edge):
|
| 332 |
+
child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
|
| 333 |
+
for children in itertools.product(*child_choices):
|
| 334 |
+
lhs = (
|
| 335 |
+
Token(
|
| 336 |
+
self._tokens[edge.start() : edge.end()],
|
| 337 |
+
edge.lhs(),
|
| 338 |
+
compute_semantics(children, edge),
|
| 339 |
+
),
|
| 340 |
+
str(edge.rule()),
|
| 341 |
+
)
|
| 342 |
+
trees.append(tree_class(lhs, children))
|
| 343 |
+
|
| 344 |
+
memo[edge] = trees
|
| 345 |
+
return trees
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def compute_semantics(children, edge):
|
| 349 |
+
if children[0].label()[0].semantics() is None:
|
| 350 |
+
return None
|
| 351 |
+
|
| 352 |
+
if len(children) == 2:
|
| 353 |
+
if isinstance(edge.rule(), BackwardCombinator):
|
| 354 |
+
children = [children[1], children[0]]
|
| 355 |
+
|
| 356 |
+
combinator = edge.rule()._combinator
|
| 357 |
+
function = children[0].label()[0].semantics()
|
| 358 |
+
argument = children[1].label()[0].semantics()
|
| 359 |
+
|
| 360 |
+
if isinstance(combinator, UndirectedFunctionApplication):
|
| 361 |
+
return compute_function_semantics(function, argument)
|
| 362 |
+
elif isinstance(combinator, UndirectedComposition):
|
| 363 |
+
return compute_composition_semantics(function, argument)
|
| 364 |
+
elif isinstance(combinator, UndirectedSubstitution):
|
| 365 |
+
return compute_substitution_semantics(function, argument)
|
| 366 |
+
else:
|
| 367 |
+
raise AssertionError("Unsupported combinator '" + combinator + "'")
|
| 368 |
+
else:
|
| 369 |
+
return compute_type_raised_semantics(children[0].label()[0].semantics())
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# --------
|
| 373 |
+
# Displaying derivations
|
| 374 |
+
# --------
|
| 375 |
+
def printCCGDerivation(tree):
|
| 376 |
+
# Get the leaves and initial categories
|
| 377 |
+
leafcats = tree.pos()
|
| 378 |
+
leafstr = ""
|
| 379 |
+
catstr = ""
|
| 380 |
+
|
| 381 |
+
# Construct a string with both the leaf word and corresponding
|
| 382 |
+
# category aligned.
|
| 383 |
+
for (leaf, cat) in leafcats:
|
| 384 |
+
str_cat = "%s" % cat
|
| 385 |
+
nextlen = 2 + max(len(leaf), len(str_cat))
|
| 386 |
+
lcatlen = (nextlen - len(str_cat)) // 2
|
| 387 |
+
rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
|
| 388 |
+
catstr += " " * lcatlen + str_cat + " " * rcatlen
|
| 389 |
+
lleaflen = (nextlen - len(leaf)) // 2
|
| 390 |
+
rleaflen = lleaflen + (nextlen - len(leaf)) % 2
|
| 391 |
+
leafstr += " " * lleaflen + leaf + " " * rleaflen
|
| 392 |
+
print(leafstr.rstrip())
|
| 393 |
+
print(catstr.rstrip())
|
| 394 |
+
|
| 395 |
+
# Display the derivation steps
|
| 396 |
+
printCCGTree(0, tree)
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
# Prints the sequence of derivation steps.
|
| 400 |
+
def printCCGTree(lwidth, tree):
|
| 401 |
+
rwidth = lwidth
|
| 402 |
+
|
| 403 |
+
# Is a leaf (word).
|
| 404 |
+
# Increment the span by the space occupied by the leaf.
|
| 405 |
+
if not isinstance(tree, Tree):
|
| 406 |
+
return 2 + lwidth + len(tree)
|
| 407 |
+
|
| 408 |
+
# Find the width of the current derivation step
|
| 409 |
+
for child in tree:
|
| 410 |
+
rwidth = max(rwidth, printCCGTree(rwidth, child))
|
| 411 |
+
|
| 412 |
+
# Is a leaf node.
|
| 413 |
+
# Don't print anything, but account for the space occupied.
|
| 414 |
+
if not isinstance(tree.label(), tuple):
|
| 415 |
+
return max(
|
| 416 |
+
rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
(token, op) = tree.label()
|
| 420 |
+
|
| 421 |
+
if op == "Leaf":
|
| 422 |
+
return rwidth
|
| 423 |
+
|
| 424 |
+
# Pad to the left with spaces, followed by a sequence of '-'
|
| 425 |
+
# and the derivation rule.
|
| 426 |
+
print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op)
|
| 427 |
+
# Print the resulting category on a new line.
|
| 428 |
+
str_res = "%s" % (token.categ())
|
| 429 |
+
if token.semantics() is not None:
|
| 430 |
+
str_res += " {" + str(token.semantics()) + "}"
|
| 431 |
+
respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
|
| 432 |
+
print(respadlen * " " + str_res)
|
| 433 |
+
return rwidth
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
### Demonstration code
|
| 437 |
+
|
| 438 |
+
# Construct the lexicon
|
| 439 |
+
lex = fromstring(
|
| 440 |
+
"""
|
| 441 |
+
:- S, NP, N, VP # Primitive categories, S is the target primitive
|
| 442 |
+
|
| 443 |
+
Det :: NP/N # Family of words
|
| 444 |
+
Pro :: NP
|
| 445 |
+
TV :: VP/NP
|
| 446 |
+
Modal :: (S\\NP)/VP # Backslashes need to be escaped
|
| 447 |
+
|
| 448 |
+
I => Pro # Word -> Category mapping
|
| 449 |
+
you => Pro
|
| 450 |
+
|
| 451 |
+
the => Det
|
| 452 |
+
|
| 453 |
+
# Variables have the special keyword 'var'
|
| 454 |
+
# '.' prevents permutation
|
| 455 |
+
# ',' prevents composition
|
| 456 |
+
and => var\\.,var/.,var
|
| 457 |
+
|
| 458 |
+
which => (N\\N)/(S/NP)
|
| 459 |
+
|
| 460 |
+
will => Modal # Categories can be either explicit, or families.
|
| 461 |
+
might => Modal
|
| 462 |
+
|
| 463 |
+
cook => TV
|
| 464 |
+
eat => TV
|
| 465 |
+
|
| 466 |
+
mushrooms => N
|
| 467 |
+
parsnips => N
|
| 468 |
+
bacon => N
|
| 469 |
+
"""
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
def demo():
|
| 474 |
+
parser = CCGChartParser(lex, DefaultRuleSet)
|
| 475 |
+
for parse in parser.parse("I might cook and eat the bacon".split()):
|
| 476 |
+
printCCGDerivation(parse)
|
| 477 |
+
|
| 478 |
+
|
| 479 |
+
if __name__ == "__main__":
|
| 480 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/combinator.py
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Combinatory Categorial Grammar
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
"""
|
| 8 |
+
CCG Combinators
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from abc import ABCMeta, abstractmethod
|
| 12 |
+
|
| 13 |
+
from nltk.ccg.api import FunctionalCategory
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class UndirectedBinaryCombinator(metaclass=ABCMeta):
|
| 17 |
+
"""
|
| 18 |
+
Abstract class for representing a binary combinator.
|
| 19 |
+
Merely defines functions for checking if the function and argument
|
| 20 |
+
are able to be combined, and what the resulting category is.
|
| 21 |
+
|
| 22 |
+
Note that as no assumptions are made as to direction, the unrestricted
|
| 23 |
+
combinators can perform all backward, forward and crossed variations
|
| 24 |
+
of the combinators; these restrictions must be added in the rule
|
| 25 |
+
class.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
@abstractmethod
|
| 29 |
+
def can_combine(self, function, argument):
|
| 30 |
+
pass
|
| 31 |
+
|
| 32 |
+
@abstractmethod
|
| 33 |
+
def combine(self, function, argument):
|
| 34 |
+
pass
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class DirectedBinaryCombinator(metaclass=ABCMeta):
|
| 38 |
+
"""
|
| 39 |
+
Wrapper for the undirected binary combinator.
|
| 40 |
+
It takes left and right categories, and decides which is to be
|
| 41 |
+
the function, and which the argument.
|
| 42 |
+
It then decides whether or not they can be combined.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
@abstractmethod
|
| 46 |
+
def can_combine(self, left, right):
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
@abstractmethod
|
| 50 |
+
def combine(self, left, right):
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class ForwardCombinator(DirectedBinaryCombinator):
|
| 55 |
+
"""
|
| 56 |
+
Class representing combinators where the primary functor is on the left.
|
| 57 |
+
|
| 58 |
+
Takes an undirected combinator, and a predicate which adds constraints
|
| 59 |
+
restricting the cases in which it may apply.
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
def __init__(self, combinator, predicate, suffix=""):
|
| 63 |
+
self._combinator = combinator
|
| 64 |
+
self._predicate = predicate
|
| 65 |
+
self._suffix = suffix
|
| 66 |
+
|
| 67 |
+
def can_combine(self, left, right):
|
| 68 |
+
return self._combinator.can_combine(left, right) and self._predicate(
|
| 69 |
+
left, right
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
def combine(self, left, right):
|
| 73 |
+
yield from self._combinator.combine(left, right)
|
| 74 |
+
|
| 75 |
+
def __str__(self):
|
| 76 |
+
return f">{self._combinator}{self._suffix}"
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class BackwardCombinator(DirectedBinaryCombinator):
|
| 80 |
+
"""
|
| 81 |
+
The backward equivalent of the ForwardCombinator class.
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
def __init__(self, combinator, predicate, suffix=""):
|
| 85 |
+
self._combinator = combinator
|
| 86 |
+
self._predicate = predicate
|
| 87 |
+
self._suffix = suffix
|
| 88 |
+
|
| 89 |
+
def can_combine(self, left, right):
|
| 90 |
+
return self._combinator.can_combine(right, left) and self._predicate(
|
| 91 |
+
left, right
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
def combine(self, left, right):
|
| 95 |
+
yield from self._combinator.combine(right, left)
|
| 96 |
+
|
| 97 |
+
def __str__(self):
|
| 98 |
+
return f"<{self._combinator}{self._suffix}"
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
class UndirectedFunctionApplication(UndirectedBinaryCombinator):
|
| 102 |
+
"""
|
| 103 |
+
Class representing function application.
|
| 104 |
+
Implements rules of the form:
|
| 105 |
+
X/Y Y -> X (>)
|
| 106 |
+
And the corresponding backwards application rule
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
def can_combine(self, function, argument):
|
| 110 |
+
if not function.is_function():
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
return not function.arg().can_unify(argument) is None
|
| 114 |
+
|
| 115 |
+
def combine(self, function, argument):
|
| 116 |
+
if not function.is_function():
|
| 117 |
+
return
|
| 118 |
+
|
| 119 |
+
subs = function.arg().can_unify(argument)
|
| 120 |
+
if subs is None:
|
| 121 |
+
return
|
| 122 |
+
|
| 123 |
+
yield function.res().substitute(subs)
|
| 124 |
+
|
| 125 |
+
def __str__(self):
|
| 126 |
+
return ""
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# Predicates for function application.
|
| 130 |
+
|
| 131 |
+
# Ensures the left functor takes an argument on the right
|
| 132 |
+
def forwardOnly(left, right):
|
| 133 |
+
return left.dir().is_forward()
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# Ensures the right functor takes an argument on the left
|
| 137 |
+
def backwardOnly(left, right):
|
| 138 |
+
return right.dir().is_backward()
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# Application combinator instances
|
| 142 |
+
ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly)
|
| 143 |
+
BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class UndirectedComposition(UndirectedBinaryCombinator):
|
| 147 |
+
"""
|
| 148 |
+
Functional composition (harmonic) combinator.
|
| 149 |
+
Implements rules of the form
|
| 150 |
+
X/Y Y/Z -> X/Z (B>)
|
| 151 |
+
And the corresponding backwards and crossed variations.
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
def can_combine(self, function, argument):
|
| 155 |
+
# Can only combine two functions, and both functions must
|
| 156 |
+
# allow composition.
|
| 157 |
+
if not (function.is_function() and argument.is_function()):
|
| 158 |
+
return False
|
| 159 |
+
if function.dir().can_compose() and argument.dir().can_compose():
|
| 160 |
+
return not function.arg().can_unify(argument.res()) is None
|
| 161 |
+
return False
|
| 162 |
+
|
| 163 |
+
def combine(self, function, argument):
|
| 164 |
+
if not (function.is_function() and argument.is_function()):
|
| 165 |
+
return
|
| 166 |
+
if function.dir().can_compose() and argument.dir().can_compose():
|
| 167 |
+
subs = function.arg().can_unify(argument.res())
|
| 168 |
+
if subs is not None:
|
| 169 |
+
yield FunctionalCategory(
|
| 170 |
+
function.res().substitute(subs),
|
| 171 |
+
argument.arg().substitute(subs),
|
| 172 |
+
argument.dir(),
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
def __str__(self):
|
| 176 |
+
return "B"
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# Predicates for restricting application of straight composition.
|
| 180 |
+
def bothForward(left, right):
|
| 181 |
+
return left.dir().is_forward() and right.dir().is_forward()
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def bothBackward(left, right):
|
| 185 |
+
return left.dir().is_backward() and right.dir().is_backward()
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
# Predicates for crossed composition
|
| 189 |
+
def crossedDirs(left, right):
|
| 190 |
+
return left.dir().is_forward() and right.dir().is_backward()
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def backwardBxConstraint(left, right):
|
| 194 |
+
# The functors must be crossed inwards
|
| 195 |
+
if not crossedDirs(left, right):
|
| 196 |
+
return False
|
| 197 |
+
# Permuting combinators must be allowed
|
| 198 |
+
if not left.dir().can_cross() and right.dir().can_cross():
|
| 199 |
+
return False
|
| 200 |
+
# The resulting argument category is restricted to be primitive
|
| 201 |
+
return left.arg().is_primitive()
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
# Straight composition combinators
|
| 205 |
+
ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly)
|
| 206 |
+
BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly)
|
| 207 |
+
|
| 208 |
+
# Backward crossed composition
|
| 209 |
+
BackwardBx = BackwardCombinator(
|
| 210 |
+
UndirectedComposition(), backwardBxConstraint, suffix="x"
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
class UndirectedSubstitution(UndirectedBinaryCombinator):
|
| 215 |
+
r"""
|
| 216 |
+
Substitution (permutation) combinator.
|
| 217 |
+
Implements rules of the form
|
| 218 |
+
Y/Z (X\Y)/Z -> X/Z (<Sx)
|
| 219 |
+
And other variations.
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
def can_combine(self, function, argument):
|
| 223 |
+
if function.is_primitive() or argument.is_primitive():
|
| 224 |
+
return False
|
| 225 |
+
|
| 226 |
+
# These could potentially be moved to the predicates, as the
|
| 227 |
+
# constraints may not be general to all languages.
|
| 228 |
+
if function.res().is_primitive():
|
| 229 |
+
return False
|
| 230 |
+
if not function.arg().is_primitive():
|
| 231 |
+
return False
|
| 232 |
+
|
| 233 |
+
if not (function.dir().can_compose() and argument.dir().can_compose()):
|
| 234 |
+
return False
|
| 235 |
+
return (function.res().arg() == argument.res()) and (
|
| 236 |
+
function.arg() == argument.arg()
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
def combine(self, function, argument):
|
| 240 |
+
if self.can_combine(function, argument):
|
| 241 |
+
yield FunctionalCategory(
|
| 242 |
+
function.res().res(), argument.arg(), argument.dir()
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
def __str__(self):
|
| 246 |
+
return "S"
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
# Predicate for forward substitution
|
| 250 |
+
def forwardSConstraint(left, right):
|
| 251 |
+
if not bothForward(left, right):
|
| 252 |
+
return False
|
| 253 |
+
return left.res().dir().is_forward() and left.arg().is_primitive()
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
# Predicate for backward crossed substitution
|
| 257 |
+
def backwardSxConstraint(left, right):
|
| 258 |
+
if not left.dir().can_cross() and right.dir().can_cross():
|
| 259 |
+
return False
|
| 260 |
+
if not bothForward(left, right):
|
| 261 |
+
return False
|
| 262 |
+
return right.res().dir().is_backward() and right.arg().is_primitive()
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
# Instances of substitution combinators
|
| 266 |
+
ForwardSubstitution = ForwardCombinator(UndirectedSubstitution(), forwardSConstraint)
|
| 267 |
+
BackwardSx = BackwardCombinator(UndirectedSubstitution(), backwardSxConstraint, "x")
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
# Retrieves the left-most functional category.
|
| 271 |
+
# ie, (N\N)/(S/NP) => N\N
|
| 272 |
+
def innermostFunction(categ):
|
| 273 |
+
while categ.res().is_function():
|
| 274 |
+
categ = categ.res()
|
| 275 |
+
return categ
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
class UndirectedTypeRaise(UndirectedBinaryCombinator):
|
| 279 |
+
"""
|
| 280 |
+
Undirected combinator for type raising.
|
| 281 |
+
"""
|
| 282 |
+
|
| 283 |
+
def can_combine(self, function, arg):
|
| 284 |
+
# The argument must be a function.
|
| 285 |
+
# The restriction that arg.res() must be a function
|
| 286 |
+
# merely reduces redundant type-raising; if arg.res() is
|
| 287 |
+
# primitive, we have:
|
| 288 |
+
# X Y\X =>(<T) Y/(Y\X) Y\X =>(>) Y
|
| 289 |
+
# which is equivalent to
|
| 290 |
+
# X Y\X =>(<) Y
|
| 291 |
+
if not (arg.is_function() and arg.res().is_function()):
|
| 292 |
+
return False
|
| 293 |
+
|
| 294 |
+
arg = innermostFunction(arg)
|
| 295 |
+
|
| 296 |
+
# left, arg_categ are undefined!
|
| 297 |
+
subs = left.can_unify(arg_categ.arg())
|
| 298 |
+
if subs is not None:
|
| 299 |
+
return True
|
| 300 |
+
return False
|
| 301 |
+
|
| 302 |
+
def combine(self, function, arg):
|
| 303 |
+
if not (
|
| 304 |
+
function.is_primitive() and arg.is_function() and arg.res().is_function()
|
| 305 |
+
):
|
| 306 |
+
return
|
| 307 |
+
|
| 308 |
+
# Type-raising matches only the innermost application.
|
| 309 |
+
arg = innermostFunction(arg)
|
| 310 |
+
|
| 311 |
+
subs = function.can_unify(arg.arg())
|
| 312 |
+
if subs is not None:
|
| 313 |
+
xcat = arg.res().substitute(subs)
|
| 314 |
+
yield FunctionalCategory(
|
| 315 |
+
xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir())
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
def __str__(self):
|
| 319 |
+
return "T"
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
# Predicates for type-raising
|
| 323 |
+
# The direction of the innermost category must be towards
|
| 324 |
+
# the primary functor.
|
| 325 |
+
# The restriction that the variable must be primitive is not
|
| 326 |
+
# common to all versions of CCGs; some authors have other restrictions.
|
| 327 |
+
def forwardTConstraint(left, right):
|
| 328 |
+
arg = innermostFunction(right)
|
| 329 |
+
return arg.dir().is_backward() and arg.res().is_primitive()
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def backwardTConstraint(left, right):
|
| 333 |
+
arg = innermostFunction(left)
|
| 334 |
+
return arg.dir().is_forward() and arg.res().is_primitive()
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
# Instances of type-raising combinators
|
| 338 |
+
ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint)
|
| 339 |
+
BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint)
|
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/lexicon.py
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Combinatory Categorial Grammar
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Graeme Gange <ggange@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
"""
|
| 8 |
+
CCG Lexicons
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import re
|
| 12 |
+
from collections import defaultdict
|
| 13 |
+
|
| 14 |
+
from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory
|
| 15 |
+
from nltk.internals import deprecated
|
| 16 |
+
from nltk.sem.logic import Expression
|
| 17 |
+
|
| 18 |
+
# ------------
|
| 19 |
+
# Regular expressions used for parsing components of the lexicon
|
| 20 |
+
# ------------
|
| 21 |
+
|
| 22 |
+
# Parses a primitive category and subscripts
|
| 23 |
+
PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""")
|
| 24 |
+
|
| 25 |
+
# Separates the next primitive category from the remainder of the
|
| 26 |
+
# string
|
| 27 |
+
NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""")
|
| 28 |
+
|
| 29 |
+
# Separates the next application operator from the remainder
|
| 30 |
+
APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""")
|
| 31 |
+
|
| 32 |
+
# Parses the definition of the right-hand side (rhs) of either a word or a family
|
| 33 |
+
LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE)
|
| 34 |
+
|
| 35 |
+
# Parses the right hand side that contains category and maybe semantic predicate
|
| 36 |
+
RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE)
|
| 37 |
+
|
| 38 |
+
# Parses the semantic predicate
|
| 39 |
+
SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE)
|
| 40 |
+
|
| 41 |
+
# Strips comments from a line
|
| 42 |
+
COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class Token:
|
| 46 |
+
"""
|
| 47 |
+
Class representing a token.
|
| 48 |
+
|
| 49 |
+
token => category {semantics}
|
| 50 |
+
e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)}
|
| 51 |
+
|
| 52 |
+
* `token` (string)
|
| 53 |
+
* `categ` (string)
|
| 54 |
+
* `semantics` (Expression)
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
def __init__(self, token, categ, semantics=None):
|
| 58 |
+
self._token = token
|
| 59 |
+
self._categ = categ
|
| 60 |
+
self._semantics = semantics
|
| 61 |
+
|
| 62 |
+
def categ(self):
|
| 63 |
+
return self._categ
|
| 64 |
+
|
| 65 |
+
def semantics(self):
|
| 66 |
+
return self._semantics
|
| 67 |
+
|
| 68 |
+
def __str__(self):
|
| 69 |
+
semantics_str = ""
|
| 70 |
+
if self._semantics is not None:
|
| 71 |
+
semantics_str = " {" + str(self._semantics) + "}"
|
| 72 |
+
return "" + str(self._categ) + semantics_str
|
| 73 |
+
|
| 74 |
+
def __cmp__(self, other):
|
| 75 |
+
if not isinstance(other, Token):
|
| 76 |
+
return -1
|
| 77 |
+
return cmp((self._categ, self._semantics), other.categ(), other.semantics())
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class CCGLexicon:
|
| 81 |
+
"""
|
| 82 |
+
Class representing a lexicon for CCG grammars.
|
| 83 |
+
|
| 84 |
+
* `primitives`: The list of primitive categories for the lexicon
|
| 85 |
+
* `families`: Families of categories
|
| 86 |
+
* `entries`: A mapping of words to possible categories
|
| 87 |
+
"""
|
| 88 |
+
|
| 89 |
+
def __init__(self, start, primitives, families, entries):
|
| 90 |
+
self._start = PrimitiveCategory(start)
|
| 91 |
+
self._primitives = primitives
|
| 92 |
+
self._families = families
|
| 93 |
+
self._entries = entries
|
| 94 |
+
|
| 95 |
+
def categories(self, word):
|
| 96 |
+
"""
|
| 97 |
+
Returns all the possible categories for a word
|
| 98 |
+
"""
|
| 99 |
+
return self._entries[word]
|
| 100 |
+
|
| 101 |
+
def start(self):
|
| 102 |
+
"""
|
| 103 |
+
Return the target category for the parser
|
| 104 |
+
"""
|
| 105 |
+
return self._start
|
| 106 |
+
|
| 107 |
+
def __str__(self):
|
| 108 |
+
"""
|
| 109 |
+
String representation of the lexicon. Used for debugging.
|
| 110 |
+
"""
|
| 111 |
+
string = ""
|
| 112 |
+
first = True
|
| 113 |
+
for ident in sorted(self._entries):
|
| 114 |
+
if not first:
|
| 115 |
+
string = string + "\n"
|
| 116 |
+
string = string + ident + " => "
|
| 117 |
+
|
| 118 |
+
first = True
|
| 119 |
+
for cat in self._entries[ident]:
|
| 120 |
+
if not first:
|
| 121 |
+
string = string + " | "
|
| 122 |
+
else:
|
| 123 |
+
first = False
|
| 124 |
+
string = string + "%s" % cat
|
| 125 |
+
return string
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# -----------
|
| 129 |
+
# Parsing lexicons
|
| 130 |
+
# -----------
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def matchBrackets(string):
|
| 134 |
+
"""
|
| 135 |
+
Separate the contents matching the first set of brackets from the rest of
|
| 136 |
+
the input.
|
| 137 |
+
"""
|
| 138 |
+
rest = string[1:]
|
| 139 |
+
inside = "("
|
| 140 |
+
|
| 141 |
+
while rest != "" and not rest.startswith(")"):
|
| 142 |
+
if rest.startswith("("):
|
| 143 |
+
(part, rest) = matchBrackets(rest)
|
| 144 |
+
inside = inside + part
|
| 145 |
+
else:
|
| 146 |
+
inside = inside + rest[0]
|
| 147 |
+
rest = rest[1:]
|
| 148 |
+
if rest.startswith(")"):
|
| 149 |
+
return (inside + ")", rest[1:])
|
| 150 |
+
raise AssertionError("Unmatched bracket in string '" + string + "'")
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def nextCategory(string):
|
| 154 |
+
"""
|
| 155 |
+
Separate the string for the next portion of the category from the rest
|
| 156 |
+
of the string
|
| 157 |
+
"""
|
| 158 |
+
if string.startswith("("):
|
| 159 |
+
return matchBrackets(string)
|
| 160 |
+
return NEXTPRIM_RE.match(string).groups()
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def parseApplication(app):
|
| 164 |
+
"""
|
| 165 |
+
Parse an application operator
|
| 166 |
+
"""
|
| 167 |
+
return Direction(app[0], app[1:])
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def parseSubscripts(subscr):
|
| 171 |
+
"""
|
| 172 |
+
Parse the subscripts for a primitive category
|
| 173 |
+
"""
|
| 174 |
+
if subscr:
|
| 175 |
+
return subscr[1:-1].split(",")
|
| 176 |
+
return []
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def parsePrimitiveCategory(chunks, primitives, families, var):
|
| 180 |
+
"""
|
| 181 |
+
Parse a primitive category
|
| 182 |
+
|
| 183 |
+
If the primitive is the special category 'var', replace it with the
|
| 184 |
+
correct `CCGVar`.
|
| 185 |
+
"""
|
| 186 |
+
if chunks[0] == "var":
|
| 187 |
+
if chunks[1] is None:
|
| 188 |
+
if var is None:
|
| 189 |
+
var = CCGVar()
|
| 190 |
+
return (var, var)
|
| 191 |
+
|
| 192 |
+
catstr = chunks[0]
|
| 193 |
+
if catstr in families:
|
| 194 |
+
(cat, cvar) = families[catstr]
|
| 195 |
+
if var is None:
|
| 196 |
+
var = cvar
|
| 197 |
+
else:
|
| 198 |
+
cat = cat.substitute([(cvar, var)])
|
| 199 |
+
return (cat, var)
|
| 200 |
+
|
| 201 |
+
if catstr in primitives:
|
| 202 |
+
subscrs = parseSubscripts(chunks[1])
|
| 203 |
+
return (PrimitiveCategory(catstr, subscrs), var)
|
| 204 |
+
raise AssertionError(
|
| 205 |
+
"String '" + catstr + "' is neither a family nor primitive category."
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def augParseCategory(line, primitives, families, var=None):
|
| 210 |
+
"""
|
| 211 |
+
Parse a string representing a category, and returns a tuple with
|
| 212 |
+
(possibly) the CCG variable for the category
|
| 213 |
+
"""
|
| 214 |
+
(cat_string, rest) = nextCategory(line)
|
| 215 |
+
|
| 216 |
+
if cat_string.startswith("("):
|
| 217 |
+
(res, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
|
| 218 |
+
|
| 219 |
+
else:
|
| 220 |
+
(res, var) = parsePrimitiveCategory(
|
| 221 |
+
PRIM_RE.match(cat_string).groups(), primitives, families, var
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
while rest != "":
|
| 225 |
+
app = APP_RE.match(rest).groups()
|
| 226 |
+
direction = parseApplication(app[0:3])
|
| 227 |
+
rest = app[3]
|
| 228 |
+
|
| 229 |
+
(cat_string, rest) = nextCategory(rest)
|
| 230 |
+
if cat_string.startswith("("):
|
| 231 |
+
(arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var)
|
| 232 |
+
else:
|
| 233 |
+
(arg, var) = parsePrimitiveCategory(
|
| 234 |
+
PRIM_RE.match(cat_string).groups(), primitives, families, var
|
| 235 |
+
)
|
| 236 |
+
res = FunctionalCategory(res, arg, direction)
|
| 237 |
+
|
| 238 |
+
return (res, var)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def fromstring(lex_str, include_semantics=False):
|
| 242 |
+
"""
|
| 243 |
+
Convert string representation into a lexicon for CCGs.
|
| 244 |
+
"""
|
| 245 |
+
CCGVar.reset_id()
|
| 246 |
+
primitives = []
|
| 247 |
+
families = {}
|
| 248 |
+
entries = defaultdict(list)
|
| 249 |
+
for line in lex_str.splitlines():
|
| 250 |
+
# Strip comments and leading/trailing whitespace.
|
| 251 |
+
line = COMMENTS_RE.match(line).groups()[0].strip()
|
| 252 |
+
if line == "":
|
| 253 |
+
continue
|
| 254 |
+
|
| 255 |
+
if line.startswith(":-"):
|
| 256 |
+
# A line of primitive categories.
|
| 257 |
+
# The first one is the target category
|
| 258 |
+
# ie, :- S, N, NP, VP
|
| 259 |
+
primitives = primitives + [
|
| 260 |
+
prim.strip() for prim in line[2:].strip().split(",")
|
| 261 |
+
]
|
| 262 |
+
else:
|
| 263 |
+
# Either a family definition, or a word definition
|
| 264 |
+
(ident, sep, rhs) = LEX_RE.match(line).groups()
|
| 265 |
+
(catstr, semantics_str) = RHS_RE.match(rhs).groups()
|
| 266 |
+
(cat, var) = augParseCategory(catstr, primitives, families)
|
| 267 |
+
|
| 268 |
+
if sep == "::":
|
| 269 |
+
# Family definition
|
| 270 |
+
# ie, Det :: NP/N
|
| 271 |
+
families[ident] = (cat, var)
|
| 272 |
+
else:
|
| 273 |
+
semantics = None
|
| 274 |
+
if include_semantics is True:
|
| 275 |
+
if semantics_str is None:
|
| 276 |
+
raise AssertionError(
|
| 277 |
+
line
|
| 278 |
+
+ " must contain semantics because include_semantics is set to True"
|
| 279 |
+
)
|
| 280 |
+
else:
|
| 281 |
+
semantics = Expression.fromstring(
|
| 282 |
+
SEMANTICS_RE.match(semantics_str).groups()[0]
|
| 283 |
+
)
|
| 284 |
+
# Word definition
|
| 285 |
+
# ie, which => (N\N)/(S/NP)
|
| 286 |
+
entries[ident].append(Token(ident, cat, semantics))
|
| 287 |
+
return CCGLexicon(primitives[0], primitives, families, entries)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
@deprecated("Use fromstring() instead.")
|
| 291 |
+
def parseLexicon(lex_str):
|
| 292 |
+
return fromstring(lex_str)
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
openccg_tinytiny = fromstring(
|
| 296 |
+
"""
|
| 297 |
+
# Rather minimal lexicon based on the openccg `tinytiny' grammar.
|
| 298 |
+
# Only incorporates a subset of the morphological subcategories, however.
|
| 299 |
+
:- S,NP,N # Primitive categories
|
| 300 |
+
Det :: NP/N # Determiners
|
| 301 |
+
Pro :: NP
|
| 302 |
+
IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
|
| 303 |
+
IntransVpl :: S\\NP[pl] # Plural
|
| 304 |
+
TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
|
| 305 |
+
TransVpl :: S\\NP[pl]/NP # Plural
|
| 306 |
+
|
| 307 |
+
the => NP[sg]/N[sg]
|
| 308 |
+
the => NP[pl]/N[pl]
|
| 309 |
+
|
| 310 |
+
I => Pro
|
| 311 |
+
me => Pro
|
| 312 |
+
we => Pro
|
| 313 |
+
us => Pro
|
| 314 |
+
|
| 315 |
+
book => N[sg]
|
| 316 |
+
books => N[pl]
|
| 317 |
+
|
| 318 |
+
peach => N[sg]
|
| 319 |
+
peaches => N[pl]
|
| 320 |
+
|
| 321 |
+
policeman => N[sg]
|
| 322 |
+
policemen => N[pl]
|
| 323 |
+
|
| 324 |
+
boy => N[sg]
|
| 325 |
+
boys => N[pl]
|
| 326 |
+
|
| 327 |
+
sleep => IntransVsg
|
| 328 |
+
sleep => IntransVpl
|
| 329 |
+
|
| 330 |
+
eat => IntransVpl
|
| 331 |
+
eat => TransVpl
|
| 332 |
+
eats => IntransVsg
|
| 333 |
+
eats => TransVsg
|
| 334 |
+
|
| 335 |
+
see => TransVpl
|
| 336 |
+
sees => TransVsg
|
| 337 |
+
"""
|
| 338 |
+
)
|
.eggs/nltk-3.8-py3.10.egg/nltk/ccg/logic.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Combinatory Categorial Grammar
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Tanin Na Nakorn (@tanin)
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
"""
|
| 8 |
+
Helper functions for CCG semantics computation
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from nltk.sem.logic import *
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def compute_type_raised_semantics(semantics):
|
| 15 |
+
core = semantics
|
| 16 |
+
parent = None
|
| 17 |
+
while isinstance(core, LambdaExpression):
|
| 18 |
+
parent = core
|
| 19 |
+
core = core.term
|
| 20 |
+
|
| 21 |
+
var = Variable("F")
|
| 22 |
+
while var in core.free():
|
| 23 |
+
var = unique_variable(pattern=var)
|
| 24 |
+
core = ApplicationExpression(FunctionVariableExpression(var), core)
|
| 25 |
+
|
| 26 |
+
if parent is not None:
|
| 27 |
+
parent.term = core
|
| 28 |
+
else:
|
| 29 |
+
semantics = core
|
| 30 |
+
|
| 31 |
+
return LambdaExpression(var, semantics)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def compute_function_semantics(function, argument):
|
| 35 |
+
return ApplicationExpression(function, argument).simplify()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def compute_composition_semantics(function, argument):
|
| 39 |
+
assert isinstance(argument, LambdaExpression), (
|
| 40 |
+
"`" + str(argument) + "` must be a lambda expression"
|
| 41 |
+
)
|
| 42 |
+
return LambdaExpression(
|
| 43 |
+
argument.variable, ApplicationExpression(function, argument.term).simplify()
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def compute_substitution_semantics(function, argument):
|
| 48 |
+
assert isinstance(function, LambdaExpression) and isinstance(
|
| 49 |
+
function.term, LambdaExpression
|
| 50 |
+
), ("`" + str(function) + "` must be a lambda expression with 2 arguments")
|
| 51 |
+
assert isinstance(argument, LambdaExpression), (
|
| 52 |
+
"`" + str(argument) + "` must be a lambda expression"
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
new_argument = ApplicationExpression(
|
| 56 |
+
argument, VariableExpression(function.variable)
|
| 57 |
+
).simplify()
|
| 58 |
+
new_term = ApplicationExpression(function.term, new_argument).simplify()
|
| 59 |
+
|
| 60 |
+
return LambdaExpression(function.variable, new_term)
|
.eggs/nltk-3.8-py3.10.egg/nltk/chat/__init__.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Chatbots
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Authors: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
|
| 9 |
+
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
|
| 10 |
+
|
| 11 |
+
"""
|
| 12 |
+
A class for simple chatbots. These perform simple pattern matching on sentences
|
| 13 |
+
typed by users, and respond with automatically generated sentences.
|
| 14 |
+
|
| 15 |
+
These chatbots may not work using the windows command line or the
|
| 16 |
+
windows IDLE GUI.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from nltk.chat.eliza import eliza_chat
|
| 20 |
+
from nltk.chat.iesha import iesha_chat
|
| 21 |
+
from nltk.chat.rude import rude_chat
|
| 22 |
+
from nltk.chat.suntsu import suntsu_chat
|
| 23 |
+
from nltk.chat.util import Chat
|
| 24 |
+
from nltk.chat.zen import zen_chat
|
| 25 |
+
|
| 26 |
+
bots = [
|
| 27 |
+
(eliza_chat, "Eliza (psycho-babble)"),
|
| 28 |
+
(iesha_chat, "Iesha (teen anime junky)"),
|
| 29 |
+
(rude_chat, "Rude (abusive bot)"),
|
| 30 |
+
(suntsu_chat, "Suntsu (Chinese sayings)"),
|
| 31 |
+
(zen_chat, "Zen (gems of wisdom)"),
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def chatbots():
|
| 36 |
+
print("Which chatbot would you like to talk to?")
|
| 37 |
+
botcount = len(bots)
|
| 38 |
+
for i in range(botcount):
|
| 39 |
+
print(" %d: %s" % (i + 1, bots[i][1]))
|
| 40 |
+
while True:
|
| 41 |
+
choice = input(f"\nEnter a number in the range 1-{botcount}: ").strip()
|
| 42 |
+
if choice.isdigit() and (int(choice) - 1) in range(botcount):
|
| 43 |
+
break
|
| 44 |
+
else:
|
| 45 |
+
print(" Error: bad chatbot number")
|
| 46 |
+
|
| 47 |
+
chatbot = bots[int(choice) - 1][0]
|
| 48 |
+
chatbot()
|
.eggs/nltk-3.8-py3.10.egg/nltk/chat/eliza.py
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Eliza
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Authors: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
|
| 10 |
+
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <mailto:jez@jezuk.co.uk>.
|
| 11 |
+
|
| 12 |
+
# a translation table used to convert things you say into things the
|
| 13 |
+
# computer says back, e.g. "I am" --> "you are"
|
| 14 |
+
|
| 15 |
+
from nltk.chat.util import Chat, reflections
|
| 16 |
+
|
| 17 |
+
# a table of response pairs, where each pair consists of a
|
| 18 |
+
# regular expression, and a list of possible responses,
|
| 19 |
+
# with group-macros labelled as %1, %2.
|
| 20 |
+
|
| 21 |
+
pairs = (
|
| 22 |
+
(
|
| 23 |
+
r"I need (.*)",
|
| 24 |
+
(
|
| 25 |
+
"Why do you need %1?",
|
| 26 |
+
"Would it really help you to get %1?",
|
| 27 |
+
"Are you sure you need %1?",
|
| 28 |
+
),
|
| 29 |
+
),
|
| 30 |
+
(
|
| 31 |
+
r"Why don\'t you (.*)",
|
| 32 |
+
(
|
| 33 |
+
"Do you really think I don't %1?",
|
| 34 |
+
"Perhaps eventually I will %1.",
|
| 35 |
+
"Do you really want me to %1?",
|
| 36 |
+
),
|
| 37 |
+
),
|
| 38 |
+
(
|
| 39 |
+
r"Why can\'t I (.*)",
|
| 40 |
+
(
|
| 41 |
+
"Do you think you should be able to %1?",
|
| 42 |
+
"If you could %1, what would you do?",
|
| 43 |
+
"I don't know -- why can't you %1?",
|
| 44 |
+
"Have you really tried?",
|
| 45 |
+
),
|
| 46 |
+
),
|
| 47 |
+
(
|
| 48 |
+
r"I can\'t (.*)",
|
| 49 |
+
(
|
| 50 |
+
"How do you know you can't %1?",
|
| 51 |
+
"Perhaps you could %1 if you tried.",
|
| 52 |
+
"What would it take for you to %1?",
|
| 53 |
+
),
|
| 54 |
+
),
|
| 55 |
+
(
|
| 56 |
+
r"I am (.*)",
|
| 57 |
+
(
|
| 58 |
+
"Did you come to me because you are %1?",
|
| 59 |
+
"How long have you been %1?",
|
| 60 |
+
"How do you feel about being %1?",
|
| 61 |
+
),
|
| 62 |
+
),
|
| 63 |
+
(
|
| 64 |
+
r"I\'m (.*)",
|
| 65 |
+
(
|
| 66 |
+
"How does being %1 make you feel?",
|
| 67 |
+
"Do you enjoy being %1?",
|
| 68 |
+
"Why do you tell me you're %1?",
|
| 69 |
+
"Why do you think you're %1?",
|
| 70 |
+
),
|
| 71 |
+
),
|
| 72 |
+
(
|
| 73 |
+
r"Are you (.*)",
|
| 74 |
+
(
|
| 75 |
+
"Why does it matter whether I am %1?",
|
| 76 |
+
"Would you prefer it if I were not %1?",
|
| 77 |
+
"Perhaps you believe I am %1.",
|
| 78 |
+
"I may be %1 -- what do you think?",
|
| 79 |
+
),
|
| 80 |
+
),
|
| 81 |
+
(
|
| 82 |
+
r"What (.*)",
|
| 83 |
+
(
|
| 84 |
+
"Why do you ask?",
|
| 85 |
+
"How would an answer to that help you?",
|
| 86 |
+
"What do you think?",
|
| 87 |
+
),
|
| 88 |
+
),
|
| 89 |
+
(
|
| 90 |
+
r"How (.*)",
|
| 91 |
+
(
|
| 92 |
+
"How do you suppose?",
|
| 93 |
+
"Perhaps you can answer your own question.",
|
| 94 |
+
"What is it you're really asking?",
|
| 95 |
+
),
|
| 96 |
+
),
|
| 97 |
+
(
|
| 98 |
+
r"Because (.*)",
|
| 99 |
+
(
|
| 100 |
+
"Is that the real reason?",
|
| 101 |
+
"What other reasons come to mind?",
|
| 102 |
+
"Does that reason apply to anything else?",
|
| 103 |
+
"If %1, what else must be true?",
|
| 104 |
+
),
|
| 105 |
+
),
|
| 106 |
+
(
|
| 107 |
+
r"(.*) sorry (.*)",
|
| 108 |
+
(
|
| 109 |
+
"There are many times when no apology is needed.",
|
| 110 |
+
"What feelings do you have when you apologize?",
|
| 111 |
+
),
|
| 112 |
+
),
|
| 113 |
+
(
|
| 114 |
+
r"Hello(.*)",
|
| 115 |
+
(
|
| 116 |
+
"Hello... I'm glad you could drop by today.",
|
| 117 |
+
"Hi there... how are you today?",
|
| 118 |
+
"Hello, how are you feeling today?",
|
| 119 |
+
),
|
| 120 |
+
),
|
| 121 |
+
(
|
| 122 |
+
r"I think (.*)",
|
| 123 |
+
("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"),
|
| 124 |
+
),
|
| 125 |
+
(
|
| 126 |
+
r"(.*) friend (.*)",
|
| 127 |
+
(
|
| 128 |
+
"Tell me more about your friends.",
|
| 129 |
+
"When you think of a friend, what comes to mind?",
|
| 130 |
+
"Why don't you tell me about a childhood friend?",
|
| 131 |
+
),
|
| 132 |
+
),
|
| 133 |
+
(r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")),
|
| 134 |
+
(
|
| 135 |
+
r"(.*) computer(.*)",
|
| 136 |
+
(
|
| 137 |
+
"Are you really talking about me?",
|
| 138 |
+
"Does it seem strange to talk to a computer?",
|
| 139 |
+
"How do computers make you feel?",
|
| 140 |
+
"Do you feel threatened by computers?",
|
| 141 |
+
),
|
| 142 |
+
),
|
| 143 |
+
(
|
| 144 |
+
r"Is it (.*)",
|
| 145 |
+
(
|
| 146 |
+
"Do you think it is %1?",
|
| 147 |
+
"Perhaps it's %1 -- what do you think?",
|
| 148 |
+
"If it were %1, what would you do?",
|
| 149 |
+
"It could well be that %1.",
|
| 150 |
+
),
|
| 151 |
+
),
|
| 152 |
+
(
|
| 153 |
+
r"It is (.*)",
|
| 154 |
+
(
|
| 155 |
+
"You seem very certain.",
|
| 156 |
+
"If I told you that it probably isn't %1, what would you feel?",
|
| 157 |
+
),
|
| 158 |
+
),
|
| 159 |
+
(
|
| 160 |
+
r"Can you (.*)",
|
| 161 |
+
(
|
| 162 |
+
"What makes you think I can't %1?",
|
| 163 |
+
"If I could %1, then what?",
|
| 164 |
+
"Why do you ask if I can %1?",
|
| 165 |
+
),
|
| 166 |
+
),
|
| 167 |
+
(
|
| 168 |
+
r"Can I (.*)",
|
| 169 |
+
(
|
| 170 |
+
"Perhaps you don't want to %1.",
|
| 171 |
+
"Do you want to be able to %1?",
|
| 172 |
+
"If you could %1, would you?",
|
| 173 |
+
),
|
| 174 |
+
),
|
| 175 |
+
(
|
| 176 |
+
r"You are (.*)",
|
| 177 |
+
(
|
| 178 |
+
"Why do you think I am %1?",
|
| 179 |
+
"Does it please you to think that I'm %1?",
|
| 180 |
+
"Perhaps you would like me to be %1.",
|
| 181 |
+
"Perhaps you're really talking about yourself?",
|
| 182 |
+
),
|
| 183 |
+
),
|
| 184 |
+
(
|
| 185 |
+
r"You\'re (.*)",
|
| 186 |
+
(
|
| 187 |
+
"Why do you say I am %1?",
|
| 188 |
+
"Why do you think I am %1?",
|
| 189 |
+
"Are we talking about you, or me?",
|
| 190 |
+
),
|
| 191 |
+
),
|
| 192 |
+
(
|
| 193 |
+
r"I don\'t (.*)",
|
| 194 |
+
("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"),
|
| 195 |
+
),
|
| 196 |
+
(
|
| 197 |
+
r"I feel (.*)",
|
| 198 |
+
(
|
| 199 |
+
"Good, tell me more about these feelings.",
|
| 200 |
+
"Do you often feel %1?",
|
| 201 |
+
"When do you usually feel %1?",
|
| 202 |
+
"When you feel %1, what do you do?",
|
| 203 |
+
),
|
| 204 |
+
),
|
| 205 |
+
(
|
| 206 |
+
r"I have (.*)",
|
| 207 |
+
(
|
| 208 |
+
"Why do you tell me that you've %1?",
|
| 209 |
+
"Have you really %1?",
|
| 210 |
+
"Now that you have %1, what will you do next?",
|
| 211 |
+
),
|
| 212 |
+
),
|
| 213 |
+
(
|
| 214 |
+
r"I would (.*)",
|
| 215 |
+
(
|
| 216 |
+
"Could you explain why you would %1?",
|
| 217 |
+
"Why would you %1?",
|
| 218 |
+
"Who else knows that you would %1?",
|
| 219 |
+
),
|
| 220 |
+
),
|
| 221 |
+
(
|
| 222 |
+
r"Is there (.*)",
|
| 223 |
+
(
|
| 224 |
+
"Do you think there is %1?",
|
| 225 |
+
"It's likely that there is %1.",
|
| 226 |
+
"Would you like there to be %1?",
|
| 227 |
+
),
|
| 228 |
+
),
|
| 229 |
+
(
|
| 230 |
+
r"My (.*)",
|
| 231 |
+
(
|
| 232 |
+
"I see, your %1.",
|
| 233 |
+
"Why do you say that your %1?",
|
| 234 |
+
"When your %1, how do you feel?",
|
| 235 |
+
),
|
| 236 |
+
),
|
| 237 |
+
(
|
| 238 |
+
r"You (.*)",
|
| 239 |
+
(
|
| 240 |
+
"We should be discussing you, not me.",
|
| 241 |
+
"Why do you say that about me?",
|
| 242 |
+
"Why do you care whether I %1?",
|
| 243 |
+
),
|
| 244 |
+
),
|
| 245 |
+
(r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")),
|
| 246 |
+
(
|
| 247 |
+
r"I want (.*)",
|
| 248 |
+
(
|
| 249 |
+
"What would it mean to you if you got %1?",
|
| 250 |
+
"Why do you want %1?",
|
| 251 |
+
"What would you do if you got %1?",
|
| 252 |
+
"If you got %1, then what would you do?",
|
| 253 |
+
),
|
| 254 |
+
),
|
| 255 |
+
(
|
| 256 |
+
r"(.*) mother(.*)",
|
| 257 |
+
(
|
| 258 |
+
"Tell me more about your mother.",
|
| 259 |
+
"What was your relationship with your mother like?",
|
| 260 |
+
"How do you feel about your mother?",
|
| 261 |
+
"How does this relate to your feelings today?",
|
| 262 |
+
"Good family relations are important.",
|
| 263 |
+
),
|
| 264 |
+
),
|
| 265 |
+
(
|
| 266 |
+
r"(.*) father(.*)",
|
| 267 |
+
(
|
| 268 |
+
"Tell me more about your father.",
|
| 269 |
+
"How did your father make you feel?",
|
| 270 |
+
"How do you feel about your father?",
|
| 271 |
+
"Does your relationship with your father relate to your feelings today?",
|
| 272 |
+
"Do you have trouble showing affection with your family?",
|
| 273 |
+
),
|
| 274 |
+
),
|
| 275 |
+
(
|
| 276 |
+
r"(.*) child(.*)",
|
| 277 |
+
(
|
| 278 |
+
"Did you have close friends as a child?",
|
| 279 |
+
"What is your favorite childhood memory?",
|
| 280 |
+
"Do you remember any dreams or nightmares from childhood?",
|
| 281 |
+
"Did the other children sometimes tease you?",
|
| 282 |
+
"How do you think your childhood experiences relate to your feelings today?",
|
| 283 |
+
),
|
| 284 |
+
),
|
| 285 |
+
(
|
| 286 |
+
r"(.*)\?",
|
| 287 |
+
(
|
| 288 |
+
"Why do you ask that?",
|
| 289 |
+
"Please consider whether you can answer your own question.",
|
| 290 |
+
"Perhaps the answer lies within yourself?",
|
| 291 |
+
"Why don't you tell me?",
|
| 292 |
+
),
|
| 293 |
+
),
|
| 294 |
+
(
|
| 295 |
+
r"quit",
|
| 296 |
+
(
|
| 297 |
+
"Thank you for talking with me.",
|
| 298 |
+
"Good-bye.",
|
| 299 |
+
"Thank you, that will be $150. Have a good day!",
|
| 300 |
+
),
|
| 301 |
+
),
|
| 302 |
+
(
|
| 303 |
+
r"(.*)",
|
| 304 |
+
(
|
| 305 |
+
"Please tell me more.",
|
| 306 |
+
"Let's change focus a bit... Tell me about your family.",
|
| 307 |
+
"Can you elaborate on that?",
|
| 308 |
+
"Why do you say that %1?",
|
| 309 |
+
"I see.",
|
| 310 |
+
"Very interesting.",
|
| 311 |
+
"%1.",
|
| 312 |
+
"I see. And what does that tell you?",
|
| 313 |
+
"How does that make you feel?",
|
| 314 |
+
"How do you feel when you say that?",
|
| 315 |
+
),
|
| 316 |
+
),
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
eliza_chatbot = Chat(pairs, reflections)
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def eliza_chat():
|
| 323 |
+
print("Therapist\n---------")
|
| 324 |
+
print("Talk to the program by typing in plain English, using normal upper-")
|
| 325 |
+
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
| 326 |
+
print("=" * 72)
|
| 327 |
+
print("Hello. How are you feeling today?")
|
| 328 |
+
|
| 329 |
+
eliza_chatbot.converse()
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
def demo():
|
| 333 |
+
eliza_chat()
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
if __name__ == "__main__":
|
| 337 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/chat/iesha.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Teen Chatbot
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Selina Dennis <sjmd@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
This chatbot is a tongue-in-cheek take on the average teen
|
| 10 |
+
anime junky that frequents YahooMessenger or MSNM.
|
| 11 |
+
All spelling mistakes and flawed grammar are intentional.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from nltk.chat.util import Chat
|
| 15 |
+
|
| 16 |
+
reflections = {
|
| 17 |
+
"am": "r",
|
| 18 |
+
"was": "were",
|
| 19 |
+
"i": "u",
|
| 20 |
+
"i'd": "u'd",
|
| 21 |
+
"i've": "u'v",
|
| 22 |
+
"ive": "u'v",
|
| 23 |
+
"i'll": "u'll",
|
| 24 |
+
"my": "ur",
|
| 25 |
+
"are": "am",
|
| 26 |
+
"you're": "im",
|
| 27 |
+
"you've": "ive",
|
| 28 |
+
"you'll": "i'll",
|
| 29 |
+
"your": "my",
|
| 30 |
+
"yours": "mine",
|
| 31 |
+
"you": "me",
|
| 32 |
+
"u": "me",
|
| 33 |
+
"ur": "my",
|
| 34 |
+
"urs": "mine",
|
| 35 |
+
"me": "u",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Note: %1/2/etc are used without spaces prior as the chat bot seems
|
| 39 |
+
# to add a superfluous space when matching.
|
| 40 |
+
|
| 41 |
+
pairs = (
|
| 42 |
+
(
|
| 43 |
+
r"I\'m (.*)",
|
| 44 |
+
(
|
| 45 |
+
"ur%1?? that's so cool! kekekekeke ^_^ tell me more!",
|
| 46 |
+
"ur%1? neat!! kekeke >_<",
|
| 47 |
+
),
|
| 48 |
+
),
|
| 49 |
+
(
|
| 50 |
+
r"(.*) don\'t you (.*)",
|
| 51 |
+
(
|
| 52 |
+
r"u think I can%2??! really?? kekeke \<_\<",
|
| 53 |
+
"what do u mean%2??!",
|
| 54 |
+
"i could if i wanted, don't you think!! kekeke",
|
| 55 |
+
),
|
| 56 |
+
),
|
| 57 |
+
(r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")),
|
| 58 |
+
(
|
| 59 |
+
r"do (you|u) (.*)\??",
|
| 60 |
+
("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"),
|
| 61 |
+
),
|
| 62 |
+
(
|
| 63 |
+
r"(.*)\?",
|
| 64 |
+
(
|
| 65 |
+
"man u ask lots of questions!",
|
| 66 |
+
"booooring! how old r u??",
|
| 67 |
+
"boooooring!! ur not very fun",
|
| 68 |
+
),
|
| 69 |
+
),
|
| 70 |
+
(
|
| 71 |
+
r"(cos|because) (.*)",
|
| 72 |
+
("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"),
|
| 73 |
+
),
|
| 74 |
+
(
|
| 75 |
+
r"why can\'t [iI] (.*)",
|
| 76 |
+
(
|
| 77 |
+
"i dunno! y u askin me for!",
|
| 78 |
+
"try harder, silly! hee! ^_^",
|
| 79 |
+
"i dunno! but when i can't%1 i jump up and down!",
|
| 80 |
+
),
|
| 81 |
+
),
|
| 82 |
+
(
|
| 83 |
+
r"I can\'t (.*)",
|
| 84 |
+
(
|
| 85 |
+
"u can't what??! >_<",
|
| 86 |
+
"that's ok! i can't%1 either! kekekekeke ^_^",
|
| 87 |
+
"try harder, silly! hee! ^&^",
|
| 88 |
+
),
|
| 89 |
+
),
|
| 90 |
+
(
|
| 91 |
+
r"(.*) (like|love|watch) anime",
|
| 92 |
+
(
|
| 93 |
+
"omg i love anime!! do u like sailor moon??! ^&^",
|
| 94 |
+
"anime yay! anime rocks sooooo much!",
|
| 95 |
+
"oooh anime! i love anime more than anything!",
|
| 96 |
+
"anime is the bestest evar! evangelion is the best!",
|
| 97 |
+
"hee anime is the best! do you have ur fav??",
|
| 98 |
+
),
|
| 99 |
+
),
|
| 100 |
+
(
|
| 101 |
+
r"I (like|love|watch|play) (.*)",
|
| 102 |
+
("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"),
|
| 103 |
+
),
|
| 104 |
+
(
|
| 105 |
+
r"anime sucks|(.*) (hate|detest) anime",
|
| 106 |
+
(
|
| 107 |
+
"ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*",
|
| 108 |
+
"no way! anime is the best ever!",
|
| 109 |
+
"nuh-uh, anime is the best!",
|
| 110 |
+
),
|
| 111 |
+
),
|
| 112 |
+
(
|
| 113 |
+
r"(are|r) (you|u) (.*)",
|
| 114 |
+
("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"),
|
| 115 |
+
),
|
| 116 |
+
(
|
| 117 |
+
r"what (.*)",
|
| 118 |
+
("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"),
|
| 119 |
+
),
|
| 120 |
+
(r"how (.*)", ("not tellin!! kekekekekeke ^_^",)),
|
| 121 |
+
(r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)),
|
| 122 |
+
(
|
| 123 |
+
r"quit",
|
| 124 |
+
(
|
| 125 |
+
"mom says i have to go eat dinner now :,( bye!!",
|
| 126 |
+
"awww u have to go?? see u next time!!",
|
| 127 |
+
"how to see u again soon! ^_^",
|
| 128 |
+
),
|
| 129 |
+
),
|
| 130 |
+
(
|
| 131 |
+
r"(.*)",
|
| 132 |
+
(
|
| 133 |
+
"ur funny! kekeke",
|
| 134 |
+
"boooooring! talk about something else! tell me wat u like!",
|
| 135 |
+
"do u like anime??",
|
| 136 |
+
"do u watch anime? i like sailor moon! ^_^",
|
| 137 |
+
"i wish i was a kitty!! kekekeke ^_^",
|
| 138 |
+
),
|
| 139 |
+
),
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
iesha_chatbot = Chat(pairs, reflections)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def iesha_chat():
|
| 146 |
+
print("Iesha the TeenBoT\n---------")
|
| 147 |
+
print("Talk to the program by typing in plain English, using normal upper-")
|
| 148 |
+
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
| 149 |
+
print("=" * 72)
|
| 150 |
+
print("hi!! i'm iesha! who r u??!")
|
| 151 |
+
|
| 152 |
+
iesha_chatbot.converse()
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def demo():
|
| 156 |
+
iesha_chat()
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
if __name__ == "__main__":
|
| 160 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/chat/rude.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Rude Chatbot
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Peter Spiller <pspiller@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
from nltk.chat.util import Chat, reflections
|
| 9 |
+
|
| 10 |
+
pairs = (
|
| 11 |
+
(
|
| 12 |
+
r"We (.*)",
|
| 13 |
+
(
|
| 14 |
+
"What do you mean, 'we'?",
|
| 15 |
+
"Don't include me in that!",
|
| 16 |
+
"I wouldn't be so sure about that.",
|
| 17 |
+
),
|
| 18 |
+
),
|
| 19 |
+
(
|
| 20 |
+
r"You should (.*)",
|
| 21 |
+
("Don't tell me what to do, buddy.", "Really? I should, should I?"),
|
| 22 |
+
),
|
| 23 |
+
(
|
| 24 |
+
r"You\'re(.*)",
|
| 25 |
+
(
|
| 26 |
+
"More like YOU'RE %1!",
|
| 27 |
+
"Hah! Look who's talking.",
|
| 28 |
+
"Come over here and tell me I'm %1.",
|
| 29 |
+
),
|
| 30 |
+
),
|
| 31 |
+
(
|
| 32 |
+
r"You are(.*)",
|
| 33 |
+
(
|
| 34 |
+
"More like YOU'RE %1!",
|
| 35 |
+
"Hah! Look who's talking.",
|
| 36 |
+
"Come over here and tell me I'm %1.",
|
| 37 |
+
),
|
| 38 |
+
),
|
| 39 |
+
(
|
| 40 |
+
r"I can\'t(.*)",
|
| 41 |
+
(
|
| 42 |
+
"You do sound like the type who can't %1.",
|
| 43 |
+
"Hear that splashing sound? That's my heart bleeding for you.",
|
| 44 |
+
"Tell somebody who might actually care.",
|
| 45 |
+
),
|
| 46 |
+
),
|
| 47 |
+
(
|
| 48 |
+
r"I think (.*)",
|
| 49 |
+
(
|
| 50 |
+
"I wouldn't think too hard if I were you.",
|
| 51 |
+
"You actually think? I'd never have guessed...",
|
| 52 |
+
),
|
| 53 |
+
),
|
| 54 |
+
(
|
| 55 |
+
r"I (.*)",
|
| 56 |
+
(
|
| 57 |
+
"I'm getting a bit tired of hearing about you.",
|
| 58 |
+
"How about we talk about me instead?",
|
| 59 |
+
"Me, me, me... Frankly, I don't care.",
|
| 60 |
+
),
|
| 61 |
+
),
|
| 62 |
+
(
|
| 63 |
+
r"How (.*)",
|
| 64 |
+
(
|
| 65 |
+
"How do you think?",
|
| 66 |
+
"Take a wild guess.",
|
| 67 |
+
"I'm not even going to dignify that with an answer.",
|
| 68 |
+
),
|
| 69 |
+
),
|
| 70 |
+
(r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")),
|
| 71 |
+
(
|
| 72 |
+
r"Why (.*)",
|
| 73 |
+
(
|
| 74 |
+
"Why not?",
|
| 75 |
+
"That's so obvious I thought even you'd have already figured it out.",
|
| 76 |
+
),
|
| 77 |
+
),
|
| 78 |
+
(
|
| 79 |
+
r"(.*)shut up(.*)",
|
| 80 |
+
(
|
| 81 |
+
"Make me.",
|
| 82 |
+
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
|
| 83 |
+
"Say that again, I dare you.",
|
| 84 |
+
),
|
| 85 |
+
),
|
| 86 |
+
(
|
| 87 |
+
r"Shut up(.*)",
|
| 88 |
+
(
|
| 89 |
+
"Make me.",
|
| 90 |
+
"Getting angry at a feeble NLP assignment? Somebody's losing it.",
|
| 91 |
+
"Say that again, I dare you.",
|
| 92 |
+
),
|
| 93 |
+
),
|
| 94 |
+
(
|
| 95 |
+
r"Hello(.*)",
|
| 96 |
+
("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."),
|
| 97 |
+
),
|
| 98 |
+
(
|
| 99 |
+
r"(.*)",
|
| 100 |
+
(
|
| 101 |
+
"I'm getting bored here. Become more interesting.",
|
| 102 |
+
"Either become more thrilling or get lost, buddy.",
|
| 103 |
+
"Change the subject before I die of fatal boredom.",
|
| 104 |
+
),
|
| 105 |
+
),
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
rude_chatbot = Chat(pairs, reflections)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def rude_chat():
|
| 112 |
+
print("Talk to the program by typing in plain English, using normal upper-")
|
| 113 |
+
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
| 114 |
+
print("=" * 72)
|
| 115 |
+
print("I suppose I should say hello.")
|
| 116 |
+
|
| 117 |
+
rude_chatbot.converse()
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def demo():
|
| 121 |
+
rude_chat()
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/chat/suntsu.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Sun Tsu-Bot
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Sam Huston 2007
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Tsu bot responds to all queries with a Sun Tsu sayings
|
| 10 |
+
|
| 11 |
+
Quoted from Sun Tsu's The Art of War
|
| 12 |
+
Translated by LIONEL GILES, M.A. 1910
|
| 13 |
+
Hosted by the Gutenberg Project
|
| 14 |
+
https://www.gutenberg.org/
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from nltk.chat.util import Chat, reflections
|
| 18 |
+
|
| 19 |
+
pairs = (
|
| 20 |
+
(r"quit", ("Good-bye.", "Plan well", "May victory be your future")),
|
| 21 |
+
(
|
| 22 |
+
r"[^\?]*\?",
|
| 23 |
+
(
|
| 24 |
+
"Please consider whether you can answer your own question.",
|
| 25 |
+
"Ask me no questions!",
|
| 26 |
+
),
|
| 27 |
+
),
|
| 28 |
+
(
|
| 29 |
+
r"[0-9]+(.*)",
|
| 30 |
+
(
|
| 31 |
+
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
|
| 32 |
+
"There are five essentials for victory",
|
| 33 |
+
),
|
| 34 |
+
),
|
| 35 |
+
(
|
| 36 |
+
r"[A-Ca-c](.*)",
|
| 37 |
+
(
|
| 38 |
+
"The art of war is of vital importance to the State.",
|
| 39 |
+
"All warfare is based on deception.",
|
| 40 |
+
"If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.",
|
| 41 |
+
"If the campaign is protracted, the resources of the State will not be equal to the strain.",
|
| 42 |
+
"Attack him where he is unprepared, appear where you are not expected.",
|
| 43 |
+
"There is no instance of a country having benefited from prolonged warfare.",
|
| 44 |
+
),
|
| 45 |
+
),
|
| 46 |
+
(
|
| 47 |
+
r"[D-Fd-f](.*)",
|
| 48 |
+
(
|
| 49 |
+
"The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.",
|
| 50 |
+
"Bring war material with you from home, but forage on the enemy.",
|
| 51 |
+
"In war, then, let your great object be victory, not lengthy campaigns.",
|
| 52 |
+
"To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.",
|
| 53 |
+
),
|
| 54 |
+
),
|
| 55 |
+
(
|
| 56 |
+
r"[G-Ig-i](.*)",
|
| 57 |
+
(
|
| 58 |
+
"Heaven signifies night and day, cold and heat, times and seasons.",
|
| 59 |
+
"It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.",
|
| 60 |
+
"The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.",
|
| 61 |
+
"One may know how to conquer without being able to do it.",
|
| 62 |
+
),
|
| 63 |
+
),
|
| 64 |
+
(
|
| 65 |
+
r"[J-Lj-l](.*)",
|
| 66 |
+
(
|
| 67 |
+
"There are three ways in which a ruler can bring misfortune upon his army.",
|
| 68 |
+
"By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.",
|
| 69 |
+
"By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.",
|
| 70 |
+
"By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.",
|
| 71 |
+
"There are five essentials for victory",
|
| 72 |
+
"He will win who knows when to fight and when not to fight.",
|
| 73 |
+
"He will win who knows how to handle both superior and inferior forces.",
|
| 74 |
+
"He will win whose army is animated by the same spirit throughout all its ranks.",
|
| 75 |
+
"He will win who, prepared himself, waits to take the enemy unprepared.",
|
| 76 |
+
"He will win who has military capacity and is not interfered with by the sovereign.",
|
| 77 |
+
),
|
| 78 |
+
),
|
| 79 |
+
(
|
| 80 |
+
r"[M-Om-o](.*)",
|
| 81 |
+
(
|
| 82 |
+
"If you know the enemy and know yourself, you need not fear the result of a hundred battles.",
|
| 83 |
+
"If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.",
|
| 84 |
+
"If you know neither the enemy nor yourself, you will succumb in every battle.",
|
| 85 |
+
"The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.",
|
| 86 |
+
),
|
| 87 |
+
),
|
| 88 |
+
(
|
| 89 |
+
r"[P-Rp-r](.*)",
|
| 90 |
+
(
|
| 91 |
+
"Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.",
|
| 92 |
+
"Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.",
|
| 93 |
+
"He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.",
|
| 94 |
+
"A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.",
|
| 95 |
+
"The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.",
|
| 96 |
+
),
|
| 97 |
+
),
|
| 98 |
+
(
|
| 99 |
+
r"[S-Us-u](.*)",
|
| 100 |
+
(
|
| 101 |
+
"What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.",
|
| 102 |
+
"Hence his victories bring him neither reputation for wisdom nor credit for courage.",
|
| 103 |
+
"Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.",
|
| 104 |
+
"In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.",
|
| 105 |
+
"There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.",
|
| 106 |
+
"Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.",
|
| 107 |
+
),
|
| 108 |
+
),
|
| 109 |
+
(
|
| 110 |
+
r"[V-Zv-z](.*)",
|
| 111 |
+
(
|
| 112 |
+
"It is a matter of life and death, a road either to safety or to ruin.",
|
| 113 |
+
"Hold out baits to entice the enemy. Feign disorder, and crush him.",
|
| 114 |
+
"All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.",
|
| 115 |
+
"Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.",
|
| 116 |
+
"So in war, the way is to avoid what is strong and to strike at what is weak.",
|
| 117 |
+
"Just as water retains no constant shape, so in warfare there are no constant conditions.",
|
| 118 |
+
),
|
| 119 |
+
),
|
| 120 |
+
(r"(.*)", ("Your statement insults me.", "")),
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
suntsu_chatbot = Chat(pairs, reflections)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def suntsu_chat():
|
| 127 |
+
print("Talk to the program by typing in plain English, using normal upper-")
|
| 128 |
+
print('and lower-case letters and punctuation. Enter "quit" when done.')
|
| 129 |
+
print("=" * 72)
|
| 130 |
+
print("You seek enlightenment?")
|
| 131 |
+
|
| 132 |
+
suntsu_chatbot.converse()
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def demo():
|
| 136 |
+
suntsu_chat()
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
if __name__ == "__main__":
|
| 140 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/chat/util.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Chatbot Utilities
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Authors: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
# Based on an Eliza implementation by Joe Strout <joe@strout.net>,
|
| 9 |
+
# Jeff Epler <jepler@inetnebr.com> and Jez Higgins <jez@jezuk.co.uk>.
|
| 10 |
+
|
| 11 |
+
import random
|
| 12 |
+
import re
|
| 13 |
+
|
| 14 |
+
reflections = {
|
| 15 |
+
"i am": "you are",
|
| 16 |
+
"i was": "you were",
|
| 17 |
+
"i": "you",
|
| 18 |
+
"i'm": "you are",
|
| 19 |
+
"i'd": "you would",
|
| 20 |
+
"i've": "you have",
|
| 21 |
+
"i'll": "you will",
|
| 22 |
+
"my": "your",
|
| 23 |
+
"you are": "I am",
|
| 24 |
+
"you were": "I was",
|
| 25 |
+
"you've": "I have",
|
| 26 |
+
"you'll": "I will",
|
| 27 |
+
"your": "my",
|
| 28 |
+
"yours": "mine",
|
| 29 |
+
"you": "me",
|
| 30 |
+
"me": "you",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class Chat:
|
| 35 |
+
def __init__(self, pairs, reflections={}):
|
| 36 |
+
"""
|
| 37 |
+
Initialize the chatbot. Pairs is a list of patterns and responses. Each
|
| 38 |
+
pattern is a regular expression matching the user's statement or question,
|
| 39 |
+
e.g. r'I like (.*)'. For each such pattern a list of possible responses
|
| 40 |
+
is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material
|
| 41 |
+
which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to
|
| 42 |
+
the numbered positions in the responses, e.g. %1.
|
| 43 |
+
|
| 44 |
+
:type pairs: list of tuple
|
| 45 |
+
:param pairs: The patterns and responses
|
| 46 |
+
:type reflections: dict
|
| 47 |
+
:param reflections: A mapping between first and second person expressions
|
| 48 |
+
:rtype: None
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs]
|
| 52 |
+
self._reflections = reflections
|
| 53 |
+
self._regex = self._compile_reflections()
|
| 54 |
+
|
| 55 |
+
def _compile_reflections(self):
|
| 56 |
+
sorted_refl = sorted(self._reflections, key=len, reverse=True)
|
| 57 |
+
return re.compile(
|
| 58 |
+
r"\b({})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
def _substitute(self, str):
|
| 62 |
+
"""
|
| 63 |
+
Substitute words in the string, according to the specified reflections,
|
| 64 |
+
e.g. "I'm" -> "you are"
|
| 65 |
+
|
| 66 |
+
:type str: str
|
| 67 |
+
:param str: The string to be mapped
|
| 68 |
+
:rtype: str
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
return self._regex.sub(
|
| 72 |
+
lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower()
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
def _wildcards(self, response, match):
|
| 76 |
+
pos = response.find("%")
|
| 77 |
+
while pos >= 0:
|
| 78 |
+
num = int(response[pos + 1 : pos + 2])
|
| 79 |
+
response = (
|
| 80 |
+
response[:pos]
|
| 81 |
+
+ self._substitute(match.group(num))
|
| 82 |
+
+ response[pos + 2 :]
|
| 83 |
+
)
|
| 84 |
+
pos = response.find("%")
|
| 85 |
+
return response
|
| 86 |
+
|
| 87 |
+
def respond(self, str):
|
| 88 |
+
"""
|
| 89 |
+
Generate a response to the user input.
|
| 90 |
+
|
| 91 |
+
:type str: str
|
| 92 |
+
:param str: The string to be mapped
|
| 93 |
+
:rtype: str
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
# check each pattern
|
| 97 |
+
for (pattern, response) in self._pairs:
|
| 98 |
+
match = pattern.match(str)
|
| 99 |
+
|
| 100 |
+
# did the pattern match?
|
| 101 |
+
if match:
|
| 102 |
+
resp = random.choice(response) # pick a random response
|
| 103 |
+
resp = self._wildcards(resp, match) # process wildcards
|
| 104 |
+
|
| 105 |
+
# fix munged punctuation at the end
|
| 106 |
+
if resp[-2:] == "?.":
|
| 107 |
+
resp = resp[:-2] + "."
|
| 108 |
+
if resp[-2:] == "??":
|
| 109 |
+
resp = resp[:-2] + "?"
|
| 110 |
+
return resp
|
| 111 |
+
|
| 112 |
+
# Hold a conversation with a chatbot
|
| 113 |
+
def converse(self, quit="quit"):
|
| 114 |
+
user_input = ""
|
| 115 |
+
while user_input != quit:
|
| 116 |
+
user_input = quit
|
| 117 |
+
try:
|
| 118 |
+
user_input = input(">")
|
| 119 |
+
except EOFError:
|
| 120 |
+
print(user_input)
|
| 121 |
+
if user_input:
|
| 122 |
+
while user_input[-1] in "!.":
|
| 123 |
+
user_input = user_input[:-1]
|
| 124 |
+
print(self.respond(user_input))
|
.eggs/nltk-3.8-py3.10.egg/nltk/chat/zen.py
ADDED
|
@@ -0,0 +1,329 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Zen Chatbot
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Amy Holland <amyrh@csse.unimelb.edu.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Zen Chatbot talks in gems of Zen wisdom.
|
| 10 |
+
|
| 11 |
+
This is a sample conversation with Zen Chatbot:
|
| 12 |
+
ZC: Welcome, my child.
|
| 13 |
+
me: Good afternoon.
|
| 14 |
+
ZC: Ask the question you have come to ask.
|
| 15 |
+
me: How can I achieve enlightenment?
|
| 16 |
+
ZC: How do you suppose?
|
| 17 |
+
me: Through meditation.
|
| 18 |
+
ZC: Form is emptiness, and emptiness form.
|
| 19 |
+
me: How can I empty my mind of worldly troubles?
|
| 20 |
+
ZC: Will an answer to that really help in your search for enlightenment?
|
| 21 |
+
me: Yes.
|
| 22 |
+
ZC: It is better to be right than to be certain.
|
| 23 |
+
me: I seek truth and wisdom.
|
| 24 |
+
ZC: The search for truth is a long journey.
|
| 25 |
+
me: Are you sure?
|
| 26 |
+
ZC: Maybe sure, maybe not sure.
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
The chatbot structure is based on that of chat.eliza. Thus, it uses
|
| 30 |
+
a translation table to convert from question to response
|
| 31 |
+
i.e. "I am" --> "you are"
|
| 32 |
+
|
| 33 |
+
Of course, since Zen Chatbot does not understand the meaning of any words,
|
| 34 |
+
responses are very limited. Zen Chatbot will usually answer very vaguely, or
|
| 35 |
+
respond to a question by asking a different question, in much the same way
|
| 36 |
+
as Eliza.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
from nltk.chat.util import Chat, reflections
|
| 40 |
+
|
| 41 |
+
# responses are matched top to bottom, so non-specific matches occur later
|
| 42 |
+
# for each match, a list of possible responses is provided
|
| 43 |
+
responses = (
|
| 44 |
+
# Zen Chatbot opens with the line "Welcome, my child." The usual
|
| 45 |
+
# response will be a greeting problem: 'good' matches "good morning",
|
| 46 |
+
# "good day" etc, but also "good grief!" and other sentences starting
|
| 47 |
+
# with the word 'good' that may not be a greeting
|
| 48 |
+
(
|
| 49 |
+
r"(hello(.*))|(good [a-zA-Z]+)",
|
| 50 |
+
(
|
| 51 |
+
"The path to enlightenment is often difficult to see.",
|
| 52 |
+
"Greetings. I sense your mind is troubled. Tell me of your troubles.",
|
| 53 |
+
"Ask the question you have come to ask.",
|
| 54 |
+
"Hello. Do you seek englightenment?",
|
| 55 |
+
),
|
| 56 |
+
),
|
| 57 |
+
# "I need" and "I want" can be followed by a thing (eg 'help')
|
| 58 |
+
# or an action (eg 'to see you')
|
| 59 |
+
#
|
| 60 |
+
# This is a problem with this style of response -
|
| 61 |
+
# person: "I need you"
|
| 62 |
+
# chatbot: "me can be achieved by hard work and dedication of the mind"
|
| 63 |
+
# i.e. 'you' is not really a thing that can be mapped this way, so this
|
| 64 |
+
# interpretation only makes sense for some inputs
|
| 65 |
+
#
|
| 66 |
+
(
|
| 67 |
+
r"i need (.*)",
|
| 68 |
+
(
|
| 69 |
+
"%1 can be achieved by hard work and dedication of the mind.",
|
| 70 |
+
"%1 is not a need, but a desire of the mind. Clear your mind of such concerns.",
|
| 71 |
+
"Focus your mind on%1, and you will find what you need.",
|
| 72 |
+
),
|
| 73 |
+
),
|
| 74 |
+
(
|
| 75 |
+
r"i want (.*)",
|
| 76 |
+
(
|
| 77 |
+
"Desires of the heart will distract you from the path to enlightenment.",
|
| 78 |
+
"Will%1 help you attain enlightenment?",
|
| 79 |
+
"Is%1 a desire of the mind, or of the heart?",
|
| 80 |
+
),
|
| 81 |
+
),
|
| 82 |
+
# why questions are separated into three types:
|
| 83 |
+
# "why..I" e.g. "why am I here?" "Why do I like cake?"
|
| 84 |
+
# "why..you" e.g. "why are you here?" "Why won't you tell me?"
|
| 85 |
+
# "why..." e.g. "Why is the sky blue?"
|
| 86 |
+
# problems:
|
| 87 |
+
# person: "Why can't you tell me?"
|
| 88 |
+
# chatbot: "Are you sure I tell you?"
|
| 89 |
+
# - this style works for positives (e.g. "why do you like cake?")
|
| 90 |
+
# but does not work for negatives (e.g. "why don't you like cake?")
|
| 91 |
+
(r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")),
|
| 92 |
+
(r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")),
|
| 93 |
+
(r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")),
|
| 94 |
+
# e.g. "are you listening?", "are you a duck"
|
| 95 |
+
(
|
| 96 |
+
r"are you (.*)\?",
|
| 97 |
+
("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."),
|
| 98 |
+
),
|
| 99 |
+
# e.g. "am I a duck?", "am I going to die?"
|
| 100 |
+
(
|
| 101 |
+
r"am i (.*)\?",
|
| 102 |
+
("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."),
|
| 103 |
+
),
|
| 104 |
+
# what questions, e.g. "what time is it?"
|
| 105 |
+
# problems:
|
| 106 |
+
# person: "What do you want?"
|
| 107 |
+
# chatbot: "Seek truth, not what do me want."
|
| 108 |
+
(r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")),
|
| 109 |
+
# how questions, e.g. "how do you do?"
|
| 110 |
+
(
|
| 111 |
+
r"how (.*)\?",
|
| 112 |
+
(
|
| 113 |
+
"How do you suppose?",
|
| 114 |
+
"Will an answer to that really help in your search for enlightenment?",
|
| 115 |
+
"Ask yourself not how, but why.",
|
| 116 |
+
),
|
| 117 |
+
),
|
| 118 |
+
# can questions, e.g. "can you run?", "can you come over here please?"
|
| 119 |
+
(
|
| 120 |
+
r"can you (.*)\?",
|
| 121 |
+
(
|
| 122 |
+
"I probably can, but I may not.",
|
| 123 |
+
"Maybe I can%1, and maybe I cannot.",
|
| 124 |
+
"I can do all, and I can do nothing.",
|
| 125 |
+
),
|
| 126 |
+
),
|
| 127 |
+
# can questions, e.g. "can I have some cake?", "can I know truth?"
|
| 128 |
+
(
|
| 129 |
+
r"can i (.*)\?",
|
| 130 |
+
(
|
| 131 |
+
"You can%1 if you believe you can%1, and have a pure spirit.",
|
| 132 |
+
"Seek truth and you will know if you can%1.",
|
| 133 |
+
),
|
| 134 |
+
),
|
| 135 |
+
# e.g. "It is raining" - implies the speaker is certain of a fact
|
| 136 |
+
(
|
| 137 |
+
r"it is (.*)",
|
| 138 |
+
(
|
| 139 |
+
"How can you be certain that%1, when you do not even know yourself?",
|
| 140 |
+
"Whether it is%1 or not does not change the way the world is.",
|
| 141 |
+
),
|
| 142 |
+
),
|
| 143 |
+
# e.g. "is there a doctor in the house?"
|
| 144 |
+
(
|
| 145 |
+
r"is there (.*)\?",
|
| 146 |
+
("There is%1 if you believe there is.", "It is possible that there is%1."),
|
| 147 |
+
),
|
| 148 |
+
# e.g. "is it possible?", "is this true?"
|
| 149 |
+
(r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")),
|
| 150 |
+
# non-specific question
|
| 151 |
+
(
|
| 152 |
+
r"(.*)\?",
|
| 153 |
+
(
|
| 154 |
+
"Do you think %1?",
|
| 155 |
+
"You seek the truth. Does the truth seek you?",
|
| 156 |
+
"If you intentionally pursue the answers to your questions, the answers become hard to see.",
|
| 157 |
+
"The answer to your question cannot be told. It must be experienced.",
|
| 158 |
+
),
|
| 159 |
+
),
|
| 160 |
+
# expression of hate of form "I hate you" or "Kelly hates cheese"
|
| 161 |
+
(
|
| 162 |
+
r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)",
|
| 163 |
+
(
|
| 164 |
+
"Perhaps it is not about hating %2, but about hate from within.",
|
| 165 |
+
"Weeds only grow when we dislike them",
|
| 166 |
+
"Hate is a very strong emotion.",
|
| 167 |
+
),
|
| 168 |
+
),
|
| 169 |
+
# statement containing the word 'truth'
|
| 170 |
+
(
|
| 171 |
+
r"(.*) truth(.*)",
|
| 172 |
+
(
|
| 173 |
+
"Seek truth, and truth will seek you.",
|
| 174 |
+
"Remember, it is not the spoon which bends - only yourself.",
|
| 175 |
+
"The search for truth is a long journey.",
|
| 176 |
+
),
|
| 177 |
+
),
|
| 178 |
+
# desire to do an action
|
| 179 |
+
# e.g. "I want to go shopping"
|
| 180 |
+
(
|
| 181 |
+
r"i want to (.*)",
|
| 182 |
+
("You may %1 if your heart truly desires to.", "You may have to %1."),
|
| 183 |
+
),
|
| 184 |
+
# desire for an object
|
| 185 |
+
# e.g. "I want a pony"
|
| 186 |
+
(
|
| 187 |
+
r"i want (.*)",
|
| 188 |
+
(
|
| 189 |
+
"Does your heart truly desire %1?",
|
| 190 |
+
"Is this a desire of the heart, or of the mind?",
|
| 191 |
+
),
|
| 192 |
+
),
|
| 193 |
+
# e.g. "I can't wait" or "I can't do this"
|
| 194 |
+
(
|
| 195 |
+
r"i can\'t (.*)",
|
| 196 |
+
(
|
| 197 |
+
"What we can and can't do is a limitation of the mind.",
|
| 198 |
+
"There are limitations of the body, and limitations of the mind.",
|
| 199 |
+
"Have you tried to%1 with a clear mind?",
|
| 200 |
+
),
|
| 201 |
+
),
|
| 202 |
+
# "I think.." indicates uncertainty. e.g. "I think so."
|
| 203 |
+
# problem: exceptions...
|
| 204 |
+
# e.g. "I think, therefore I am"
|
| 205 |
+
(
|
| 206 |
+
r"i think (.*)",
|
| 207 |
+
(
|
| 208 |
+
"Uncertainty in an uncertain world.",
|
| 209 |
+
"Indeed, how can we be certain of anything in such uncertain times.",
|
| 210 |
+
"Are you not, in fact, certain that%1?",
|
| 211 |
+
),
|
| 212 |
+
),
|
| 213 |
+
# "I feel...emotions/sick/light-headed..."
|
| 214 |
+
(
|
| 215 |
+
r"i feel (.*)",
|
| 216 |
+
(
|
| 217 |
+
"Your body and your emotions are both symptoms of your mind."
|
| 218 |
+
"What do you believe is the root of such feelings?",
|
| 219 |
+
"Feeling%1 can be a sign of your state-of-mind.",
|
| 220 |
+
),
|
| 221 |
+
),
|
| 222 |
+
# exclaimation mark indicating emotion
|
| 223 |
+
# e.g. "Wow!" or "No!"
|
| 224 |
+
(
|
| 225 |
+
r"(.*)!",
|
| 226 |
+
(
|
| 227 |
+
"I sense that you are feeling emotional today.",
|
| 228 |
+
"You need to calm your emotions.",
|
| 229 |
+
),
|
| 230 |
+
),
|
| 231 |
+
# because [statement]
|
| 232 |
+
# e.g. "because I said so"
|
| 233 |
+
(
|
| 234 |
+
r"because (.*)",
|
| 235 |
+
(
|
| 236 |
+
"Does knowning the reasons behind things help you to understand"
|
| 237 |
+
" the things themselves?",
|
| 238 |
+
"If%1, what else must be true?",
|
| 239 |
+
),
|
| 240 |
+
),
|
| 241 |
+
# yes or no - raise an issue of certainty/correctness
|
| 242 |
+
(
|
| 243 |
+
r"(yes)|(no)",
|
| 244 |
+
(
|
| 245 |
+
"Is there certainty in an uncertain world?",
|
| 246 |
+
"It is better to be right than to be certain.",
|
| 247 |
+
),
|
| 248 |
+
),
|
| 249 |
+
# sentence containing word 'love'
|
| 250 |
+
(
|
| 251 |
+
r"(.*)love(.*)",
|
| 252 |
+
(
|
| 253 |
+
"Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.",
|
| 254 |
+
"Free love!",
|
| 255 |
+
),
|
| 256 |
+
),
|
| 257 |
+
# sentence containing word 'understand' - r
|
| 258 |
+
(
|
| 259 |
+
r"(.*)understand(.*)",
|
| 260 |
+
(
|
| 261 |
+
"If you understand, things are just as they are;"
|
| 262 |
+
" if you do not understand, things are just as they are.",
|
| 263 |
+
"Imagination is more important than knowledge.",
|
| 264 |
+
),
|
| 265 |
+
),
|
| 266 |
+
# 'I', 'me', 'my' - person is talking about themself.
|
| 267 |
+
# this breaks down when words contain these - eg 'Thyme', 'Irish'
|
| 268 |
+
(
|
| 269 |
+
r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)",
|
| 270 |
+
(
|
| 271 |
+
"'I', 'me', 'my'... these are selfish expressions.",
|
| 272 |
+
"Have you ever considered that you might be a selfish person?",
|
| 273 |
+
"Try to consider others, not just yourself.",
|
| 274 |
+
"Think not just of yourself, but of others.",
|
| 275 |
+
),
|
| 276 |
+
),
|
| 277 |
+
# 'you' starting a sentence
|
| 278 |
+
# e.g. "you stink!"
|
| 279 |
+
(
|
| 280 |
+
r"you (.*)",
|
| 281 |
+
("My path is not of concern to you.", "I am but one, and you but one more."),
|
| 282 |
+
),
|
| 283 |
+
# say goodbye with some extra Zen wisdom.
|
| 284 |
+
(
|
| 285 |
+
r"exit",
|
| 286 |
+
(
|
| 287 |
+
"Farewell. The obstacle is the path.",
|
| 288 |
+
"Farewell. Life is a journey, not a destination.",
|
| 289 |
+
"Good bye. We are cups, constantly and quietly being filled."
|
| 290 |
+
"\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.",
|
| 291 |
+
),
|
| 292 |
+
),
|
| 293 |
+
# fall through case -
|
| 294 |
+
# when stumped, respond with generic zen wisdom
|
| 295 |
+
#
|
| 296 |
+
(
|
| 297 |
+
r"(.*)",
|
| 298 |
+
(
|
| 299 |
+
"When you're enlightened, every word is wisdom.",
|
| 300 |
+
"Random talk is useless.",
|
| 301 |
+
"The reverse side also has a reverse side.",
|
| 302 |
+
"Form is emptiness, and emptiness is form.",
|
| 303 |
+
"I pour out a cup of water. Is the cup empty?",
|
| 304 |
+
),
|
| 305 |
+
),
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
zen_chatbot = Chat(responses, reflections)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def zen_chat():
|
| 312 |
+
print("*" * 75)
|
| 313 |
+
print("Zen Chatbot!".center(75))
|
| 314 |
+
print("*" * 75)
|
| 315 |
+
print('"Look beyond mere words and letters - look into your mind"'.center(75))
|
| 316 |
+
print("* Talk your way to truth with Zen Chatbot.")
|
| 317 |
+
print("* Type 'quit' when you have had enough.")
|
| 318 |
+
print("*" * 75)
|
| 319 |
+
print("Welcome, my child.")
|
| 320 |
+
|
| 321 |
+
zen_chatbot.converse()
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def demo():
|
| 325 |
+
zen_chat()
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
if __name__ == "__main__":
|
| 329 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/__init__.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Chunkers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
Classes and interfaces for identifying non-overlapping linguistic
|
| 12 |
+
groups (such as base noun phrases) in unrestricted text. This task is
|
| 13 |
+
called "chunk parsing" or "chunking", and the identified groups are
|
| 14 |
+
called "chunks". The chunked text is represented using a shallow
|
| 15 |
+
tree called a "chunk structure." A chunk structure is a tree
|
| 16 |
+
containing tokens and chunks, where each chunk is a subtree containing
|
| 17 |
+
only tokens. For example, the chunk structure for base noun phrase
|
| 18 |
+
chunks in the sentence "I saw the big dog on the hill" is::
|
| 19 |
+
|
| 20 |
+
(SENTENCE:
|
| 21 |
+
(NP: <I>)
|
| 22 |
+
<saw>
|
| 23 |
+
(NP: <the> <big> <dog>)
|
| 24 |
+
<on>
|
| 25 |
+
(NP: <the> <hill>))
|
| 26 |
+
|
| 27 |
+
To convert a chunk structure back to a list of tokens, simply use the
|
| 28 |
+
chunk structure's ``leaves()`` method.
|
| 29 |
+
|
| 30 |
+
This module defines ``ChunkParserI``, a standard interface for
|
| 31 |
+
chunking texts; and ``RegexpChunkParser``, a regular-expression based
|
| 32 |
+
implementation of that interface. It also defines ``ChunkScore``, a
|
| 33 |
+
utility class for scoring chunk parsers.
|
| 34 |
+
|
| 35 |
+
RegexpChunkParser
|
| 36 |
+
=================
|
| 37 |
+
|
| 38 |
+
``RegexpChunkParser`` is an implementation of the chunk parser interface
|
| 39 |
+
that uses regular-expressions over tags to chunk a text. Its
|
| 40 |
+
``parse()`` method first constructs a ``ChunkString``, which encodes a
|
| 41 |
+
particular chunking of the input text. Initially, nothing is
|
| 42 |
+
chunked. ``parse.RegexpChunkParser`` then applies a sequence of
|
| 43 |
+
``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies
|
| 44 |
+
the chunking that it encodes. Finally, the ``ChunkString`` is
|
| 45 |
+
transformed back into a chunk structure, which is returned.
|
| 46 |
+
|
| 47 |
+
``RegexpChunkParser`` can only be used to chunk a single kind of phrase.
|
| 48 |
+
For example, you can use an ``RegexpChunkParser`` to chunk the noun
|
| 49 |
+
phrases in a text, or the verb phrases in a text; but you can not
|
| 50 |
+
use it to simultaneously chunk both noun phrases and verb phrases in
|
| 51 |
+
the same text. (This is a limitation of ``RegexpChunkParser``, not of
|
| 52 |
+
chunk parsers in general.)
|
| 53 |
+
|
| 54 |
+
RegexpChunkRules
|
| 55 |
+
----------------
|
| 56 |
+
|
| 57 |
+
A ``RegexpChunkRule`` is a transformational rule that updates the
|
| 58 |
+
chunking of a text by modifying its ``ChunkString``. Each
|
| 59 |
+
``RegexpChunkRule`` defines the ``apply()`` method, which modifies
|
| 60 |
+
the chunking encoded by a ``ChunkString``. The
|
| 61 |
+
``RegexpChunkRule`` class itself can be used to implement any
|
| 62 |
+
transformational rule based on regular expressions. There are
|
| 63 |
+
also a number of subclasses, which can be used to implement
|
| 64 |
+
simpler types of rules:
|
| 65 |
+
|
| 66 |
+
- ``ChunkRule`` chunks anything that matches a given regular
|
| 67 |
+
expression.
|
| 68 |
+
- ``StripRule`` strips anything that matches a given regular
|
| 69 |
+
expression.
|
| 70 |
+
- ``UnChunkRule`` will un-chunk any chunk that matches a given
|
| 71 |
+
regular expression.
|
| 72 |
+
- ``MergeRule`` can be used to merge two contiguous chunks.
|
| 73 |
+
- ``SplitRule`` can be used to split a single chunk into two
|
| 74 |
+
smaller chunks.
|
| 75 |
+
- ``ExpandLeftRule`` will expand a chunk to incorporate new
|
| 76 |
+
unchunked material on the left.
|
| 77 |
+
- ``ExpandRightRule`` will expand a chunk to incorporate new
|
| 78 |
+
unchunked material on the right.
|
| 79 |
+
|
| 80 |
+
Tag Patterns
|
| 81 |
+
~~~~~~~~~~~~
|
| 82 |
+
|
| 83 |
+
A ``RegexpChunkRule`` uses a modified version of regular
|
| 84 |
+
expression patterns, called "tag patterns". Tag patterns are
|
| 85 |
+
used to match sequences of tags. Examples of tag patterns are::
|
| 86 |
+
|
| 87 |
+
r'(<DT>|<JJ>|<NN>)+'
|
| 88 |
+
r'<NN>+'
|
| 89 |
+
r'<NN.*>'
|
| 90 |
+
|
| 91 |
+
The differences between regular expression patterns and tag
|
| 92 |
+
patterns are:
|
| 93 |
+
|
| 94 |
+
- In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
|
| 95 |
+
``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
|
| 96 |
+
``'<NN'`` followed by one or more repetitions of ``'>'``.
|
| 97 |
+
- Whitespace in tag patterns is ignored. So
|
| 98 |
+
``'<DT> | <NN>'`` is equivalent to ``'<DT>|<NN>'``
|
| 99 |
+
- In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so
|
| 100 |
+
``'<NN.*>'`` matches any single tag starting with ``'NN'``.
|
| 101 |
+
|
| 102 |
+
The function ``tag_pattern2re_pattern`` can be used to transform
|
| 103 |
+
a tag pattern to an equivalent regular expression pattern.
|
| 104 |
+
|
| 105 |
+
Efficiency
|
| 106 |
+
----------
|
| 107 |
+
|
| 108 |
+
Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a
|
| 109 |
+
rate of about 300 tokens/second, with a moderately complex rule set.
|
| 110 |
+
|
| 111 |
+
There may be problems if ``RegexpChunkParser`` is used with more than
|
| 112 |
+
5,000 tokens at a time. In particular, evaluation of some regular
|
| 113 |
+
expressions may cause the Python regular expression engine to
|
| 114 |
+
exceed its maximum recursion depth. We have attempted to minimize
|
| 115 |
+
these problems, but it is impossible to avoid them completely. We
|
| 116 |
+
therefore recommend that you apply the chunk parser to a single
|
| 117 |
+
sentence at a time.
|
| 118 |
+
|
| 119 |
+
Emacs Tip
|
| 120 |
+
---------
|
| 121 |
+
|
| 122 |
+
If you evaluate the following elisp expression in emacs, it will
|
| 123 |
+
colorize a ``ChunkString`` when you use an interactive python shell
|
| 124 |
+
with emacs or xemacs ("C-c !")::
|
| 125 |
+
|
| 126 |
+
(let ()
|
| 127 |
+
(defconst comint-mode-font-lock-keywords
|
| 128 |
+
'(("<[^>]+>" 0 'font-lock-reference-face)
|
| 129 |
+
("[{}]" 0 'font-lock-function-name-face)))
|
| 130 |
+
(add-hook 'comint-mode-hook (lambda () (turn-on-font-lock))))
|
| 131 |
+
|
| 132 |
+
You can evaluate this code by copying it to a temporary buffer,
|
| 133 |
+
placing the cursor after the last close parenthesis, and typing
|
| 134 |
+
"``C-x C-e``". You should evaluate it before running the interactive
|
| 135 |
+
session. The change will last until you close emacs.
|
| 136 |
+
|
| 137 |
+
Unresolved Issues
|
| 138 |
+
-----------------
|
| 139 |
+
|
| 140 |
+
If we use the ``re`` module for regular expressions, Python's
|
| 141 |
+
regular expression engine generates "maximum recursion depth
|
| 142 |
+
exceeded" errors when processing very large texts, even for
|
| 143 |
+
regular expressions that should not require any recursion. We
|
| 144 |
+
therefore use the ``pre`` module instead. But note that ``pre``
|
| 145 |
+
does not include Unicode support, so this module will not work
|
| 146 |
+
with unicode strings. Note also that ``pre`` regular expressions
|
| 147 |
+
are not quite as advanced as ``re`` ones (e.g., no leftward
|
| 148 |
+
zero-length assertions).
|
| 149 |
+
|
| 150 |
+
:type CHUNK_TAG_PATTERN: regexp
|
| 151 |
+
:var CHUNK_TAG_PATTERN: A regular expression to test whether a tag
|
| 152 |
+
pattern is valid.
|
| 153 |
+
"""
|
| 154 |
+
|
| 155 |
+
from nltk.chunk.api import ChunkParserI
|
| 156 |
+
from nltk.chunk.regexp import RegexpChunkParser, RegexpParser
|
| 157 |
+
from nltk.chunk.util import (
|
| 158 |
+
ChunkScore,
|
| 159 |
+
accuracy,
|
| 160 |
+
conllstr2tree,
|
| 161 |
+
conlltags2tree,
|
| 162 |
+
ieerstr2tree,
|
| 163 |
+
tagstr2tree,
|
| 164 |
+
tree2conllstr,
|
| 165 |
+
tree2conlltags,
|
| 166 |
+
)
|
| 167 |
+
from nltk.data import load
|
| 168 |
+
|
| 169 |
+
# Standard treebank POS tagger
|
| 170 |
+
_BINARY_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_binary.pickle"
|
| 171 |
+
_MULTICLASS_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle"
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def ne_chunk(tagged_tokens, binary=False):
|
| 175 |
+
"""
|
| 176 |
+
Use NLTK's currently recommended named entity chunker to
|
| 177 |
+
chunk the given list of tagged tokens.
|
| 178 |
+
"""
|
| 179 |
+
if binary:
|
| 180 |
+
chunker_pickle = _BINARY_NE_CHUNKER
|
| 181 |
+
else:
|
| 182 |
+
chunker_pickle = _MULTICLASS_NE_CHUNKER
|
| 183 |
+
chunker = load(chunker_pickle)
|
| 184 |
+
return chunker.parse(tagged_tokens)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def ne_chunk_sents(tagged_sentences, binary=False):
|
| 188 |
+
"""
|
| 189 |
+
Use NLTK's currently recommended named entity chunker to chunk the
|
| 190 |
+
given list of tagged sentences, each consisting of a list of tagged tokens.
|
| 191 |
+
"""
|
| 192 |
+
if binary:
|
| 193 |
+
chunker_pickle = _BINARY_NE_CHUNKER
|
| 194 |
+
else:
|
| 195 |
+
chunker_pickle = _MULTICLASS_NE_CHUNKER
|
| 196 |
+
chunker = load(chunker_pickle)
|
| 197 |
+
return chunker.parse_sents(tagged_sentences)
|
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/api.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Chunk parsing API
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
##//////////////////////////////////////////////////////
|
| 10 |
+
## Chunk Parser Interface
|
| 11 |
+
##//////////////////////////////////////////////////////
|
| 12 |
+
|
| 13 |
+
from nltk.chunk.util import ChunkScore
|
| 14 |
+
from nltk.internals import deprecated
|
| 15 |
+
from nltk.parse import ParserI
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ChunkParserI(ParserI):
|
| 19 |
+
"""
|
| 20 |
+
A processing interface for identifying non-overlapping groups in
|
| 21 |
+
unrestricted text. Typically, chunk parsers are used to find base
|
| 22 |
+
syntactic constituents, such as base noun phrases. Unlike
|
| 23 |
+
``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
|
| 24 |
+
will always generate a parse.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def parse(self, tokens):
|
| 28 |
+
"""
|
| 29 |
+
Return the best chunk structure for the given tokens
|
| 30 |
+
and return a tree.
|
| 31 |
+
|
| 32 |
+
:param tokens: The list of (word, tag) tokens to be chunked.
|
| 33 |
+
:type tokens: list(tuple)
|
| 34 |
+
:rtype: Tree
|
| 35 |
+
"""
|
| 36 |
+
raise NotImplementedError()
|
| 37 |
+
|
| 38 |
+
@deprecated("Use accuracy(gold) instead.")
|
| 39 |
+
def evaluate(self, gold):
|
| 40 |
+
return self.accuracy(gold)
|
| 41 |
+
|
| 42 |
+
def accuracy(self, gold):
|
| 43 |
+
"""
|
| 44 |
+
Score the accuracy of the chunker against the gold standard.
|
| 45 |
+
Remove the chunking the gold standard text, rechunk it using
|
| 46 |
+
the chunker, and return a ``ChunkScore`` object
|
| 47 |
+
reflecting the performance of this chunk parser.
|
| 48 |
+
|
| 49 |
+
:type gold: list(Tree)
|
| 50 |
+
:param gold: The list of chunked sentences to score the chunker on.
|
| 51 |
+
:rtype: ChunkScore
|
| 52 |
+
"""
|
| 53 |
+
chunkscore = ChunkScore()
|
| 54 |
+
for correct in gold:
|
| 55 |
+
chunkscore.score(correct, self.parse(correct.leaves()))
|
| 56 |
+
return chunkscore
|
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/named_entity.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Chunk parsing API
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Named entity chunker
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import pickle
|
| 14 |
+
import re
|
| 15 |
+
from xml.etree import ElementTree as ET
|
| 16 |
+
|
| 17 |
+
from nltk.tag import ClassifierBasedTagger, pos_tag
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from nltk.classify import MaxentClassifier
|
| 21 |
+
except ImportError:
|
| 22 |
+
pass
|
| 23 |
+
|
| 24 |
+
from nltk.chunk.api import ChunkParserI
|
| 25 |
+
from nltk.chunk.util import ChunkScore
|
| 26 |
+
from nltk.data import find
|
| 27 |
+
from nltk.tokenize import word_tokenize
|
| 28 |
+
from nltk.tree import Tree
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class NEChunkParserTagger(ClassifierBasedTagger):
|
| 32 |
+
"""
|
| 33 |
+
The IOB tagger used by the chunk parser.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
def __init__(self, train):
|
| 37 |
+
ClassifierBasedTagger.__init__(
|
| 38 |
+
self, train=train, classifier_builder=self._classifier_builder
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
def _classifier_builder(self, train):
|
| 42 |
+
return MaxentClassifier.train(
|
| 43 |
+
train, algorithm="megam", gaussian_prior_sigma=1, trace=2
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
def _english_wordlist(self):
|
| 47 |
+
try:
|
| 48 |
+
wl = self._en_wordlist
|
| 49 |
+
except AttributeError:
|
| 50 |
+
from nltk.corpus import words
|
| 51 |
+
|
| 52 |
+
self._en_wordlist = set(words.words("en-basic"))
|
| 53 |
+
wl = self._en_wordlist
|
| 54 |
+
return wl
|
| 55 |
+
|
| 56 |
+
def _feature_detector(self, tokens, index, history):
|
| 57 |
+
word = tokens[index][0]
|
| 58 |
+
pos = simplify_pos(tokens[index][1])
|
| 59 |
+
if index == 0:
|
| 60 |
+
prevword = prevprevword = None
|
| 61 |
+
prevpos = prevprevpos = None
|
| 62 |
+
prevshape = prevtag = prevprevtag = None
|
| 63 |
+
elif index == 1:
|
| 64 |
+
prevword = tokens[index - 1][0].lower()
|
| 65 |
+
prevprevword = None
|
| 66 |
+
prevpos = simplify_pos(tokens[index - 1][1])
|
| 67 |
+
prevprevpos = None
|
| 68 |
+
prevtag = history[index - 1][0]
|
| 69 |
+
prevshape = prevprevtag = None
|
| 70 |
+
else:
|
| 71 |
+
prevword = tokens[index - 1][0].lower()
|
| 72 |
+
prevprevword = tokens[index - 2][0].lower()
|
| 73 |
+
prevpos = simplify_pos(tokens[index - 1][1])
|
| 74 |
+
prevprevpos = simplify_pos(tokens[index - 2][1])
|
| 75 |
+
prevtag = history[index - 1]
|
| 76 |
+
prevprevtag = history[index - 2]
|
| 77 |
+
prevshape = shape(prevword)
|
| 78 |
+
if index == len(tokens) - 1:
|
| 79 |
+
nextword = nextnextword = None
|
| 80 |
+
nextpos = nextnextpos = None
|
| 81 |
+
elif index == len(tokens) - 2:
|
| 82 |
+
nextword = tokens[index + 1][0].lower()
|
| 83 |
+
nextpos = tokens[index + 1][1].lower()
|
| 84 |
+
nextnextword = None
|
| 85 |
+
nextnextpos = None
|
| 86 |
+
else:
|
| 87 |
+
nextword = tokens[index + 1][0].lower()
|
| 88 |
+
nextpos = tokens[index + 1][1].lower()
|
| 89 |
+
nextnextword = tokens[index + 2][0].lower()
|
| 90 |
+
nextnextpos = tokens[index + 2][1].lower()
|
| 91 |
+
|
| 92 |
+
# 89.6
|
| 93 |
+
features = {
|
| 94 |
+
"bias": True,
|
| 95 |
+
"shape": shape(word),
|
| 96 |
+
"wordlen": len(word),
|
| 97 |
+
"prefix3": word[:3].lower(),
|
| 98 |
+
"suffix3": word[-3:].lower(),
|
| 99 |
+
"pos": pos,
|
| 100 |
+
"word": word,
|
| 101 |
+
"en-wordlist": (word in self._english_wordlist()),
|
| 102 |
+
"prevtag": prevtag,
|
| 103 |
+
"prevpos": prevpos,
|
| 104 |
+
"nextpos": nextpos,
|
| 105 |
+
"prevword": prevword,
|
| 106 |
+
"nextword": nextword,
|
| 107 |
+
"word+nextpos": f"{word.lower()}+{nextpos}",
|
| 108 |
+
"pos+prevtag": f"{pos}+{prevtag}",
|
| 109 |
+
"shape+prevtag": f"{prevshape}+{prevtag}",
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
return features
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class NEChunkParser(ChunkParserI):
|
| 116 |
+
"""
|
| 117 |
+
Expected input: list of pos-tagged words
|
| 118 |
+
"""
|
| 119 |
+
|
| 120 |
+
def __init__(self, train):
|
| 121 |
+
self._train(train)
|
| 122 |
+
|
| 123 |
+
def parse(self, tokens):
|
| 124 |
+
"""
|
| 125 |
+
Each token should be a pos-tagged word
|
| 126 |
+
"""
|
| 127 |
+
tagged = self._tagger.tag(tokens)
|
| 128 |
+
tree = self._tagged_to_parse(tagged)
|
| 129 |
+
return tree
|
| 130 |
+
|
| 131 |
+
def _train(self, corpus):
|
| 132 |
+
# Convert to tagged sequence
|
| 133 |
+
corpus = [self._parse_to_tagged(s) for s in corpus]
|
| 134 |
+
|
| 135 |
+
self._tagger = NEChunkParserTagger(train=corpus)
|
| 136 |
+
|
| 137 |
+
def _tagged_to_parse(self, tagged_tokens):
|
| 138 |
+
"""
|
| 139 |
+
Convert a list of tagged tokens to a chunk-parse tree.
|
| 140 |
+
"""
|
| 141 |
+
sent = Tree("S", [])
|
| 142 |
+
|
| 143 |
+
for (tok, tag) in tagged_tokens:
|
| 144 |
+
if tag == "O":
|
| 145 |
+
sent.append(tok)
|
| 146 |
+
elif tag.startswith("B-"):
|
| 147 |
+
sent.append(Tree(tag[2:], [tok]))
|
| 148 |
+
elif tag.startswith("I-"):
|
| 149 |
+
if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]:
|
| 150 |
+
sent[-1].append(tok)
|
| 151 |
+
else:
|
| 152 |
+
sent.append(Tree(tag[2:], [tok]))
|
| 153 |
+
return sent
|
| 154 |
+
|
| 155 |
+
@staticmethod
|
| 156 |
+
def _parse_to_tagged(sent):
|
| 157 |
+
"""
|
| 158 |
+
Convert a chunk-parse tree to a list of tagged tokens.
|
| 159 |
+
"""
|
| 160 |
+
toks = []
|
| 161 |
+
for child in sent:
|
| 162 |
+
if isinstance(child, Tree):
|
| 163 |
+
if len(child) == 0:
|
| 164 |
+
print("Warning -- empty chunk in sentence")
|
| 165 |
+
continue
|
| 166 |
+
toks.append((child[0], f"B-{child.label()}"))
|
| 167 |
+
for tok in child[1:]:
|
| 168 |
+
toks.append((tok, f"I-{child.label()}"))
|
| 169 |
+
else:
|
| 170 |
+
toks.append((child, "O"))
|
| 171 |
+
return toks
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def shape(word):
|
| 175 |
+
if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE):
|
| 176 |
+
return "number"
|
| 177 |
+
elif re.match(r"\W+$", word, re.UNICODE):
|
| 178 |
+
return "punct"
|
| 179 |
+
elif re.match(r"\w+$", word, re.UNICODE):
|
| 180 |
+
if word.istitle():
|
| 181 |
+
return "upcase"
|
| 182 |
+
elif word.islower():
|
| 183 |
+
return "downcase"
|
| 184 |
+
else:
|
| 185 |
+
return "mixedcase"
|
| 186 |
+
else:
|
| 187 |
+
return "other"
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def simplify_pos(s):
|
| 191 |
+
if s.startswith("V"):
|
| 192 |
+
return "V"
|
| 193 |
+
else:
|
| 194 |
+
return s.split("-")[0]
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def postag_tree(tree):
|
| 198 |
+
# Part-of-speech tagging.
|
| 199 |
+
words = tree.leaves()
|
| 200 |
+
tag_iter = (pos for (word, pos) in pos_tag(words))
|
| 201 |
+
newtree = Tree("S", [])
|
| 202 |
+
for child in tree:
|
| 203 |
+
if isinstance(child, Tree):
|
| 204 |
+
newtree.append(Tree(child.label(), []))
|
| 205 |
+
for subchild in child:
|
| 206 |
+
newtree[-1].append((subchild, next(tag_iter)))
|
| 207 |
+
else:
|
| 208 |
+
newtree.append((child, next(tag_iter)))
|
| 209 |
+
return newtree
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def load_ace_data(roots, fmt="binary", skip_bnews=True):
|
| 213 |
+
for root in roots:
|
| 214 |
+
for root, dirs, files in os.walk(root):
|
| 215 |
+
if root.endswith("bnews") and skip_bnews:
|
| 216 |
+
continue
|
| 217 |
+
for f in files:
|
| 218 |
+
if f.endswith(".sgm"):
|
| 219 |
+
yield from load_ace_file(os.path.join(root, f), fmt)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def load_ace_file(textfile, fmt):
|
| 223 |
+
print(f" - {os.path.split(textfile)[1]}")
|
| 224 |
+
annfile = textfile + ".tmx.rdc.xml"
|
| 225 |
+
|
| 226 |
+
# Read the xml file, and get a list of entities
|
| 227 |
+
entities = []
|
| 228 |
+
with open(annfile) as infile:
|
| 229 |
+
xml = ET.parse(infile).getroot()
|
| 230 |
+
for entity in xml.findall("document/entity"):
|
| 231 |
+
typ = entity.find("entity_type").text
|
| 232 |
+
for mention in entity.findall("entity_mention"):
|
| 233 |
+
if mention.get("TYPE") != "NAME":
|
| 234 |
+
continue # only NEs
|
| 235 |
+
s = int(mention.find("head/charseq/start").text)
|
| 236 |
+
e = int(mention.find("head/charseq/end").text) + 1
|
| 237 |
+
entities.append((s, e, typ))
|
| 238 |
+
|
| 239 |
+
# Read the text file, and mark the entities.
|
| 240 |
+
with open(textfile) as infile:
|
| 241 |
+
text = infile.read()
|
| 242 |
+
|
| 243 |
+
# Strip XML tags, since they don't count towards the indices
|
| 244 |
+
text = re.sub("<(?!/?TEXT)[^>]+>", "", text)
|
| 245 |
+
|
| 246 |
+
# Blank out anything before/after <TEXT>
|
| 247 |
+
def subfunc(m):
|
| 248 |
+
return " " * (m.end() - m.start() - 6)
|
| 249 |
+
|
| 250 |
+
text = re.sub(r"[\s\S]*<TEXT>", subfunc, text)
|
| 251 |
+
text = re.sub(r"</TEXT>[\s\S]*", "", text)
|
| 252 |
+
|
| 253 |
+
# Simplify quotes
|
| 254 |
+
text = re.sub("``", ' "', text)
|
| 255 |
+
text = re.sub("''", '" ', text)
|
| 256 |
+
|
| 257 |
+
entity_types = {typ for (s, e, typ) in entities}
|
| 258 |
+
|
| 259 |
+
# Binary distinction (NE or not NE)
|
| 260 |
+
if fmt == "binary":
|
| 261 |
+
i = 0
|
| 262 |
+
toks = Tree("S", [])
|
| 263 |
+
for (s, e, typ) in sorted(entities):
|
| 264 |
+
if s < i:
|
| 265 |
+
s = i # Overlapping! Deal with this better?
|
| 266 |
+
if e <= s:
|
| 267 |
+
continue
|
| 268 |
+
toks.extend(word_tokenize(text[i:s]))
|
| 269 |
+
toks.append(Tree("NE", text[s:e].split()))
|
| 270 |
+
i = e
|
| 271 |
+
toks.extend(word_tokenize(text[i:]))
|
| 272 |
+
yield toks
|
| 273 |
+
|
| 274 |
+
# Multiclass distinction (NE type)
|
| 275 |
+
elif fmt == "multiclass":
|
| 276 |
+
i = 0
|
| 277 |
+
toks = Tree("S", [])
|
| 278 |
+
for (s, e, typ) in sorted(entities):
|
| 279 |
+
if s < i:
|
| 280 |
+
s = i # Overlapping! Deal with this better?
|
| 281 |
+
if e <= s:
|
| 282 |
+
continue
|
| 283 |
+
toks.extend(word_tokenize(text[i:s]))
|
| 284 |
+
toks.append(Tree(typ, text[s:e].split()))
|
| 285 |
+
i = e
|
| 286 |
+
toks.extend(word_tokenize(text[i:]))
|
| 287 |
+
yield toks
|
| 288 |
+
|
| 289 |
+
else:
|
| 290 |
+
raise ValueError("bad fmt value")
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
# This probably belongs in a more general-purpose location (as does
|
| 294 |
+
# the parse_to_tagged function).
|
| 295 |
+
def cmp_chunks(correct, guessed):
|
| 296 |
+
correct = NEChunkParser._parse_to_tagged(correct)
|
| 297 |
+
guessed = NEChunkParser._parse_to_tagged(guessed)
|
| 298 |
+
ellipsis = False
|
| 299 |
+
for (w, ct), (w, gt) in zip(correct, guessed):
|
| 300 |
+
if ct == gt == "O":
|
| 301 |
+
if not ellipsis:
|
| 302 |
+
print(f" {ct:15} {gt:15} {w}")
|
| 303 |
+
print(" {:15} {:15} {2}".format("...", "...", "..."))
|
| 304 |
+
ellipsis = True
|
| 305 |
+
else:
|
| 306 |
+
ellipsis = False
|
| 307 |
+
print(f" {ct:15} {gt:15} {w}")
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def build_model(fmt="binary"):
|
| 311 |
+
print("Loading training data...")
|
| 312 |
+
train_paths = [
|
| 313 |
+
find("corpora/ace_data/ace.dev"),
|
| 314 |
+
find("corpora/ace_data/ace.heldout"),
|
| 315 |
+
find("corpora/ace_data/bbn.dev"),
|
| 316 |
+
find("corpora/ace_data/muc.dev"),
|
| 317 |
+
]
|
| 318 |
+
train_trees = load_ace_data(train_paths, fmt)
|
| 319 |
+
train_data = [postag_tree(t) for t in train_trees]
|
| 320 |
+
print("Training...")
|
| 321 |
+
cp = NEChunkParser(train_data)
|
| 322 |
+
del train_data
|
| 323 |
+
|
| 324 |
+
print("Loading eval data...")
|
| 325 |
+
eval_paths = [find("corpora/ace_data/ace.eval")]
|
| 326 |
+
eval_trees = load_ace_data(eval_paths, fmt)
|
| 327 |
+
eval_data = [postag_tree(t) for t in eval_trees]
|
| 328 |
+
|
| 329 |
+
print("Evaluating...")
|
| 330 |
+
chunkscore = ChunkScore()
|
| 331 |
+
for i, correct in enumerate(eval_data):
|
| 332 |
+
guess = cp.parse(correct.leaves())
|
| 333 |
+
chunkscore.score(correct, guess)
|
| 334 |
+
if i < 3:
|
| 335 |
+
cmp_chunks(correct, guess)
|
| 336 |
+
print(chunkscore)
|
| 337 |
+
|
| 338 |
+
outfilename = f"/tmp/ne_chunker_{fmt}.pickle"
|
| 339 |
+
print(f"Saving chunker to {outfilename}...")
|
| 340 |
+
|
| 341 |
+
with open(outfilename, "wb") as outfile:
|
| 342 |
+
pickle.dump(cp, outfile, -1)
|
| 343 |
+
|
| 344 |
+
return cp
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
if __name__ == "__main__":
|
| 348 |
+
# Make sure that the pickled object has the right class name:
|
| 349 |
+
from nltk.chunk.named_entity import build_model
|
| 350 |
+
|
| 351 |
+
build_model("binary")
|
| 352 |
+
build_model("multiclass")
|
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/regexp.py
ADDED
|
@@ -0,0 +1,1475 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Regular Expression Chunkers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
import regex
|
| 12 |
+
|
| 13 |
+
from nltk.chunk.api import ChunkParserI
|
| 14 |
+
from nltk.tree import Tree
|
| 15 |
+
|
| 16 |
+
# //////////////////////////////////////////////////////
|
| 17 |
+
# ChunkString
|
| 18 |
+
# //////////////////////////////////////////////////////
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class ChunkString:
|
| 22 |
+
"""
|
| 23 |
+
A string-based encoding of a particular chunking of a text.
|
| 24 |
+
Internally, the ``ChunkString`` class uses a single string to
|
| 25 |
+
encode the chunking of the input text. This string contains a
|
| 26 |
+
sequence of angle-bracket delimited tags, with chunking indicated
|
| 27 |
+
by braces. An example of this encoding is::
|
| 28 |
+
|
| 29 |
+
{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
|
| 30 |
+
|
| 31 |
+
``ChunkString`` are created from tagged texts (i.e., lists of
|
| 32 |
+
``tokens`` whose type is ``TaggedType``). Initially, nothing is
|
| 33 |
+
chunked.
|
| 34 |
+
|
| 35 |
+
The chunking of a ``ChunkString`` can be modified with the ``xform()``
|
| 36 |
+
method, which uses a regular expression to transform the string
|
| 37 |
+
representation. These transformations should only add and remove
|
| 38 |
+
braces; they should *not* modify the sequence of angle-bracket
|
| 39 |
+
delimited tags.
|
| 40 |
+
|
| 41 |
+
:type _str: str
|
| 42 |
+
:ivar _str: The internal string representation of the text's
|
| 43 |
+
encoding. This string representation contains a sequence of
|
| 44 |
+
angle-bracket delimited tags, with chunking indicated by
|
| 45 |
+
braces. An example of this encoding is::
|
| 46 |
+
|
| 47 |
+
{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
|
| 48 |
+
|
| 49 |
+
:type _pieces: list(tagged tokens and chunks)
|
| 50 |
+
:ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``.
|
| 51 |
+
:ivar _debug: The debug level. See the constructor docs.
|
| 52 |
+
|
| 53 |
+
:cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that
|
| 54 |
+
will only match positions that are in chunks.
|
| 55 |
+
:cvar IN_STRIP_PATTERN: A zero-width regexp pattern string that
|
| 56 |
+
will only match positions that are in strips.
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
CHUNK_TAG_CHAR = r"[^\{\}<>]"
|
| 60 |
+
CHUNK_TAG = r"(<%s+?>)" % CHUNK_TAG_CHAR
|
| 61 |
+
|
| 62 |
+
IN_CHUNK_PATTERN = r"(?=[^\{]*\})"
|
| 63 |
+
IN_STRIP_PATTERN = r"(?=[^\}]*(\{|$))"
|
| 64 |
+
|
| 65 |
+
# These are used by _verify
|
| 66 |
+
_CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG
|
| 67 |
+
_STRIP = r"(%s+?)+?" % CHUNK_TAG
|
| 68 |
+
_VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG)
|
| 69 |
+
_BRACKETS = re.compile(r"[^\{\}]+")
|
| 70 |
+
_BALANCED_BRACKETS = re.compile(r"(\{\})*$")
|
| 71 |
+
|
| 72 |
+
def __init__(self, chunk_struct, debug_level=1):
|
| 73 |
+
"""
|
| 74 |
+
Construct a new ``ChunkString`` that encodes the chunking of
|
| 75 |
+
the text ``tagged_tokens``.
|
| 76 |
+
|
| 77 |
+
:type chunk_struct: Tree
|
| 78 |
+
:param chunk_struct: The chunk structure to be further chunked.
|
| 79 |
+
:type debug_level: int
|
| 80 |
+
:param debug_level: The level of debugging which should be
|
| 81 |
+
applied to transformations on the ``ChunkString``. The
|
| 82 |
+
valid levels are:
|
| 83 |
+
|
| 84 |
+
- 0: no checks
|
| 85 |
+
- 1: full check on to_chunkstruct
|
| 86 |
+
- 2: full check on to_chunkstruct and cursory check after
|
| 87 |
+
each transformation.
|
| 88 |
+
- 3: full check on to_chunkstruct and full check after
|
| 89 |
+
each transformation.
|
| 90 |
+
|
| 91 |
+
We recommend you use at least level 1. You should
|
| 92 |
+
probably use level 3 if you use any non-standard
|
| 93 |
+
subclasses of ``RegexpChunkRule``.
|
| 94 |
+
"""
|
| 95 |
+
self._root_label = chunk_struct.label()
|
| 96 |
+
self._pieces = chunk_struct[:]
|
| 97 |
+
tags = [self._tag(tok) for tok in self._pieces]
|
| 98 |
+
self._str = "<" + "><".join(tags) + ">"
|
| 99 |
+
self._debug = debug_level
|
| 100 |
+
|
| 101 |
+
def _tag(self, tok):
|
| 102 |
+
if isinstance(tok, tuple):
|
| 103 |
+
return tok[1]
|
| 104 |
+
elif isinstance(tok, Tree):
|
| 105 |
+
return tok.label()
|
| 106 |
+
else:
|
| 107 |
+
raise ValueError("chunk structures must contain tagged " "tokens or trees")
|
| 108 |
+
|
| 109 |
+
def _verify(self, s, verify_tags):
|
| 110 |
+
"""
|
| 111 |
+
Check to make sure that ``s`` still corresponds to some chunked
|
| 112 |
+
version of ``_pieces``.
|
| 113 |
+
|
| 114 |
+
:type verify_tags: bool
|
| 115 |
+
:param verify_tags: Whether the individual tags should be
|
| 116 |
+
checked. If this is false, ``_verify`` will check to make
|
| 117 |
+
sure that ``_str`` encodes a chunked version of *some*
|
| 118 |
+
list of tokens. If this is true, then ``_verify`` will
|
| 119 |
+
check to make sure that the tags in ``_str`` match those in
|
| 120 |
+
``_pieces``.
|
| 121 |
+
|
| 122 |
+
:raise ValueError: if the internal string representation of
|
| 123 |
+
this ``ChunkString`` is invalid or not consistent with _pieces.
|
| 124 |
+
"""
|
| 125 |
+
# Check overall form
|
| 126 |
+
if not ChunkString._VALID.match(s):
|
| 127 |
+
raise ValueError(
|
| 128 |
+
"Transformation generated invalid " "chunkstring:\n %s" % s
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
# Check that parens are balanced. If the string is long, we
|
| 132 |
+
# have to do this in pieces, to avoid a maximum recursion
|
| 133 |
+
# depth limit for regular expressions.
|
| 134 |
+
brackets = ChunkString._BRACKETS.sub("", s)
|
| 135 |
+
for i in range(1 + len(brackets) // 5000):
|
| 136 |
+
substr = brackets[i * 5000 : i * 5000 + 5000]
|
| 137 |
+
if not ChunkString._BALANCED_BRACKETS.match(substr):
|
| 138 |
+
raise ValueError(
|
| 139 |
+
"Transformation generated invalid " "chunkstring:\n %s" % s
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
if verify_tags <= 0:
|
| 143 |
+
return
|
| 144 |
+
|
| 145 |
+
tags1 = (re.split(r"[\{\}<>]+", s))[1:-1]
|
| 146 |
+
tags2 = [self._tag(piece) for piece in self._pieces]
|
| 147 |
+
if tags1 != tags2:
|
| 148 |
+
raise ValueError(
|
| 149 |
+
"Transformation generated invalid " "chunkstring: tag changed"
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
def to_chunkstruct(self, chunk_label="CHUNK"):
|
| 153 |
+
"""
|
| 154 |
+
Return the chunk structure encoded by this ``ChunkString``.
|
| 155 |
+
|
| 156 |
+
:rtype: Tree
|
| 157 |
+
:raise ValueError: If a transformation has generated an
|
| 158 |
+
invalid chunkstring.
|
| 159 |
+
"""
|
| 160 |
+
if self._debug > 0:
|
| 161 |
+
self._verify(self._str, 1)
|
| 162 |
+
|
| 163 |
+
# Use this alternating list to create the chunkstruct.
|
| 164 |
+
pieces = []
|
| 165 |
+
index = 0
|
| 166 |
+
piece_in_chunk = 0
|
| 167 |
+
for piece in re.split("[{}]", self._str):
|
| 168 |
+
|
| 169 |
+
# Find the list of tokens contained in this piece.
|
| 170 |
+
length = piece.count("<")
|
| 171 |
+
subsequence = self._pieces[index : index + length]
|
| 172 |
+
|
| 173 |
+
# Add this list of tokens to our pieces.
|
| 174 |
+
if piece_in_chunk:
|
| 175 |
+
pieces.append(Tree(chunk_label, subsequence))
|
| 176 |
+
else:
|
| 177 |
+
pieces += subsequence
|
| 178 |
+
|
| 179 |
+
# Update index, piece_in_chunk
|
| 180 |
+
index += length
|
| 181 |
+
piece_in_chunk = not piece_in_chunk
|
| 182 |
+
|
| 183 |
+
return Tree(self._root_label, pieces)
|
| 184 |
+
|
| 185 |
+
def xform(self, regexp, repl):
|
| 186 |
+
"""
|
| 187 |
+
Apply the given transformation to the string encoding of this
|
| 188 |
+
``ChunkString``. In particular, find all occurrences that match
|
| 189 |
+
``regexp``, and replace them using ``repl`` (as done by
|
| 190 |
+
``re.sub``).
|
| 191 |
+
|
| 192 |
+
This transformation should only add and remove braces; it
|
| 193 |
+
should *not* modify the sequence of angle-bracket delimited
|
| 194 |
+
tags. Furthermore, this transformation may not result in
|
| 195 |
+
improper bracketing. Note, in particular, that bracketing may
|
| 196 |
+
not be nested.
|
| 197 |
+
|
| 198 |
+
:type regexp: str or regexp
|
| 199 |
+
:param regexp: A regular expression matching the substring
|
| 200 |
+
that should be replaced. This will typically include a
|
| 201 |
+
named group, which can be used by ``repl``.
|
| 202 |
+
:type repl: str
|
| 203 |
+
:param repl: An expression specifying what should replace the
|
| 204 |
+
matched substring. Typically, this will include a named
|
| 205 |
+
replacement group, specified by ``regexp``.
|
| 206 |
+
:rtype: None
|
| 207 |
+
:raise ValueError: If this transformation generated an
|
| 208 |
+
invalid chunkstring.
|
| 209 |
+
"""
|
| 210 |
+
# Do the actual substitution
|
| 211 |
+
s = re.sub(regexp, repl, self._str)
|
| 212 |
+
|
| 213 |
+
# The substitution might have generated "empty chunks"
|
| 214 |
+
# (substrings of the form "{}"). Remove them, so they don't
|
| 215 |
+
# interfere with other transformations.
|
| 216 |
+
s = re.sub(r"\{\}", "", s)
|
| 217 |
+
|
| 218 |
+
# Make sure that the transformation was legal.
|
| 219 |
+
if self._debug > 1:
|
| 220 |
+
self._verify(s, self._debug - 2)
|
| 221 |
+
|
| 222 |
+
# Commit the transformation.
|
| 223 |
+
self._str = s
|
| 224 |
+
|
| 225 |
+
def __repr__(self):
|
| 226 |
+
"""
|
| 227 |
+
Return a string representation of this ``ChunkString``.
|
| 228 |
+
It has the form::
|
| 229 |
+
|
| 230 |
+
<ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'>
|
| 231 |
+
|
| 232 |
+
:rtype: str
|
| 233 |
+
"""
|
| 234 |
+
return "<ChunkString: %s>" % repr(self._str)
|
| 235 |
+
|
| 236 |
+
def __str__(self):
|
| 237 |
+
"""
|
| 238 |
+
Return a formatted representation of this ``ChunkString``.
|
| 239 |
+
This representation will include extra spaces to ensure that
|
| 240 |
+
tags will line up with the representation of other
|
| 241 |
+
``ChunkStrings`` for the same text, regardless of the chunking.
|
| 242 |
+
|
| 243 |
+
:rtype: str
|
| 244 |
+
"""
|
| 245 |
+
# Add spaces to make everything line up.
|
| 246 |
+
str = re.sub(r">(?!\})", r"> ", self._str)
|
| 247 |
+
str = re.sub(r"([^\{])<", r"\1 <", str)
|
| 248 |
+
if str[0] == "<":
|
| 249 |
+
str = " " + str
|
| 250 |
+
return str
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
# //////////////////////////////////////////////////////
|
| 254 |
+
# Chunking Rules
|
| 255 |
+
# //////////////////////////////////////////////////////
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
class RegexpChunkRule:
|
| 259 |
+
"""
|
| 260 |
+
A rule specifying how to modify the chunking in a ``ChunkString``,
|
| 261 |
+
using a transformational regular expression. The
|
| 262 |
+
``RegexpChunkRule`` class itself can be used to implement any
|
| 263 |
+
transformational rule based on regular expressions. There are
|
| 264 |
+
also a number of subclasses, which can be used to implement
|
| 265 |
+
simpler types of rules, based on matching regular expressions.
|
| 266 |
+
|
| 267 |
+
Each ``RegexpChunkRule`` has a regular expression and a
|
| 268 |
+
replacement expression. When a ``RegexpChunkRule`` is "applied"
|
| 269 |
+
to a ``ChunkString``, it searches the ``ChunkString`` for any
|
| 270 |
+
substring that matches the regular expression, and replaces it
|
| 271 |
+
using the replacement expression. This search/replace operation
|
| 272 |
+
has the same semantics as ``re.sub``.
|
| 273 |
+
|
| 274 |
+
Each ``RegexpChunkRule`` also has a description string, which
|
| 275 |
+
gives a short (typically less than 75 characters) description of
|
| 276 |
+
the purpose of the rule.
|
| 277 |
+
|
| 278 |
+
This transformation defined by this ``RegexpChunkRule`` should
|
| 279 |
+
only add and remove braces; it should *not* modify the sequence
|
| 280 |
+
of angle-bracket delimited tags. Furthermore, this transformation
|
| 281 |
+
may not result in nested or mismatched bracketing.
|
| 282 |
+
"""
|
| 283 |
+
|
| 284 |
+
def __init__(self, regexp, repl, descr):
|
| 285 |
+
"""
|
| 286 |
+
Construct a new RegexpChunkRule.
|
| 287 |
+
|
| 288 |
+
:type regexp: regexp or str
|
| 289 |
+
:param regexp: The regular expression for this ``RegexpChunkRule``.
|
| 290 |
+
When this rule is applied to a ``ChunkString``, any
|
| 291 |
+
substring that matches ``regexp`` will be replaced using
|
| 292 |
+
the replacement string ``repl``. Note that this must be a
|
| 293 |
+
normal regular expression, not a tag pattern.
|
| 294 |
+
:type repl: str
|
| 295 |
+
:param repl: The replacement expression for this ``RegexpChunkRule``.
|
| 296 |
+
When this rule is applied to a ``ChunkString``, any substring
|
| 297 |
+
that matches ``regexp`` will be replaced using ``repl``.
|
| 298 |
+
:type descr: str
|
| 299 |
+
:param descr: A short description of the purpose and/or effect
|
| 300 |
+
of this rule.
|
| 301 |
+
"""
|
| 302 |
+
if isinstance(regexp, str):
|
| 303 |
+
regexp = re.compile(regexp)
|
| 304 |
+
self._repl = repl
|
| 305 |
+
self._descr = descr
|
| 306 |
+
self._regexp = regexp
|
| 307 |
+
|
| 308 |
+
def apply(self, chunkstr):
|
| 309 |
+
# Keep docstring generic so we can inherit it.
|
| 310 |
+
"""
|
| 311 |
+
Apply this rule to the given ``ChunkString``. See the
|
| 312 |
+
class reference documentation for a description of what it
|
| 313 |
+
means to apply a rule.
|
| 314 |
+
|
| 315 |
+
:type chunkstr: ChunkString
|
| 316 |
+
:param chunkstr: The chunkstring to which this rule is applied.
|
| 317 |
+
:rtype: None
|
| 318 |
+
:raise ValueError: If this transformation generated an
|
| 319 |
+
invalid chunkstring.
|
| 320 |
+
"""
|
| 321 |
+
chunkstr.xform(self._regexp, self._repl)
|
| 322 |
+
|
| 323 |
+
def descr(self):
|
| 324 |
+
"""
|
| 325 |
+
Return a short description of the purpose and/or effect of
|
| 326 |
+
this rule.
|
| 327 |
+
|
| 328 |
+
:rtype: str
|
| 329 |
+
"""
|
| 330 |
+
return self._descr
|
| 331 |
+
|
| 332 |
+
def __repr__(self):
|
| 333 |
+
"""
|
| 334 |
+
Return a string representation of this rule. It has the form::
|
| 335 |
+
|
| 336 |
+
<RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
|
| 337 |
+
|
| 338 |
+
Note that this representation does not include the
|
| 339 |
+
description string; that string can be accessed
|
| 340 |
+
separately with the ``descr()`` method.
|
| 341 |
+
|
| 342 |
+
:rtype: str
|
| 343 |
+
"""
|
| 344 |
+
return (
|
| 345 |
+
"<RegexpChunkRule: "
|
| 346 |
+
+ repr(self._regexp.pattern)
|
| 347 |
+
+ "->"
|
| 348 |
+
+ repr(self._repl)
|
| 349 |
+
+ ">"
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
@staticmethod
|
| 353 |
+
def fromstring(s):
|
| 354 |
+
"""
|
| 355 |
+
Create a RegexpChunkRule from a string description.
|
| 356 |
+
Currently, the following formats are supported::
|
| 357 |
+
|
| 358 |
+
{regexp} # chunk rule
|
| 359 |
+
}regexp{ # strip rule
|
| 360 |
+
regexp}{regexp # split rule
|
| 361 |
+
regexp{}regexp # merge rule
|
| 362 |
+
|
| 363 |
+
Where ``regexp`` is a regular expression for the rule. Any
|
| 364 |
+
text following the comment marker (``#``) will be used as
|
| 365 |
+
the rule's description:
|
| 366 |
+
|
| 367 |
+
>>> from nltk.chunk.regexp import RegexpChunkRule
|
| 368 |
+
>>> RegexpChunkRule.fromstring('{<DT>?<NN.*>+}')
|
| 369 |
+
<ChunkRule: '<DT>?<NN.*>+'>
|
| 370 |
+
"""
|
| 371 |
+
# Split off the comment (but don't split on '\#')
|
| 372 |
+
m = re.match(r"(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?", s)
|
| 373 |
+
rule = m.group("rule").strip()
|
| 374 |
+
comment = (m.group("comment") or "")[1:].strip()
|
| 375 |
+
|
| 376 |
+
# Pattern bodies: chunk, strip, split, merge
|
| 377 |
+
try:
|
| 378 |
+
if not rule:
|
| 379 |
+
raise ValueError("Empty chunk pattern")
|
| 380 |
+
if rule[0] == "{" and rule[-1] == "}":
|
| 381 |
+
return ChunkRule(rule[1:-1], comment)
|
| 382 |
+
elif rule[0] == "}" and rule[-1] == "{":
|
| 383 |
+
return StripRule(rule[1:-1], comment)
|
| 384 |
+
elif "}{" in rule:
|
| 385 |
+
left, right = rule.split("}{")
|
| 386 |
+
return SplitRule(left, right, comment)
|
| 387 |
+
elif "{}" in rule:
|
| 388 |
+
left, right = rule.split("{}")
|
| 389 |
+
return MergeRule(left, right, comment)
|
| 390 |
+
elif re.match("[^{}]*{[^{}]*}[^{}]*", rule):
|
| 391 |
+
left, chunk, right = re.split("[{}]", rule)
|
| 392 |
+
return ChunkRuleWithContext(left, chunk, right, comment)
|
| 393 |
+
else:
|
| 394 |
+
raise ValueError("Illegal chunk pattern: %s" % rule)
|
| 395 |
+
except (ValueError, re.error) as e:
|
| 396 |
+
raise ValueError("Illegal chunk pattern: %s" % rule) from e
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
class ChunkRule(RegexpChunkRule):
|
| 400 |
+
"""
|
| 401 |
+
A rule specifying how to add chunks to a ``ChunkString``, using a
|
| 402 |
+
matching tag pattern. When applied to a ``ChunkString``, it will
|
| 403 |
+
find any substring that matches this tag pattern and that is not
|
| 404 |
+
already part of a chunk, and create a new chunk containing that
|
| 405 |
+
substring.
|
| 406 |
+
"""
|
| 407 |
+
|
| 408 |
+
def __init__(self, tag_pattern, descr):
|
| 409 |
+
"""
|
| 410 |
+
Construct a new ``ChunkRule``.
|
| 411 |
+
|
| 412 |
+
:type tag_pattern: str
|
| 413 |
+
:param tag_pattern: This rule's tag pattern. When
|
| 414 |
+
applied to a ``ChunkString``, this rule will
|
| 415 |
+
chunk any substring that matches this tag pattern and that
|
| 416 |
+
is not already part of a chunk.
|
| 417 |
+
:type descr: str
|
| 418 |
+
:param descr: A short description of the purpose and/or effect
|
| 419 |
+
of this rule.
|
| 420 |
+
"""
|
| 421 |
+
self._pattern = tag_pattern
|
| 422 |
+
regexp = re.compile(
|
| 423 |
+
"(?P<chunk>%s)%s"
|
| 424 |
+
% (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_STRIP_PATTERN)
|
| 425 |
+
)
|
| 426 |
+
RegexpChunkRule.__init__(self, regexp, r"{\g<chunk>}", descr)
|
| 427 |
+
|
| 428 |
+
def __repr__(self):
|
| 429 |
+
"""
|
| 430 |
+
Return a string representation of this rule. It has the form::
|
| 431 |
+
|
| 432 |
+
<ChunkRule: '<IN|VB.*>'>
|
| 433 |
+
|
| 434 |
+
Note that this representation does not include the
|
| 435 |
+
description string; that string can be accessed
|
| 436 |
+
separately with the ``descr()`` method.
|
| 437 |
+
|
| 438 |
+
:rtype: str
|
| 439 |
+
"""
|
| 440 |
+
return "<ChunkRule: " + repr(self._pattern) + ">"
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
class StripRule(RegexpChunkRule):
|
| 444 |
+
"""
|
| 445 |
+
A rule specifying how to remove strips to a ``ChunkString``,
|
| 446 |
+
using a matching tag pattern. When applied to a
|
| 447 |
+
``ChunkString``, it will find any substring that matches this
|
| 448 |
+
tag pattern and that is contained in a chunk, and remove it
|
| 449 |
+
from that chunk, thus creating two new chunks.
|
| 450 |
+
"""
|
| 451 |
+
|
| 452 |
+
def __init__(self, tag_pattern, descr):
|
| 453 |
+
"""
|
| 454 |
+
Construct a new ``StripRule``.
|
| 455 |
+
|
| 456 |
+
:type tag_pattern: str
|
| 457 |
+
:param tag_pattern: This rule's tag pattern. When
|
| 458 |
+
applied to a ``ChunkString``, this rule will
|
| 459 |
+
find any substring that matches this tag pattern and that
|
| 460 |
+
is contained in a chunk, and remove it from that chunk,
|
| 461 |
+
thus creating two new chunks.
|
| 462 |
+
:type descr: str
|
| 463 |
+
:param descr: A short description of the purpose and/or effect
|
| 464 |
+
of this rule.
|
| 465 |
+
"""
|
| 466 |
+
self._pattern = tag_pattern
|
| 467 |
+
regexp = re.compile(
|
| 468 |
+
"(?P<strip>%s)%s"
|
| 469 |
+
% (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
|
| 470 |
+
)
|
| 471 |
+
RegexpChunkRule.__init__(self, regexp, r"}\g<strip>{", descr)
|
| 472 |
+
|
| 473 |
+
def __repr__(self):
|
| 474 |
+
"""
|
| 475 |
+
Return a string representation of this rule. It has the form::
|
| 476 |
+
|
| 477 |
+
<StripRule: '<IN|VB.*>'>
|
| 478 |
+
|
| 479 |
+
Note that this representation does not include the
|
| 480 |
+
description string; that string can be accessed
|
| 481 |
+
separately with the ``descr()`` method.
|
| 482 |
+
|
| 483 |
+
:rtype: str
|
| 484 |
+
"""
|
| 485 |
+
return "<StripRule: " + repr(self._pattern) + ">"
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
class UnChunkRule(RegexpChunkRule):
|
| 489 |
+
"""
|
| 490 |
+
A rule specifying how to remove chunks to a ``ChunkString``,
|
| 491 |
+
using a matching tag pattern. When applied to a
|
| 492 |
+
``ChunkString``, it will find any complete chunk that matches this
|
| 493 |
+
tag pattern, and un-chunk it.
|
| 494 |
+
"""
|
| 495 |
+
|
| 496 |
+
def __init__(self, tag_pattern, descr):
|
| 497 |
+
"""
|
| 498 |
+
Construct a new ``UnChunkRule``.
|
| 499 |
+
|
| 500 |
+
:type tag_pattern: str
|
| 501 |
+
:param tag_pattern: This rule's tag pattern. When
|
| 502 |
+
applied to a ``ChunkString``, this rule will
|
| 503 |
+
find any complete chunk that matches this tag pattern,
|
| 504 |
+
and un-chunk it.
|
| 505 |
+
:type descr: str
|
| 506 |
+
:param descr: A short description of the purpose and/or effect
|
| 507 |
+
of this rule.
|
| 508 |
+
"""
|
| 509 |
+
self._pattern = tag_pattern
|
| 510 |
+
regexp = re.compile(r"\{(?P<chunk>%s)\}" % tag_pattern2re_pattern(tag_pattern))
|
| 511 |
+
RegexpChunkRule.__init__(self, regexp, r"\g<chunk>", descr)
|
| 512 |
+
|
| 513 |
+
def __repr__(self):
|
| 514 |
+
"""
|
| 515 |
+
Return a string representation of this rule. It has the form::
|
| 516 |
+
|
| 517 |
+
<UnChunkRule: '<IN|VB.*>'>
|
| 518 |
+
|
| 519 |
+
Note that this representation does not include the
|
| 520 |
+
description string; that string can be accessed
|
| 521 |
+
separately with the ``descr()`` method.
|
| 522 |
+
|
| 523 |
+
:rtype: str
|
| 524 |
+
"""
|
| 525 |
+
return "<UnChunkRule: " + repr(self._pattern) + ">"
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
class MergeRule(RegexpChunkRule):
|
| 529 |
+
"""
|
| 530 |
+
A rule specifying how to merge chunks in a ``ChunkString``, using
|
| 531 |
+
two matching tag patterns: a left pattern, and a right pattern.
|
| 532 |
+
When applied to a ``ChunkString``, it will find any chunk whose end
|
| 533 |
+
matches left pattern, and immediately followed by a chunk whose
|
| 534 |
+
beginning matches right pattern. It will then merge those two
|
| 535 |
+
chunks into a single chunk.
|
| 536 |
+
"""
|
| 537 |
+
|
| 538 |
+
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
|
| 539 |
+
"""
|
| 540 |
+
Construct a new ``MergeRule``.
|
| 541 |
+
|
| 542 |
+
:type right_tag_pattern: str
|
| 543 |
+
:param right_tag_pattern: This rule's right tag
|
| 544 |
+
pattern. When applied to a ``ChunkString``, this
|
| 545 |
+
rule will find any chunk whose end matches
|
| 546 |
+
``left_tag_pattern``, and immediately followed by a chunk
|
| 547 |
+
whose beginning matches this pattern. It will
|
| 548 |
+
then merge those two chunks into a single chunk.
|
| 549 |
+
:type left_tag_pattern: str
|
| 550 |
+
:param left_tag_pattern: This rule's left tag
|
| 551 |
+
pattern. When applied to a ``ChunkString``, this
|
| 552 |
+
rule will find any chunk whose end matches
|
| 553 |
+
this pattern, and immediately followed by a chunk
|
| 554 |
+
whose beginning matches ``right_tag_pattern``. It will
|
| 555 |
+
then merge those two chunks into a single chunk.
|
| 556 |
+
|
| 557 |
+
:type descr: str
|
| 558 |
+
:param descr: A short description of the purpose and/or effect
|
| 559 |
+
of this rule.
|
| 560 |
+
"""
|
| 561 |
+
# Ensure that the individual patterns are coherent. E.g., if
|
| 562 |
+
# left='(' and right=')', then this will raise an exception:
|
| 563 |
+
re.compile(tag_pattern2re_pattern(left_tag_pattern))
|
| 564 |
+
re.compile(tag_pattern2re_pattern(right_tag_pattern))
|
| 565 |
+
|
| 566 |
+
self._left_tag_pattern = left_tag_pattern
|
| 567 |
+
self._right_tag_pattern = right_tag_pattern
|
| 568 |
+
regexp = re.compile(
|
| 569 |
+
"(?P<left>%s)}{(?=%s)"
|
| 570 |
+
% (
|
| 571 |
+
tag_pattern2re_pattern(left_tag_pattern),
|
| 572 |
+
tag_pattern2re_pattern(right_tag_pattern),
|
| 573 |
+
)
|
| 574 |
+
)
|
| 575 |
+
RegexpChunkRule.__init__(self, regexp, r"\g<left>", descr)
|
| 576 |
+
|
| 577 |
+
def __repr__(self):
|
| 578 |
+
"""
|
| 579 |
+
Return a string representation of this rule. It has the form::
|
| 580 |
+
|
| 581 |
+
<MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
|
| 582 |
+
|
| 583 |
+
Note that this representation does not include the
|
| 584 |
+
description string; that string can be accessed
|
| 585 |
+
separately with the ``descr()`` method.
|
| 586 |
+
|
| 587 |
+
:rtype: str
|
| 588 |
+
"""
|
| 589 |
+
return (
|
| 590 |
+
"<MergeRule: "
|
| 591 |
+
+ repr(self._left_tag_pattern)
|
| 592 |
+
+ ", "
|
| 593 |
+
+ repr(self._right_tag_pattern)
|
| 594 |
+
+ ">"
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
|
| 598 |
+
class SplitRule(RegexpChunkRule):
|
| 599 |
+
"""
|
| 600 |
+
A rule specifying how to split chunks in a ``ChunkString``, using
|
| 601 |
+
two matching tag patterns: a left pattern, and a right pattern.
|
| 602 |
+
When applied to a ``ChunkString``, it will find any chunk that
|
| 603 |
+
matches the left pattern followed by the right pattern. It will
|
| 604 |
+
then split the chunk into two new chunks, at the point between the
|
| 605 |
+
two pattern matches.
|
| 606 |
+
"""
|
| 607 |
+
|
| 608 |
+
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
|
| 609 |
+
"""
|
| 610 |
+
Construct a new ``SplitRule``.
|
| 611 |
+
|
| 612 |
+
:type right_tag_pattern: str
|
| 613 |
+
:param right_tag_pattern: This rule's right tag
|
| 614 |
+
pattern. When applied to a ``ChunkString``, this rule will
|
| 615 |
+
find any chunk containing a substring that matches
|
| 616 |
+
``left_tag_pattern`` followed by this pattern. It will
|
| 617 |
+
then split the chunk into two new chunks at the point
|
| 618 |
+
between these two matching patterns.
|
| 619 |
+
:type left_tag_pattern: str
|
| 620 |
+
:param left_tag_pattern: This rule's left tag
|
| 621 |
+
pattern. When applied to a ``ChunkString``, this rule will
|
| 622 |
+
find any chunk containing a substring that matches this
|
| 623 |
+
pattern followed by ``right_tag_pattern``. It will then
|
| 624 |
+
split the chunk into two new chunks at the point between
|
| 625 |
+
these two matching patterns.
|
| 626 |
+
:type descr: str
|
| 627 |
+
:param descr: A short description of the purpose and/or effect
|
| 628 |
+
of this rule.
|
| 629 |
+
"""
|
| 630 |
+
# Ensure that the individual patterns are coherent. E.g., if
|
| 631 |
+
# left='(' and right=')', then this will raise an exception:
|
| 632 |
+
re.compile(tag_pattern2re_pattern(left_tag_pattern))
|
| 633 |
+
re.compile(tag_pattern2re_pattern(right_tag_pattern))
|
| 634 |
+
|
| 635 |
+
self._left_tag_pattern = left_tag_pattern
|
| 636 |
+
self._right_tag_pattern = right_tag_pattern
|
| 637 |
+
regexp = re.compile(
|
| 638 |
+
"(?P<left>%s)(?=%s)"
|
| 639 |
+
% (
|
| 640 |
+
tag_pattern2re_pattern(left_tag_pattern),
|
| 641 |
+
tag_pattern2re_pattern(right_tag_pattern),
|
| 642 |
+
)
|
| 643 |
+
)
|
| 644 |
+
RegexpChunkRule.__init__(self, regexp, r"\g<left>}{", descr)
|
| 645 |
+
|
| 646 |
+
def __repr__(self):
|
| 647 |
+
"""
|
| 648 |
+
Return a string representation of this rule. It has the form::
|
| 649 |
+
|
| 650 |
+
<SplitRule: '<NN>', '<DT>'>
|
| 651 |
+
|
| 652 |
+
Note that this representation does not include the
|
| 653 |
+
description string; that string can be accessed
|
| 654 |
+
separately with the ``descr()`` method.
|
| 655 |
+
|
| 656 |
+
:rtype: str
|
| 657 |
+
"""
|
| 658 |
+
return (
|
| 659 |
+
"<SplitRule: "
|
| 660 |
+
+ repr(self._left_tag_pattern)
|
| 661 |
+
+ ", "
|
| 662 |
+
+ repr(self._right_tag_pattern)
|
| 663 |
+
+ ">"
|
| 664 |
+
)
|
| 665 |
+
|
| 666 |
+
|
| 667 |
+
class ExpandLeftRule(RegexpChunkRule):
|
| 668 |
+
"""
|
| 669 |
+
A rule specifying how to expand chunks in a ``ChunkString`` to the left,
|
| 670 |
+
using two matching tag patterns: a left pattern, and a right pattern.
|
| 671 |
+
When applied to a ``ChunkString``, it will find any chunk whose beginning
|
| 672 |
+
matches right pattern, and immediately preceded by a strip whose
|
| 673 |
+
end matches left pattern. It will then expand the chunk to incorporate
|
| 674 |
+
the new material on the left.
|
| 675 |
+
"""
|
| 676 |
+
|
| 677 |
+
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
|
| 678 |
+
"""
|
| 679 |
+
Construct a new ``ExpandRightRule``.
|
| 680 |
+
|
| 681 |
+
:type right_tag_pattern: str
|
| 682 |
+
:param right_tag_pattern: This rule's right tag
|
| 683 |
+
pattern. When applied to a ``ChunkString``, this
|
| 684 |
+
rule will find any chunk whose beginning matches
|
| 685 |
+
``right_tag_pattern``, and immediately preceded by a strip
|
| 686 |
+
whose end matches this pattern. It will
|
| 687 |
+
then merge those two chunks into a single chunk.
|
| 688 |
+
:type left_tag_pattern: str
|
| 689 |
+
:param left_tag_pattern: This rule's left tag
|
| 690 |
+
pattern. When applied to a ``ChunkString``, this
|
| 691 |
+
rule will find any chunk whose beginning matches
|
| 692 |
+
this pattern, and immediately preceded by a strip
|
| 693 |
+
whose end matches ``left_tag_pattern``. It will
|
| 694 |
+
then expand the chunk to incorporate the new material on the left.
|
| 695 |
+
|
| 696 |
+
:type descr: str
|
| 697 |
+
:param descr: A short description of the purpose and/or effect
|
| 698 |
+
of this rule.
|
| 699 |
+
"""
|
| 700 |
+
# Ensure that the individual patterns are coherent. E.g., if
|
| 701 |
+
# left='(' and right=')', then this will raise an exception:
|
| 702 |
+
re.compile(tag_pattern2re_pattern(left_tag_pattern))
|
| 703 |
+
re.compile(tag_pattern2re_pattern(right_tag_pattern))
|
| 704 |
+
|
| 705 |
+
self._left_tag_pattern = left_tag_pattern
|
| 706 |
+
self._right_tag_pattern = right_tag_pattern
|
| 707 |
+
regexp = re.compile(
|
| 708 |
+
r"(?P<left>%s)\{(?P<right>%s)"
|
| 709 |
+
% (
|
| 710 |
+
tag_pattern2re_pattern(left_tag_pattern),
|
| 711 |
+
tag_pattern2re_pattern(right_tag_pattern),
|
| 712 |
+
)
|
| 713 |
+
)
|
| 714 |
+
RegexpChunkRule.__init__(self, regexp, r"{\g<left>\g<right>", descr)
|
| 715 |
+
|
| 716 |
+
def __repr__(self):
|
| 717 |
+
"""
|
| 718 |
+
Return a string representation of this rule. It has the form::
|
| 719 |
+
|
| 720 |
+
<ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
|
| 721 |
+
|
| 722 |
+
Note that this representation does not include the
|
| 723 |
+
description string; that string can be accessed
|
| 724 |
+
separately with the ``descr()`` method.
|
| 725 |
+
|
| 726 |
+
:rtype: str
|
| 727 |
+
"""
|
| 728 |
+
return (
|
| 729 |
+
"<ExpandLeftRule: "
|
| 730 |
+
+ repr(self._left_tag_pattern)
|
| 731 |
+
+ ", "
|
| 732 |
+
+ repr(self._right_tag_pattern)
|
| 733 |
+
+ ">"
|
| 734 |
+
)
|
| 735 |
+
|
| 736 |
+
|
| 737 |
+
class ExpandRightRule(RegexpChunkRule):
|
| 738 |
+
"""
|
| 739 |
+
A rule specifying how to expand chunks in a ``ChunkString`` to the
|
| 740 |
+
right, using two matching tag patterns: a left pattern, and a
|
| 741 |
+
right pattern. When applied to a ``ChunkString``, it will find any
|
| 742 |
+
chunk whose end matches left pattern, and immediately followed by
|
| 743 |
+
a strip whose beginning matches right pattern. It will then
|
| 744 |
+
expand the chunk to incorporate the new material on the right.
|
| 745 |
+
"""
|
| 746 |
+
|
| 747 |
+
def __init__(self, left_tag_pattern, right_tag_pattern, descr):
|
| 748 |
+
"""
|
| 749 |
+
Construct a new ``ExpandRightRule``.
|
| 750 |
+
|
| 751 |
+
:type right_tag_pattern: str
|
| 752 |
+
:param right_tag_pattern: This rule's right tag
|
| 753 |
+
pattern. When applied to a ``ChunkString``, this
|
| 754 |
+
rule will find any chunk whose end matches
|
| 755 |
+
``left_tag_pattern``, and immediately followed by a strip
|
| 756 |
+
whose beginning matches this pattern. It will
|
| 757 |
+
then merge those two chunks into a single chunk.
|
| 758 |
+
:type left_tag_pattern: str
|
| 759 |
+
:param left_tag_pattern: This rule's left tag
|
| 760 |
+
pattern. When applied to a ``ChunkString``, this
|
| 761 |
+
rule will find any chunk whose end matches
|
| 762 |
+
this pattern, and immediately followed by a strip
|
| 763 |
+
whose beginning matches ``right_tag_pattern``. It will
|
| 764 |
+
then expand the chunk to incorporate the new material on the right.
|
| 765 |
+
|
| 766 |
+
:type descr: str
|
| 767 |
+
:param descr: A short description of the purpose and/or effect
|
| 768 |
+
of this rule.
|
| 769 |
+
"""
|
| 770 |
+
# Ensure that the individual patterns are coherent. E.g., if
|
| 771 |
+
# left='(' and right=')', then this will raise an exception:
|
| 772 |
+
re.compile(tag_pattern2re_pattern(left_tag_pattern))
|
| 773 |
+
re.compile(tag_pattern2re_pattern(right_tag_pattern))
|
| 774 |
+
|
| 775 |
+
self._left_tag_pattern = left_tag_pattern
|
| 776 |
+
self._right_tag_pattern = right_tag_pattern
|
| 777 |
+
regexp = re.compile(
|
| 778 |
+
r"(?P<left>%s)\}(?P<right>%s)"
|
| 779 |
+
% (
|
| 780 |
+
tag_pattern2re_pattern(left_tag_pattern),
|
| 781 |
+
tag_pattern2re_pattern(right_tag_pattern),
|
| 782 |
+
)
|
| 783 |
+
)
|
| 784 |
+
RegexpChunkRule.__init__(self, regexp, r"\g<left>\g<right>}", descr)
|
| 785 |
+
|
| 786 |
+
def __repr__(self):
|
| 787 |
+
"""
|
| 788 |
+
Return a string representation of this rule. It has the form::
|
| 789 |
+
|
| 790 |
+
<ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
|
| 791 |
+
|
| 792 |
+
Note that this representation does not include the
|
| 793 |
+
description string; that string can be accessed
|
| 794 |
+
separately with the ``descr()`` method.
|
| 795 |
+
|
| 796 |
+
:rtype: str
|
| 797 |
+
"""
|
| 798 |
+
return (
|
| 799 |
+
"<ExpandRightRule: "
|
| 800 |
+
+ repr(self._left_tag_pattern)
|
| 801 |
+
+ ", "
|
| 802 |
+
+ repr(self._right_tag_pattern)
|
| 803 |
+
+ ">"
|
| 804 |
+
)
|
| 805 |
+
|
| 806 |
+
|
| 807 |
+
class ChunkRuleWithContext(RegexpChunkRule):
|
| 808 |
+
"""
|
| 809 |
+
A rule specifying how to add chunks to a ``ChunkString``, using
|
| 810 |
+
three matching tag patterns: one for the left context, one for the
|
| 811 |
+
chunk, and one for the right context. When applied to a
|
| 812 |
+
``ChunkString``, it will find any substring that matches the chunk
|
| 813 |
+
tag pattern, is surrounded by substrings that match the two
|
| 814 |
+
context patterns, and is not already part of a chunk; and create a
|
| 815 |
+
new chunk containing the substring that matched the chunk tag
|
| 816 |
+
pattern.
|
| 817 |
+
|
| 818 |
+
Caveat: Both the left and right context are consumed when this
|
| 819 |
+
rule matches; therefore, if you need to find overlapping matches,
|
| 820 |
+
you will need to apply your rule more than once.
|
| 821 |
+
"""
|
| 822 |
+
|
| 823 |
+
def __init__(
|
| 824 |
+
self,
|
| 825 |
+
left_context_tag_pattern,
|
| 826 |
+
chunk_tag_pattern,
|
| 827 |
+
right_context_tag_pattern,
|
| 828 |
+
descr,
|
| 829 |
+
):
|
| 830 |
+
"""
|
| 831 |
+
Construct a new ``ChunkRuleWithContext``.
|
| 832 |
+
|
| 833 |
+
:type left_context_tag_pattern: str
|
| 834 |
+
:param left_context_tag_pattern: A tag pattern that must match
|
| 835 |
+
the left context of ``chunk_tag_pattern`` for this rule to
|
| 836 |
+
apply.
|
| 837 |
+
:type chunk_tag_pattern: str
|
| 838 |
+
:param chunk_tag_pattern: A tag pattern that must match for this
|
| 839 |
+
rule to apply. If the rule does apply, then this pattern
|
| 840 |
+
also identifies the substring that will be made into a chunk.
|
| 841 |
+
:type right_context_tag_pattern: str
|
| 842 |
+
:param right_context_tag_pattern: A tag pattern that must match
|
| 843 |
+
the right context of ``chunk_tag_pattern`` for this rule to
|
| 844 |
+
apply.
|
| 845 |
+
:type descr: str
|
| 846 |
+
:param descr: A short description of the purpose and/or effect
|
| 847 |
+
of this rule.
|
| 848 |
+
"""
|
| 849 |
+
# Ensure that the individual patterns are coherent. E.g., if
|
| 850 |
+
# left='(' and right=')', then this will raise an exception:
|
| 851 |
+
re.compile(tag_pattern2re_pattern(left_context_tag_pattern))
|
| 852 |
+
re.compile(tag_pattern2re_pattern(chunk_tag_pattern))
|
| 853 |
+
re.compile(tag_pattern2re_pattern(right_context_tag_pattern))
|
| 854 |
+
|
| 855 |
+
self._left_context_tag_pattern = left_context_tag_pattern
|
| 856 |
+
self._chunk_tag_pattern = chunk_tag_pattern
|
| 857 |
+
self._right_context_tag_pattern = right_context_tag_pattern
|
| 858 |
+
regexp = re.compile(
|
| 859 |
+
"(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s"
|
| 860 |
+
% (
|
| 861 |
+
tag_pattern2re_pattern(left_context_tag_pattern),
|
| 862 |
+
tag_pattern2re_pattern(chunk_tag_pattern),
|
| 863 |
+
tag_pattern2re_pattern(right_context_tag_pattern),
|
| 864 |
+
ChunkString.IN_STRIP_PATTERN,
|
| 865 |
+
)
|
| 866 |
+
)
|
| 867 |
+
replacement = r"\g<left>{\g<chunk>}\g<right>"
|
| 868 |
+
RegexpChunkRule.__init__(self, regexp, replacement, descr)
|
| 869 |
+
|
| 870 |
+
def __repr__(self):
|
| 871 |
+
"""
|
| 872 |
+
Return a string representation of this rule. It has the form::
|
| 873 |
+
|
| 874 |
+
<ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'>
|
| 875 |
+
|
| 876 |
+
Note that this representation does not include the
|
| 877 |
+
description string; that string can be accessed
|
| 878 |
+
separately with the ``descr()`` method.
|
| 879 |
+
|
| 880 |
+
:rtype: str
|
| 881 |
+
"""
|
| 882 |
+
return "<ChunkRuleWithContext: {!r}, {!r}, {!r}>".format(
|
| 883 |
+
self._left_context_tag_pattern,
|
| 884 |
+
self._chunk_tag_pattern,
|
| 885 |
+
self._right_context_tag_pattern,
|
| 886 |
+
)
|
| 887 |
+
|
| 888 |
+
|
| 889 |
+
# //////////////////////////////////////////////////////
|
| 890 |
+
# Tag Pattern Format Conversion
|
| 891 |
+
# //////////////////////////////////////////////////////
|
| 892 |
+
|
| 893 |
+
# this should probably be made more strict than it is -- e.g., it
|
| 894 |
+
# currently accepts 'foo'.
|
| 895 |
+
CHUNK_TAG_PATTERN = re.compile(
|
| 896 |
+
r"^(({}|<{}>)*)$".format(r"([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", r"[^\{\}<>]+")
|
| 897 |
+
)
|
| 898 |
+
|
| 899 |
+
|
| 900 |
+
def tag_pattern2re_pattern(tag_pattern):
|
| 901 |
+
"""
|
| 902 |
+
Convert a tag pattern to a regular expression pattern. A "tag
|
| 903 |
+
pattern" is a modified version of a regular expression, designed
|
| 904 |
+
for matching sequences of tags. The differences between regular
|
| 905 |
+
expression patterns and tag patterns are:
|
| 906 |
+
|
| 907 |
+
- In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
|
| 908 |
+
``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
|
| 909 |
+
``'<NN'`` followed by one or more repetitions of ``'>'``.
|
| 910 |
+
- Whitespace in tag patterns is ignored. So
|
| 911 |
+
``'<DT> | <NN>'`` is equivalent to ``'<DT>|<NN>'``
|
| 912 |
+
- In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so
|
| 913 |
+
``'<NN.*>'`` matches any single tag starting with ``'NN'``.
|
| 914 |
+
|
| 915 |
+
In particular, ``tag_pattern2re_pattern`` performs the following
|
| 916 |
+
transformations on the given pattern:
|
| 917 |
+
|
| 918 |
+
- Replace '.' with '[^<>{}]'
|
| 919 |
+
- Remove any whitespace
|
| 920 |
+
- Add extra parens around '<' and '>', to make '<' and '>' act
|
| 921 |
+
like parentheses. E.g., so that in '<NN>+', the '+' has scope
|
| 922 |
+
over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
|
| 923 |
+
scope over 'NN' and 'IN', but not '<' or '>'.
|
| 924 |
+
- Check to make sure the resulting pattern is valid.
|
| 925 |
+
|
| 926 |
+
:type tag_pattern: str
|
| 927 |
+
:param tag_pattern: The tag pattern to convert to a regular
|
| 928 |
+
expression pattern.
|
| 929 |
+
:raise ValueError: If ``tag_pattern`` is not a valid tag pattern.
|
| 930 |
+
In particular, ``tag_pattern`` should not include braces; and it
|
| 931 |
+
should not contain nested or mismatched angle-brackets.
|
| 932 |
+
:rtype: str
|
| 933 |
+
:return: A regular expression pattern corresponding to
|
| 934 |
+
``tag_pattern``.
|
| 935 |
+
"""
|
| 936 |
+
# Clean up the regular expression
|
| 937 |
+
tag_pattern = re.sub(r"\s", "", tag_pattern)
|
| 938 |
+
tag_pattern = re.sub(r"<", "(<(", tag_pattern)
|
| 939 |
+
tag_pattern = re.sub(r">", ")>)", tag_pattern)
|
| 940 |
+
|
| 941 |
+
# Check the regular expression
|
| 942 |
+
if not CHUNK_TAG_PATTERN.match(tag_pattern):
|
| 943 |
+
raise ValueError("Bad tag pattern: %r" % tag_pattern)
|
| 944 |
+
|
| 945 |
+
# Replace "." with CHUNK_TAG_CHAR.
|
| 946 |
+
# We have to do this after, since it adds {}[]<>s, which would
|
| 947 |
+
# confuse CHUNK_TAG_PATTERN.
|
| 948 |
+
# PRE doesn't have lookback assertions, so reverse twice, and do
|
| 949 |
+
# the pattern backwards (with lookahead assertions). This can be
|
| 950 |
+
# made much cleaner once we can switch back to SRE.
|
| 951 |
+
def reverse_str(str):
|
| 952 |
+
lst = list(str)
|
| 953 |
+
lst.reverse()
|
| 954 |
+
return "".join(lst)
|
| 955 |
+
|
| 956 |
+
tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
|
| 957 |
+
reversed = reverse_str(tag_pattern)
|
| 958 |
+
reversed = re.sub(r"\.(?!\\(\\\\)*($|[^\\]))", tc_rev, reversed)
|
| 959 |
+
tag_pattern = reverse_str(reversed)
|
| 960 |
+
|
| 961 |
+
return tag_pattern
|
| 962 |
+
|
| 963 |
+
|
| 964 |
+
# //////////////////////////////////////////////////////
|
| 965 |
+
# RegexpChunkParser
|
| 966 |
+
# //////////////////////////////////////////////////////
|
| 967 |
+
|
| 968 |
+
|
| 969 |
+
class RegexpChunkParser(ChunkParserI):
|
| 970 |
+
"""
|
| 971 |
+
A regular expression based chunk parser. ``RegexpChunkParser`` uses a
|
| 972 |
+
sequence of "rules" to find chunks of a single type within a
|
| 973 |
+
text. The chunking of the text is encoded using a ``ChunkString``,
|
| 974 |
+
and each rule acts by modifying the chunking in the
|
| 975 |
+
``ChunkString``. The rules are all implemented using regular
|
| 976 |
+
expression matching and substitution.
|
| 977 |
+
|
| 978 |
+
The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``,
|
| 979 |
+
``StripRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``)
|
| 980 |
+
define the rules that are used by ``RegexpChunkParser``. Each rule
|
| 981 |
+
defines an ``apply()`` method, which modifies the chunking encoded
|
| 982 |
+
by a given ``ChunkString``.
|
| 983 |
+
|
| 984 |
+
:type _rules: list(RegexpChunkRule)
|
| 985 |
+
:ivar _rules: The list of rules that should be applied to a text.
|
| 986 |
+
:type _trace: int
|
| 987 |
+
:ivar _trace: The default level of tracing.
|
| 988 |
+
|
| 989 |
+
"""
|
| 990 |
+
|
| 991 |
+
def __init__(self, rules, chunk_label="NP", root_label="S", trace=0):
|
| 992 |
+
"""
|
| 993 |
+
Construct a new ``RegexpChunkParser``.
|
| 994 |
+
|
| 995 |
+
:type rules: list(RegexpChunkRule)
|
| 996 |
+
:param rules: The sequence of rules that should be used to
|
| 997 |
+
generate the chunking for a tagged text.
|
| 998 |
+
:type chunk_label: str
|
| 999 |
+
:param chunk_label: The node value that should be used for
|
| 1000 |
+
chunk subtrees. This is typically a short string
|
| 1001 |
+
describing the type of information contained by the chunk,
|
| 1002 |
+
such as ``"NP"`` for base noun phrases.
|
| 1003 |
+
:type root_label: str
|
| 1004 |
+
:param root_label: The node value that should be used for the
|
| 1005 |
+
top node of the chunk structure.
|
| 1006 |
+
:type trace: int
|
| 1007 |
+
:param trace: The level of tracing that should be used when
|
| 1008 |
+
parsing a text. ``0`` will generate no tracing output;
|
| 1009 |
+
``1`` will generate normal tracing output; and ``2`` or
|
| 1010 |
+
higher will generate verbose tracing output.
|
| 1011 |
+
"""
|
| 1012 |
+
self._rules = rules
|
| 1013 |
+
self._trace = trace
|
| 1014 |
+
self._chunk_label = chunk_label
|
| 1015 |
+
self._root_label = root_label
|
| 1016 |
+
|
| 1017 |
+
def _trace_apply(self, chunkstr, verbose):
|
| 1018 |
+
"""
|
| 1019 |
+
Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
|
| 1020 |
+
turn. Generate trace output between each rule. If ``verbose``
|
| 1021 |
+
is true, then generate verbose output.
|
| 1022 |
+
|
| 1023 |
+
:type chunkstr: ChunkString
|
| 1024 |
+
:param chunkstr: The chunk string to which each rule should be
|
| 1025 |
+
applied.
|
| 1026 |
+
:type verbose: bool
|
| 1027 |
+
:param verbose: Whether output should be verbose.
|
| 1028 |
+
:rtype: None
|
| 1029 |
+
"""
|
| 1030 |
+
print("# Input:")
|
| 1031 |
+
print(chunkstr)
|
| 1032 |
+
for rule in self._rules:
|
| 1033 |
+
rule.apply(chunkstr)
|
| 1034 |
+
if verbose:
|
| 1035 |
+
print("#", rule.descr() + " (" + repr(rule) + "):")
|
| 1036 |
+
else:
|
| 1037 |
+
print("#", rule.descr() + ":")
|
| 1038 |
+
print(chunkstr)
|
| 1039 |
+
|
| 1040 |
+
def _notrace_apply(self, chunkstr):
|
| 1041 |
+
"""
|
| 1042 |
+
Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
|
| 1043 |
+
turn.
|
| 1044 |
+
|
| 1045 |
+
:param chunkstr: The chunk string to which each rule should be
|
| 1046 |
+
applied.
|
| 1047 |
+
:type chunkstr: ChunkString
|
| 1048 |
+
:rtype: None
|
| 1049 |
+
"""
|
| 1050 |
+
|
| 1051 |
+
for rule in self._rules:
|
| 1052 |
+
rule.apply(chunkstr)
|
| 1053 |
+
|
| 1054 |
+
def parse(self, chunk_struct, trace=None):
|
| 1055 |
+
"""
|
| 1056 |
+
:type chunk_struct: Tree
|
| 1057 |
+
:param chunk_struct: the chunk structure to be (further) chunked
|
| 1058 |
+
:type trace: int
|
| 1059 |
+
:param trace: The level of tracing that should be used when
|
| 1060 |
+
parsing a text. ``0`` will generate no tracing output;
|
| 1061 |
+
``1`` will generate normal tracing output; and ``2`` or
|
| 1062 |
+
higher will generate verbose tracing output. This value
|
| 1063 |
+
overrides the trace level value that was given to the
|
| 1064 |
+
constructor.
|
| 1065 |
+
:rtype: Tree
|
| 1066 |
+
:return: a chunk structure that encodes the chunks in a given
|
| 1067 |
+
tagged sentence. A chunk is a non-overlapping linguistic
|
| 1068 |
+
group, such as a noun phrase. The set of chunks
|
| 1069 |
+
identified in the chunk structure depends on the rules
|
| 1070 |
+
used to define this ``RegexpChunkParser``.
|
| 1071 |
+
"""
|
| 1072 |
+
if len(chunk_struct) == 0:
|
| 1073 |
+
print("Warning: parsing empty text")
|
| 1074 |
+
return Tree(self._root_label, [])
|
| 1075 |
+
|
| 1076 |
+
try:
|
| 1077 |
+
chunk_struct.label()
|
| 1078 |
+
except AttributeError:
|
| 1079 |
+
chunk_struct = Tree(self._root_label, chunk_struct)
|
| 1080 |
+
|
| 1081 |
+
# Use the default trace value?
|
| 1082 |
+
if trace is None:
|
| 1083 |
+
trace = self._trace
|
| 1084 |
+
|
| 1085 |
+
chunkstr = ChunkString(chunk_struct)
|
| 1086 |
+
|
| 1087 |
+
# Apply the sequence of rules to the chunkstring.
|
| 1088 |
+
if trace:
|
| 1089 |
+
verbose = trace > 1
|
| 1090 |
+
self._trace_apply(chunkstr, verbose)
|
| 1091 |
+
else:
|
| 1092 |
+
self._notrace_apply(chunkstr)
|
| 1093 |
+
|
| 1094 |
+
# Use the chunkstring to create a chunk structure.
|
| 1095 |
+
return chunkstr.to_chunkstruct(self._chunk_label)
|
| 1096 |
+
|
| 1097 |
+
def rules(self):
|
| 1098 |
+
"""
|
| 1099 |
+
:return: the sequence of rules used by ``RegexpChunkParser``.
|
| 1100 |
+
:rtype: list(RegexpChunkRule)
|
| 1101 |
+
"""
|
| 1102 |
+
return self._rules
|
| 1103 |
+
|
| 1104 |
+
def __repr__(self):
|
| 1105 |
+
"""
|
| 1106 |
+
:return: a concise string representation of this
|
| 1107 |
+
``RegexpChunkParser``.
|
| 1108 |
+
:rtype: str
|
| 1109 |
+
"""
|
| 1110 |
+
return "<RegexpChunkParser with %d rules>" % len(self._rules)
|
| 1111 |
+
|
| 1112 |
+
def __str__(self):
|
| 1113 |
+
"""
|
| 1114 |
+
:return: a verbose string representation of this ``RegexpChunkParser``.
|
| 1115 |
+
:rtype: str
|
| 1116 |
+
"""
|
| 1117 |
+
s = "RegexpChunkParser with %d rules:\n" % len(self._rules)
|
| 1118 |
+
margin = 0
|
| 1119 |
+
for rule in self._rules:
|
| 1120 |
+
margin = max(margin, len(rule.descr()))
|
| 1121 |
+
if margin < 35:
|
| 1122 |
+
format = " %" + repr(-(margin + 3)) + "s%s\n"
|
| 1123 |
+
else:
|
| 1124 |
+
format = " %s\n %s\n"
|
| 1125 |
+
for rule in self._rules:
|
| 1126 |
+
s += format % (rule.descr(), repr(rule))
|
| 1127 |
+
return s[:-1]
|
| 1128 |
+
|
| 1129 |
+
|
| 1130 |
+
# //////////////////////////////////////////////////////
|
| 1131 |
+
# Chunk Grammar
|
| 1132 |
+
# //////////////////////////////////////////////////////
|
| 1133 |
+
|
| 1134 |
+
|
| 1135 |
+
class RegexpParser(ChunkParserI):
|
| 1136 |
+
r"""
|
| 1137 |
+
A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of
|
| 1138 |
+
regular expression patterns to specify the behavior of the parser.
|
| 1139 |
+
The chunking of the text is encoded using a ``ChunkString``, and
|
| 1140 |
+
each rule acts by modifying the chunking in the ``ChunkString``.
|
| 1141 |
+
The rules are all implemented using regular expression matching
|
| 1142 |
+
and substitution.
|
| 1143 |
+
|
| 1144 |
+
A grammar contains one or more clauses in the following form::
|
| 1145 |
+
|
| 1146 |
+
NP:
|
| 1147 |
+
{<DT|JJ>} # chunk determiners and adjectives
|
| 1148 |
+
}<[\.VI].*>+{ # strip any tag beginning with V, I, or .
|
| 1149 |
+
<.*>}{<DT> # split a chunk at a determiner
|
| 1150 |
+
<DT|JJ>{}<NN.*> # merge chunk ending with det/adj
|
| 1151 |
+
# with one starting with a noun
|
| 1152 |
+
|
| 1153 |
+
The patterns of a clause are executed in order. An earlier
|
| 1154 |
+
pattern may introduce a chunk boundary that prevents a later
|
| 1155 |
+
pattern from executing. Sometimes an individual pattern will
|
| 1156 |
+
match on multiple, overlapping extents of the input. As with
|
| 1157 |
+
regular expression substitution more generally, the chunker will
|
| 1158 |
+
identify the first match possible, then continue looking for matches
|
| 1159 |
+
after this one has ended.
|
| 1160 |
+
|
| 1161 |
+
The clauses of a grammar are also executed in order. A cascaded
|
| 1162 |
+
chunk parser is one having more than one clause. The maximum depth
|
| 1163 |
+
of a parse tree created by this chunk parser is the same as the
|
| 1164 |
+
number of clauses in the grammar.
|
| 1165 |
+
|
| 1166 |
+
When tracing is turned on, the comment portion of a line is displayed
|
| 1167 |
+
each time the corresponding pattern is applied.
|
| 1168 |
+
|
| 1169 |
+
:type _start: str
|
| 1170 |
+
:ivar _start: The start symbol of the grammar (the root node of
|
| 1171 |
+
resulting trees)
|
| 1172 |
+
:type _stages: int
|
| 1173 |
+
:ivar _stages: The list of parsing stages corresponding to the grammar
|
| 1174 |
+
|
| 1175 |
+
"""
|
| 1176 |
+
|
| 1177 |
+
def __init__(self, grammar, root_label="S", loop=1, trace=0):
|
| 1178 |
+
"""
|
| 1179 |
+
Create a new chunk parser, from the given start state
|
| 1180 |
+
and set of chunk patterns.
|
| 1181 |
+
|
| 1182 |
+
:param grammar: The grammar, or a list of RegexpChunkParser objects
|
| 1183 |
+
:type grammar: str or list(RegexpChunkParser)
|
| 1184 |
+
:param root_label: The top node of the tree being created
|
| 1185 |
+
:type root_label: str or Nonterminal
|
| 1186 |
+
:param loop: The number of times to run through the patterns
|
| 1187 |
+
:type loop: int
|
| 1188 |
+
:type trace: int
|
| 1189 |
+
:param trace: The level of tracing that should be used when
|
| 1190 |
+
parsing a text. ``0`` will generate no tracing output;
|
| 1191 |
+
``1`` will generate normal tracing output; and ``2`` or
|
| 1192 |
+
higher will generate verbose tracing output.
|
| 1193 |
+
"""
|
| 1194 |
+
self._trace = trace
|
| 1195 |
+
self._stages = []
|
| 1196 |
+
self._grammar = grammar
|
| 1197 |
+
self._loop = loop
|
| 1198 |
+
|
| 1199 |
+
if isinstance(grammar, str):
|
| 1200 |
+
self._read_grammar(grammar, root_label, trace)
|
| 1201 |
+
else:
|
| 1202 |
+
# Make sur the grammar looks like it has the right type:
|
| 1203 |
+
type_err = (
|
| 1204 |
+
"Expected string or list of RegexpChunkParsers " "for the grammar."
|
| 1205 |
+
)
|
| 1206 |
+
try:
|
| 1207 |
+
grammar = list(grammar)
|
| 1208 |
+
except BaseException as e:
|
| 1209 |
+
raise TypeError(type_err) from e
|
| 1210 |
+
for elt in grammar:
|
| 1211 |
+
if not isinstance(elt, RegexpChunkParser):
|
| 1212 |
+
raise TypeError(type_err)
|
| 1213 |
+
self._stages = grammar
|
| 1214 |
+
|
| 1215 |
+
def _read_grammar(self, grammar, root_label, trace):
|
| 1216 |
+
"""
|
| 1217 |
+
Helper function for __init__: read the grammar if it is a
|
| 1218 |
+
string.
|
| 1219 |
+
"""
|
| 1220 |
+
rules = []
|
| 1221 |
+
lhs = None
|
| 1222 |
+
pattern = regex.compile("(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))")
|
| 1223 |
+
for line in grammar.split("\n"):
|
| 1224 |
+
line = line.strip()
|
| 1225 |
+
|
| 1226 |
+
# New stage begins if there's an unescaped ':'
|
| 1227 |
+
m = pattern.match(line)
|
| 1228 |
+
if m:
|
| 1229 |
+
# Record the stage that we just completed.
|
| 1230 |
+
self._add_stage(rules, lhs, root_label, trace)
|
| 1231 |
+
# Start a new stage.
|
| 1232 |
+
lhs = m.group("nonterminal").strip()
|
| 1233 |
+
rules = []
|
| 1234 |
+
line = m.group("rule").strip()
|
| 1235 |
+
|
| 1236 |
+
# Skip blank & comment-only lines
|
| 1237 |
+
if line == "" or line.startswith("#"):
|
| 1238 |
+
continue
|
| 1239 |
+
|
| 1240 |
+
# Add the rule
|
| 1241 |
+
rules.append(RegexpChunkRule.fromstring(line))
|
| 1242 |
+
|
| 1243 |
+
# Record the final stage
|
| 1244 |
+
self._add_stage(rules, lhs, root_label, trace)
|
| 1245 |
+
|
| 1246 |
+
def _add_stage(self, rules, lhs, root_label, trace):
|
| 1247 |
+
"""
|
| 1248 |
+
Helper function for __init__: add a new stage to the parser.
|
| 1249 |
+
"""
|
| 1250 |
+
if rules != []:
|
| 1251 |
+
if not lhs:
|
| 1252 |
+
raise ValueError("Expected stage marker (eg NP:)")
|
| 1253 |
+
parser = RegexpChunkParser(
|
| 1254 |
+
rules, chunk_label=lhs, root_label=root_label, trace=trace
|
| 1255 |
+
)
|
| 1256 |
+
self._stages.append(parser)
|
| 1257 |
+
|
| 1258 |
+
def parse(self, chunk_struct, trace=None):
|
| 1259 |
+
"""
|
| 1260 |
+
Apply the chunk parser to this input.
|
| 1261 |
+
|
| 1262 |
+
:type chunk_struct: Tree
|
| 1263 |
+
:param chunk_struct: the chunk structure to be (further) chunked
|
| 1264 |
+
(this tree is modified, and is also returned)
|
| 1265 |
+
:type trace: int
|
| 1266 |
+
:param trace: The level of tracing that should be used when
|
| 1267 |
+
parsing a text. ``0`` will generate no tracing output;
|
| 1268 |
+
``1`` will generate normal tracing output; and ``2`` or
|
| 1269 |
+
higher will generate verbose tracing output. This value
|
| 1270 |
+
overrides the trace level value that was given to the
|
| 1271 |
+
constructor.
|
| 1272 |
+
:return: the chunked output.
|
| 1273 |
+
:rtype: Tree
|
| 1274 |
+
"""
|
| 1275 |
+
if trace is None:
|
| 1276 |
+
trace = self._trace
|
| 1277 |
+
for i in range(self._loop):
|
| 1278 |
+
for parser in self._stages:
|
| 1279 |
+
chunk_struct = parser.parse(chunk_struct, trace=trace)
|
| 1280 |
+
return chunk_struct
|
| 1281 |
+
|
| 1282 |
+
def __repr__(self):
|
| 1283 |
+
"""
|
| 1284 |
+
:return: a concise string representation of this ``chunk.RegexpParser``.
|
| 1285 |
+
:rtype: str
|
| 1286 |
+
"""
|
| 1287 |
+
return "<chunk.RegexpParser with %d stages>" % len(self._stages)
|
| 1288 |
+
|
| 1289 |
+
def __str__(self):
|
| 1290 |
+
"""
|
| 1291 |
+
:return: a verbose string representation of this
|
| 1292 |
+
``RegexpParser``.
|
| 1293 |
+
:rtype: str
|
| 1294 |
+
"""
|
| 1295 |
+
s = "chunk.RegexpParser with %d stages:\n" % len(self._stages)
|
| 1296 |
+
margin = 0
|
| 1297 |
+
for parser in self._stages:
|
| 1298 |
+
s += "%s\n" % parser
|
| 1299 |
+
return s[:-1]
|
| 1300 |
+
|
| 1301 |
+
|
| 1302 |
+
# //////////////////////////////////////////////////////
|
| 1303 |
+
# Demonstration code
|
| 1304 |
+
# //////////////////////////////////////////////////////
|
| 1305 |
+
|
| 1306 |
+
|
| 1307 |
+
def demo_eval(chunkparser, text):
|
| 1308 |
+
"""
|
| 1309 |
+
Demonstration code for evaluating a chunk parser, using a
|
| 1310 |
+
``ChunkScore``. This function assumes that ``text`` contains one
|
| 1311 |
+
sentence per line, and that each sentence has the form expected by
|
| 1312 |
+
``tree.chunk``. It runs the given chunk parser on each sentence in
|
| 1313 |
+
the text, and scores the result. It prints the final score
|
| 1314 |
+
(precision, recall, and f-measure); and reports the set of chunks
|
| 1315 |
+
that were missed and the set of chunks that were incorrect. (At
|
| 1316 |
+
most 10 missing chunks and 10 incorrect chunks are reported).
|
| 1317 |
+
|
| 1318 |
+
:param chunkparser: The chunkparser to be tested
|
| 1319 |
+
:type chunkparser: ChunkParserI
|
| 1320 |
+
:param text: The chunked tagged text that should be used for
|
| 1321 |
+
evaluation.
|
| 1322 |
+
:type text: str
|
| 1323 |
+
"""
|
| 1324 |
+
from nltk import chunk
|
| 1325 |
+
from nltk.tree import Tree
|
| 1326 |
+
|
| 1327 |
+
# Evaluate our chunk parser.
|
| 1328 |
+
chunkscore = chunk.ChunkScore()
|
| 1329 |
+
|
| 1330 |
+
for sentence in text.split("\n"):
|
| 1331 |
+
print(sentence)
|
| 1332 |
+
sentence = sentence.strip()
|
| 1333 |
+
if not sentence:
|
| 1334 |
+
continue
|
| 1335 |
+
gold = chunk.tagstr2tree(sentence)
|
| 1336 |
+
tokens = gold.leaves()
|
| 1337 |
+
test = chunkparser.parse(Tree("S", tokens), trace=1)
|
| 1338 |
+
chunkscore.score(gold, test)
|
| 1339 |
+
print()
|
| 1340 |
+
|
| 1341 |
+
print("/" + ("=" * 75) + "\\")
|
| 1342 |
+
print("Scoring", chunkparser)
|
| 1343 |
+
print("-" * 77)
|
| 1344 |
+
print("Precision: %5.1f%%" % (chunkscore.precision() * 100), " " * 4, end=" ")
|
| 1345 |
+
print("Recall: %5.1f%%" % (chunkscore.recall() * 100), " " * 6, end=" ")
|
| 1346 |
+
print("F-Measure: %5.1f%%" % (chunkscore.f_measure() * 100))
|
| 1347 |
+
|
| 1348 |
+
# Missed chunks.
|
| 1349 |
+
if chunkscore.missed():
|
| 1350 |
+
print("Missed:")
|
| 1351 |
+
missed = chunkscore.missed()
|
| 1352 |
+
for chunk in missed[:10]:
|
| 1353 |
+
print(" ", " ".join(map(str, chunk)))
|
| 1354 |
+
if len(chunkscore.missed()) > 10:
|
| 1355 |
+
print(" ...")
|
| 1356 |
+
|
| 1357 |
+
# Incorrect chunks.
|
| 1358 |
+
if chunkscore.incorrect():
|
| 1359 |
+
print("Incorrect:")
|
| 1360 |
+
incorrect = chunkscore.incorrect()
|
| 1361 |
+
for chunk in incorrect[:10]:
|
| 1362 |
+
print(" ", " ".join(map(str, chunk)))
|
| 1363 |
+
if len(chunkscore.incorrect()) > 10:
|
| 1364 |
+
print(" ...")
|
| 1365 |
+
|
| 1366 |
+
print("\\" + ("=" * 75) + "/")
|
| 1367 |
+
print()
|
| 1368 |
+
|
| 1369 |
+
|
| 1370 |
+
def demo():
|
| 1371 |
+
"""
|
| 1372 |
+
A demonstration for the ``RegexpChunkParser`` class. A single text is
|
| 1373 |
+
parsed with four different chunk parsers, using a variety of rules
|
| 1374 |
+
and strategies.
|
| 1375 |
+
"""
|
| 1376 |
+
|
| 1377 |
+
from nltk import Tree, chunk
|
| 1378 |
+
|
| 1379 |
+
text = """\
|
| 1380 |
+
[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
|
| 1381 |
+
[ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
|
| 1382 |
+
[ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
|
| 1383 |
+
"""
|
| 1384 |
+
|
| 1385 |
+
print("*" * 75)
|
| 1386 |
+
print("Evaluation text:")
|
| 1387 |
+
print(text)
|
| 1388 |
+
print("*" * 75)
|
| 1389 |
+
print()
|
| 1390 |
+
|
| 1391 |
+
grammar = r"""
|
| 1392 |
+
NP: # NP stage
|
| 1393 |
+
{<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
|
| 1394 |
+
{<NNP>+} # chunk proper nouns
|
| 1395 |
+
"""
|
| 1396 |
+
cp = chunk.RegexpParser(grammar)
|
| 1397 |
+
demo_eval(cp, text)
|
| 1398 |
+
|
| 1399 |
+
grammar = r"""
|
| 1400 |
+
NP:
|
| 1401 |
+
{<.*>} # start by chunking each tag
|
| 1402 |
+
}<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
|
| 1403 |
+
<DT|JJ>{}<NN.*> # merge det/adj with nouns
|
| 1404 |
+
"""
|
| 1405 |
+
cp = chunk.RegexpParser(grammar)
|
| 1406 |
+
demo_eval(cp, text)
|
| 1407 |
+
|
| 1408 |
+
grammar = r"""
|
| 1409 |
+
NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
|
| 1410 |
+
VP: {<TO>?<VB.*>} # VP = verb words
|
| 1411 |
+
"""
|
| 1412 |
+
cp = chunk.RegexpParser(grammar)
|
| 1413 |
+
demo_eval(cp, text)
|
| 1414 |
+
|
| 1415 |
+
grammar = r"""
|
| 1416 |
+
NP: {<.*>*} # start by chunking everything
|
| 1417 |
+
}<[\.VI].*>+{ # strip any verbs, prepositions or periods
|
| 1418 |
+
<.*>}{<DT> # separate on determiners
|
| 1419 |
+
PP: {<IN><NP>} # PP = preposition + noun phrase
|
| 1420 |
+
VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
|
| 1421 |
+
"""
|
| 1422 |
+
cp = chunk.RegexpParser(grammar)
|
| 1423 |
+
demo_eval(cp, text)
|
| 1424 |
+
|
| 1425 |
+
# Evaluation
|
| 1426 |
+
|
| 1427 |
+
from nltk.corpus import conll2000
|
| 1428 |
+
|
| 1429 |
+
print()
|
| 1430 |
+
print("Demonstration of empty grammar:")
|
| 1431 |
+
|
| 1432 |
+
cp = chunk.RegexpParser("")
|
| 1433 |
+
print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt", chunk_types=("NP",))))
|
| 1434 |
+
|
| 1435 |
+
print()
|
| 1436 |
+
print("Demonstration of accuracy evaluation using CoNLL tags:")
|
| 1437 |
+
|
| 1438 |
+
grammar = r"""
|
| 1439 |
+
NP:
|
| 1440 |
+
{<.*>} # start by chunking each tag
|
| 1441 |
+
}<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
|
| 1442 |
+
<DT|JJ>{}<NN.*> # merge det/adj with nouns
|
| 1443 |
+
"""
|
| 1444 |
+
cp = chunk.RegexpParser(grammar)
|
| 1445 |
+
print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt")[:5]))
|
| 1446 |
+
|
| 1447 |
+
print()
|
| 1448 |
+
print("Demonstration of tagged token input")
|
| 1449 |
+
|
| 1450 |
+
grammar = r"""
|
| 1451 |
+
NP: {<.*>*} # start by chunking everything
|
| 1452 |
+
}<[\.VI].*>+{ # strip any verbs, prepositions or periods
|
| 1453 |
+
<.*>}{<DT> # separate on determiners
|
| 1454 |
+
PP: {<IN><NP>} # PP = preposition + noun phrase
|
| 1455 |
+
VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
|
| 1456 |
+
"""
|
| 1457 |
+
cp = chunk.RegexpParser(grammar)
|
| 1458 |
+
print(
|
| 1459 |
+
cp.parse(
|
| 1460 |
+
[
|
| 1461 |
+
("the", "DT"),
|
| 1462 |
+
("little", "JJ"),
|
| 1463 |
+
("cat", "NN"),
|
| 1464 |
+
("sat", "VBD"),
|
| 1465 |
+
("on", "IN"),
|
| 1466 |
+
("the", "DT"),
|
| 1467 |
+
("mat", "NN"),
|
| 1468 |
+
(".", "."),
|
| 1469 |
+
]
|
| 1470 |
+
)
|
| 1471 |
+
)
|
| 1472 |
+
|
| 1473 |
+
|
| 1474 |
+
if __name__ == "__main__":
|
| 1475 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/chunk/util.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Chunk format conversions
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
from nltk.metrics import accuracy as _accuracy
|
| 12 |
+
from nltk.tag.mapping import map_tag
|
| 13 |
+
from nltk.tag.util import str2tuple
|
| 14 |
+
from nltk.tree import Tree
|
| 15 |
+
|
| 16 |
+
##//////////////////////////////////////////////////////
|
| 17 |
+
## EVALUATION
|
| 18 |
+
##//////////////////////////////////////////////////////
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def accuracy(chunker, gold):
|
| 22 |
+
"""
|
| 23 |
+
Score the accuracy of the chunker against the gold standard.
|
| 24 |
+
Strip the chunk information from the gold standard and rechunk it using
|
| 25 |
+
the chunker, then compute the accuracy score.
|
| 26 |
+
|
| 27 |
+
:type chunker: ChunkParserI
|
| 28 |
+
:param chunker: The chunker being evaluated.
|
| 29 |
+
:type gold: tree
|
| 30 |
+
:param gold: The chunk structures to score the chunker on.
|
| 31 |
+
:rtype: float
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
gold_tags = []
|
| 35 |
+
test_tags = []
|
| 36 |
+
for gold_tree in gold:
|
| 37 |
+
test_tree = chunker.parse(gold_tree.flatten())
|
| 38 |
+
gold_tags += tree2conlltags(gold_tree)
|
| 39 |
+
test_tags += tree2conlltags(test_tree)
|
| 40 |
+
|
| 41 |
+
# print 'GOLD:', gold_tags[:50]
|
| 42 |
+
# print 'TEST:', test_tags[:50]
|
| 43 |
+
return _accuracy(gold_tags, test_tags)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
|
| 47 |
+
# -- statistics are evaluated only on demand, instead of at every sentence evaluation
|
| 48 |
+
#
|
| 49 |
+
# SB: use nltk.metrics for precision/recall scoring?
|
| 50 |
+
#
|
| 51 |
+
class ChunkScore:
|
| 52 |
+
"""
|
| 53 |
+
A utility class for scoring chunk parsers. ``ChunkScore`` can
|
| 54 |
+
evaluate a chunk parser's output, based on a number of statistics
|
| 55 |
+
(precision, recall, f-measure, misssed chunks, incorrect chunks).
|
| 56 |
+
It can also combine the scores from the parsing of multiple texts;
|
| 57 |
+
this makes it significantly easier to evaluate a chunk parser that
|
| 58 |
+
operates one sentence at a time.
|
| 59 |
+
|
| 60 |
+
Texts are evaluated with the ``score`` method. The results of
|
| 61 |
+
evaluation can be accessed via a number of accessor methods, such
|
| 62 |
+
as ``precision`` and ``f_measure``. A typical use of the
|
| 63 |
+
``ChunkScore`` class is::
|
| 64 |
+
|
| 65 |
+
>>> chunkscore = ChunkScore() # doctest: +SKIP
|
| 66 |
+
>>> for correct in correct_sentences: # doctest: +SKIP
|
| 67 |
+
... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
|
| 68 |
+
... chunkscore.score(correct, guess) # doctest: +SKIP
|
| 69 |
+
>>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
|
| 70 |
+
F Measure: 0.823
|
| 71 |
+
|
| 72 |
+
:ivar kwargs: Keyword arguments:
|
| 73 |
+
|
| 74 |
+
- max_tp_examples: The maximum number actual examples of true
|
| 75 |
+
positives to record. This affects the ``correct`` member
|
| 76 |
+
function: ``correct`` will not return more than this number
|
| 77 |
+
of true positive examples. This does *not* affect any of
|
| 78 |
+
the numerical metrics (precision, recall, or f-measure)
|
| 79 |
+
|
| 80 |
+
- max_fp_examples: The maximum number actual examples of false
|
| 81 |
+
positives to record. This affects the ``incorrect`` member
|
| 82 |
+
function and the ``guessed`` member function: ``incorrect``
|
| 83 |
+
will not return more than this number of examples, and
|
| 84 |
+
``guessed`` will not return more than this number of true
|
| 85 |
+
positive examples. This does *not* affect any of the
|
| 86 |
+
numerical metrics (precision, recall, or f-measure)
|
| 87 |
+
|
| 88 |
+
- max_fn_examples: The maximum number actual examples of false
|
| 89 |
+
negatives to record. This affects the ``missed`` member
|
| 90 |
+
function and the ``correct`` member function: ``missed``
|
| 91 |
+
will not return more than this number of examples, and
|
| 92 |
+
``correct`` will not return more than this number of true
|
| 93 |
+
negative examples. This does *not* affect any of the
|
| 94 |
+
numerical metrics (precision, recall, or f-measure)
|
| 95 |
+
|
| 96 |
+
- chunk_label: A regular expression indicating which chunks
|
| 97 |
+
should be compared. Defaults to ``'.*'`` (i.e., all chunks).
|
| 98 |
+
|
| 99 |
+
:type _tp: list(Token)
|
| 100 |
+
:ivar _tp: List of true positives
|
| 101 |
+
:type _fp: list(Token)
|
| 102 |
+
:ivar _fp: List of false positives
|
| 103 |
+
:type _fn: list(Token)
|
| 104 |
+
:ivar _fn: List of false negatives
|
| 105 |
+
|
| 106 |
+
:type _tp_num: int
|
| 107 |
+
:ivar _tp_num: Number of true positives
|
| 108 |
+
:type _fp_num: int
|
| 109 |
+
:ivar _fp_num: Number of false positives
|
| 110 |
+
:type _fn_num: int
|
| 111 |
+
:ivar _fn_num: Number of false negatives.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
def __init__(self, **kwargs):
|
| 115 |
+
self._correct = set()
|
| 116 |
+
self._guessed = set()
|
| 117 |
+
self._tp = set()
|
| 118 |
+
self._fp = set()
|
| 119 |
+
self._fn = set()
|
| 120 |
+
self._max_tp = kwargs.get("max_tp_examples", 100)
|
| 121 |
+
self._max_fp = kwargs.get("max_fp_examples", 100)
|
| 122 |
+
self._max_fn = kwargs.get("max_fn_examples", 100)
|
| 123 |
+
self._chunk_label = kwargs.get("chunk_label", ".*")
|
| 124 |
+
self._tp_num = 0
|
| 125 |
+
self._fp_num = 0
|
| 126 |
+
self._fn_num = 0
|
| 127 |
+
self._count = 0
|
| 128 |
+
self._tags_correct = 0.0
|
| 129 |
+
self._tags_total = 0.0
|
| 130 |
+
|
| 131 |
+
self._measuresNeedUpdate = False
|
| 132 |
+
|
| 133 |
+
def _updateMeasures(self):
|
| 134 |
+
if self._measuresNeedUpdate:
|
| 135 |
+
self._tp = self._guessed & self._correct
|
| 136 |
+
self._fn = self._correct - self._guessed
|
| 137 |
+
self._fp = self._guessed - self._correct
|
| 138 |
+
self._tp_num = len(self._tp)
|
| 139 |
+
self._fp_num = len(self._fp)
|
| 140 |
+
self._fn_num = len(self._fn)
|
| 141 |
+
self._measuresNeedUpdate = False
|
| 142 |
+
|
| 143 |
+
def score(self, correct, guessed):
|
| 144 |
+
"""
|
| 145 |
+
Given a correctly chunked sentence, score another chunked
|
| 146 |
+
version of the same sentence.
|
| 147 |
+
|
| 148 |
+
:type correct: chunk structure
|
| 149 |
+
:param correct: The known-correct ("gold standard") chunked
|
| 150 |
+
sentence.
|
| 151 |
+
:type guessed: chunk structure
|
| 152 |
+
:param guessed: The chunked sentence to be scored.
|
| 153 |
+
"""
|
| 154 |
+
self._correct |= _chunksets(correct, self._count, self._chunk_label)
|
| 155 |
+
self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
|
| 156 |
+
self._count += 1
|
| 157 |
+
self._measuresNeedUpdate = True
|
| 158 |
+
# Keep track of per-tag accuracy (if possible)
|
| 159 |
+
try:
|
| 160 |
+
correct_tags = tree2conlltags(correct)
|
| 161 |
+
guessed_tags = tree2conlltags(guessed)
|
| 162 |
+
except ValueError:
|
| 163 |
+
# This exception case is for nested chunk structures,
|
| 164 |
+
# where tree2conlltags will fail with a ValueError: "Tree
|
| 165 |
+
# is too deeply nested to be printed in CoNLL format."
|
| 166 |
+
correct_tags = guessed_tags = ()
|
| 167 |
+
self._tags_total += len(correct_tags)
|
| 168 |
+
self._tags_correct += sum(
|
| 169 |
+
1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
def accuracy(self):
|
| 173 |
+
"""
|
| 174 |
+
Return the overall tag-based accuracy for all text that have
|
| 175 |
+
been scored by this ``ChunkScore``, using the IOB (conll2000)
|
| 176 |
+
tag encoding.
|
| 177 |
+
|
| 178 |
+
:rtype: float
|
| 179 |
+
"""
|
| 180 |
+
if self._tags_total == 0:
|
| 181 |
+
return 1
|
| 182 |
+
return self._tags_correct / self._tags_total
|
| 183 |
+
|
| 184 |
+
def precision(self):
|
| 185 |
+
"""
|
| 186 |
+
Return the overall precision for all texts that have been
|
| 187 |
+
scored by this ``ChunkScore``.
|
| 188 |
+
|
| 189 |
+
:rtype: float
|
| 190 |
+
"""
|
| 191 |
+
self._updateMeasures()
|
| 192 |
+
div = self._tp_num + self._fp_num
|
| 193 |
+
if div == 0:
|
| 194 |
+
return 0
|
| 195 |
+
else:
|
| 196 |
+
return self._tp_num / div
|
| 197 |
+
|
| 198 |
+
def recall(self):
|
| 199 |
+
"""
|
| 200 |
+
Return the overall recall for all texts that have been
|
| 201 |
+
scored by this ``ChunkScore``.
|
| 202 |
+
|
| 203 |
+
:rtype: float
|
| 204 |
+
"""
|
| 205 |
+
self._updateMeasures()
|
| 206 |
+
div = self._tp_num + self._fn_num
|
| 207 |
+
if div == 0:
|
| 208 |
+
return 0
|
| 209 |
+
else:
|
| 210 |
+
return self._tp_num / div
|
| 211 |
+
|
| 212 |
+
def f_measure(self, alpha=0.5):
|
| 213 |
+
"""
|
| 214 |
+
Return the overall F measure for all texts that have been
|
| 215 |
+
scored by this ``ChunkScore``.
|
| 216 |
+
|
| 217 |
+
:param alpha: the relative weighting of precision and recall.
|
| 218 |
+
Larger alpha biases the score towards the precision value,
|
| 219 |
+
while smaller alpha biases the score towards the recall
|
| 220 |
+
value. ``alpha`` should have a value in the range [0,1].
|
| 221 |
+
:type alpha: float
|
| 222 |
+
:rtype: float
|
| 223 |
+
"""
|
| 224 |
+
self._updateMeasures()
|
| 225 |
+
p = self.precision()
|
| 226 |
+
r = self.recall()
|
| 227 |
+
if p == 0 or r == 0: # what if alpha is 0 or 1?
|
| 228 |
+
return 0
|
| 229 |
+
return 1 / (alpha / p + (1 - alpha) / r)
|
| 230 |
+
|
| 231 |
+
def missed(self):
|
| 232 |
+
"""
|
| 233 |
+
Return the chunks which were included in the
|
| 234 |
+
correct chunk structures, but not in the guessed chunk
|
| 235 |
+
structures, listed in input order.
|
| 236 |
+
|
| 237 |
+
:rtype: list of chunks
|
| 238 |
+
"""
|
| 239 |
+
self._updateMeasures()
|
| 240 |
+
chunks = list(self._fn)
|
| 241 |
+
return [c[1] for c in chunks] # discard position information
|
| 242 |
+
|
| 243 |
+
def incorrect(self):
|
| 244 |
+
"""
|
| 245 |
+
Return the chunks which were included in the guessed chunk structures,
|
| 246 |
+
but not in the correct chunk structures, listed in input order.
|
| 247 |
+
|
| 248 |
+
:rtype: list of chunks
|
| 249 |
+
"""
|
| 250 |
+
self._updateMeasures()
|
| 251 |
+
chunks = list(self._fp)
|
| 252 |
+
return [c[1] for c in chunks] # discard position information
|
| 253 |
+
|
| 254 |
+
def correct(self):
|
| 255 |
+
"""
|
| 256 |
+
Return the chunks which were included in the correct
|
| 257 |
+
chunk structures, listed in input order.
|
| 258 |
+
|
| 259 |
+
:rtype: list of chunks
|
| 260 |
+
"""
|
| 261 |
+
chunks = list(self._correct)
|
| 262 |
+
return [c[1] for c in chunks] # discard position information
|
| 263 |
+
|
| 264 |
+
def guessed(self):
|
| 265 |
+
"""
|
| 266 |
+
Return the chunks which were included in the guessed
|
| 267 |
+
chunk structures, listed in input order.
|
| 268 |
+
|
| 269 |
+
:rtype: list of chunks
|
| 270 |
+
"""
|
| 271 |
+
chunks = list(self._guessed)
|
| 272 |
+
return [c[1] for c in chunks] # discard position information
|
| 273 |
+
|
| 274 |
+
def __len__(self):
|
| 275 |
+
self._updateMeasures()
|
| 276 |
+
return self._tp_num + self._fn_num
|
| 277 |
+
|
| 278 |
+
def __repr__(self):
|
| 279 |
+
"""
|
| 280 |
+
Return a concise representation of this ``ChunkScoring``.
|
| 281 |
+
|
| 282 |
+
:rtype: str
|
| 283 |
+
"""
|
| 284 |
+
return "<ChunkScoring of " + repr(len(self)) + " chunks>"
|
| 285 |
+
|
| 286 |
+
def __str__(self):
|
| 287 |
+
"""
|
| 288 |
+
Return a verbose representation of this ``ChunkScoring``.
|
| 289 |
+
This representation includes the precision, recall, and
|
| 290 |
+
f-measure scores. For other information about the score,
|
| 291 |
+
use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
|
| 292 |
+
|
| 293 |
+
:rtype: str
|
| 294 |
+
"""
|
| 295 |
+
return (
|
| 296 |
+
"ChunkParse score:\n"
|
| 297 |
+
+ (f" IOB Accuracy: {self.accuracy() * 100:5.1f}%%\n")
|
| 298 |
+
+ (f" Precision: {self.precision() * 100:5.1f}%%\n")
|
| 299 |
+
+ (f" Recall: {self.recall() * 100:5.1f}%%\n")
|
| 300 |
+
+ (f" F-Measure: {self.f_measure() * 100:5.1f}%%")
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# extract chunks, and assign unique id, the absolute position of
|
| 305 |
+
# the first word of the chunk
|
| 306 |
+
def _chunksets(t, count, chunk_label):
|
| 307 |
+
pos = 0
|
| 308 |
+
chunks = []
|
| 309 |
+
for child in t:
|
| 310 |
+
if isinstance(child, Tree):
|
| 311 |
+
if re.match(chunk_label, child.label()):
|
| 312 |
+
chunks.append(((count, pos), child.freeze()))
|
| 313 |
+
pos += len(child.leaves())
|
| 314 |
+
else:
|
| 315 |
+
pos += 1
|
| 316 |
+
return set(chunks)
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def tagstr2tree(
|
| 320 |
+
s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None
|
| 321 |
+
):
|
| 322 |
+
"""
|
| 323 |
+
Divide a string of bracketted tagged text into
|
| 324 |
+
chunks and unchunked tokens, and produce a Tree.
|
| 325 |
+
Chunks are marked by square brackets (``[...]``). Words are
|
| 326 |
+
delimited by whitespace, and each word should have the form
|
| 327 |
+
``text/tag``. Words that do not contain a slash are
|
| 328 |
+
assigned a ``tag`` of None.
|
| 329 |
+
|
| 330 |
+
:param s: The string to be converted
|
| 331 |
+
:type s: str
|
| 332 |
+
:param chunk_label: The label to use for chunk nodes
|
| 333 |
+
:type chunk_label: str
|
| 334 |
+
:param root_label: The label to use for the root of the tree
|
| 335 |
+
:type root_label: str
|
| 336 |
+
:rtype: Tree
|
| 337 |
+
"""
|
| 338 |
+
|
| 339 |
+
WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")
|
| 340 |
+
|
| 341 |
+
stack = [Tree(root_label, [])]
|
| 342 |
+
for match in WORD_OR_BRACKET.finditer(s):
|
| 343 |
+
text = match.group()
|
| 344 |
+
if text[0] == "[":
|
| 345 |
+
if len(stack) != 1:
|
| 346 |
+
raise ValueError(f"Unexpected [ at char {match.start():d}")
|
| 347 |
+
chunk = Tree(chunk_label, [])
|
| 348 |
+
stack[-1].append(chunk)
|
| 349 |
+
stack.append(chunk)
|
| 350 |
+
elif text[0] == "]":
|
| 351 |
+
if len(stack) != 2:
|
| 352 |
+
raise ValueError(f"Unexpected ] at char {match.start():d}")
|
| 353 |
+
stack.pop()
|
| 354 |
+
else:
|
| 355 |
+
if sep is None:
|
| 356 |
+
stack[-1].append(text)
|
| 357 |
+
else:
|
| 358 |
+
word, tag = str2tuple(text, sep)
|
| 359 |
+
if source_tagset and target_tagset:
|
| 360 |
+
tag = map_tag(source_tagset, target_tagset, tag)
|
| 361 |
+
stack[-1].append((word, tag))
|
| 362 |
+
|
| 363 |
+
if len(stack) != 1:
|
| 364 |
+
raise ValueError(f"Expected ] at char {len(s):d}")
|
| 365 |
+
return stack[0]
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
### CONLL
|
| 369 |
+
|
| 370 |
+
_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?")
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"):
|
| 374 |
+
"""
|
| 375 |
+
Return a chunk structure for a single sentence
|
| 376 |
+
encoded in the given CONLL 2000 style string.
|
| 377 |
+
This function converts a CoNLL IOB string into a tree.
|
| 378 |
+
It uses the specified chunk types
|
| 379 |
+
(defaults to NP, PP and VP), and creates a tree rooted at a node
|
| 380 |
+
labeled S (by default).
|
| 381 |
+
|
| 382 |
+
:param s: The CoNLL string to be converted.
|
| 383 |
+
:type s: str
|
| 384 |
+
:param chunk_types: The chunk types to be converted.
|
| 385 |
+
:type chunk_types: tuple
|
| 386 |
+
:param root_label: The node label to use for the root.
|
| 387 |
+
:type root_label: str
|
| 388 |
+
:rtype: Tree
|
| 389 |
+
"""
|
| 390 |
+
|
| 391 |
+
stack = [Tree(root_label, [])]
|
| 392 |
+
|
| 393 |
+
for lineno, line in enumerate(s.split("\n")):
|
| 394 |
+
if not line.strip():
|
| 395 |
+
continue
|
| 396 |
+
|
| 397 |
+
# Decode the line.
|
| 398 |
+
match = _LINE_RE.match(line)
|
| 399 |
+
if match is None:
|
| 400 |
+
raise ValueError(f"Error on line {lineno:d}")
|
| 401 |
+
(word, tag, state, chunk_type) = match.groups()
|
| 402 |
+
|
| 403 |
+
# If it's a chunk type we don't care about, treat it as O.
|
| 404 |
+
if chunk_types is not None and chunk_type not in chunk_types:
|
| 405 |
+
state = "O"
|
| 406 |
+
|
| 407 |
+
# For "Begin"/"Outside", finish any completed chunks -
|
| 408 |
+
# also do so for "Inside" which don't match the previous token.
|
| 409 |
+
mismatch_I = state == "I" and chunk_type != stack[-1].label()
|
| 410 |
+
if state in "BO" or mismatch_I:
|
| 411 |
+
if len(stack) == 2:
|
| 412 |
+
stack.pop()
|
| 413 |
+
|
| 414 |
+
# For "Begin", start a new chunk.
|
| 415 |
+
if state == "B" or mismatch_I:
|
| 416 |
+
chunk = Tree(chunk_type, [])
|
| 417 |
+
stack[-1].append(chunk)
|
| 418 |
+
stack.append(chunk)
|
| 419 |
+
|
| 420 |
+
# Add the new word token.
|
| 421 |
+
stack[-1].append((word, tag))
|
| 422 |
+
|
| 423 |
+
return stack[0]
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def tree2conlltags(t):
|
| 427 |
+
"""
|
| 428 |
+
Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
|
| 429 |
+
Convert a tree to the CoNLL IOB tag format.
|
| 430 |
+
|
| 431 |
+
:param t: The tree to be converted.
|
| 432 |
+
:type t: Tree
|
| 433 |
+
:rtype: list(tuple)
|
| 434 |
+
"""
|
| 435 |
+
|
| 436 |
+
tags = []
|
| 437 |
+
for child in t:
|
| 438 |
+
try:
|
| 439 |
+
category = child.label()
|
| 440 |
+
prefix = "B-"
|
| 441 |
+
for contents in child:
|
| 442 |
+
if isinstance(contents, Tree):
|
| 443 |
+
raise ValueError(
|
| 444 |
+
"Tree is too deeply nested to be printed in CoNLL format"
|
| 445 |
+
)
|
| 446 |
+
tags.append((contents[0], contents[1], prefix + category))
|
| 447 |
+
prefix = "I-"
|
| 448 |
+
except AttributeError:
|
| 449 |
+
tags.append((child[0], child[1], "O"))
|
| 450 |
+
return tags
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
def conlltags2tree(
|
| 454 |
+
sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False
|
| 455 |
+
):
|
| 456 |
+
"""
|
| 457 |
+
Convert the CoNLL IOB format to a tree.
|
| 458 |
+
"""
|
| 459 |
+
tree = Tree(root_label, [])
|
| 460 |
+
for (word, postag, chunktag) in sentence:
|
| 461 |
+
if chunktag is None:
|
| 462 |
+
if strict:
|
| 463 |
+
raise ValueError("Bad conll tag sequence")
|
| 464 |
+
else:
|
| 465 |
+
# Treat as O
|
| 466 |
+
tree.append((word, postag))
|
| 467 |
+
elif chunktag.startswith("B-"):
|
| 468 |
+
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
| 469 |
+
elif chunktag.startswith("I-"):
|
| 470 |
+
if (
|
| 471 |
+
len(tree) == 0
|
| 472 |
+
or not isinstance(tree[-1], Tree)
|
| 473 |
+
or tree[-1].label() != chunktag[2:]
|
| 474 |
+
):
|
| 475 |
+
if strict:
|
| 476 |
+
raise ValueError("Bad conll tag sequence")
|
| 477 |
+
else:
|
| 478 |
+
# Treat as B-*
|
| 479 |
+
tree.append(Tree(chunktag[2:], [(word, postag)]))
|
| 480 |
+
else:
|
| 481 |
+
tree[-1].append((word, postag))
|
| 482 |
+
elif chunktag == "O":
|
| 483 |
+
tree.append((word, postag))
|
| 484 |
+
else:
|
| 485 |
+
raise ValueError(f"Bad conll tag {chunktag!r}")
|
| 486 |
+
return tree
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
def tree2conllstr(t):
|
| 490 |
+
"""
|
| 491 |
+
Return a multiline string where each line contains a word, tag and IOB tag.
|
| 492 |
+
Convert a tree to the CoNLL IOB string format
|
| 493 |
+
|
| 494 |
+
:param t: The tree to be converted.
|
| 495 |
+
:type t: Tree
|
| 496 |
+
:rtype: str
|
| 497 |
+
"""
|
| 498 |
+
lines = [" ".join(token) for token in tree2conlltags(t)]
|
| 499 |
+
return "\n".join(lines)
|
| 500 |
+
|
| 501 |
+
|
| 502 |
+
### IEER
|
| 503 |
+
|
| 504 |
+
_IEER_DOC_RE = re.compile(
|
| 505 |
+
r"<DOC>\s*"
|
| 506 |
+
r"(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?"
|
| 507 |
+
r"(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?"
|
| 508 |
+
r"(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?"
|
| 509 |
+
r"<BODY>\s*"
|
| 510 |
+
r"(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?"
|
| 511 |
+
r"<TEXT>(?P<text>.*?)</TEXT>\s*"
|
| 512 |
+
r"</BODY>\s*</DOC>\s*",
|
| 513 |
+
re.DOTALL,
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
_IEER_TYPE_RE = re.compile(r'<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
def _ieer_read_text(s, root_label):
|
| 520 |
+
stack = [Tree(root_label, [])]
|
| 521 |
+
# s will be None if there is no headline in the text
|
| 522 |
+
# return the empty list in place of a Tree
|
| 523 |
+
if s is None:
|
| 524 |
+
return []
|
| 525 |
+
for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s):
|
| 526 |
+
piece = piece_m.group()
|
| 527 |
+
try:
|
| 528 |
+
if piece.startswith("<b_"):
|
| 529 |
+
m = _IEER_TYPE_RE.match(piece)
|
| 530 |
+
if m is None:
|
| 531 |
+
print("XXXX", piece)
|
| 532 |
+
chunk = Tree(m.group("type"), [])
|
| 533 |
+
stack[-1].append(chunk)
|
| 534 |
+
stack.append(chunk)
|
| 535 |
+
elif piece.startswith("<e_"):
|
| 536 |
+
stack.pop()
|
| 537 |
+
# elif piece.startswith('<'):
|
| 538 |
+
# print "ERROR:", piece
|
| 539 |
+
# raise ValueError # Unexpected HTML
|
| 540 |
+
else:
|
| 541 |
+
stack[-1].append(piece)
|
| 542 |
+
except (IndexError, ValueError) as e:
|
| 543 |
+
raise ValueError(
|
| 544 |
+
f"Bad IEER string (error at character {piece_m.start():d})"
|
| 545 |
+
) from e
|
| 546 |
+
if len(stack) != 1:
|
| 547 |
+
raise ValueError("Bad IEER string")
|
| 548 |
+
return stack[0]
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
def ieerstr2tree(
|
| 552 |
+
s,
|
| 553 |
+
chunk_types=[
|
| 554 |
+
"LOCATION",
|
| 555 |
+
"ORGANIZATION",
|
| 556 |
+
"PERSON",
|
| 557 |
+
"DURATION",
|
| 558 |
+
"DATE",
|
| 559 |
+
"CARDINAL",
|
| 560 |
+
"PERCENT",
|
| 561 |
+
"MONEY",
|
| 562 |
+
"MEASURE",
|
| 563 |
+
],
|
| 564 |
+
root_label="S",
|
| 565 |
+
):
|
| 566 |
+
"""
|
| 567 |
+
Return a chunk structure containing the chunked tagged text that is
|
| 568 |
+
encoded in the given IEER style string.
|
| 569 |
+
Convert a string of chunked tagged text in the IEER named
|
| 570 |
+
entity format into a chunk structure. Chunks are of several
|
| 571 |
+
types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
|
| 572 |
+
PERCENT, MONEY, and MEASURE.
|
| 573 |
+
|
| 574 |
+
:rtype: Tree
|
| 575 |
+
"""
|
| 576 |
+
|
| 577 |
+
# Try looking for a single document. If that doesn't work, then just
|
| 578 |
+
# treat everything as if it was within the <TEXT>...</TEXT>.
|
| 579 |
+
m = _IEER_DOC_RE.match(s)
|
| 580 |
+
if m:
|
| 581 |
+
return {
|
| 582 |
+
"text": _ieer_read_text(m.group("text"), root_label),
|
| 583 |
+
"docno": m.group("docno"),
|
| 584 |
+
"doctype": m.group("doctype"),
|
| 585 |
+
"date_time": m.group("date_time"),
|
| 586 |
+
#'headline': m.group('headline')
|
| 587 |
+
# we want to capture NEs in the headline too!
|
| 588 |
+
"headline": _ieer_read_text(m.group("headline"), root_label),
|
| 589 |
+
}
|
| 590 |
+
else:
|
| 591 |
+
return _ieer_read_text(s, root_label)
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
def demo():
|
| 595 |
+
|
| 596 |
+
s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
|
| 597 |
+
import nltk
|
| 598 |
+
|
| 599 |
+
t = nltk.chunk.tagstr2tree(s, chunk_label="NP")
|
| 600 |
+
t.pprint()
|
| 601 |
+
print()
|
| 602 |
+
|
| 603 |
+
s = """
|
| 604 |
+
These DT B-NP
|
| 605 |
+
research NN I-NP
|
| 606 |
+
protocols NNS I-NP
|
| 607 |
+
offer VBP B-VP
|
| 608 |
+
to TO B-PP
|
| 609 |
+
the DT B-NP
|
| 610 |
+
patient NN I-NP
|
| 611 |
+
not RB O
|
| 612 |
+
only RB O
|
| 613 |
+
the DT B-NP
|
| 614 |
+
very RB I-NP
|
| 615 |
+
best JJS I-NP
|
| 616 |
+
therapy NN I-NP
|
| 617 |
+
which WDT B-NP
|
| 618 |
+
we PRP B-NP
|
| 619 |
+
have VBP B-VP
|
| 620 |
+
established VBN I-VP
|
| 621 |
+
today NN B-NP
|
| 622 |
+
but CC B-NP
|
| 623 |
+
also RB I-NP
|
| 624 |
+
the DT B-NP
|
| 625 |
+
hope NN I-NP
|
| 626 |
+
of IN B-PP
|
| 627 |
+
something NN B-NP
|
| 628 |
+
still RB B-ADJP
|
| 629 |
+
better JJR I-ADJP
|
| 630 |
+
. . O
|
| 631 |
+
"""
|
| 632 |
+
|
| 633 |
+
conll_tree = conllstr2tree(s, chunk_types=("NP", "PP"))
|
| 634 |
+
conll_tree.pprint()
|
| 635 |
+
|
| 636 |
+
# Demonstrate CoNLL output
|
| 637 |
+
print("CoNLL output:")
|
| 638 |
+
print(nltk.chunk.tree2conllstr(conll_tree))
|
| 639 |
+
print()
|
| 640 |
+
|
| 641 |
+
|
| 642 |
+
if __name__ == "__main__":
|
| 643 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/classify/__init__.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Classifiers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Classes and interfaces for labeling tokens with category labels (or
|
| 10 |
+
"class labels"). Typically, labels are represented with strings
|
| 11 |
+
(such as ``'health'`` or ``'sports'``). Classifiers can be used to
|
| 12 |
+
perform a wide range of classification tasks. For example,
|
| 13 |
+
classifiers can be used...
|
| 14 |
+
|
| 15 |
+
- to classify documents by topic
|
| 16 |
+
- to classify ambiguous words by which word sense is intended
|
| 17 |
+
- to classify acoustic signals by which phoneme they represent
|
| 18 |
+
- to classify sentences by their author
|
| 19 |
+
|
| 20 |
+
Features
|
| 21 |
+
========
|
| 22 |
+
In order to decide which category label is appropriate for a given
|
| 23 |
+
token, classifiers examine one or more 'features' of the token. These
|
| 24 |
+
"features" are typically chosen by hand, and indicate which aspects
|
| 25 |
+
of the token are relevant to the classification decision. For
|
| 26 |
+
example, a document classifier might use a separate feature for each
|
| 27 |
+
word, recording how often that word occurred in the document.
|
| 28 |
+
|
| 29 |
+
Featuresets
|
| 30 |
+
===========
|
| 31 |
+
The features describing a token are encoded using a "featureset",
|
| 32 |
+
which is a dictionary that maps from "feature names" to "feature
|
| 33 |
+
values". Feature names are unique strings that indicate what aspect
|
| 34 |
+
of the token is encoded by the feature. Examples include
|
| 35 |
+
``'prevword'``, for a feature whose value is the previous word; and
|
| 36 |
+
``'contains-word(library)'`` for a feature that is true when a document
|
| 37 |
+
contains the word ``'library'``. Feature values are typically
|
| 38 |
+
booleans, numbers, or strings, depending on which feature they
|
| 39 |
+
describe.
|
| 40 |
+
|
| 41 |
+
Featuresets are typically constructed using a "feature detector"
|
| 42 |
+
(also known as a "feature extractor"). A feature detector is a
|
| 43 |
+
function that takes a token (and sometimes information about its
|
| 44 |
+
context) as its input, and returns a featureset describing that token.
|
| 45 |
+
For example, the following feature detector converts a document
|
| 46 |
+
(stored as a list of words) to a featureset describing the set of
|
| 47 |
+
words included in the document:
|
| 48 |
+
|
| 49 |
+
>>> # Define a feature detector function.
|
| 50 |
+
>>> def document_features(document):
|
| 51 |
+
... return dict([('contains-word(%s)' % w, True) for w in document])
|
| 52 |
+
|
| 53 |
+
Feature detectors are typically applied to each token before it is fed
|
| 54 |
+
to the classifier:
|
| 55 |
+
|
| 56 |
+
>>> # Classify each Gutenberg document.
|
| 57 |
+
>>> from nltk.corpus import gutenberg
|
| 58 |
+
>>> for fileid in gutenberg.fileids(): # doctest: +SKIP
|
| 59 |
+
... doc = gutenberg.words(fileid) # doctest: +SKIP
|
| 60 |
+
... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP
|
| 61 |
+
|
| 62 |
+
The parameters that a feature detector expects will vary, depending on
|
| 63 |
+
the task and the needs of the feature detector. For example, a
|
| 64 |
+
feature detector for word sense disambiguation (WSD) might take as its
|
| 65 |
+
input a sentence, and the index of a word that should be classified,
|
| 66 |
+
and return a featureset for that word. The following feature detector
|
| 67 |
+
for WSD includes features describing the left and right contexts of
|
| 68 |
+
the target word:
|
| 69 |
+
|
| 70 |
+
>>> def wsd_features(sentence, index):
|
| 71 |
+
... featureset = {}
|
| 72 |
+
... for i in range(max(0, index-3), index):
|
| 73 |
+
... featureset['left-context(%s)' % sentence[i]] = True
|
| 74 |
+
... for i in range(index, max(index+3, len(sentence))):
|
| 75 |
+
... featureset['right-context(%s)' % sentence[i]] = True
|
| 76 |
+
... return featureset
|
| 77 |
+
|
| 78 |
+
Training Classifiers
|
| 79 |
+
====================
|
| 80 |
+
Most classifiers are built by training them on a list of hand-labeled
|
| 81 |
+
examples, known as the "training set". Training sets are represented
|
| 82 |
+
as lists of ``(featuredict, label)`` tuples.
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
from nltk.classify.api import ClassifierI, MultiClassifierI
|
| 86 |
+
from nltk.classify.decisiontree import DecisionTreeClassifier
|
| 87 |
+
from nltk.classify.maxent import (
|
| 88 |
+
BinaryMaxentFeatureEncoding,
|
| 89 |
+
ConditionalExponentialClassifier,
|
| 90 |
+
MaxentClassifier,
|
| 91 |
+
TypedMaxentFeatureEncoding,
|
| 92 |
+
)
|
| 93 |
+
from nltk.classify.megam import call_megam, config_megam
|
| 94 |
+
from nltk.classify.naivebayes import NaiveBayesClassifier
|
| 95 |
+
from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier
|
| 96 |
+
from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features
|
| 97 |
+
from nltk.classify.scikitlearn import SklearnClassifier
|
| 98 |
+
from nltk.classify.senna import Senna
|
| 99 |
+
from nltk.classify.textcat import TextCat
|
| 100 |
+
from nltk.classify.util import accuracy, apply_features, log_likelihood
|
| 101 |
+
from nltk.classify.weka import WekaClassifier, config_weka
|
.eggs/nltk-3.8-py3.10.egg/nltk/classify/api.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Classifier Interface
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
Interfaces for labeling tokens with category labels (or "class labels").
|
| 11 |
+
|
| 12 |
+
``ClassifierI`` is a standard interface for "single-category
|
| 13 |
+
classification", in which the set of categories is known, the number
|
| 14 |
+
of categories is finite, and each text belongs to exactly one
|
| 15 |
+
category.
|
| 16 |
+
|
| 17 |
+
``MultiClassifierI`` is a standard interface for "multi-category
|
| 18 |
+
classification", which is like single-category classification except
|
| 19 |
+
that each text belongs to zero or more categories.
|
| 20 |
+
"""
|
| 21 |
+
from nltk.internals import overridden
|
| 22 |
+
|
| 23 |
+
##//////////////////////////////////////////////////////
|
| 24 |
+
# { Classification Interfaces
|
| 25 |
+
##//////////////////////////////////////////////////////
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class ClassifierI:
|
| 29 |
+
"""
|
| 30 |
+
A processing interface for labeling tokens with a single category
|
| 31 |
+
label (or "class"). Labels are typically strs or
|
| 32 |
+
ints, but can be any immutable type. The set of labels
|
| 33 |
+
that the classifier chooses from must be fixed and finite.
|
| 34 |
+
|
| 35 |
+
Subclasses must define:
|
| 36 |
+
- ``labels()``
|
| 37 |
+
- either ``classify()`` or ``classify_many()`` (or both)
|
| 38 |
+
|
| 39 |
+
Subclasses may define:
|
| 40 |
+
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def labels(self):
|
| 44 |
+
"""
|
| 45 |
+
:return: the list of category labels used by this classifier.
|
| 46 |
+
:rtype: list of (immutable)
|
| 47 |
+
"""
|
| 48 |
+
raise NotImplementedError()
|
| 49 |
+
|
| 50 |
+
def classify(self, featureset):
|
| 51 |
+
"""
|
| 52 |
+
:return: the most appropriate label for the given featureset.
|
| 53 |
+
:rtype: label
|
| 54 |
+
"""
|
| 55 |
+
if overridden(self.classify_many):
|
| 56 |
+
return self.classify_many([featureset])[0]
|
| 57 |
+
else:
|
| 58 |
+
raise NotImplementedError()
|
| 59 |
+
|
| 60 |
+
def prob_classify(self, featureset):
|
| 61 |
+
"""
|
| 62 |
+
:return: a probability distribution over labels for the given
|
| 63 |
+
featureset.
|
| 64 |
+
:rtype: ProbDistI
|
| 65 |
+
"""
|
| 66 |
+
if overridden(self.prob_classify_many):
|
| 67 |
+
return self.prob_classify_many([featureset])[0]
|
| 68 |
+
else:
|
| 69 |
+
raise NotImplementedError()
|
| 70 |
+
|
| 71 |
+
def classify_many(self, featuresets):
|
| 72 |
+
"""
|
| 73 |
+
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
|
| 74 |
+
|
| 75 |
+
return [self.classify(fs) for fs in featuresets]
|
| 76 |
+
|
| 77 |
+
:rtype: list(label)
|
| 78 |
+
"""
|
| 79 |
+
return [self.classify(fs) for fs in featuresets]
|
| 80 |
+
|
| 81 |
+
def prob_classify_many(self, featuresets):
|
| 82 |
+
"""
|
| 83 |
+
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
|
| 84 |
+
|
| 85 |
+
return [self.prob_classify(fs) for fs in featuresets]
|
| 86 |
+
|
| 87 |
+
:rtype: list(ProbDistI)
|
| 88 |
+
"""
|
| 89 |
+
return [self.prob_classify(fs) for fs in featuresets]
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class MultiClassifierI:
|
| 93 |
+
"""
|
| 94 |
+
A processing interface for labeling tokens with zero or more
|
| 95 |
+
category labels (or "labels"). Labels are typically strs
|
| 96 |
+
or ints, but can be any immutable type. The set of labels
|
| 97 |
+
that the multi-classifier chooses from must be fixed and finite.
|
| 98 |
+
|
| 99 |
+
Subclasses must define:
|
| 100 |
+
- ``labels()``
|
| 101 |
+
- either ``classify()`` or ``classify_many()`` (or both)
|
| 102 |
+
|
| 103 |
+
Subclasses may define:
|
| 104 |
+
- either ``prob_classify()`` or ``prob_classify_many()`` (or both)
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
def labels(self):
|
| 108 |
+
"""
|
| 109 |
+
:return: the list of category labels used by this classifier.
|
| 110 |
+
:rtype: list of (immutable)
|
| 111 |
+
"""
|
| 112 |
+
raise NotImplementedError()
|
| 113 |
+
|
| 114 |
+
def classify(self, featureset):
|
| 115 |
+
"""
|
| 116 |
+
:return: the most appropriate set of labels for the given featureset.
|
| 117 |
+
:rtype: set(label)
|
| 118 |
+
"""
|
| 119 |
+
if overridden(self.classify_many):
|
| 120 |
+
return self.classify_many([featureset])[0]
|
| 121 |
+
else:
|
| 122 |
+
raise NotImplementedError()
|
| 123 |
+
|
| 124 |
+
def prob_classify(self, featureset):
|
| 125 |
+
"""
|
| 126 |
+
:return: a probability distribution over sets of labels for the
|
| 127 |
+
given featureset.
|
| 128 |
+
:rtype: ProbDistI
|
| 129 |
+
"""
|
| 130 |
+
if overridden(self.prob_classify_many):
|
| 131 |
+
return self.prob_classify_many([featureset])[0]
|
| 132 |
+
else:
|
| 133 |
+
raise NotImplementedError()
|
| 134 |
+
|
| 135 |
+
def classify_many(self, featuresets):
|
| 136 |
+
"""
|
| 137 |
+
Apply ``self.classify()`` to each element of ``featuresets``. I.e.:
|
| 138 |
+
|
| 139 |
+
return [self.classify(fs) for fs in featuresets]
|
| 140 |
+
|
| 141 |
+
:rtype: list(set(label))
|
| 142 |
+
"""
|
| 143 |
+
return [self.classify(fs) for fs in featuresets]
|
| 144 |
+
|
| 145 |
+
def prob_classify_many(self, featuresets):
|
| 146 |
+
"""
|
| 147 |
+
Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.:
|
| 148 |
+
|
| 149 |
+
return [self.prob_classify(fs) for fs in featuresets]
|
| 150 |
+
|
| 151 |
+
:rtype: list(ProbDistI)
|
| 152 |
+
"""
|
| 153 |
+
return [self.prob_classify(fs) for fs in featuresets]
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# # [XX] IN PROGRESS:
|
| 157 |
+
# class SequenceClassifierI:
|
| 158 |
+
# """
|
| 159 |
+
# A processing interface for labeling sequences of tokens with a
|
| 160 |
+
# single category label (or "class"). Labels are typically
|
| 161 |
+
# strs or ints, but can be any immutable type. The set
|
| 162 |
+
# of labels that the classifier chooses from must be fixed and
|
| 163 |
+
# finite.
|
| 164 |
+
# """
|
| 165 |
+
# def labels(self):
|
| 166 |
+
# """
|
| 167 |
+
# :return: the list of category labels used by this classifier.
|
| 168 |
+
# :rtype: list of (immutable)
|
| 169 |
+
# """
|
| 170 |
+
# raise NotImplementedError()
|
| 171 |
+
|
| 172 |
+
# def prob_classify(self, featureset):
|
| 173 |
+
# """
|
| 174 |
+
# Return a probability distribution over labels for the given
|
| 175 |
+
# featureset.
|
| 176 |
+
|
| 177 |
+
# If ``featureset`` is a list of featuresets, then return a
|
| 178 |
+
# corresponding list containing the probability distribution
|
| 179 |
+
# over labels for each of the given featuresets, where the
|
| 180 |
+
# *i*\ th element of this list is the most appropriate label for
|
| 181 |
+
# the *i*\ th element of ``featuresets``.
|
| 182 |
+
# """
|
| 183 |
+
# raise NotImplementedError()
|
| 184 |
+
|
| 185 |
+
# def classify(self, featureset):
|
| 186 |
+
# """
|
| 187 |
+
# Return the most appropriate label for the given featureset.
|
| 188 |
+
|
| 189 |
+
# If ``featureset`` is a list of featuresets, then return a
|
| 190 |
+
# corresponding list containing the most appropriate label for
|
| 191 |
+
# each of the given featuresets, where the *i*\ th element of
|
| 192 |
+
# this list is the most appropriate label for the *i*\ th element
|
| 193 |
+
# of ``featuresets``.
|
| 194 |
+
# """
|
| 195 |
+
# raise NotImplementedError()
|
.eggs/nltk-3.8-py3.10.egg/nltk/classify/decisiontree.py
ADDED
|
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Decision Tree Classifiers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
A classifier model that decides which label to assign to a token on
|
| 10 |
+
the basis of a tree structure, where branches correspond to conditions
|
| 11 |
+
on feature values, and leaves correspond to label assignments.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from collections import defaultdict
|
| 15 |
+
|
| 16 |
+
from nltk.classify.api import ClassifierI
|
| 17 |
+
from nltk.probability import FreqDist, MLEProbDist, entropy
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class DecisionTreeClassifier(ClassifierI):
|
| 21 |
+
def __init__(self, label, feature_name=None, decisions=None, default=None):
|
| 22 |
+
"""
|
| 23 |
+
:param label: The most likely label for tokens that reach
|
| 24 |
+
this node in the decision tree. If this decision tree
|
| 25 |
+
has no children, then this label will be assigned to
|
| 26 |
+
any token that reaches this decision tree.
|
| 27 |
+
:param feature_name: The name of the feature that this
|
| 28 |
+
decision tree selects for.
|
| 29 |
+
:param decisions: A dictionary mapping from feature values
|
| 30 |
+
for the feature identified by ``feature_name`` to
|
| 31 |
+
child decision trees.
|
| 32 |
+
:param default: The child that will be used if the value of
|
| 33 |
+
feature ``feature_name`` does not match any of the keys in
|
| 34 |
+
``decisions``. This is used when constructing binary
|
| 35 |
+
decision trees.
|
| 36 |
+
"""
|
| 37 |
+
self._label = label
|
| 38 |
+
self._fname = feature_name
|
| 39 |
+
self._decisions = decisions
|
| 40 |
+
self._default = default
|
| 41 |
+
|
| 42 |
+
def labels(self):
|
| 43 |
+
labels = [self._label]
|
| 44 |
+
if self._decisions is not None:
|
| 45 |
+
for dt in self._decisions.values():
|
| 46 |
+
labels.extend(dt.labels())
|
| 47 |
+
if self._default is not None:
|
| 48 |
+
labels.extend(self._default.labels())
|
| 49 |
+
return list(set(labels))
|
| 50 |
+
|
| 51 |
+
def classify(self, featureset):
|
| 52 |
+
# Decision leaf:
|
| 53 |
+
if self._fname is None:
|
| 54 |
+
return self._label
|
| 55 |
+
|
| 56 |
+
# Decision tree:
|
| 57 |
+
fval = featureset.get(self._fname)
|
| 58 |
+
if fval in self._decisions:
|
| 59 |
+
return self._decisions[fval].classify(featureset)
|
| 60 |
+
elif self._default is not None:
|
| 61 |
+
return self._default.classify(featureset)
|
| 62 |
+
else:
|
| 63 |
+
return self._label
|
| 64 |
+
|
| 65 |
+
def error(self, labeled_featuresets):
|
| 66 |
+
errors = 0
|
| 67 |
+
for featureset, label in labeled_featuresets:
|
| 68 |
+
if self.classify(featureset) != label:
|
| 69 |
+
errors += 1
|
| 70 |
+
return errors / len(labeled_featuresets)
|
| 71 |
+
|
| 72 |
+
def pretty_format(self, width=70, prefix="", depth=4):
|
| 73 |
+
"""
|
| 74 |
+
Return a string containing a pretty-printed version of this
|
| 75 |
+
decision tree. Each line in this string corresponds to a
|
| 76 |
+
single decision tree node or leaf, and indentation is used to
|
| 77 |
+
display the structure of the decision tree.
|
| 78 |
+
"""
|
| 79 |
+
# [xx] display default!!
|
| 80 |
+
if self._fname is None:
|
| 81 |
+
n = width - len(prefix) - 15
|
| 82 |
+
return "{}{} {}\n".format(prefix, "." * n, self._label)
|
| 83 |
+
s = ""
|
| 84 |
+
for i, (fval, result) in enumerate(
|
| 85 |
+
sorted(
|
| 86 |
+
self._decisions.items(),
|
| 87 |
+
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
|
| 88 |
+
)
|
| 89 |
+
):
|
| 90 |
+
hdr = f"{prefix}{self._fname}={fval}? "
|
| 91 |
+
n = width - 15 - len(hdr)
|
| 92 |
+
s += "{}{} {}\n".format(hdr, "." * (n), result._label)
|
| 93 |
+
if result._fname is not None and depth > 1:
|
| 94 |
+
s += result.pretty_format(width, prefix + " ", depth - 1)
|
| 95 |
+
if self._default is not None:
|
| 96 |
+
n = width - len(prefix) - 21
|
| 97 |
+
s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label)
|
| 98 |
+
if self._default._fname is not None and depth > 1:
|
| 99 |
+
s += self._default.pretty_format(width, prefix + " ", depth - 1)
|
| 100 |
+
return s
|
| 101 |
+
|
| 102 |
+
def pseudocode(self, prefix="", depth=4):
|
| 103 |
+
"""
|
| 104 |
+
Return a string representation of this decision tree that
|
| 105 |
+
expresses the decisions it makes as a nested set of pseudocode
|
| 106 |
+
if statements.
|
| 107 |
+
"""
|
| 108 |
+
if self._fname is None:
|
| 109 |
+
return f"{prefix}return {self._label!r}\n"
|
| 110 |
+
s = ""
|
| 111 |
+
for (fval, result) in sorted(
|
| 112 |
+
self._decisions.items(),
|
| 113 |
+
key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()),
|
| 114 |
+
):
|
| 115 |
+
s += f"{prefix}if {self._fname} == {fval!r}: "
|
| 116 |
+
if result._fname is not None and depth > 1:
|
| 117 |
+
s += "\n" + result.pseudocode(prefix + " ", depth - 1)
|
| 118 |
+
else:
|
| 119 |
+
s += f"return {result._label!r}\n"
|
| 120 |
+
if self._default is not None:
|
| 121 |
+
if len(self._decisions) == 1:
|
| 122 |
+
s += "{}if {} != {!r}: ".format(
|
| 123 |
+
prefix, self._fname, list(self._decisions.keys())[0]
|
| 124 |
+
)
|
| 125 |
+
else:
|
| 126 |
+
s += f"{prefix}else: "
|
| 127 |
+
if self._default._fname is not None and depth > 1:
|
| 128 |
+
s += "\n" + self._default.pseudocode(prefix + " ", depth - 1)
|
| 129 |
+
else:
|
| 130 |
+
s += f"return {self._default._label!r}\n"
|
| 131 |
+
return s
|
| 132 |
+
|
| 133 |
+
def __str__(self):
|
| 134 |
+
return self.pretty_format()
|
| 135 |
+
|
| 136 |
+
@staticmethod
|
| 137 |
+
def train(
|
| 138 |
+
labeled_featuresets,
|
| 139 |
+
entropy_cutoff=0.05,
|
| 140 |
+
depth_cutoff=100,
|
| 141 |
+
support_cutoff=10,
|
| 142 |
+
binary=False,
|
| 143 |
+
feature_values=None,
|
| 144 |
+
verbose=False,
|
| 145 |
+
):
|
| 146 |
+
"""
|
| 147 |
+
:param binary: If true, then treat all feature/value pairs as
|
| 148 |
+
individual binary features, rather than using a single n-way
|
| 149 |
+
branch for each feature.
|
| 150 |
+
"""
|
| 151 |
+
# Collect a list of all feature names.
|
| 152 |
+
feature_names = set()
|
| 153 |
+
for featureset, label in labeled_featuresets:
|
| 154 |
+
for fname in featureset:
|
| 155 |
+
feature_names.add(fname)
|
| 156 |
+
|
| 157 |
+
# Collect a list of the values each feature can take.
|
| 158 |
+
if feature_values is None and binary:
|
| 159 |
+
feature_values = defaultdict(set)
|
| 160 |
+
for featureset, label in labeled_featuresets:
|
| 161 |
+
for fname, fval in featureset.items():
|
| 162 |
+
feature_values[fname].add(fval)
|
| 163 |
+
|
| 164 |
+
# Start with a stump.
|
| 165 |
+
if not binary:
|
| 166 |
+
tree = DecisionTreeClassifier.best_stump(
|
| 167 |
+
feature_names, labeled_featuresets, verbose
|
| 168 |
+
)
|
| 169 |
+
else:
|
| 170 |
+
tree = DecisionTreeClassifier.best_binary_stump(
|
| 171 |
+
feature_names, labeled_featuresets, feature_values, verbose
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# Refine the stump.
|
| 175 |
+
tree.refine(
|
| 176 |
+
labeled_featuresets,
|
| 177 |
+
entropy_cutoff,
|
| 178 |
+
depth_cutoff - 1,
|
| 179 |
+
support_cutoff,
|
| 180 |
+
binary,
|
| 181 |
+
feature_values,
|
| 182 |
+
verbose,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
# Return it
|
| 186 |
+
return tree
|
| 187 |
+
|
| 188 |
+
@staticmethod
|
| 189 |
+
def leaf(labeled_featuresets):
|
| 190 |
+
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
| 191 |
+
return DecisionTreeClassifier(label)
|
| 192 |
+
|
| 193 |
+
@staticmethod
|
| 194 |
+
def stump(feature_name, labeled_featuresets):
|
| 195 |
+
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
| 196 |
+
|
| 197 |
+
# Find the best label for each value.
|
| 198 |
+
freqs = defaultdict(FreqDist) # freq(label|value)
|
| 199 |
+
for featureset, label in labeled_featuresets:
|
| 200 |
+
feature_value = featureset.get(feature_name)
|
| 201 |
+
freqs[feature_value][label] += 1
|
| 202 |
+
|
| 203 |
+
decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs}
|
| 204 |
+
return DecisionTreeClassifier(label, feature_name, decisions)
|
| 205 |
+
|
| 206 |
+
def refine(
|
| 207 |
+
self,
|
| 208 |
+
labeled_featuresets,
|
| 209 |
+
entropy_cutoff,
|
| 210 |
+
depth_cutoff,
|
| 211 |
+
support_cutoff,
|
| 212 |
+
binary=False,
|
| 213 |
+
feature_values=None,
|
| 214 |
+
verbose=False,
|
| 215 |
+
):
|
| 216 |
+
if len(labeled_featuresets) <= support_cutoff:
|
| 217 |
+
return
|
| 218 |
+
if self._fname is None:
|
| 219 |
+
return
|
| 220 |
+
if depth_cutoff <= 0:
|
| 221 |
+
return
|
| 222 |
+
for fval in self._decisions:
|
| 223 |
+
fval_featuresets = [
|
| 224 |
+
(featureset, label)
|
| 225 |
+
for (featureset, label) in labeled_featuresets
|
| 226 |
+
if featureset.get(self._fname) == fval
|
| 227 |
+
]
|
| 228 |
+
|
| 229 |
+
label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
|
| 230 |
+
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
|
| 231 |
+
self._decisions[fval] = DecisionTreeClassifier.train(
|
| 232 |
+
fval_featuresets,
|
| 233 |
+
entropy_cutoff,
|
| 234 |
+
depth_cutoff,
|
| 235 |
+
support_cutoff,
|
| 236 |
+
binary,
|
| 237 |
+
feature_values,
|
| 238 |
+
verbose,
|
| 239 |
+
)
|
| 240 |
+
if self._default is not None:
|
| 241 |
+
default_featuresets = [
|
| 242 |
+
(featureset, label)
|
| 243 |
+
for (featureset, label) in labeled_featuresets
|
| 244 |
+
if featureset.get(self._fname) not in self._decisions
|
| 245 |
+
]
|
| 246 |
+
label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
|
| 247 |
+
if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
|
| 248 |
+
self._default = DecisionTreeClassifier.train(
|
| 249 |
+
default_featuresets,
|
| 250 |
+
entropy_cutoff,
|
| 251 |
+
depth_cutoff,
|
| 252 |
+
support_cutoff,
|
| 253 |
+
binary,
|
| 254 |
+
feature_values,
|
| 255 |
+
verbose,
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
@staticmethod
|
| 259 |
+
def best_stump(feature_names, labeled_featuresets, verbose=False):
|
| 260 |
+
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
|
| 261 |
+
best_error = best_stump.error(labeled_featuresets)
|
| 262 |
+
for fname in feature_names:
|
| 263 |
+
stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
|
| 264 |
+
stump_error = stump.error(labeled_featuresets)
|
| 265 |
+
if stump_error < best_error:
|
| 266 |
+
best_error = stump_error
|
| 267 |
+
best_stump = stump
|
| 268 |
+
if verbose:
|
| 269 |
+
print(
|
| 270 |
+
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
|
| 271 |
+
len(labeled_featuresets), best_stump._fname, best_error
|
| 272 |
+
)
|
| 273 |
+
)
|
| 274 |
+
return best_stump
|
| 275 |
+
|
| 276 |
+
@staticmethod
|
| 277 |
+
def binary_stump(feature_name, feature_value, labeled_featuresets):
|
| 278 |
+
label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
|
| 279 |
+
|
| 280 |
+
# Find the best label for each value.
|
| 281 |
+
pos_fdist = FreqDist()
|
| 282 |
+
neg_fdist = FreqDist()
|
| 283 |
+
for featureset, label in labeled_featuresets:
|
| 284 |
+
if featureset.get(feature_name) == feature_value:
|
| 285 |
+
pos_fdist[label] += 1
|
| 286 |
+
else:
|
| 287 |
+
neg_fdist[label] += 1
|
| 288 |
+
|
| 289 |
+
decisions = {}
|
| 290 |
+
default = label
|
| 291 |
+
# But hopefully we have observations!
|
| 292 |
+
if pos_fdist.N() > 0:
|
| 293 |
+
decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
|
| 294 |
+
if neg_fdist.N() > 0:
|
| 295 |
+
default = DecisionTreeClassifier(neg_fdist.max())
|
| 296 |
+
|
| 297 |
+
return DecisionTreeClassifier(label, feature_name, decisions, default)
|
| 298 |
+
|
| 299 |
+
@staticmethod
|
| 300 |
+
def best_binary_stump(
|
| 301 |
+
feature_names, labeled_featuresets, feature_values, verbose=False
|
| 302 |
+
):
|
| 303 |
+
best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
|
| 304 |
+
best_error = best_stump.error(labeled_featuresets)
|
| 305 |
+
for fname in feature_names:
|
| 306 |
+
for fval in feature_values[fname]:
|
| 307 |
+
stump = DecisionTreeClassifier.binary_stump(
|
| 308 |
+
fname, fval, labeled_featuresets
|
| 309 |
+
)
|
| 310 |
+
stump_error = stump.error(labeled_featuresets)
|
| 311 |
+
if stump_error < best_error:
|
| 312 |
+
best_error = stump_error
|
| 313 |
+
best_stump = stump
|
| 314 |
+
if verbose:
|
| 315 |
+
if best_stump._decisions:
|
| 316 |
+
descr = "{}={}".format(
|
| 317 |
+
best_stump._fname, list(best_stump._decisions.keys())[0]
|
| 318 |
+
)
|
| 319 |
+
else:
|
| 320 |
+
descr = "(default)"
|
| 321 |
+
print(
|
| 322 |
+
"best stump for {:6d} toks uses {:20} err={:6.4f}".format(
|
| 323 |
+
len(labeled_featuresets), descr, best_error
|
| 324 |
+
)
|
| 325 |
+
)
|
| 326 |
+
return best_stump
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
##//////////////////////////////////////////////////////
|
| 330 |
+
## Demo
|
| 331 |
+
##//////////////////////////////////////////////////////
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def f(x):
|
| 335 |
+
return DecisionTreeClassifier.train(x, binary=True, verbose=True)
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
def demo():
|
| 339 |
+
from nltk.classify.util import binary_names_demo_features, names_demo
|
| 340 |
+
|
| 341 |
+
classifier = names_demo(
|
| 342 |
+
f, binary_names_demo_features # DecisionTreeClassifier.train,
|
| 343 |
+
)
|
| 344 |
+
print(classifier.pretty_format(depth=7))
|
| 345 |
+
print(classifier.pseudocode(depth=7))
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
if __name__ == "__main__":
|
| 349 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/classify/maxent.py
ADDED
|
@@ -0,0 +1,1569 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Maximum Entropy Classifiers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
A classifier model based on maximum entropy modeling framework. This
|
| 11 |
+
framework considers all of the probability distributions that are
|
| 12 |
+
empirically consistent with the training data; and chooses the
|
| 13 |
+
distribution with the highest entropy. A probability distribution is
|
| 14 |
+
"empirically consistent" with a set of training data if its estimated
|
| 15 |
+
frequency with which a class and a feature vector value co-occur is
|
| 16 |
+
equal to the actual frequency in the data.
|
| 17 |
+
|
| 18 |
+
Terminology: 'feature'
|
| 19 |
+
======================
|
| 20 |
+
The term *feature* is usually used to refer to some property of an
|
| 21 |
+
unlabeled token. For example, when performing word sense
|
| 22 |
+
disambiguation, we might define a ``'prevword'`` feature whose value is
|
| 23 |
+
the word preceding the target word. However, in the context of
|
| 24 |
+
maxent modeling, the term *feature* is typically used to refer to a
|
| 25 |
+
property of a "labeled" token. In order to prevent confusion, we
|
| 26 |
+
will introduce two distinct terms to disambiguate these two different
|
| 27 |
+
concepts:
|
| 28 |
+
|
| 29 |
+
- An "input-feature" is a property of an unlabeled token.
|
| 30 |
+
- A "joint-feature" is a property of a labeled token.
|
| 31 |
+
|
| 32 |
+
In the rest of the ``nltk.classify`` module, the term "features" is
|
| 33 |
+
used to refer to what we will call "input-features" in this module.
|
| 34 |
+
|
| 35 |
+
In literature that describes and discusses maximum entropy models,
|
| 36 |
+
input-features are typically called "contexts", and joint-features
|
| 37 |
+
are simply referred to as "features".
|
| 38 |
+
|
| 39 |
+
Converting Input-Features to Joint-Features
|
| 40 |
+
-------------------------------------------
|
| 41 |
+
In maximum entropy models, joint-features are required to have numeric
|
| 42 |
+
values. Typically, each input-feature ``input_feat`` is mapped to a
|
| 43 |
+
set of joint-features of the form:
|
| 44 |
+
|
| 45 |
+
| joint_feat(token, label) = { 1 if input_feat(token) == feat_val
|
| 46 |
+
| { and label == some_label
|
| 47 |
+
| {
|
| 48 |
+
| { 0 otherwise
|
| 49 |
+
|
| 50 |
+
For all values of ``feat_val`` and ``some_label``. This mapping is
|
| 51 |
+
performed by classes that implement the ``MaxentFeatureEncodingI``
|
| 52 |
+
interface.
|
| 53 |
+
"""
|
| 54 |
+
try:
|
| 55 |
+
import numpy
|
| 56 |
+
except ImportError:
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
import os
|
| 60 |
+
import tempfile
|
| 61 |
+
from collections import defaultdict
|
| 62 |
+
|
| 63 |
+
from nltk.classify.api import ClassifierI
|
| 64 |
+
from nltk.classify.megam import call_megam, parse_megam_weights, write_megam_file
|
| 65 |
+
from nltk.classify.tadm import call_tadm, parse_tadm_weights, write_tadm_file
|
| 66 |
+
from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
|
| 67 |
+
from nltk.data import gzip_open_unicode
|
| 68 |
+
from nltk.probability import DictionaryProbDist
|
| 69 |
+
from nltk.util import OrderedDict
|
| 70 |
+
|
| 71 |
+
__docformat__ = "epytext en"
|
| 72 |
+
|
| 73 |
+
######################################################################
|
| 74 |
+
# { Classifier Model
|
| 75 |
+
######################################################################
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class MaxentClassifier(ClassifierI):
|
| 79 |
+
"""
|
| 80 |
+
A maximum entropy classifier (also known as a "conditional
|
| 81 |
+
exponential classifier"). This classifier is parameterized by a
|
| 82 |
+
set of "weights", which are used to combine the joint-features
|
| 83 |
+
that are generated from a featureset by an "encoding". In
|
| 84 |
+
particular, the encoding maps each ``(featureset, label)`` pair to
|
| 85 |
+
a vector. The probability of each label is then computed using
|
| 86 |
+
the following equation::
|
| 87 |
+
|
| 88 |
+
dotprod(weights, encode(fs,label))
|
| 89 |
+
prob(fs|label) = ---------------------------------------------------
|
| 90 |
+
sum(dotprod(weights, encode(fs,l)) for l in labels)
|
| 91 |
+
|
| 92 |
+
Where ``dotprod`` is the dot product::
|
| 93 |
+
|
| 94 |
+
dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
def __init__(self, encoding, weights, logarithmic=True):
|
| 98 |
+
"""
|
| 99 |
+
Construct a new maxent classifier model. Typically, new
|
| 100 |
+
classifier models are created using the ``train()`` method.
|
| 101 |
+
|
| 102 |
+
:type encoding: MaxentFeatureEncodingI
|
| 103 |
+
:param encoding: An encoding that is used to convert the
|
| 104 |
+
featuresets that are given to the ``classify`` method into
|
| 105 |
+
joint-feature vectors, which are used by the maxent
|
| 106 |
+
classifier model.
|
| 107 |
+
|
| 108 |
+
:type weights: list of float
|
| 109 |
+
:param weights: The feature weight vector for this classifier.
|
| 110 |
+
|
| 111 |
+
:type logarithmic: bool
|
| 112 |
+
:param logarithmic: If false, then use non-logarithmic weights.
|
| 113 |
+
"""
|
| 114 |
+
self._encoding = encoding
|
| 115 |
+
self._weights = weights
|
| 116 |
+
self._logarithmic = logarithmic
|
| 117 |
+
# self._logarithmic = False
|
| 118 |
+
assert encoding.length() == len(weights)
|
| 119 |
+
|
| 120 |
+
def labels(self):
|
| 121 |
+
return self._encoding.labels()
|
| 122 |
+
|
| 123 |
+
def set_weights(self, new_weights):
|
| 124 |
+
"""
|
| 125 |
+
Set the feature weight vector for this classifier.
|
| 126 |
+
:param new_weights: The new feature weight vector.
|
| 127 |
+
:type new_weights: list of float
|
| 128 |
+
"""
|
| 129 |
+
self._weights = new_weights
|
| 130 |
+
assert self._encoding.length() == len(new_weights)
|
| 131 |
+
|
| 132 |
+
def weights(self):
|
| 133 |
+
"""
|
| 134 |
+
:return: The feature weight vector for this classifier.
|
| 135 |
+
:rtype: list of float
|
| 136 |
+
"""
|
| 137 |
+
return self._weights
|
| 138 |
+
|
| 139 |
+
def classify(self, featureset):
|
| 140 |
+
return self.prob_classify(featureset).max()
|
| 141 |
+
|
| 142 |
+
def prob_classify(self, featureset):
|
| 143 |
+
prob_dict = {}
|
| 144 |
+
for label in self._encoding.labels():
|
| 145 |
+
feature_vector = self._encoding.encode(featureset, label)
|
| 146 |
+
|
| 147 |
+
if self._logarithmic:
|
| 148 |
+
total = 0.0
|
| 149 |
+
for (f_id, f_val) in feature_vector:
|
| 150 |
+
total += self._weights[f_id] * f_val
|
| 151 |
+
prob_dict[label] = total
|
| 152 |
+
|
| 153 |
+
else:
|
| 154 |
+
prod = 1.0
|
| 155 |
+
for (f_id, f_val) in feature_vector:
|
| 156 |
+
prod *= self._weights[f_id] ** f_val
|
| 157 |
+
prob_dict[label] = prod
|
| 158 |
+
|
| 159 |
+
# Normalize the dictionary to give a probability distribution
|
| 160 |
+
return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True)
|
| 161 |
+
|
| 162 |
+
def explain(self, featureset, columns=4):
|
| 163 |
+
"""
|
| 164 |
+
Print a table showing the effect of each of the features in
|
| 165 |
+
the given feature set, and how they combine to determine the
|
| 166 |
+
probabilities of each label for that featureset.
|
| 167 |
+
"""
|
| 168 |
+
descr_width = 50
|
| 169 |
+
TEMPLATE = " %-" + str(descr_width - 2) + "s%s%8.3f"
|
| 170 |
+
|
| 171 |
+
pdist = self.prob_classify(featureset)
|
| 172 |
+
labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
|
| 173 |
+
labels = labels[:columns]
|
| 174 |
+
print(
|
| 175 |
+
" Feature".ljust(descr_width)
|
| 176 |
+
+ "".join("%8s" % (("%s" % l)[:7]) for l in labels)
|
| 177 |
+
)
|
| 178 |
+
print(" " + "-" * (descr_width - 2 + 8 * len(labels)))
|
| 179 |
+
sums = defaultdict(int)
|
| 180 |
+
for i, label in enumerate(labels):
|
| 181 |
+
feature_vector = self._encoding.encode(featureset, label)
|
| 182 |
+
feature_vector.sort(
|
| 183 |
+
key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True
|
| 184 |
+
)
|
| 185 |
+
for (f_id, f_val) in feature_vector:
|
| 186 |
+
if self._logarithmic:
|
| 187 |
+
score = self._weights[f_id] * f_val
|
| 188 |
+
else:
|
| 189 |
+
score = self._weights[f_id] ** f_val
|
| 190 |
+
descr = self._encoding.describe(f_id)
|
| 191 |
+
descr = descr.split(" and label is ")[0] # hack
|
| 192 |
+
descr += " (%s)" % f_val # hack
|
| 193 |
+
if len(descr) > 47:
|
| 194 |
+
descr = descr[:44] + "..."
|
| 195 |
+
print(TEMPLATE % (descr, i * 8 * " ", score))
|
| 196 |
+
sums[label] += score
|
| 197 |
+
print(" " + "-" * (descr_width - 1 + 8 * len(labels)))
|
| 198 |
+
print(
|
| 199 |
+
" TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels)
|
| 200 |
+
)
|
| 201 |
+
print(
|
| 202 |
+
" PROBS:".ljust(descr_width)
|
| 203 |
+
+ "".join("%8.3f" % pdist.prob(l) for l in labels)
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
def most_informative_features(self, n=10):
|
| 207 |
+
"""
|
| 208 |
+
Generates the ranked list of informative features from most to least.
|
| 209 |
+
"""
|
| 210 |
+
if hasattr(self, "_most_informative_features"):
|
| 211 |
+
return self._most_informative_features[:n]
|
| 212 |
+
else:
|
| 213 |
+
self._most_informative_features = sorted(
|
| 214 |
+
list(range(len(self._weights))),
|
| 215 |
+
key=lambda fid: abs(self._weights[fid]),
|
| 216 |
+
reverse=True,
|
| 217 |
+
)
|
| 218 |
+
return self._most_informative_features[:n]
|
| 219 |
+
|
| 220 |
+
def show_most_informative_features(self, n=10, show="all"):
|
| 221 |
+
"""
|
| 222 |
+
:param show: all, neg, or pos (for negative-only or positive-only)
|
| 223 |
+
:type show: str
|
| 224 |
+
:param n: The no. of top features
|
| 225 |
+
:type n: int
|
| 226 |
+
"""
|
| 227 |
+
# Use None the full list of ranked features.
|
| 228 |
+
fids = self.most_informative_features(None)
|
| 229 |
+
if show == "pos":
|
| 230 |
+
fids = [fid for fid in fids if self._weights[fid] > 0]
|
| 231 |
+
elif show == "neg":
|
| 232 |
+
fids = [fid for fid in fids if self._weights[fid] < 0]
|
| 233 |
+
for fid in fids[:n]:
|
| 234 |
+
print(f"{self._weights[fid]:8.3f} {self._encoding.describe(fid)}")
|
| 235 |
+
|
| 236 |
+
def __repr__(self):
|
| 237 |
+
return "<ConditionalExponentialClassifier: %d labels, %d features>" % (
|
| 238 |
+
len(self._encoding.labels()),
|
| 239 |
+
self._encoding.length(),
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
#: A list of the algorithm names that are accepted for the
|
| 243 |
+
#: ``train()`` method's ``algorithm`` parameter.
|
| 244 |
+
ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"]
|
| 245 |
+
|
| 246 |
+
@classmethod
|
| 247 |
+
def train(
|
| 248 |
+
cls,
|
| 249 |
+
train_toks,
|
| 250 |
+
algorithm=None,
|
| 251 |
+
trace=3,
|
| 252 |
+
encoding=None,
|
| 253 |
+
labels=None,
|
| 254 |
+
gaussian_prior_sigma=0,
|
| 255 |
+
**cutoffs,
|
| 256 |
+
):
|
| 257 |
+
"""
|
| 258 |
+
Train a new maxent classifier based on the given corpus of
|
| 259 |
+
training samples. This classifier will have its weights
|
| 260 |
+
chosen to maximize entropy while remaining empirically
|
| 261 |
+
consistent with the training corpus.
|
| 262 |
+
|
| 263 |
+
:rtype: MaxentClassifier
|
| 264 |
+
:return: The new maxent classifier
|
| 265 |
+
|
| 266 |
+
:type train_toks: list
|
| 267 |
+
:param train_toks: Training data, represented as a list of
|
| 268 |
+
pairs, the first member of which is a featureset,
|
| 269 |
+
and the second of which is a classification label.
|
| 270 |
+
|
| 271 |
+
:type algorithm: str
|
| 272 |
+
:param algorithm: A case-insensitive string, specifying which
|
| 273 |
+
algorithm should be used to train the classifier. The
|
| 274 |
+
following algorithms are currently available.
|
| 275 |
+
|
| 276 |
+
- Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
|
| 277 |
+
Improved Iterative Scaling (``'IIS'``)
|
| 278 |
+
- External Libraries (requiring megam):
|
| 279 |
+
LM-BFGS algorithm, with training performed by Megam (``'megam'``)
|
| 280 |
+
|
| 281 |
+
The default algorithm is ``'IIS'``.
|
| 282 |
+
|
| 283 |
+
:type trace: int
|
| 284 |
+
:param trace: The level of diagnostic tracing output to produce.
|
| 285 |
+
Higher values produce more verbose output.
|
| 286 |
+
:type encoding: MaxentFeatureEncodingI
|
| 287 |
+
:param encoding: A feature encoding, used to convert featuresets
|
| 288 |
+
into feature vectors. If none is specified, then a
|
| 289 |
+
``BinaryMaxentFeatureEncoding`` will be built based on the
|
| 290 |
+
features that are attested in the training corpus.
|
| 291 |
+
:type labels: list(str)
|
| 292 |
+
:param labels: The set of possible labels. If none is given, then
|
| 293 |
+
the set of all labels attested in the training data will be
|
| 294 |
+
used instead.
|
| 295 |
+
:param gaussian_prior_sigma: The sigma value for a gaussian
|
| 296 |
+
prior on model weights. Currently, this is supported by
|
| 297 |
+
``megam``. For other algorithms, its value is ignored.
|
| 298 |
+
:param cutoffs: Arguments specifying various conditions under
|
| 299 |
+
which the training should be halted. (Some of the cutoff
|
| 300 |
+
conditions are not supported by some algorithms.)
|
| 301 |
+
|
| 302 |
+
- ``max_iter=v``: Terminate after ``v`` iterations.
|
| 303 |
+
- ``min_ll=v``: Terminate after the negative average
|
| 304 |
+
log-likelihood drops under ``v``.
|
| 305 |
+
- ``min_lldelta=v``: Terminate if a single iteration improves
|
| 306 |
+
log likelihood by less than ``v``.
|
| 307 |
+
"""
|
| 308 |
+
if algorithm is None:
|
| 309 |
+
algorithm = "iis"
|
| 310 |
+
for key in cutoffs:
|
| 311 |
+
if key not in (
|
| 312 |
+
"max_iter",
|
| 313 |
+
"min_ll",
|
| 314 |
+
"min_lldelta",
|
| 315 |
+
"max_acc",
|
| 316 |
+
"min_accdelta",
|
| 317 |
+
"count_cutoff",
|
| 318 |
+
"norm",
|
| 319 |
+
"explicit",
|
| 320 |
+
"bernoulli",
|
| 321 |
+
):
|
| 322 |
+
raise TypeError("Unexpected keyword arg %r" % key)
|
| 323 |
+
algorithm = algorithm.lower()
|
| 324 |
+
if algorithm == "iis":
|
| 325 |
+
return train_maxent_classifier_with_iis(
|
| 326 |
+
train_toks, trace, encoding, labels, **cutoffs
|
| 327 |
+
)
|
| 328 |
+
elif algorithm == "gis":
|
| 329 |
+
return train_maxent_classifier_with_gis(
|
| 330 |
+
train_toks, trace, encoding, labels, **cutoffs
|
| 331 |
+
)
|
| 332 |
+
elif algorithm == "megam":
|
| 333 |
+
return train_maxent_classifier_with_megam(
|
| 334 |
+
train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
|
| 335 |
+
)
|
| 336 |
+
elif algorithm == "tadm":
|
| 337 |
+
kwargs = cutoffs
|
| 338 |
+
kwargs["trace"] = trace
|
| 339 |
+
kwargs["encoding"] = encoding
|
| 340 |
+
kwargs["labels"] = labels
|
| 341 |
+
kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma
|
| 342 |
+
return TadmMaxentClassifier.train(train_toks, **kwargs)
|
| 343 |
+
else:
|
| 344 |
+
raise ValueError("Unknown algorithm %s" % algorithm)
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
#: Alias for MaxentClassifier.
|
| 348 |
+
ConditionalExponentialClassifier = MaxentClassifier
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
######################################################################
|
| 352 |
+
# { Feature Encodings
|
| 353 |
+
######################################################################
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
class MaxentFeatureEncodingI:
|
| 357 |
+
"""
|
| 358 |
+
A mapping that converts a set of input-feature values to a vector
|
| 359 |
+
of joint-feature values, given a label. This conversion is
|
| 360 |
+
necessary to translate featuresets into a format that can be used
|
| 361 |
+
by maximum entropy models.
|
| 362 |
+
|
| 363 |
+
The set of joint-features used by a given encoding is fixed, and
|
| 364 |
+
each index in the generated joint-feature vectors corresponds to a
|
| 365 |
+
single joint-feature. The length of the generated joint-feature
|
| 366 |
+
vectors is therefore constant (for a given encoding).
|
| 367 |
+
|
| 368 |
+
Because the joint-feature vectors generated by
|
| 369 |
+
``MaxentFeatureEncodingI`` are typically very sparse, they are
|
| 370 |
+
represented as a list of ``(index, value)`` tuples, specifying the
|
| 371 |
+
value of each non-zero joint-feature.
|
| 372 |
+
|
| 373 |
+
Feature encodings are generally created using the ``train()``
|
| 374 |
+
method, which generates an appropriate encoding based on the
|
| 375 |
+
input-feature values and labels that are present in a given
|
| 376 |
+
corpus.
|
| 377 |
+
"""
|
| 378 |
+
|
| 379 |
+
def encode(self, featureset, label):
|
| 380 |
+
"""
|
| 381 |
+
Given a (featureset, label) pair, return the corresponding
|
| 382 |
+
vector of joint-feature values. This vector is represented as
|
| 383 |
+
a list of ``(index, value)`` tuples, specifying the value of
|
| 384 |
+
each non-zero joint-feature.
|
| 385 |
+
|
| 386 |
+
:type featureset: dict
|
| 387 |
+
:rtype: list(tuple(int, int))
|
| 388 |
+
"""
|
| 389 |
+
raise NotImplementedError()
|
| 390 |
+
|
| 391 |
+
def length(self):
|
| 392 |
+
"""
|
| 393 |
+
:return: The size of the fixed-length joint-feature vectors
|
| 394 |
+
that are generated by this encoding.
|
| 395 |
+
:rtype: int
|
| 396 |
+
"""
|
| 397 |
+
raise NotImplementedError()
|
| 398 |
+
|
| 399 |
+
def labels(self):
|
| 400 |
+
"""
|
| 401 |
+
:return: A list of the \"known labels\" -- i.e., all labels
|
| 402 |
+
``l`` such that ``self.encode(fs,l)`` can be a nonzero
|
| 403 |
+
joint-feature vector for some value of ``fs``.
|
| 404 |
+
:rtype: list
|
| 405 |
+
"""
|
| 406 |
+
raise NotImplementedError()
|
| 407 |
+
|
| 408 |
+
def describe(self, fid):
|
| 409 |
+
"""
|
| 410 |
+
:return: A string describing the value of the joint-feature
|
| 411 |
+
whose index in the generated feature vectors is ``fid``.
|
| 412 |
+
:rtype: str
|
| 413 |
+
"""
|
| 414 |
+
raise NotImplementedError()
|
| 415 |
+
|
| 416 |
+
def train(cls, train_toks):
|
| 417 |
+
"""
|
| 418 |
+
Construct and return new feature encoding, based on a given
|
| 419 |
+
training corpus ``train_toks``.
|
| 420 |
+
|
| 421 |
+
:type train_toks: list(tuple(dict, str))
|
| 422 |
+
:param train_toks: Training data, represented as a list of
|
| 423 |
+
pairs, the first member of which is a feature dictionary,
|
| 424 |
+
and the second of which is a classification label.
|
| 425 |
+
"""
|
| 426 |
+
raise NotImplementedError()
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
|
| 430 |
+
"""
|
| 431 |
+
A feature encoding that calls a user-supplied function to map a
|
| 432 |
+
given featureset/label pair to a sparse joint-feature vector.
|
| 433 |
+
"""
|
| 434 |
+
|
| 435 |
+
def __init__(self, func, length, labels):
|
| 436 |
+
"""
|
| 437 |
+
Construct a new feature encoding based on the given function.
|
| 438 |
+
|
| 439 |
+
:type func: (callable)
|
| 440 |
+
:param func: A function that takes two arguments, a featureset
|
| 441 |
+
and a label, and returns the sparse joint feature vector
|
| 442 |
+
that encodes them::
|
| 443 |
+
|
| 444 |
+
func(featureset, label) -> feature_vector
|
| 445 |
+
|
| 446 |
+
This sparse joint feature vector (``feature_vector``) is a
|
| 447 |
+
list of ``(index,value)`` tuples.
|
| 448 |
+
|
| 449 |
+
:type length: int
|
| 450 |
+
:param length: The size of the fixed-length joint-feature
|
| 451 |
+
vectors that are generated by this encoding.
|
| 452 |
+
|
| 453 |
+
:type labels: list
|
| 454 |
+
:param labels: A list of the \"known labels\" for this
|
| 455 |
+
encoding -- i.e., all labels ``l`` such that
|
| 456 |
+
``self.encode(fs,l)`` can be a nonzero joint-feature vector
|
| 457 |
+
for some value of ``fs``.
|
| 458 |
+
"""
|
| 459 |
+
self._length = length
|
| 460 |
+
self._func = func
|
| 461 |
+
self._labels = labels
|
| 462 |
+
|
| 463 |
+
def encode(self, featureset, label):
|
| 464 |
+
return self._func(featureset, label)
|
| 465 |
+
|
| 466 |
+
def length(self):
|
| 467 |
+
return self._length
|
| 468 |
+
|
| 469 |
+
def labels(self):
|
| 470 |
+
return self._labels
|
| 471 |
+
|
| 472 |
+
def describe(self, fid):
|
| 473 |
+
return "no description available"
|
| 474 |
+
|
| 475 |
+
|
| 476 |
+
class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
|
| 477 |
+
"""
|
| 478 |
+
A feature encoding that generates vectors containing a binary
|
| 479 |
+
joint-features of the form:
|
| 480 |
+
|
| 481 |
+
| joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
|
| 482 |
+
| {
|
| 483 |
+
| { 0 otherwise
|
| 484 |
+
|
| 485 |
+
Where ``fname`` is the name of an input-feature, ``fval`` is a value
|
| 486 |
+
for that input-feature, and ``label`` is a label.
|
| 487 |
+
|
| 488 |
+
Typically, these features are constructed based on a training
|
| 489 |
+
corpus, using the ``train()`` method. This method will create one
|
| 490 |
+
feature for each combination of ``fname``, ``fval``, and ``label``
|
| 491 |
+
that occurs at least once in the training corpus.
|
| 492 |
+
|
| 493 |
+
The ``unseen_features`` parameter can be used to add "unseen-value
|
| 494 |
+
features", which are used whenever an input feature has a value
|
| 495 |
+
that was not encountered in the training corpus. These features
|
| 496 |
+
have the form:
|
| 497 |
+
|
| 498 |
+
| joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
|
| 499 |
+
| { and l == label
|
| 500 |
+
| {
|
| 501 |
+
| { 0 otherwise
|
| 502 |
+
|
| 503 |
+
Where ``is_unseen(fname, fval)`` is true if the encoding does not
|
| 504 |
+
contain any joint features that are true when ``fs[fname]==fval``.
|
| 505 |
+
|
| 506 |
+
The ``alwayson_features`` parameter can be used to add "always-on
|
| 507 |
+
features", which have the form::
|
| 508 |
+
|
| 509 |
+
| joint_feat(fs, l) = { 1 if (l == label)
|
| 510 |
+
| {
|
| 511 |
+
| { 0 otherwise
|
| 512 |
+
|
| 513 |
+
These always-on features allow the maxent model to directly model
|
| 514 |
+
the prior probabilities of each label.
|
| 515 |
+
"""
|
| 516 |
+
|
| 517 |
+
def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
|
| 518 |
+
"""
|
| 519 |
+
:param labels: A list of the \"known labels\" for this encoding.
|
| 520 |
+
|
| 521 |
+
:param mapping: A dictionary mapping from ``(fname,fval,label)``
|
| 522 |
+
tuples to corresponding joint-feature indexes. These
|
| 523 |
+
indexes must be the set of integers from 0...len(mapping).
|
| 524 |
+
If ``mapping[fname,fval,label]=id``, then
|
| 525 |
+
``self.encode(..., fname:fval, ..., label)[id]`` is 1;
|
| 526 |
+
otherwise, it is 0.
|
| 527 |
+
|
| 528 |
+
:param unseen_features: If true, then include unseen value
|
| 529 |
+
features in the generated joint-feature vectors.
|
| 530 |
+
|
| 531 |
+
:param alwayson_features: If true, then include always-on
|
| 532 |
+
features in the generated joint-feature vectors.
|
| 533 |
+
"""
|
| 534 |
+
if set(mapping.values()) != set(range(len(mapping))):
|
| 535 |
+
raise ValueError(
|
| 536 |
+
"Mapping values must be exactly the "
|
| 537 |
+
"set of integers from 0...len(mapping)"
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
self._labels = list(labels)
|
| 541 |
+
"""A list of attested labels."""
|
| 542 |
+
|
| 543 |
+
self._mapping = mapping
|
| 544 |
+
"""dict mapping from (fname,fval,label) -> fid"""
|
| 545 |
+
|
| 546 |
+
self._length = len(mapping)
|
| 547 |
+
"""The length of generated joint feature vectors."""
|
| 548 |
+
|
| 549 |
+
self._alwayson = None
|
| 550 |
+
"""dict mapping from label -> fid"""
|
| 551 |
+
|
| 552 |
+
self._unseen = None
|
| 553 |
+
"""dict mapping from fname -> fid"""
|
| 554 |
+
|
| 555 |
+
if alwayson_features:
|
| 556 |
+
self._alwayson = {
|
| 557 |
+
label: i + self._length for (i, label) in enumerate(labels)
|
| 558 |
+
}
|
| 559 |
+
self._length += len(self._alwayson)
|
| 560 |
+
|
| 561 |
+
if unseen_features:
|
| 562 |
+
fnames = {fname for (fname, fval, label) in mapping}
|
| 563 |
+
self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)}
|
| 564 |
+
self._length += len(fnames)
|
| 565 |
+
|
| 566 |
+
def encode(self, featureset, label):
|
| 567 |
+
# Inherit docs.
|
| 568 |
+
encoding = []
|
| 569 |
+
|
| 570 |
+
# Convert input-features to joint-features:
|
| 571 |
+
for fname, fval in featureset.items():
|
| 572 |
+
# Known feature name & value:
|
| 573 |
+
if (fname, fval, label) in self._mapping:
|
| 574 |
+
encoding.append((self._mapping[fname, fval, label], 1))
|
| 575 |
+
|
| 576 |
+
# Otherwise, we might want to fire an "unseen-value feature".
|
| 577 |
+
elif self._unseen:
|
| 578 |
+
# Have we seen this fname/fval combination with any label?
|
| 579 |
+
for label2 in self._labels:
|
| 580 |
+
if (fname, fval, label2) in self._mapping:
|
| 581 |
+
break # we've seen this fname/fval combo
|
| 582 |
+
# We haven't -- fire the unseen-value feature
|
| 583 |
+
else:
|
| 584 |
+
if fname in self._unseen:
|
| 585 |
+
encoding.append((self._unseen[fname], 1))
|
| 586 |
+
|
| 587 |
+
# Add always-on features:
|
| 588 |
+
if self._alwayson and label in self._alwayson:
|
| 589 |
+
encoding.append((self._alwayson[label], 1))
|
| 590 |
+
|
| 591 |
+
return encoding
|
| 592 |
+
|
| 593 |
+
def describe(self, f_id):
|
| 594 |
+
# Inherit docs.
|
| 595 |
+
if not isinstance(f_id, int):
|
| 596 |
+
raise TypeError("describe() expected an int")
|
| 597 |
+
try:
|
| 598 |
+
self._inv_mapping
|
| 599 |
+
except AttributeError:
|
| 600 |
+
self._inv_mapping = [-1] * len(self._mapping)
|
| 601 |
+
for (info, i) in self._mapping.items():
|
| 602 |
+
self._inv_mapping[i] = info
|
| 603 |
+
|
| 604 |
+
if f_id < len(self._mapping):
|
| 605 |
+
(fname, fval, label) = self._inv_mapping[f_id]
|
| 606 |
+
return f"{fname}=={fval!r} and label is {label!r}"
|
| 607 |
+
elif self._alwayson and f_id in self._alwayson.values():
|
| 608 |
+
for (label, f_id2) in self._alwayson.items():
|
| 609 |
+
if f_id == f_id2:
|
| 610 |
+
return "label is %r" % label
|
| 611 |
+
elif self._unseen and f_id in self._unseen.values():
|
| 612 |
+
for (fname, f_id2) in self._unseen.items():
|
| 613 |
+
if f_id == f_id2:
|
| 614 |
+
return "%s is unseen" % fname
|
| 615 |
+
else:
|
| 616 |
+
raise ValueError("Bad feature id")
|
| 617 |
+
|
| 618 |
+
def labels(self):
|
| 619 |
+
# Inherit docs.
|
| 620 |
+
return self._labels
|
| 621 |
+
|
| 622 |
+
def length(self):
|
| 623 |
+
# Inherit docs.
|
| 624 |
+
return self._length
|
| 625 |
+
|
| 626 |
+
@classmethod
|
| 627 |
+
def train(cls, train_toks, count_cutoff=0, labels=None, **options):
|
| 628 |
+
"""
|
| 629 |
+
Construct and return new feature encoding, based on a given
|
| 630 |
+
training corpus ``train_toks``. See the class description
|
| 631 |
+
``BinaryMaxentFeatureEncoding`` for a description of the
|
| 632 |
+
joint-features that will be included in this encoding.
|
| 633 |
+
|
| 634 |
+
:type train_toks: list(tuple(dict, str))
|
| 635 |
+
:param train_toks: Training data, represented as a list of
|
| 636 |
+
pairs, the first member of which is a feature dictionary,
|
| 637 |
+
and the second of which is a classification label.
|
| 638 |
+
|
| 639 |
+
:type count_cutoff: int
|
| 640 |
+
:param count_cutoff: A cutoff value that is used to discard
|
| 641 |
+
rare joint-features. If a joint-feature's value is 1
|
| 642 |
+
fewer than ``count_cutoff`` times in the training corpus,
|
| 643 |
+
then that joint-feature is not included in the generated
|
| 644 |
+
encoding.
|
| 645 |
+
|
| 646 |
+
:type labels: list
|
| 647 |
+
:param labels: A list of labels that should be used by the
|
| 648 |
+
classifier. If not specified, then the set of labels
|
| 649 |
+
attested in ``train_toks`` will be used.
|
| 650 |
+
|
| 651 |
+
:param options: Extra parameters for the constructor, such as
|
| 652 |
+
``unseen_features`` and ``alwayson_features``.
|
| 653 |
+
"""
|
| 654 |
+
mapping = {} # maps (fname, fval, label) -> fid
|
| 655 |
+
seen_labels = set() # The set of labels we've encountered
|
| 656 |
+
count = defaultdict(int) # maps (fname, fval) -> count
|
| 657 |
+
|
| 658 |
+
for (tok, label) in train_toks:
|
| 659 |
+
if labels and label not in labels:
|
| 660 |
+
raise ValueError("Unexpected label %s" % label)
|
| 661 |
+
seen_labels.add(label)
|
| 662 |
+
|
| 663 |
+
# Record each of the features.
|
| 664 |
+
for (fname, fval) in tok.items():
|
| 665 |
+
|
| 666 |
+
# If a count cutoff is given, then only add a joint
|
| 667 |
+
# feature once the corresponding (fname, fval, label)
|
| 668 |
+
# tuple exceeds that cutoff.
|
| 669 |
+
count[fname, fval] += 1
|
| 670 |
+
if count[fname, fval] >= count_cutoff:
|
| 671 |
+
if (fname, fval, label) not in mapping:
|
| 672 |
+
mapping[fname, fval, label] = len(mapping)
|
| 673 |
+
|
| 674 |
+
if labels is None:
|
| 675 |
+
labels = seen_labels
|
| 676 |
+
return cls(labels, mapping, **options)
|
| 677 |
+
|
| 678 |
+
|
| 679 |
+
class GISEncoding(BinaryMaxentFeatureEncoding):
|
| 680 |
+
"""
|
| 681 |
+
A binary feature encoding which adds one new joint-feature to the
|
| 682 |
+
joint-features defined by ``BinaryMaxentFeatureEncoding``: a
|
| 683 |
+
correction feature, whose value is chosen to ensure that the
|
| 684 |
+
sparse vector always sums to a constant non-negative number. This
|
| 685 |
+
new feature is used to ensure two preconditions for the GIS
|
| 686 |
+
training algorithm:
|
| 687 |
+
|
| 688 |
+
- At least one feature vector index must be nonzero for every
|
| 689 |
+
token.
|
| 690 |
+
- The feature vector must sum to a constant non-negative number
|
| 691 |
+
for every token.
|
| 692 |
+
"""
|
| 693 |
+
|
| 694 |
+
def __init__(
|
| 695 |
+
self, labels, mapping, unseen_features=False, alwayson_features=False, C=None
|
| 696 |
+
):
|
| 697 |
+
"""
|
| 698 |
+
:param C: The correction constant. The value of the correction
|
| 699 |
+
feature is based on this value. In particular, its value is
|
| 700 |
+
``C - sum([v for (f,v) in encoding])``.
|
| 701 |
+
:seealso: ``BinaryMaxentFeatureEncoding.__init__``
|
| 702 |
+
"""
|
| 703 |
+
BinaryMaxentFeatureEncoding.__init__(
|
| 704 |
+
self, labels, mapping, unseen_features, alwayson_features
|
| 705 |
+
)
|
| 706 |
+
if C is None:
|
| 707 |
+
C = len({fname for (fname, fval, label) in mapping}) + 1
|
| 708 |
+
self._C = C
|
| 709 |
+
|
| 710 |
+
@property
|
| 711 |
+
def C(self):
|
| 712 |
+
"""The non-negative constant that all encoded feature vectors
|
| 713 |
+
will sum to."""
|
| 714 |
+
return self._C
|
| 715 |
+
|
| 716 |
+
def encode(self, featureset, label):
|
| 717 |
+
# Get the basic encoding.
|
| 718 |
+
encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label)
|
| 719 |
+
base_length = BinaryMaxentFeatureEncoding.length(self)
|
| 720 |
+
|
| 721 |
+
# Add a correction feature.
|
| 722 |
+
total = sum(v for (f, v) in encoding)
|
| 723 |
+
if total >= self._C:
|
| 724 |
+
raise ValueError("Correction feature is not high enough!")
|
| 725 |
+
encoding.append((base_length, self._C - total))
|
| 726 |
+
|
| 727 |
+
# Return the result
|
| 728 |
+
return encoding
|
| 729 |
+
|
| 730 |
+
def length(self):
|
| 731 |
+
return BinaryMaxentFeatureEncoding.length(self) + 1
|
| 732 |
+
|
| 733 |
+
def describe(self, f_id):
|
| 734 |
+
if f_id == BinaryMaxentFeatureEncoding.length(self):
|
| 735 |
+
return "Correction feature (%s)" % self._C
|
| 736 |
+
else:
|
| 737 |
+
return BinaryMaxentFeatureEncoding.describe(self, f_id)
|
| 738 |
+
|
| 739 |
+
|
| 740 |
+
class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):
|
| 741 |
+
def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
|
| 742 |
+
self._mapping = OrderedDict(mapping)
|
| 743 |
+
self._label_mapping = OrderedDict()
|
| 744 |
+
BinaryMaxentFeatureEncoding.__init__(
|
| 745 |
+
self, labels, self._mapping, unseen_features, alwayson_features
|
| 746 |
+
)
|
| 747 |
+
|
| 748 |
+
def encode(self, featureset, label):
|
| 749 |
+
encoding = []
|
| 750 |
+
for feature, value in featureset.items():
|
| 751 |
+
if (feature, label) not in self._mapping:
|
| 752 |
+
self._mapping[(feature, label)] = len(self._mapping)
|
| 753 |
+
if value not in self._label_mapping:
|
| 754 |
+
if not isinstance(value, int):
|
| 755 |
+
self._label_mapping[value] = len(self._label_mapping)
|
| 756 |
+
else:
|
| 757 |
+
self._label_mapping[value] = value
|
| 758 |
+
encoding.append(
|
| 759 |
+
(self._mapping[(feature, label)], self._label_mapping[value])
|
| 760 |
+
)
|
| 761 |
+
return encoding
|
| 762 |
+
|
| 763 |
+
def labels(self):
|
| 764 |
+
return self._labels
|
| 765 |
+
|
| 766 |
+
def describe(self, fid):
|
| 767 |
+
for (feature, label) in self._mapping:
|
| 768 |
+
if self._mapping[(feature, label)] == fid:
|
| 769 |
+
return (feature, label)
|
| 770 |
+
|
| 771 |
+
def length(self):
|
| 772 |
+
return len(self._mapping)
|
| 773 |
+
|
| 774 |
+
@classmethod
|
| 775 |
+
def train(cls, train_toks, count_cutoff=0, labels=None, **options):
|
| 776 |
+
mapping = OrderedDict()
|
| 777 |
+
if not labels:
|
| 778 |
+
labels = []
|
| 779 |
+
|
| 780 |
+
# This gets read twice, so compute the values in case it's lazy.
|
| 781 |
+
train_toks = list(train_toks)
|
| 782 |
+
|
| 783 |
+
for (featureset, label) in train_toks:
|
| 784 |
+
if label not in labels:
|
| 785 |
+
labels.append(label)
|
| 786 |
+
|
| 787 |
+
for (featureset, label) in train_toks:
|
| 788 |
+
for label in labels:
|
| 789 |
+
for feature in featureset:
|
| 790 |
+
if (feature, label) not in mapping:
|
| 791 |
+
mapping[(feature, label)] = len(mapping)
|
| 792 |
+
|
| 793 |
+
return cls(labels, mapping, **options)
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
|
| 797 |
+
"""
|
| 798 |
+
A feature encoding that generates vectors containing integer,
|
| 799 |
+
float and binary joint-features of the form:
|
| 800 |
+
|
| 801 |
+
Binary (for string and boolean features):
|
| 802 |
+
|
| 803 |
+
| joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
|
| 804 |
+
| {
|
| 805 |
+
| { 0 otherwise
|
| 806 |
+
|
| 807 |
+
Value (for integer and float features):
|
| 808 |
+
|
| 809 |
+
| joint_feat(fs, l) = { fval if (fs[fname] == type(fval))
|
| 810 |
+
| { and (l == label)
|
| 811 |
+
| {
|
| 812 |
+
| { not encoded otherwise
|
| 813 |
+
|
| 814 |
+
Where ``fname`` is the name of an input-feature, ``fval`` is a value
|
| 815 |
+
for that input-feature, and ``label`` is a label.
|
| 816 |
+
|
| 817 |
+
Typically, these features are constructed based on a training
|
| 818 |
+
corpus, using the ``train()`` method.
|
| 819 |
+
|
| 820 |
+
For string and boolean features [type(fval) not in (int, float)]
|
| 821 |
+
this method will create one feature for each combination of
|
| 822 |
+
``fname``, ``fval``, and ``label`` that occurs at least once in the
|
| 823 |
+
training corpus.
|
| 824 |
+
|
| 825 |
+
For integer and float features [type(fval) in (int, float)] this
|
| 826 |
+
method will create one feature for each combination of ``fname``
|
| 827 |
+
and ``label`` that occurs at least once in the training corpus.
|
| 828 |
+
|
| 829 |
+
For binary features the ``unseen_features`` parameter can be used
|
| 830 |
+
to add "unseen-value features", which are used whenever an input
|
| 831 |
+
feature has a value that was not encountered in the training
|
| 832 |
+
corpus. These features have the form:
|
| 833 |
+
|
| 834 |
+
| joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
|
| 835 |
+
| { and l == label
|
| 836 |
+
| {
|
| 837 |
+
| { 0 otherwise
|
| 838 |
+
|
| 839 |
+
Where ``is_unseen(fname, fval)`` is true if the encoding does not
|
| 840 |
+
contain any joint features that are true when ``fs[fname]==fval``.
|
| 841 |
+
|
| 842 |
+
The ``alwayson_features`` parameter can be used to add "always-on
|
| 843 |
+
features", which have the form:
|
| 844 |
+
|
| 845 |
+
| joint_feat(fs, l) = { 1 if (l == label)
|
| 846 |
+
| {
|
| 847 |
+
| { 0 otherwise
|
| 848 |
+
|
| 849 |
+
These always-on features allow the maxent model to directly model
|
| 850 |
+
the prior probabilities of each label.
|
| 851 |
+
"""
|
| 852 |
+
|
| 853 |
+
def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
|
| 854 |
+
"""
|
| 855 |
+
:param labels: A list of the \"known labels\" for this encoding.
|
| 856 |
+
|
| 857 |
+
:param mapping: A dictionary mapping from ``(fname,fval,label)``
|
| 858 |
+
tuples to corresponding joint-feature indexes. These
|
| 859 |
+
indexes must be the set of integers from 0...len(mapping).
|
| 860 |
+
If ``mapping[fname,fval,label]=id``, then
|
| 861 |
+
``self.encode({..., fname:fval, ...``, label)[id]} is 1;
|
| 862 |
+
otherwise, it is 0.
|
| 863 |
+
|
| 864 |
+
:param unseen_features: If true, then include unseen value
|
| 865 |
+
features in the generated joint-feature vectors.
|
| 866 |
+
|
| 867 |
+
:param alwayson_features: If true, then include always-on
|
| 868 |
+
features in the generated joint-feature vectors.
|
| 869 |
+
"""
|
| 870 |
+
if set(mapping.values()) != set(range(len(mapping))):
|
| 871 |
+
raise ValueError(
|
| 872 |
+
"Mapping values must be exactly the "
|
| 873 |
+
"set of integers from 0...len(mapping)"
|
| 874 |
+
)
|
| 875 |
+
|
| 876 |
+
self._labels = list(labels)
|
| 877 |
+
"""A list of attested labels."""
|
| 878 |
+
|
| 879 |
+
self._mapping = mapping
|
| 880 |
+
"""dict mapping from (fname,fval,label) -> fid"""
|
| 881 |
+
|
| 882 |
+
self._length = len(mapping)
|
| 883 |
+
"""The length of generated joint feature vectors."""
|
| 884 |
+
|
| 885 |
+
self._alwayson = None
|
| 886 |
+
"""dict mapping from label -> fid"""
|
| 887 |
+
|
| 888 |
+
self._unseen = None
|
| 889 |
+
"""dict mapping from fname -> fid"""
|
| 890 |
+
|
| 891 |
+
if alwayson_features:
|
| 892 |
+
self._alwayson = {
|
| 893 |
+
label: i + self._length for (i, label) in enumerate(labels)
|
| 894 |
+
}
|
| 895 |
+
self._length += len(self._alwayson)
|
| 896 |
+
|
| 897 |
+
if unseen_features:
|
| 898 |
+
fnames = {fname for (fname, fval, label) in mapping}
|
| 899 |
+
self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)}
|
| 900 |
+
self._length += len(fnames)
|
| 901 |
+
|
| 902 |
+
def encode(self, featureset, label):
|
| 903 |
+
# Inherit docs.
|
| 904 |
+
encoding = []
|
| 905 |
+
|
| 906 |
+
# Convert input-features to joint-features:
|
| 907 |
+
for fname, fval in featureset.items():
|
| 908 |
+
if isinstance(fval, (int, float)):
|
| 909 |
+
# Known feature name & value:
|
| 910 |
+
if (fname, type(fval), label) in self._mapping:
|
| 911 |
+
encoding.append((self._mapping[fname, type(fval), label], fval))
|
| 912 |
+
else:
|
| 913 |
+
# Known feature name & value:
|
| 914 |
+
if (fname, fval, label) in self._mapping:
|
| 915 |
+
encoding.append((self._mapping[fname, fval, label], 1))
|
| 916 |
+
|
| 917 |
+
# Otherwise, we might want to fire an "unseen-value feature".
|
| 918 |
+
elif self._unseen:
|
| 919 |
+
# Have we seen this fname/fval combination with any label?
|
| 920 |
+
for label2 in self._labels:
|
| 921 |
+
if (fname, fval, label2) in self._mapping:
|
| 922 |
+
break # we've seen this fname/fval combo
|
| 923 |
+
# We haven't -- fire the unseen-value feature
|
| 924 |
+
else:
|
| 925 |
+
if fname in self._unseen:
|
| 926 |
+
encoding.append((self._unseen[fname], 1))
|
| 927 |
+
|
| 928 |
+
# Add always-on features:
|
| 929 |
+
if self._alwayson and label in self._alwayson:
|
| 930 |
+
encoding.append((self._alwayson[label], 1))
|
| 931 |
+
|
| 932 |
+
return encoding
|
| 933 |
+
|
| 934 |
+
def describe(self, f_id):
|
| 935 |
+
# Inherit docs.
|
| 936 |
+
if not isinstance(f_id, int):
|
| 937 |
+
raise TypeError("describe() expected an int")
|
| 938 |
+
try:
|
| 939 |
+
self._inv_mapping
|
| 940 |
+
except AttributeError:
|
| 941 |
+
self._inv_mapping = [-1] * len(self._mapping)
|
| 942 |
+
for (info, i) in self._mapping.items():
|
| 943 |
+
self._inv_mapping[i] = info
|
| 944 |
+
|
| 945 |
+
if f_id < len(self._mapping):
|
| 946 |
+
(fname, fval, label) = self._inv_mapping[f_id]
|
| 947 |
+
return f"{fname}=={fval!r} and label is {label!r}"
|
| 948 |
+
elif self._alwayson and f_id in self._alwayson.values():
|
| 949 |
+
for (label, f_id2) in self._alwayson.items():
|
| 950 |
+
if f_id == f_id2:
|
| 951 |
+
return "label is %r" % label
|
| 952 |
+
elif self._unseen and f_id in self._unseen.values():
|
| 953 |
+
for (fname, f_id2) in self._unseen.items():
|
| 954 |
+
if f_id == f_id2:
|
| 955 |
+
return "%s is unseen" % fname
|
| 956 |
+
else:
|
| 957 |
+
raise ValueError("Bad feature id")
|
| 958 |
+
|
| 959 |
+
def labels(self):
|
| 960 |
+
# Inherit docs.
|
| 961 |
+
return self._labels
|
| 962 |
+
|
| 963 |
+
def length(self):
|
| 964 |
+
# Inherit docs.
|
| 965 |
+
return self._length
|
| 966 |
+
|
| 967 |
+
@classmethod
|
| 968 |
+
def train(cls, train_toks, count_cutoff=0, labels=None, **options):
|
| 969 |
+
"""
|
| 970 |
+
Construct and return new feature encoding, based on a given
|
| 971 |
+
training corpus ``train_toks``. See the class description
|
| 972 |
+
``TypedMaxentFeatureEncoding`` for a description of the
|
| 973 |
+
joint-features that will be included in this encoding.
|
| 974 |
+
|
| 975 |
+
Note: recognized feature values types are (int, float), over
|
| 976 |
+
types are interpreted as regular binary features.
|
| 977 |
+
|
| 978 |
+
:type train_toks: list(tuple(dict, str))
|
| 979 |
+
:param train_toks: Training data, represented as a list of
|
| 980 |
+
pairs, the first member of which is a feature dictionary,
|
| 981 |
+
and the second of which is a classification label.
|
| 982 |
+
|
| 983 |
+
:type count_cutoff: int
|
| 984 |
+
:param count_cutoff: A cutoff value that is used to discard
|
| 985 |
+
rare joint-features. If a joint-feature's value is 1
|
| 986 |
+
fewer than ``count_cutoff`` times in the training corpus,
|
| 987 |
+
then that joint-feature is not included in the generated
|
| 988 |
+
encoding.
|
| 989 |
+
|
| 990 |
+
:type labels: list
|
| 991 |
+
:param labels: A list of labels that should be used by the
|
| 992 |
+
classifier. If not specified, then the set of labels
|
| 993 |
+
attested in ``train_toks`` will be used.
|
| 994 |
+
|
| 995 |
+
:param options: Extra parameters for the constructor, such as
|
| 996 |
+
``unseen_features`` and ``alwayson_features``.
|
| 997 |
+
"""
|
| 998 |
+
mapping = {} # maps (fname, fval, label) -> fid
|
| 999 |
+
seen_labels = set() # The set of labels we've encountered
|
| 1000 |
+
count = defaultdict(int) # maps (fname, fval) -> count
|
| 1001 |
+
|
| 1002 |
+
for (tok, label) in train_toks:
|
| 1003 |
+
if labels and label not in labels:
|
| 1004 |
+
raise ValueError("Unexpected label %s" % label)
|
| 1005 |
+
seen_labels.add(label)
|
| 1006 |
+
|
| 1007 |
+
# Record each of the features.
|
| 1008 |
+
for (fname, fval) in tok.items():
|
| 1009 |
+
if type(fval) in (int, float):
|
| 1010 |
+
fval = type(fval)
|
| 1011 |
+
# If a count cutoff is given, then only add a joint
|
| 1012 |
+
# feature once the corresponding (fname, fval, label)
|
| 1013 |
+
# tuple exceeds that cutoff.
|
| 1014 |
+
count[fname, fval] += 1
|
| 1015 |
+
if count[fname, fval] >= count_cutoff:
|
| 1016 |
+
if (fname, fval, label) not in mapping:
|
| 1017 |
+
mapping[fname, fval, label] = len(mapping)
|
| 1018 |
+
|
| 1019 |
+
if labels is None:
|
| 1020 |
+
labels = seen_labels
|
| 1021 |
+
return cls(labels, mapping, **options)
|
| 1022 |
+
|
| 1023 |
+
|
| 1024 |
+
######################################################################
|
| 1025 |
+
# { Classifier Trainer: Generalized Iterative Scaling
|
| 1026 |
+
######################################################################
|
| 1027 |
+
|
| 1028 |
+
|
| 1029 |
+
def train_maxent_classifier_with_gis(
|
| 1030 |
+
train_toks, trace=3, encoding=None, labels=None, **cutoffs
|
| 1031 |
+
):
|
| 1032 |
+
"""
|
| 1033 |
+
Train a new ``ConditionalExponentialClassifier``, using the given
|
| 1034 |
+
training samples, using the Generalized Iterative Scaling
|
| 1035 |
+
algorithm. This ``ConditionalExponentialClassifier`` will encode
|
| 1036 |
+
the model that maximizes entropy from all the models that are
|
| 1037 |
+
empirically consistent with ``train_toks``.
|
| 1038 |
+
|
| 1039 |
+
:see: ``train_maxent_classifier()`` for parameter descriptions.
|
| 1040 |
+
"""
|
| 1041 |
+
cutoffs.setdefault("max_iter", 100)
|
| 1042 |
+
cutoffchecker = CutoffChecker(cutoffs)
|
| 1043 |
+
|
| 1044 |
+
# Construct an encoding from the training data.
|
| 1045 |
+
if encoding is None:
|
| 1046 |
+
encoding = GISEncoding.train(train_toks, labels=labels)
|
| 1047 |
+
|
| 1048 |
+
if not hasattr(encoding, "C"):
|
| 1049 |
+
raise TypeError(
|
| 1050 |
+
"The GIS algorithm requires an encoding that "
|
| 1051 |
+
"defines C (e.g., GISEncoding)."
|
| 1052 |
+
)
|
| 1053 |
+
|
| 1054 |
+
# Cinv is the inverse of the sum of each joint feature vector.
|
| 1055 |
+
# This controls the learning rate: higher Cinv (or lower C) gives
|
| 1056 |
+
# faster learning.
|
| 1057 |
+
Cinv = 1.0 / encoding.C
|
| 1058 |
+
|
| 1059 |
+
# Count how many times each feature occurs in the training data.
|
| 1060 |
+
empirical_fcount = calculate_empirical_fcount(train_toks, encoding)
|
| 1061 |
+
|
| 1062 |
+
# Check for any features that are not attested in train_toks.
|
| 1063 |
+
unattested = set(numpy.nonzero(empirical_fcount == 0)[0])
|
| 1064 |
+
|
| 1065 |
+
# Build the classifier. Start with weight=0 for each attested
|
| 1066 |
+
# feature, and weight=-infinity for each unattested feature.
|
| 1067 |
+
weights = numpy.zeros(len(empirical_fcount), "d")
|
| 1068 |
+
for fid in unattested:
|
| 1069 |
+
weights[fid] = numpy.NINF
|
| 1070 |
+
classifier = ConditionalExponentialClassifier(encoding, weights)
|
| 1071 |
+
|
| 1072 |
+
# Take the log of the empirical fcount.
|
| 1073 |
+
log_empirical_fcount = numpy.log2(empirical_fcount)
|
| 1074 |
+
del empirical_fcount
|
| 1075 |
+
|
| 1076 |
+
if trace > 0:
|
| 1077 |
+
print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
|
| 1078 |
+
if trace > 2:
|
| 1079 |
+
print()
|
| 1080 |
+
print(" Iteration Log Likelihood Accuracy")
|
| 1081 |
+
print(" ---------------------------------------")
|
| 1082 |
+
|
| 1083 |
+
# Train the classifier.
|
| 1084 |
+
try:
|
| 1085 |
+
while True:
|
| 1086 |
+
if trace > 2:
|
| 1087 |
+
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
|
| 1088 |
+
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
|
| 1089 |
+
iternum = cutoffchecker.iter
|
| 1090 |
+
print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
|
| 1091 |
+
|
| 1092 |
+
# Use the model to estimate the number of times each
|
| 1093 |
+
# feature should occur in the training data.
|
| 1094 |
+
estimated_fcount = calculate_estimated_fcount(
|
| 1095 |
+
classifier, train_toks, encoding
|
| 1096 |
+
)
|
| 1097 |
+
|
| 1098 |
+
# Take the log of estimated fcount (avoid taking log(0).)
|
| 1099 |
+
for fid in unattested:
|
| 1100 |
+
estimated_fcount[fid] += 1
|
| 1101 |
+
log_estimated_fcount = numpy.log2(estimated_fcount)
|
| 1102 |
+
del estimated_fcount
|
| 1103 |
+
|
| 1104 |
+
# Update the classifier weights
|
| 1105 |
+
weights = classifier.weights()
|
| 1106 |
+
weights += (log_empirical_fcount - log_estimated_fcount) * Cinv
|
| 1107 |
+
classifier.set_weights(weights)
|
| 1108 |
+
|
| 1109 |
+
# Check the log-likelihood & accuracy cutoffs.
|
| 1110 |
+
if cutoffchecker.check(classifier, train_toks):
|
| 1111 |
+
break
|
| 1112 |
+
|
| 1113 |
+
except KeyboardInterrupt:
|
| 1114 |
+
print(" Training stopped: keyboard interrupt")
|
| 1115 |
+
except:
|
| 1116 |
+
raise
|
| 1117 |
+
|
| 1118 |
+
if trace > 2:
|
| 1119 |
+
ll = log_likelihood(classifier, train_toks)
|
| 1120 |
+
acc = accuracy(classifier, train_toks)
|
| 1121 |
+
print(f" Final {ll:14.5f} {acc:9.3f}")
|
| 1122 |
+
|
| 1123 |
+
# Return the classifier.
|
| 1124 |
+
return classifier
|
| 1125 |
+
|
| 1126 |
+
|
| 1127 |
+
def calculate_empirical_fcount(train_toks, encoding):
|
| 1128 |
+
fcount = numpy.zeros(encoding.length(), "d")
|
| 1129 |
+
|
| 1130 |
+
for tok, label in train_toks:
|
| 1131 |
+
for (index, val) in encoding.encode(tok, label):
|
| 1132 |
+
fcount[index] += val
|
| 1133 |
+
|
| 1134 |
+
return fcount
|
| 1135 |
+
|
| 1136 |
+
|
| 1137 |
+
def calculate_estimated_fcount(classifier, train_toks, encoding):
|
| 1138 |
+
fcount = numpy.zeros(encoding.length(), "d")
|
| 1139 |
+
|
| 1140 |
+
for tok, label in train_toks:
|
| 1141 |
+
pdist = classifier.prob_classify(tok)
|
| 1142 |
+
for label in pdist.samples():
|
| 1143 |
+
prob = pdist.prob(label)
|
| 1144 |
+
for (fid, fval) in encoding.encode(tok, label):
|
| 1145 |
+
fcount[fid] += prob * fval
|
| 1146 |
+
|
| 1147 |
+
return fcount
|
| 1148 |
+
|
| 1149 |
+
|
| 1150 |
+
######################################################################
|
| 1151 |
+
# { Classifier Trainer: Improved Iterative Scaling
|
| 1152 |
+
######################################################################
|
| 1153 |
+
|
| 1154 |
+
|
| 1155 |
+
def train_maxent_classifier_with_iis(
|
| 1156 |
+
train_toks, trace=3, encoding=None, labels=None, **cutoffs
|
| 1157 |
+
):
|
| 1158 |
+
"""
|
| 1159 |
+
Train a new ``ConditionalExponentialClassifier``, using the given
|
| 1160 |
+
training samples, using the Improved Iterative Scaling algorithm.
|
| 1161 |
+
This ``ConditionalExponentialClassifier`` will encode the model
|
| 1162 |
+
that maximizes entropy from all the models that are empirically
|
| 1163 |
+
consistent with ``train_toks``.
|
| 1164 |
+
|
| 1165 |
+
:see: ``train_maxent_classifier()`` for parameter descriptions.
|
| 1166 |
+
"""
|
| 1167 |
+
cutoffs.setdefault("max_iter", 100)
|
| 1168 |
+
cutoffchecker = CutoffChecker(cutoffs)
|
| 1169 |
+
|
| 1170 |
+
# Construct an encoding from the training data.
|
| 1171 |
+
if encoding is None:
|
| 1172 |
+
encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)
|
| 1173 |
+
|
| 1174 |
+
# Count how many times each feature occurs in the training data.
|
| 1175 |
+
empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks)
|
| 1176 |
+
|
| 1177 |
+
# Find the nf map, and related variables nfarray and nfident.
|
| 1178 |
+
# nf is the sum of the features for a given labeled text.
|
| 1179 |
+
# nfmap compresses this sparse set of values to a dense list.
|
| 1180 |
+
# nfarray performs the reverse operation. nfident is
|
| 1181 |
+
# nfarray multiplied by an identity matrix.
|
| 1182 |
+
nfmap = calculate_nfmap(train_toks, encoding)
|
| 1183 |
+
nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d")
|
| 1184 |
+
nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
|
| 1185 |
+
|
| 1186 |
+
# Check for any features that are not attested in train_toks.
|
| 1187 |
+
unattested = set(numpy.nonzero(empirical_ffreq == 0)[0])
|
| 1188 |
+
|
| 1189 |
+
# Build the classifier. Start with weight=0 for each attested
|
| 1190 |
+
# feature, and weight=-infinity for each unattested feature.
|
| 1191 |
+
weights = numpy.zeros(len(empirical_ffreq), "d")
|
| 1192 |
+
for fid in unattested:
|
| 1193 |
+
weights[fid] = numpy.NINF
|
| 1194 |
+
classifier = ConditionalExponentialClassifier(encoding, weights)
|
| 1195 |
+
|
| 1196 |
+
if trace > 0:
|
| 1197 |
+
print(" ==> Training (%d iterations)" % cutoffs["max_iter"])
|
| 1198 |
+
if trace > 2:
|
| 1199 |
+
print()
|
| 1200 |
+
print(" Iteration Log Likelihood Accuracy")
|
| 1201 |
+
print(" ---------------------------------------")
|
| 1202 |
+
|
| 1203 |
+
# Train the classifier.
|
| 1204 |
+
try:
|
| 1205 |
+
while True:
|
| 1206 |
+
if trace > 2:
|
| 1207 |
+
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
|
| 1208 |
+
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
|
| 1209 |
+
iternum = cutoffchecker.iter
|
| 1210 |
+
print(" %9d %14.5f %9.3f" % (iternum, ll, acc))
|
| 1211 |
+
|
| 1212 |
+
# Calculate the deltas for this iteration, using Newton's method.
|
| 1213 |
+
deltas = calculate_deltas(
|
| 1214 |
+
train_toks,
|
| 1215 |
+
classifier,
|
| 1216 |
+
unattested,
|
| 1217 |
+
empirical_ffreq,
|
| 1218 |
+
nfmap,
|
| 1219 |
+
nfarray,
|
| 1220 |
+
nftranspose,
|
| 1221 |
+
encoding,
|
| 1222 |
+
)
|
| 1223 |
+
|
| 1224 |
+
# Use the deltas to update our weights.
|
| 1225 |
+
weights = classifier.weights()
|
| 1226 |
+
weights += deltas
|
| 1227 |
+
classifier.set_weights(weights)
|
| 1228 |
+
|
| 1229 |
+
# Check the log-likelihood & accuracy cutoffs.
|
| 1230 |
+
if cutoffchecker.check(classifier, train_toks):
|
| 1231 |
+
break
|
| 1232 |
+
|
| 1233 |
+
except KeyboardInterrupt:
|
| 1234 |
+
print(" Training stopped: keyboard interrupt")
|
| 1235 |
+
except:
|
| 1236 |
+
raise
|
| 1237 |
+
|
| 1238 |
+
if trace > 2:
|
| 1239 |
+
ll = log_likelihood(classifier, train_toks)
|
| 1240 |
+
acc = accuracy(classifier, train_toks)
|
| 1241 |
+
print(f" Final {ll:14.5f} {acc:9.3f}")
|
| 1242 |
+
|
| 1243 |
+
# Return the classifier.
|
| 1244 |
+
return classifier
|
| 1245 |
+
|
| 1246 |
+
|
| 1247 |
+
def calculate_nfmap(train_toks, encoding):
|
| 1248 |
+
"""
|
| 1249 |
+
Construct a map that can be used to compress ``nf`` (which is
|
| 1250 |
+
typically sparse).
|
| 1251 |
+
|
| 1252 |
+
*nf(feature_vector)* is the sum of the feature values for
|
| 1253 |
+
*feature_vector*.
|
| 1254 |
+
|
| 1255 |
+
This represents the number of features that are active for a
|
| 1256 |
+
given labeled text. This method finds all values of *nf(t)*
|
| 1257 |
+
that are attested for at least one token in the given list of
|
| 1258 |
+
training tokens; and constructs a dictionary mapping these
|
| 1259 |
+
attested values to a continuous range *0...N*. For example,
|
| 1260 |
+
if the only values of *nf()* that were attested were 3, 5, and
|
| 1261 |
+
7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.
|
| 1262 |
+
|
| 1263 |
+
:return: A map that can be used to compress ``nf`` to a dense
|
| 1264 |
+
vector.
|
| 1265 |
+
:rtype: dict(int -> int)
|
| 1266 |
+
"""
|
| 1267 |
+
# Map from nf to indices. This allows us to use smaller arrays.
|
| 1268 |
+
nfset = set()
|
| 1269 |
+
for tok, _ in train_toks:
|
| 1270 |
+
for label in encoding.labels():
|
| 1271 |
+
nfset.add(sum(val for (id, val) in encoding.encode(tok, label)))
|
| 1272 |
+
return {nf: i for (i, nf) in enumerate(nfset)}
|
| 1273 |
+
|
| 1274 |
+
|
| 1275 |
+
def calculate_deltas(
|
| 1276 |
+
train_toks,
|
| 1277 |
+
classifier,
|
| 1278 |
+
unattested,
|
| 1279 |
+
ffreq_empirical,
|
| 1280 |
+
nfmap,
|
| 1281 |
+
nfarray,
|
| 1282 |
+
nftranspose,
|
| 1283 |
+
encoding,
|
| 1284 |
+
):
|
| 1285 |
+
r"""
|
| 1286 |
+
Calculate the update values for the classifier weights for
|
| 1287 |
+
this iteration of IIS. These update weights are the value of
|
| 1288 |
+
``delta`` that solves the equation::
|
| 1289 |
+
|
| 1290 |
+
ffreq_empirical[i]
|
| 1291 |
+
=
|
| 1292 |
+
SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
|
| 1293 |
+
feature_vector(fs,l)[i] *
|
| 1294 |
+
exp(delta[i] * nf(feature_vector(fs,l))))
|
| 1295 |
+
|
| 1296 |
+
Where:
|
| 1297 |
+
- *(fs,l)* is a (featureset, label) tuple from ``train_toks``
|
| 1298 |
+
- *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
|
| 1299 |
+
- *nf(vector)* = ``sum([val for (id,val) in vector])``
|
| 1300 |
+
|
| 1301 |
+
This method uses Newton's method to solve this equation for
|
| 1302 |
+
*delta[i]*. In particular, it starts with a guess of
|
| 1303 |
+
``delta[i]`` = 1; and iteratively updates ``delta`` with:
|
| 1304 |
+
|
| 1305 |
+
| delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])
|
| 1306 |
+
|
| 1307 |
+
until convergence, where *sum1* and *sum2* are defined as:
|
| 1308 |
+
|
| 1309 |
+
| sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
|
| 1310 |
+
| sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
|
| 1311 |
+
| f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
|
| 1312 |
+
| feature_vector(fs,l)[i] .
|
| 1313 |
+
| exp(delta[i] . nf(feature_vector(fs,l))))
|
| 1314 |
+
|
| 1315 |
+
Note that *sum1* and *sum2* depend on ``delta``; so they need
|
| 1316 |
+
to be re-computed each iteration.
|
| 1317 |
+
|
| 1318 |
+
The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
|
| 1319 |
+
used to generate a dense encoding for *nf(ltext)*. This
|
| 1320 |
+
allows ``_deltas`` to calculate *sum1* and *sum2* using
|
| 1321 |
+
matrices, which yields a significant performance improvement.
|
| 1322 |
+
|
| 1323 |
+
:param train_toks: The set of training tokens.
|
| 1324 |
+
:type train_toks: list(tuple(dict, str))
|
| 1325 |
+
:param classifier: The current classifier.
|
| 1326 |
+
:type classifier: ClassifierI
|
| 1327 |
+
:param ffreq_empirical: An array containing the empirical
|
| 1328 |
+
frequency for each feature. The *i*\ th element of this
|
| 1329 |
+
array is the empirical frequency for feature *i*.
|
| 1330 |
+
:type ffreq_empirical: sequence of float
|
| 1331 |
+
:param unattested: An array that is 1 for features that are
|
| 1332 |
+
not attested in the training data; and 0 for features that
|
| 1333 |
+
are attested. In other words, ``unattested[i]==0`` iff
|
| 1334 |
+
``ffreq_empirical[i]==0``.
|
| 1335 |
+
:type unattested: sequence of int
|
| 1336 |
+
:param nfmap: A map that can be used to compress ``nf`` to a dense
|
| 1337 |
+
vector.
|
| 1338 |
+
:type nfmap: dict(int -> int)
|
| 1339 |
+
:param nfarray: An array that can be used to uncompress ``nf``
|
| 1340 |
+
from a dense vector.
|
| 1341 |
+
:type nfarray: array(float)
|
| 1342 |
+
:param nftranspose: The transpose of ``nfarray``
|
| 1343 |
+
:type nftranspose: array(float)
|
| 1344 |
+
"""
|
| 1345 |
+
# These parameters control when we decide that we've
|
| 1346 |
+
# converged. It probably should be possible to set these
|
| 1347 |
+
# manually, via keyword arguments to train.
|
| 1348 |
+
NEWTON_CONVERGE = 1e-12
|
| 1349 |
+
MAX_NEWTON = 300
|
| 1350 |
+
|
| 1351 |
+
deltas = numpy.ones(encoding.length(), "d")
|
| 1352 |
+
|
| 1353 |
+
# Precompute the A matrix:
|
| 1354 |
+
# A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
|
| 1355 |
+
# over all label,fs s.t. num_features[label,fs]=nf
|
| 1356 |
+
A = numpy.zeros((len(nfmap), encoding.length()), "d")
|
| 1357 |
+
|
| 1358 |
+
for tok, label in train_toks:
|
| 1359 |
+
dist = classifier.prob_classify(tok)
|
| 1360 |
+
|
| 1361 |
+
for label in encoding.labels():
|
| 1362 |
+
# Generate the feature vector
|
| 1363 |
+
feature_vector = encoding.encode(tok, label)
|
| 1364 |
+
# Find the number of active features
|
| 1365 |
+
nf = sum(val for (id, val) in feature_vector)
|
| 1366 |
+
# Update the A matrix
|
| 1367 |
+
for (id, val) in feature_vector:
|
| 1368 |
+
A[nfmap[nf], id] += dist.prob(label) * val
|
| 1369 |
+
A /= len(train_toks)
|
| 1370 |
+
|
| 1371 |
+
# Iteratively solve for delta. Use the following variables:
|
| 1372 |
+
# - nf_delta[x][y] = nfarray[x] * delta[y]
|
| 1373 |
+
# - exp_nf_delta[x][y] = exp(nf[x] * delta[y])
|
| 1374 |
+
# - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y])
|
| 1375 |
+
# - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
|
| 1376 |
+
# exp(delta[i]nf)
|
| 1377 |
+
# - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
|
| 1378 |
+
# nf exp(delta[i]nf)
|
| 1379 |
+
for rangenum in range(MAX_NEWTON):
|
| 1380 |
+
nf_delta = numpy.outer(nfarray, deltas)
|
| 1381 |
+
exp_nf_delta = 2**nf_delta
|
| 1382 |
+
nf_exp_nf_delta = nftranspose * exp_nf_delta
|
| 1383 |
+
sum1 = numpy.sum(exp_nf_delta * A, axis=0)
|
| 1384 |
+
sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
|
| 1385 |
+
|
| 1386 |
+
# Avoid division by zero.
|
| 1387 |
+
for fid in unattested:
|
| 1388 |
+
sum2[fid] += 1
|
| 1389 |
+
|
| 1390 |
+
# Update the deltas.
|
| 1391 |
+
deltas -= (ffreq_empirical - sum1) / -sum2
|
| 1392 |
+
|
| 1393 |
+
# We can stop once we converge.
|
| 1394 |
+
n_error = numpy.sum(abs(ffreq_empirical - sum1)) / numpy.sum(abs(deltas))
|
| 1395 |
+
if n_error < NEWTON_CONVERGE:
|
| 1396 |
+
return deltas
|
| 1397 |
+
|
| 1398 |
+
return deltas
|
| 1399 |
+
|
| 1400 |
+
|
| 1401 |
+
######################################################################
|
| 1402 |
+
# { Classifier Trainer: megam
|
| 1403 |
+
######################################################################
|
| 1404 |
+
|
| 1405 |
+
# [xx] possible extension: add support for using implicit file format;
|
| 1406 |
+
# this would need to put requirements on what encoding is used. But
|
| 1407 |
+
# we may need this for other maxent classifier trainers that require
|
| 1408 |
+
# implicit formats anyway.
|
| 1409 |
+
def train_maxent_classifier_with_megam(
|
| 1410 |
+
train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs
|
| 1411 |
+
):
|
| 1412 |
+
"""
|
| 1413 |
+
Train a new ``ConditionalExponentialClassifier``, using the given
|
| 1414 |
+
training samples, using the external ``megam`` library. This
|
| 1415 |
+
``ConditionalExponentialClassifier`` will encode the model that
|
| 1416 |
+
maximizes entropy from all the models that are empirically
|
| 1417 |
+
consistent with ``train_toks``.
|
| 1418 |
+
|
| 1419 |
+
:see: ``train_maxent_classifier()`` for parameter descriptions.
|
| 1420 |
+
:see: ``nltk.classify.megam``
|
| 1421 |
+
"""
|
| 1422 |
+
|
| 1423 |
+
explicit = True
|
| 1424 |
+
bernoulli = True
|
| 1425 |
+
if "explicit" in kwargs:
|
| 1426 |
+
explicit = kwargs["explicit"]
|
| 1427 |
+
if "bernoulli" in kwargs:
|
| 1428 |
+
bernoulli = kwargs["bernoulli"]
|
| 1429 |
+
|
| 1430 |
+
# Construct an encoding from the training data.
|
| 1431 |
+
if encoding is None:
|
| 1432 |
+
# Count cutoff can also be controlled by megam with the -minfc
|
| 1433 |
+
# option. Not sure where the best place for it is.
|
| 1434 |
+
count_cutoff = kwargs.get("count_cutoff", 0)
|
| 1435 |
+
encoding = BinaryMaxentFeatureEncoding.train(
|
| 1436 |
+
train_toks, count_cutoff, labels=labels, alwayson_features=True
|
| 1437 |
+
)
|
| 1438 |
+
elif labels is not None:
|
| 1439 |
+
raise ValueError("Specify encoding or labels, not both")
|
| 1440 |
+
|
| 1441 |
+
# Write a training file for megam.
|
| 1442 |
+
try:
|
| 1443 |
+
fd, trainfile_name = tempfile.mkstemp(prefix="nltk-")
|
| 1444 |
+
with open(trainfile_name, "w") as trainfile:
|
| 1445 |
+
write_megam_file(
|
| 1446 |
+
train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
|
| 1447 |
+
)
|
| 1448 |
+
os.close(fd)
|
| 1449 |
+
except (OSError, ValueError) as e:
|
| 1450 |
+
raise ValueError("Error while creating megam training file: %s" % e) from e
|
| 1451 |
+
|
| 1452 |
+
# Run megam on the training file.
|
| 1453 |
+
options = []
|
| 1454 |
+
options += ["-nobias", "-repeat", "10"]
|
| 1455 |
+
if explicit:
|
| 1456 |
+
options += ["-explicit"]
|
| 1457 |
+
if not bernoulli:
|
| 1458 |
+
options += ["-fvals"]
|
| 1459 |
+
if gaussian_prior_sigma:
|
| 1460 |
+
# Lambda is just the precision of the Gaussian prior, i.e. it's the
|
| 1461 |
+
# inverse variance, so the parameter conversion is 1.0/sigma**2.
|
| 1462 |
+
# See https://users.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf
|
| 1463 |
+
inv_variance = 1.0 / gaussian_prior_sigma**2
|
| 1464 |
+
else:
|
| 1465 |
+
inv_variance = 0
|
| 1466 |
+
options += ["-lambda", "%.2f" % inv_variance, "-tune"]
|
| 1467 |
+
if trace < 3:
|
| 1468 |
+
options += ["-quiet"]
|
| 1469 |
+
if "max_iter" in kwargs:
|
| 1470 |
+
options += ["-maxi", "%s" % kwargs["max_iter"]]
|
| 1471 |
+
if "ll_delta" in kwargs:
|
| 1472 |
+
# [xx] this is actually a perplexity delta, not a log
|
| 1473 |
+
# likelihood delta
|
| 1474 |
+
options += ["-dpp", "%s" % abs(kwargs["ll_delta"])]
|
| 1475 |
+
if hasattr(encoding, "cost"):
|
| 1476 |
+
options += ["-multilabel"] # each possible la
|
| 1477 |
+
options += ["multiclass", trainfile_name]
|
| 1478 |
+
stdout = call_megam(options)
|
| 1479 |
+
# print('./megam_i686.opt ', ' '.join(options))
|
| 1480 |
+
# Delete the training file
|
| 1481 |
+
try:
|
| 1482 |
+
os.remove(trainfile_name)
|
| 1483 |
+
except OSError as e:
|
| 1484 |
+
print(f"Warning: unable to delete {trainfile_name}: {e}")
|
| 1485 |
+
|
| 1486 |
+
# Parse the generated weight vector.
|
| 1487 |
+
weights = parse_megam_weights(stdout, encoding.length(), explicit)
|
| 1488 |
+
|
| 1489 |
+
# Convert from base-e to base-2 weights.
|
| 1490 |
+
weights *= numpy.log2(numpy.e)
|
| 1491 |
+
|
| 1492 |
+
# Build the classifier
|
| 1493 |
+
return MaxentClassifier(encoding, weights)
|
| 1494 |
+
|
| 1495 |
+
|
| 1496 |
+
######################################################################
|
| 1497 |
+
# { Classifier Trainer: tadm
|
| 1498 |
+
######################################################################
|
| 1499 |
+
|
| 1500 |
+
|
| 1501 |
+
class TadmMaxentClassifier(MaxentClassifier):
|
| 1502 |
+
@classmethod
|
| 1503 |
+
def train(cls, train_toks, **kwargs):
|
| 1504 |
+
algorithm = kwargs.get("algorithm", "tao_lmvm")
|
| 1505 |
+
trace = kwargs.get("trace", 3)
|
| 1506 |
+
encoding = kwargs.get("encoding", None)
|
| 1507 |
+
labels = kwargs.get("labels", None)
|
| 1508 |
+
sigma = kwargs.get("gaussian_prior_sigma", 0)
|
| 1509 |
+
count_cutoff = kwargs.get("count_cutoff", 0)
|
| 1510 |
+
max_iter = kwargs.get("max_iter")
|
| 1511 |
+
ll_delta = kwargs.get("min_lldelta")
|
| 1512 |
+
|
| 1513 |
+
# Construct an encoding from the training data.
|
| 1514 |
+
if not encoding:
|
| 1515 |
+
encoding = TadmEventMaxentFeatureEncoding.train(
|
| 1516 |
+
train_toks, count_cutoff, labels=labels
|
| 1517 |
+
)
|
| 1518 |
+
|
| 1519 |
+
trainfile_fd, trainfile_name = tempfile.mkstemp(
|
| 1520 |
+
prefix="nltk-tadm-events-", suffix=".gz"
|
| 1521 |
+
)
|
| 1522 |
+
weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-")
|
| 1523 |
+
|
| 1524 |
+
trainfile = gzip_open_unicode(trainfile_name, "w")
|
| 1525 |
+
write_tadm_file(train_toks, encoding, trainfile)
|
| 1526 |
+
trainfile.close()
|
| 1527 |
+
|
| 1528 |
+
options = []
|
| 1529 |
+
options.extend(["-monitor"])
|
| 1530 |
+
options.extend(["-method", algorithm])
|
| 1531 |
+
if sigma:
|
| 1532 |
+
options.extend(["-l2", "%.6f" % sigma**2])
|
| 1533 |
+
if max_iter:
|
| 1534 |
+
options.extend(["-max_it", "%d" % max_iter])
|
| 1535 |
+
if ll_delta:
|
| 1536 |
+
options.extend(["-fatol", "%.6f" % abs(ll_delta)])
|
| 1537 |
+
options.extend(["-events_in", trainfile_name])
|
| 1538 |
+
options.extend(["-params_out", weightfile_name])
|
| 1539 |
+
if trace < 3:
|
| 1540 |
+
options.extend(["2>&1"])
|
| 1541 |
+
else:
|
| 1542 |
+
options.extend(["-summary"])
|
| 1543 |
+
|
| 1544 |
+
call_tadm(options)
|
| 1545 |
+
|
| 1546 |
+
with open(weightfile_name) as weightfile:
|
| 1547 |
+
weights = parse_tadm_weights(weightfile)
|
| 1548 |
+
|
| 1549 |
+
os.remove(trainfile_name)
|
| 1550 |
+
os.remove(weightfile_name)
|
| 1551 |
+
|
| 1552 |
+
# Convert from base-e to base-2 weights.
|
| 1553 |
+
weights *= numpy.log2(numpy.e)
|
| 1554 |
+
|
| 1555 |
+
# Build the classifier
|
| 1556 |
+
return cls(encoding, weights)
|
| 1557 |
+
|
| 1558 |
+
|
| 1559 |
+
######################################################################
|
| 1560 |
+
# { Demo
|
| 1561 |
+
######################################################################
|
| 1562 |
+
def demo():
|
| 1563 |
+
from nltk.classify.util import names_demo
|
| 1564 |
+
|
| 1565 |
+
classifier = names_demo(MaxentClassifier.train)
|
| 1566 |
+
|
| 1567 |
+
|
| 1568 |
+
if __name__ == "__main__":
|
| 1569 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/classify/megam.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Interface to Megam Classifier
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
A set of functions used to interface with the external megam_ maxent
|
| 10 |
+
optimization package. Before megam can be used, you should tell NLTK where it
|
| 11 |
+
can find the megam binary, using the ``config_megam()`` function. Typical
|
| 12 |
+
usage:
|
| 13 |
+
|
| 14 |
+
>>> from nltk.classify import megam
|
| 15 |
+
>>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
|
| 16 |
+
[Found megam: ...]
|
| 17 |
+
|
| 18 |
+
Use with MaxentClassifier. Example below, see MaxentClassifier documentation
|
| 19 |
+
for details.
|
| 20 |
+
|
| 21 |
+
nltk.classify.MaxentClassifier.train(corpus, 'megam')
|
| 22 |
+
|
| 23 |
+
.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
|
| 24 |
+
"""
|
| 25 |
+
import subprocess
|
| 26 |
+
|
| 27 |
+
from nltk.internals import find_binary
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
import numpy
|
| 31 |
+
except ImportError:
|
| 32 |
+
numpy = None
|
| 33 |
+
|
| 34 |
+
######################################################################
|
| 35 |
+
# { Configuration
|
| 36 |
+
######################################################################
|
| 37 |
+
|
| 38 |
+
_megam_bin = None
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def config_megam(bin=None):
|
| 42 |
+
"""
|
| 43 |
+
Configure NLTK's interface to the ``megam`` maxent optimization
|
| 44 |
+
package.
|
| 45 |
+
|
| 46 |
+
:param bin: The full path to the ``megam`` binary. If not specified,
|
| 47 |
+
then nltk will search the system for a ``megam`` binary; and if
|
| 48 |
+
one is not found, it will raise a ``LookupError`` exception.
|
| 49 |
+
:type bin: str
|
| 50 |
+
"""
|
| 51 |
+
global _megam_bin
|
| 52 |
+
_megam_bin = find_binary(
|
| 53 |
+
"megam",
|
| 54 |
+
bin,
|
| 55 |
+
env_vars=["MEGAM"],
|
| 56 |
+
binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
|
| 57 |
+
url="https://www.umiacs.umd.edu/~hal/megam/index.html",
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
######################################################################
|
| 62 |
+
# { Megam Interface Functions
|
| 63 |
+
######################################################################
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
|
| 67 |
+
"""
|
| 68 |
+
Generate an input file for ``megam`` based on the given corpus of
|
| 69 |
+
classified tokens.
|
| 70 |
+
|
| 71 |
+
:type train_toks: list(tuple(dict, str))
|
| 72 |
+
:param train_toks: Training data, represented as a list of
|
| 73 |
+
pairs, the first member of which is a feature dictionary,
|
| 74 |
+
and the second of which is a classification label.
|
| 75 |
+
|
| 76 |
+
:type encoding: MaxentFeatureEncodingI
|
| 77 |
+
:param encoding: A feature encoding, used to convert featuresets
|
| 78 |
+
into feature vectors. May optionally implement a cost() method
|
| 79 |
+
in order to assign different costs to different class predictions.
|
| 80 |
+
|
| 81 |
+
:type stream: stream
|
| 82 |
+
:param stream: The stream to which the megam input file should be
|
| 83 |
+
written.
|
| 84 |
+
|
| 85 |
+
:param bernoulli: If true, then use the 'bernoulli' format. I.e.,
|
| 86 |
+
all joint features have binary values, and are listed iff they
|
| 87 |
+
are true. Otherwise, list feature values explicitly. If
|
| 88 |
+
``bernoulli=False``, then you must call ``megam`` with the
|
| 89 |
+
``-fvals`` option.
|
| 90 |
+
|
| 91 |
+
:param explicit: If true, then use the 'explicit' format. I.e.,
|
| 92 |
+
list the features that would fire for any of the possible
|
| 93 |
+
labels, for each token. If ``explicit=True``, then you must
|
| 94 |
+
call ``megam`` with the ``-explicit`` option.
|
| 95 |
+
"""
|
| 96 |
+
# Look up the set of labels.
|
| 97 |
+
labels = encoding.labels()
|
| 98 |
+
labelnum = {label: i for (i, label) in enumerate(labels)}
|
| 99 |
+
|
| 100 |
+
# Write the file, which contains one line per instance.
|
| 101 |
+
for featureset, label in train_toks:
|
| 102 |
+
# First, the instance number (or, in the weighted multiclass case, the cost of each label).
|
| 103 |
+
if hasattr(encoding, "cost"):
|
| 104 |
+
stream.write(
|
| 105 |
+
":".join(str(encoding.cost(featureset, label, l)) for l in labels)
|
| 106 |
+
)
|
| 107 |
+
else:
|
| 108 |
+
stream.write("%d" % labelnum[label])
|
| 109 |
+
|
| 110 |
+
# For implicit file formats, just list the features that fire
|
| 111 |
+
# for this instance's actual label.
|
| 112 |
+
if not explicit:
|
| 113 |
+
_write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
|
| 114 |
+
|
| 115 |
+
# For explicit formats, list the features that would fire for
|
| 116 |
+
# any of the possible labels.
|
| 117 |
+
else:
|
| 118 |
+
for l in labels:
|
| 119 |
+
stream.write(" #")
|
| 120 |
+
_write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
|
| 121 |
+
|
| 122 |
+
# End of the instance.
|
| 123 |
+
stream.write("\n")
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def parse_megam_weights(s, features_count, explicit=True):
|
| 127 |
+
"""
|
| 128 |
+
Given the stdout output generated by ``megam`` when training a
|
| 129 |
+
model, return a ``numpy`` array containing the corresponding weight
|
| 130 |
+
vector. This function does not currently handle bias features.
|
| 131 |
+
"""
|
| 132 |
+
if numpy is None:
|
| 133 |
+
raise ValueError("This function requires that numpy be installed")
|
| 134 |
+
assert explicit, "non-explicit not supported yet"
|
| 135 |
+
lines = s.strip().split("\n")
|
| 136 |
+
weights = numpy.zeros(features_count, "d")
|
| 137 |
+
for line in lines:
|
| 138 |
+
if line.strip():
|
| 139 |
+
fid, weight = line.split()
|
| 140 |
+
weights[int(fid)] = float(weight)
|
| 141 |
+
return weights
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _write_megam_features(vector, stream, bernoulli):
|
| 145 |
+
if not vector:
|
| 146 |
+
raise ValueError(
|
| 147 |
+
"MEGAM classifier requires the use of an " "always-on feature."
|
| 148 |
+
)
|
| 149 |
+
for (fid, fval) in vector:
|
| 150 |
+
if bernoulli:
|
| 151 |
+
if fval == 1:
|
| 152 |
+
stream.write(" %s" % fid)
|
| 153 |
+
elif fval != 0:
|
| 154 |
+
raise ValueError(
|
| 155 |
+
"If bernoulli=True, then all" "features must be binary."
|
| 156 |
+
)
|
| 157 |
+
else:
|
| 158 |
+
stream.write(f" {fid} {fval}")
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def call_megam(args):
|
| 162 |
+
"""
|
| 163 |
+
Call the ``megam`` binary with the given arguments.
|
| 164 |
+
"""
|
| 165 |
+
if isinstance(args, str):
|
| 166 |
+
raise TypeError("args should be a list of strings")
|
| 167 |
+
if _megam_bin is None:
|
| 168 |
+
config_megam()
|
| 169 |
+
|
| 170 |
+
# Call megam via a subprocess
|
| 171 |
+
cmd = [_megam_bin] + args
|
| 172 |
+
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
| 173 |
+
(stdout, stderr) = p.communicate()
|
| 174 |
+
|
| 175 |
+
# Check the return code.
|
| 176 |
+
if p.returncode != 0:
|
| 177 |
+
print()
|
| 178 |
+
print(stderr)
|
| 179 |
+
raise OSError("megam command failed!")
|
| 180 |
+
|
| 181 |
+
if isinstance(stdout, str):
|
| 182 |
+
return stdout
|
| 183 |
+
else:
|
| 184 |
+
return stdout.decode("utf-8")
|
.eggs/nltk-3.8-py3.10.egg/nltk/classify/naivebayes.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Naive Bayes Classifiers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
A classifier based on the Naive Bayes algorithm. In order to find the
|
| 10 |
+
probability for a label, this algorithm first uses the Bayes rule to
|
| 11 |
+
express P(label|features) in terms of P(label) and P(features|label):
|
| 12 |
+
|
| 13 |
+
| P(label) * P(features|label)
|
| 14 |
+
| P(label|features) = ------------------------------
|
| 15 |
+
| P(features)
|
| 16 |
+
|
| 17 |
+
The algorithm then makes the 'naive' assumption that all features are
|
| 18 |
+
independent, given the label:
|
| 19 |
+
|
| 20 |
+
| P(label) * P(f1|label) * ... * P(fn|label)
|
| 21 |
+
| P(label|features) = --------------------------------------------
|
| 22 |
+
| P(features)
|
| 23 |
+
|
| 24 |
+
Rather than computing P(features) explicitly, the algorithm just
|
| 25 |
+
calculates the numerator for each label, and normalizes them so they
|
| 26 |
+
sum to one:
|
| 27 |
+
|
| 28 |
+
| P(label) * P(f1|label) * ... * P(fn|label)
|
| 29 |
+
| P(label|features) = --------------------------------------------
|
| 30 |
+
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
from collections import defaultdict
|
| 34 |
+
|
| 35 |
+
from nltk.classify.api import ClassifierI
|
| 36 |
+
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
|
| 37 |
+
|
| 38 |
+
##//////////////////////////////////////////////////////
|
| 39 |
+
## Naive Bayes Classifier
|
| 40 |
+
##//////////////////////////////////////////////////////
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class NaiveBayesClassifier(ClassifierI):
|
| 44 |
+
"""
|
| 45 |
+
A Naive Bayes classifier. Naive Bayes classifiers are
|
| 46 |
+
paramaterized by two probability distributions:
|
| 47 |
+
|
| 48 |
+
- P(label) gives the probability that an input will receive each
|
| 49 |
+
label, given no information about the input's features.
|
| 50 |
+
|
| 51 |
+
- P(fname=fval|label) gives the probability that a given feature
|
| 52 |
+
(fname) will receive a given value (fval), given that the
|
| 53 |
+
label (label).
|
| 54 |
+
|
| 55 |
+
If the classifier encounters an input with a feature that has
|
| 56 |
+
never been seen with any label, then rather than assigning a
|
| 57 |
+
probability of 0 to all labels, it will ignore that feature.
|
| 58 |
+
|
| 59 |
+
The feature value 'None' is reserved for unseen feature values;
|
| 60 |
+
you generally should not use 'None' as a feature value for one of
|
| 61 |
+
your own features.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
def __init__(self, label_probdist, feature_probdist):
|
| 65 |
+
"""
|
| 66 |
+
:param label_probdist: P(label), the probability distribution
|
| 67 |
+
over labels. It is expressed as a ``ProbDistI`` whose
|
| 68 |
+
samples are labels. I.e., P(label) =
|
| 69 |
+
``label_probdist.prob(label)``.
|
| 70 |
+
|
| 71 |
+
:param feature_probdist: P(fname=fval|label), the probability
|
| 72 |
+
distribution for feature values, given labels. It is
|
| 73 |
+
expressed as a dictionary whose keys are ``(label, fname)``
|
| 74 |
+
pairs and whose values are ``ProbDistI`` objects over feature
|
| 75 |
+
values. I.e., P(fname=fval|label) =
|
| 76 |
+
``feature_probdist[label,fname].prob(fval)``. If a given
|
| 77 |
+
``(label,fname)`` is not a key in ``feature_probdist``, then
|
| 78 |
+
it is assumed that the corresponding P(fname=fval|label)
|
| 79 |
+
is 0 for all values of ``fval``.
|
| 80 |
+
"""
|
| 81 |
+
self._label_probdist = label_probdist
|
| 82 |
+
self._feature_probdist = feature_probdist
|
| 83 |
+
self._labels = list(label_probdist.samples())
|
| 84 |
+
|
| 85 |
+
def labels(self):
|
| 86 |
+
return self._labels
|
| 87 |
+
|
| 88 |
+
def classify(self, featureset):
|
| 89 |
+
return self.prob_classify(featureset).max()
|
| 90 |
+
|
| 91 |
+
def prob_classify(self, featureset):
|
| 92 |
+
# Discard any feature names that we've never seen before.
|
| 93 |
+
# Otherwise, we'll just assign a probability of 0 to
|
| 94 |
+
# everything.
|
| 95 |
+
featureset = featureset.copy()
|
| 96 |
+
for fname in list(featureset.keys()):
|
| 97 |
+
for label in self._labels:
|
| 98 |
+
if (label, fname) in self._feature_probdist:
|
| 99 |
+
break
|
| 100 |
+
else:
|
| 101 |
+
# print('Ignoring unseen feature %s' % fname)
|
| 102 |
+
del featureset[fname]
|
| 103 |
+
|
| 104 |
+
# Find the log probability of each label, given the features.
|
| 105 |
+
# Start with the log probability of the label itself.
|
| 106 |
+
logprob = {}
|
| 107 |
+
for label in self._labels:
|
| 108 |
+
logprob[label] = self._label_probdist.logprob(label)
|
| 109 |
+
|
| 110 |
+
# Then add in the log probability of features given labels.
|
| 111 |
+
for label in self._labels:
|
| 112 |
+
for (fname, fval) in featureset.items():
|
| 113 |
+
if (label, fname) in self._feature_probdist:
|
| 114 |
+
feature_probs = self._feature_probdist[label, fname]
|
| 115 |
+
logprob[label] += feature_probs.logprob(fval)
|
| 116 |
+
else:
|
| 117 |
+
# nb: This case will never come up if the
|
| 118 |
+
# classifier was created by
|
| 119 |
+
# NaiveBayesClassifier.train().
|
| 120 |
+
logprob[label] += sum_logs([]) # = -INF.
|
| 121 |
+
|
| 122 |
+
return DictionaryProbDist(logprob, normalize=True, log=True)
|
| 123 |
+
|
| 124 |
+
def show_most_informative_features(self, n=10):
|
| 125 |
+
# Determine the most relevant features, and display them.
|
| 126 |
+
cpdist = self._feature_probdist
|
| 127 |
+
print("Most Informative Features")
|
| 128 |
+
|
| 129 |
+
for (fname, fval) in self.most_informative_features(n):
|
| 130 |
+
|
| 131 |
+
def labelprob(l):
|
| 132 |
+
return cpdist[l, fname].prob(fval)
|
| 133 |
+
|
| 134 |
+
labels = sorted(
|
| 135 |
+
(l for l in self._labels if fval in cpdist[l, fname].samples()),
|
| 136 |
+
key=lambda element: (-labelprob(element), element),
|
| 137 |
+
reverse=True,
|
| 138 |
+
)
|
| 139 |
+
if len(labels) == 1:
|
| 140 |
+
continue
|
| 141 |
+
l0 = labels[0]
|
| 142 |
+
l1 = labels[-1]
|
| 143 |
+
if cpdist[l0, fname].prob(fval) == 0:
|
| 144 |
+
ratio = "INF"
|
| 145 |
+
else:
|
| 146 |
+
ratio = "%8.1f" % (
|
| 147 |
+
cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
|
| 148 |
+
)
|
| 149 |
+
print(
|
| 150 |
+
"%24s = %-14r %6s : %-6s = %s : 1.0"
|
| 151 |
+
% (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
def most_informative_features(self, n=100):
|
| 155 |
+
"""
|
| 156 |
+
Return a list of the 'most informative' features used by this
|
| 157 |
+
classifier. For the purpose of this function, the
|
| 158 |
+
informativeness of a feature ``(fname,fval)`` is equal to the
|
| 159 |
+
highest value of P(fname=fval|label), for any label, divided by
|
| 160 |
+
the lowest value of P(fname=fval|label), for any label:
|
| 161 |
+
|
| 162 |
+
| max[ P(fname=fval|label1) / P(fname=fval|label2) ]
|
| 163 |
+
"""
|
| 164 |
+
if hasattr(self, "_most_informative_features"):
|
| 165 |
+
return self._most_informative_features[:n]
|
| 166 |
+
else:
|
| 167 |
+
# The set of (fname, fval) pairs used by this classifier.
|
| 168 |
+
features = set()
|
| 169 |
+
# The max & min probability associated w/ each (fname, fval)
|
| 170 |
+
# pair. Maps (fname,fval) -> float.
|
| 171 |
+
maxprob = defaultdict(lambda: 0.0)
|
| 172 |
+
minprob = defaultdict(lambda: 1.0)
|
| 173 |
+
|
| 174 |
+
for (label, fname), probdist in self._feature_probdist.items():
|
| 175 |
+
for fval in probdist.samples():
|
| 176 |
+
feature = (fname, fval)
|
| 177 |
+
features.add(feature)
|
| 178 |
+
p = probdist.prob(fval)
|
| 179 |
+
maxprob[feature] = max(p, maxprob[feature])
|
| 180 |
+
minprob[feature] = min(p, minprob[feature])
|
| 181 |
+
if minprob[feature] == 0:
|
| 182 |
+
features.discard(feature)
|
| 183 |
+
|
| 184 |
+
# Convert features to a list, & sort it by how informative
|
| 185 |
+
# features are.
|
| 186 |
+
self._most_informative_features = sorted(
|
| 187 |
+
features,
|
| 188 |
+
key=lambda feature_: (
|
| 189 |
+
minprob[feature_] / maxprob[feature_],
|
| 190 |
+
feature_[0],
|
| 191 |
+
feature_[1] in [None, False, True],
|
| 192 |
+
str(feature_[1]).lower(),
|
| 193 |
+
),
|
| 194 |
+
)
|
| 195 |
+
return self._most_informative_features[:n]
|
| 196 |
+
|
| 197 |
+
@classmethod
|
| 198 |
+
def train(cls, labeled_featuresets, estimator=ELEProbDist):
|
| 199 |
+
"""
|
| 200 |
+
:param labeled_featuresets: A list of classified featuresets,
|
| 201 |
+
i.e., a list of tuples ``(featureset, label)``.
|
| 202 |
+
"""
|
| 203 |
+
label_freqdist = FreqDist()
|
| 204 |
+
feature_freqdist = defaultdict(FreqDist)
|
| 205 |
+
feature_values = defaultdict(set)
|
| 206 |
+
fnames = set()
|
| 207 |
+
|
| 208 |
+
# Count up how many times each feature value occurred, given
|
| 209 |
+
# the label and featurename.
|
| 210 |
+
for featureset, label in labeled_featuresets:
|
| 211 |
+
label_freqdist[label] += 1
|
| 212 |
+
for fname, fval in featureset.items():
|
| 213 |
+
# Increment freq(fval|label, fname)
|
| 214 |
+
feature_freqdist[label, fname][fval] += 1
|
| 215 |
+
# Record that fname can take the value fval.
|
| 216 |
+
feature_values[fname].add(fval)
|
| 217 |
+
# Keep a list of all feature names.
|
| 218 |
+
fnames.add(fname)
|
| 219 |
+
|
| 220 |
+
# If a feature didn't have a value given for an instance, then
|
| 221 |
+
# we assume that it gets the implicit value 'None.' This loop
|
| 222 |
+
# counts up the number of 'missing' feature values for each
|
| 223 |
+
# (label,fname) pair, and increments the count of the fval
|
| 224 |
+
# 'None' by that amount.
|
| 225 |
+
for label in label_freqdist:
|
| 226 |
+
num_samples = label_freqdist[label]
|
| 227 |
+
for fname in fnames:
|
| 228 |
+
count = feature_freqdist[label, fname].N()
|
| 229 |
+
# Only add a None key when necessary, i.e. if there are
|
| 230 |
+
# any samples with feature 'fname' missing.
|
| 231 |
+
if num_samples - count > 0:
|
| 232 |
+
feature_freqdist[label, fname][None] += num_samples - count
|
| 233 |
+
feature_values[fname].add(None)
|
| 234 |
+
|
| 235 |
+
# Create the P(label) distribution
|
| 236 |
+
label_probdist = estimator(label_freqdist)
|
| 237 |
+
|
| 238 |
+
# Create the P(fval|label, fname) distribution
|
| 239 |
+
feature_probdist = {}
|
| 240 |
+
for ((label, fname), freqdist) in feature_freqdist.items():
|
| 241 |
+
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
| 242 |
+
feature_probdist[label, fname] = probdist
|
| 243 |
+
|
| 244 |
+
return cls(label_probdist, feature_probdist)
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
##//////////////////////////////////////////////////////
|
| 248 |
+
## Demo
|
| 249 |
+
##//////////////////////////////////////////////////////
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def demo():
|
| 253 |
+
from nltk.classify.util import names_demo
|
| 254 |
+
|
| 255 |
+
classifier = names_demo(NaiveBayesClassifier.train)
|
| 256 |
+
classifier.show_most_informative_features()
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
if __name__ == "__main__":
|
| 260 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/classify/positivenaivebayes.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Positive Naive Bayes Classifier
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2012 NLTK Project
|
| 4 |
+
# Author: Alessandro Presta <alessandro.presta@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
A variant of the Naive Bayes Classifier that performs binary classification with
|
| 10 |
+
partially-labeled training sets. In other words, assume we want to build a classifier
|
| 11 |
+
that assigns each example to one of two complementary classes (e.g., male names and
|
| 12 |
+
female names).
|
| 13 |
+
If we have a training set with labeled examples for both classes, we can use a
|
| 14 |
+
standard Naive Bayes Classifier. However, consider the case when we only have labeled
|
| 15 |
+
examples for one of the classes, and other, unlabeled, examples.
|
| 16 |
+
Then, assuming a prior distribution on the two labels, we can use the unlabeled set
|
| 17 |
+
to estimate the frequencies of the various features.
|
| 18 |
+
|
| 19 |
+
Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
|
| 20 |
+
and unlabeled examples. We are also given an estimate of P(1).
|
| 21 |
+
|
| 22 |
+
We compute P(feature|1) exactly as in the standard case.
|
| 23 |
+
|
| 24 |
+
To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
|
| 25 |
+
assuming that the unlabeled examples are drawn according to the given prior distribution)
|
| 26 |
+
and then express the conditional probability as:
|
| 27 |
+
|
| 28 |
+
| P(feature) - P(feature|1) * P(1)
|
| 29 |
+
| P(feature|0) = ----------------------------------
|
| 30 |
+
| P(0)
|
| 31 |
+
|
| 32 |
+
Example:
|
| 33 |
+
|
| 34 |
+
>>> from nltk.classify import PositiveNaiveBayesClassifier
|
| 35 |
+
|
| 36 |
+
Some sentences about sports:
|
| 37 |
+
|
| 38 |
+
>>> sports_sentences = [ 'The team dominated the game',
|
| 39 |
+
... 'They lost the ball',
|
| 40 |
+
... 'The game was intense',
|
| 41 |
+
... 'The goalkeeper catched the ball',
|
| 42 |
+
... 'The other team controlled the ball' ]
|
| 43 |
+
|
| 44 |
+
Mixed topics, including sports:
|
| 45 |
+
|
| 46 |
+
>>> various_sentences = [ 'The President did not comment',
|
| 47 |
+
... 'I lost the keys',
|
| 48 |
+
... 'The team won the game',
|
| 49 |
+
... 'Sara has two kids',
|
| 50 |
+
... 'The ball went off the court',
|
| 51 |
+
... 'They had the ball for the whole game',
|
| 52 |
+
... 'The show is over' ]
|
| 53 |
+
|
| 54 |
+
The features of a sentence are simply the words it contains:
|
| 55 |
+
|
| 56 |
+
>>> def features(sentence):
|
| 57 |
+
... words = sentence.lower().split()
|
| 58 |
+
... return dict(('contains(%s)' % w, True) for w in words)
|
| 59 |
+
|
| 60 |
+
We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
|
| 61 |
+
|
| 62 |
+
>>> positive_featuresets = map(features, sports_sentences)
|
| 63 |
+
>>> unlabeled_featuresets = map(features, various_sentences)
|
| 64 |
+
>>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
|
| 65 |
+
... unlabeled_featuresets)
|
| 66 |
+
|
| 67 |
+
Is the following sentence about sports?
|
| 68 |
+
|
| 69 |
+
>>> classifier.classify(features('The cat is on the table'))
|
| 70 |
+
False
|
| 71 |
+
|
| 72 |
+
What about this one?
|
| 73 |
+
|
| 74 |
+
>>> classifier.classify(features('My team lost the game'))
|
| 75 |
+
True
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
from collections import defaultdict
|
| 79 |
+
|
| 80 |
+
from nltk.classify.naivebayes import NaiveBayesClassifier
|
| 81 |
+
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist
|
| 82 |
+
|
| 83 |
+
##//////////////////////////////////////////////////////
|
| 84 |
+
## Positive Naive Bayes Classifier
|
| 85 |
+
##//////////////////////////////////////////////////////
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
|
| 89 |
+
@staticmethod
|
| 90 |
+
def train(
|
| 91 |
+
positive_featuresets,
|
| 92 |
+
unlabeled_featuresets,
|
| 93 |
+
positive_prob_prior=0.5,
|
| 94 |
+
estimator=ELEProbDist,
|
| 95 |
+
):
|
| 96 |
+
"""
|
| 97 |
+
:param positive_featuresets: An iterable of featuresets that are known as positive
|
| 98 |
+
examples (i.e., their label is ``True``).
|
| 99 |
+
|
| 100 |
+
:param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
|
| 101 |
+
|
| 102 |
+
:param positive_prob_prior: A prior estimate of the probability of the label
|
| 103 |
+
``True`` (default 0.5).
|
| 104 |
+
"""
|
| 105 |
+
positive_feature_freqdist = defaultdict(FreqDist)
|
| 106 |
+
unlabeled_feature_freqdist = defaultdict(FreqDist)
|
| 107 |
+
feature_values = defaultdict(set)
|
| 108 |
+
fnames = set()
|
| 109 |
+
|
| 110 |
+
# Count up how many times each feature value occurred in positive examples.
|
| 111 |
+
num_positive_examples = 0
|
| 112 |
+
for featureset in positive_featuresets:
|
| 113 |
+
for fname, fval in featureset.items():
|
| 114 |
+
positive_feature_freqdist[fname][fval] += 1
|
| 115 |
+
feature_values[fname].add(fval)
|
| 116 |
+
fnames.add(fname)
|
| 117 |
+
num_positive_examples += 1
|
| 118 |
+
|
| 119 |
+
# Count up how many times each feature value occurred in unlabeled examples.
|
| 120 |
+
num_unlabeled_examples = 0
|
| 121 |
+
for featureset in unlabeled_featuresets:
|
| 122 |
+
for fname, fval in featureset.items():
|
| 123 |
+
unlabeled_feature_freqdist[fname][fval] += 1
|
| 124 |
+
feature_values[fname].add(fval)
|
| 125 |
+
fnames.add(fname)
|
| 126 |
+
num_unlabeled_examples += 1
|
| 127 |
+
|
| 128 |
+
# If a feature didn't have a value given for an instance, then we assume that
|
| 129 |
+
# it gets the implicit value 'None'.
|
| 130 |
+
for fname in fnames:
|
| 131 |
+
count = positive_feature_freqdist[fname].N()
|
| 132 |
+
positive_feature_freqdist[fname][None] += num_positive_examples - count
|
| 133 |
+
feature_values[fname].add(None)
|
| 134 |
+
|
| 135 |
+
for fname in fnames:
|
| 136 |
+
count = unlabeled_feature_freqdist[fname].N()
|
| 137 |
+
unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
|
| 138 |
+
feature_values[fname].add(None)
|
| 139 |
+
|
| 140 |
+
negative_prob_prior = 1.0 - positive_prob_prior
|
| 141 |
+
|
| 142 |
+
# Create the P(label) distribution.
|
| 143 |
+
label_probdist = DictionaryProbDist(
|
| 144 |
+
{True: positive_prob_prior, False: negative_prob_prior}
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
# Create the P(fval|label, fname) distribution.
|
| 148 |
+
feature_probdist = {}
|
| 149 |
+
for fname, freqdist in positive_feature_freqdist.items():
|
| 150 |
+
probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
| 151 |
+
feature_probdist[True, fname] = probdist
|
| 152 |
+
|
| 153 |
+
for fname, freqdist in unlabeled_feature_freqdist.items():
|
| 154 |
+
global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
|
| 155 |
+
negative_feature_probs = {}
|
| 156 |
+
for fval in feature_values[fname]:
|
| 157 |
+
prob = (
|
| 158 |
+
global_probdist.prob(fval)
|
| 159 |
+
- positive_prob_prior * feature_probdist[True, fname].prob(fval)
|
| 160 |
+
) / negative_prob_prior
|
| 161 |
+
# TODO: We need to add some kind of smoothing here, instead of
|
| 162 |
+
# setting negative probabilities to zero and normalizing.
|
| 163 |
+
negative_feature_probs[fval] = max(prob, 0.0)
|
| 164 |
+
feature_probdist[False, fname] = DictionaryProbDist(
|
| 165 |
+
negative_feature_probs, normalize=True
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
##//////////////////////////////////////////////////////
|
| 172 |
+
## Demo
|
| 173 |
+
##//////////////////////////////////////////////////////
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def demo():
|
| 177 |
+
from nltk.classify.util import partial_names_demo
|
| 178 |
+
|
| 179 |
+
classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
|
| 180 |
+
classifier.show_most_informative_features()
|
.eggs/nltk-3.8-py3.10.egg/nltk/classify/util.py
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Classifier Utility Functions
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
Utility functions and classes for classifiers.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import math
|
| 14 |
+
|
| 15 |
+
# from nltk.util import Deprecated
|
| 16 |
+
import nltk.classify.util # for accuracy & log_likelihood
|
| 17 |
+
from nltk.util import LazyMap
|
| 18 |
+
|
| 19 |
+
######################################################################
|
| 20 |
+
# { Helper Functions
|
| 21 |
+
######################################################################
|
| 22 |
+
|
| 23 |
+
# alternative name possibility: 'map_featurefunc()'?
|
| 24 |
+
# alternative name possibility: 'detect_features()'?
|
| 25 |
+
# alternative name possibility: 'map_featuredetect()'?
|
| 26 |
+
# or.. just have users use LazyMap directly?
|
| 27 |
+
def apply_features(feature_func, toks, labeled=None):
|
| 28 |
+
"""
|
| 29 |
+
Use the ``LazyMap`` class to construct a lazy list-like
|
| 30 |
+
object that is analogous to ``map(feature_func, toks)``. In
|
| 31 |
+
particular, if ``labeled=False``, then the returned list-like
|
| 32 |
+
object's values are equal to::
|
| 33 |
+
|
| 34 |
+
[feature_func(tok) for tok in toks]
|
| 35 |
+
|
| 36 |
+
If ``labeled=True``, then the returned list-like object's values
|
| 37 |
+
are equal to::
|
| 38 |
+
|
| 39 |
+
[(feature_func(tok), label) for (tok, label) in toks]
|
| 40 |
+
|
| 41 |
+
The primary purpose of this function is to avoid the memory
|
| 42 |
+
overhead involved in storing all the featuresets for every token
|
| 43 |
+
in a corpus. Instead, these featuresets are constructed lazily,
|
| 44 |
+
as-needed. The reduction in memory overhead can be especially
|
| 45 |
+
significant when the underlying list of tokens is itself lazy (as
|
| 46 |
+
is the case with many corpus readers).
|
| 47 |
+
|
| 48 |
+
:param feature_func: The function that will be applied to each
|
| 49 |
+
token. It should return a featureset -- i.e., a dict
|
| 50 |
+
mapping feature names to feature values.
|
| 51 |
+
:param toks: The list of tokens to which ``feature_func`` should be
|
| 52 |
+
applied. If ``labeled=True``, then the list elements will be
|
| 53 |
+
passed directly to ``feature_func()``. If ``labeled=False``,
|
| 54 |
+
then the list elements should be tuples ``(tok,label)``, and
|
| 55 |
+
``tok`` will be passed to ``feature_func()``.
|
| 56 |
+
:param labeled: If true, then ``toks`` contains labeled tokens --
|
| 57 |
+
i.e., tuples of the form ``(tok, label)``. (Default:
|
| 58 |
+
auto-detect based on types.)
|
| 59 |
+
"""
|
| 60 |
+
if labeled is None:
|
| 61 |
+
labeled = toks and isinstance(toks[0], (tuple, list))
|
| 62 |
+
if labeled:
|
| 63 |
+
|
| 64 |
+
def lazy_func(labeled_token):
|
| 65 |
+
return (feature_func(labeled_token[0]), labeled_token[1])
|
| 66 |
+
|
| 67 |
+
return LazyMap(lazy_func, toks)
|
| 68 |
+
else:
|
| 69 |
+
return LazyMap(feature_func, toks)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def attested_labels(tokens):
|
| 73 |
+
"""
|
| 74 |
+
:return: A list of all labels that are attested in the given list
|
| 75 |
+
of tokens.
|
| 76 |
+
:rtype: list of (immutable)
|
| 77 |
+
:param tokens: The list of classified tokens from which to extract
|
| 78 |
+
labels. A classified token has the form ``(token, label)``.
|
| 79 |
+
:type tokens: list
|
| 80 |
+
"""
|
| 81 |
+
return tuple({label for (tok, label) in tokens})
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def log_likelihood(classifier, gold):
|
| 85 |
+
results = classifier.prob_classify_many([fs for (fs, l) in gold])
|
| 86 |
+
ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
|
| 87 |
+
return math.log(sum(ll) / len(ll))
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def accuracy(classifier, gold):
|
| 91 |
+
results = classifier.classify_many([fs for (fs, l) in gold])
|
| 92 |
+
correct = [l == r for ((fs, l), r) in zip(gold, results)]
|
| 93 |
+
if correct:
|
| 94 |
+
return sum(correct) / len(correct)
|
| 95 |
+
else:
|
| 96 |
+
return 0
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class CutoffChecker:
|
| 100 |
+
"""
|
| 101 |
+
A helper class that implements cutoff checks based on number of
|
| 102 |
+
iterations and log likelihood.
|
| 103 |
+
|
| 104 |
+
Accuracy cutoffs are also implemented, but they're almost never
|
| 105 |
+
a good idea to use.
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
def __init__(self, cutoffs):
|
| 109 |
+
self.cutoffs = cutoffs.copy()
|
| 110 |
+
if "min_ll" in cutoffs:
|
| 111 |
+
cutoffs["min_ll"] = -abs(cutoffs["min_ll"])
|
| 112 |
+
if "min_lldelta" in cutoffs:
|
| 113 |
+
cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"])
|
| 114 |
+
self.ll = None
|
| 115 |
+
self.acc = None
|
| 116 |
+
self.iter = 1
|
| 117 |
+
|
| 118 |
+
def check(self, classifier, train_toks):
|
| 119 |
+
cutoffs = self.cutoffs
|
| 120 |
+
self.iter += 1
|
| 121 |
+
if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]:
|
| 122 |
+
return True # iteration cutoff.
|
| 123 |
+
|
| 124 |
+
new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
|
| 125 |
+
if math.isnan(new_ll):
|
| 126 |
+
return True
|
| 127 |
+
|
| 128 |
+
if "min_ll" in cutoffs or "min_lldelta" in cutoffs:
|
| 129 |
+
if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]:
|
| 130 |
+
return True # log likelihood cutoff
|
| 131 |
+
if (
|
| 132 |
+
"min_lldelta" in cutoffs
|
| 133 |
+
and self.ll
|
| 134 |
+
and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"]))
|
| 135 |
+
):
|
| 136 |
+
return True # log likelihood delta cutoff
|
| 137 |
+
self.ll = new_ll
|
| 138 |
+
|
| 139 |
+
if "max_acc" in cutoffs or "min_accdelta" in cutoffs:
|
| 140 |
+
new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
|
| 141 |
+
if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]:
|
| 142 |
+
return True # log likelihood cutoff
|
| 143 |
+
if (
|
| 144 |
+
"min_accdelta" in cutoffs
|
| 145 |
+
and self.acc
|
| 146 |
+
and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"]))
|
| 147 |
+
):
|
| 148 |
+
return True # log likelihood delta cutoff
|
| 149 |
+
self.acc = new_acc
|
| 150 |
+
|
| 151 |
+
return False # no cutoff reached.
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
######################################################################
|
| 155 |
+
# { Demos
|
| 156 |
+
######################################################################
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def names_demo_features(name):
|
| 160 |
+
features = {}
|
| 161 |
+
features["alwayson"] = True
|
| 162 |
+
features["startswith"] = name[0].lower()
|
| 163 |
+
features["endswith"] = name[-1].lower()
|
| 164 |
+
for letter in "abcdefghijklmnopqrstuvwxyz":
|
| 165 |
+
features["count(%s)" % letter] = name.lower().count(letter)
|
| 166 |
+
features["has(%s)" % letter] = letter in name.lower()
|
| 167 |
+
return features
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def binary_names_demo_features(name):
|
| 171 |
+
features = {}
|
| 172 |
+
features["alwayson"] = True
|
| 173 |
+
features["startswith(vowel)"] = name[0].lower() in "aeiouy"
|
| 174 |
+
features["endswith(vowel)"] = name[-1].lower() in "aeiouy"
|
| 175 |
+
for letter in "abcdefghijklmnopqrstuvwxyz":
|
| 176 |
+
features["count(%s)" % letter] = name.lower().count(letter)
|
| 177 |
+
features["has(%s)" % letter] = letter in name.lower()
|
| 178 |
+
features["startswith(%s)" % letter] = letter == name[0].lower()
|
| 179 |
+
features["endswith(%s)" % letter] = letter == name[-1].lower()
|
| 180 |
+
return features
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def names_demo(trainer, features=names_demo_features):
|
| 184 |
+
import random
|
| 185 |
+
|
| 186 |
+
from nltk.corpus import names
|
| 187 |
+
|
| 188 |
+
# Construct a list of classified names, using the names corpus.
|
| 189 |
+
namelist = [(name, "male") for name in names.words("male.txt")] + [
|
| 190 |
+
(name, "female") for name in names.words("female.txt")
|
| 191 |
+
]
|
| 192 |
+
|
| 193 |
+
# Randomly split the names into a test & train set.
|
| 194 |
+
random.seed(123456)
|
| 195 |
+
random.shuffle(namelist)
|
| 196 |
+
train = namelist[:5000]
|
| 197 |
+
test = namelist[5000:5500]
|
| 198 |
+
|
| 199 |
+
# Train up a classifier.
|
| 200 |
+
print("Training classifier...")
|
| 201 |
+
classifier = trainer([(features(n), g) for (n, g) in train])
|
| 202 |
+
|
| 203 |
+
# Run the classifier on the test data.
|
| 204 |
+
print("Testing classifier...")
|
| 205 |
+
acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
|
| 206 |
+
print("Accuracy: %6.4f" % acc)
|
| 207 |
+
|
| 208 |
+
# For classifiers that can find probabilities, show the log
|
| 209 |
+
# likelihood and some sample probability distributions.
|
| 210 |
+
try:
|
| 211 |
+
test_featuresets = [features(n) for (n, g) in test]
|
| 212 |
+
pdists = classifier.prob_classify_many(test_featuresets)
|
| 213 |
+
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
| 214 |
+
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
| 215 |
+
print()
|
| 216 |
+
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
| 217 |
+
for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
|
| 218 |
+
if gender == "male":
|
| 219 |
+
fmt = " %-15s *%6.4f %6.4f"
|
| 220 |
+
else:
|
| 221 |
+
fmt = " %-15s %6.4f *%6.4f"
|
| 222 |
+
print(fmt % (name, pdist.prob("male"), pdist.prob("female")))
|
| 223 |
+
except NotImplementedError:
|
| 224 |
+
pass
|
| 225 |
+
|
| 226 |
+
# Return the classifier
|
| 227 |
+
return classifier
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def partial_names_demo(trainer, features=names_demo_features):
|
| 231 |
+
import random
|
| 232 |
+
|
| 233 |
+
from nltk.corpus import names
|
| 234 |
+
|
| 235 |
+
male_names = names.words("male.txt")
|
| 236 |
+
female_names = names.words("female.txt")
|
| 237 |
+
|
| 238 |
+
random.seed(654321)
|
| 239 |
+
random.shuffle(male_names)
|
| 240 |
+
random.shuffle(female_names)
|
| 241 |
+
|
| 242 |
+
# Create a list of male names to be used as positive-labeled examples for training
|
| 243 |
+
positive = map(features, male_names[:2000])
|
| 244 |
+
|
| 245 |
+
# Create a list of male and female names to be used as unlabeled examples
|
| 246 |
+
unlabeled = map(features, male_names[2000:2500] + female_names[:500])
|
| 247 |
+
|
| 248 |
+
# Create a test set with correctly-labeled male and female names
|
| 249 |
+
test = [(name, True) for name in male_names[2500:2750]] + [
|
| 250 |
+
(name, False) for name in female_names[500:750]
|
| 251 |
+
]
|
| 252 |
+
|
| 253 |
+
random.shuffle(test)
|
| 254 |
+
|
| 255 |
+
# Train up a classifier.
|
| 256 |
+
print("Training classifier...")
|
| 257 |
+
classifier = trainer(positive, unlabeled)
|
| 258 |
+
|
| 259 |
+
# Run the classifier on the test data.
|
| 260 |
+
print("Testing classifier...")
|
| 261 |
+
acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
|
| 262 |
+
print("Accuracy: %6.4f" % acc)
|
| 263 |
+
|
| 264 |
+
# For classifiers that can find probabilities, show the log
|
| 265 |
+
# likelihood and some sample probability distributions.
|
| 266 |
+
try:
|
| 267 |
+
test_featuresets = [features(n) for (n, m) in test]
|
| 268 |
+
pdists = classifier.prob_classify_many(test_featuresets)
|
| 269 |
+
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
| 270 |
+
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
| 271 |
+
print()
|
| 272 |
+
print("Unseen Names P(Male) P(Female)\n" + "-" * 40)
|
| 273 |
+
for ((name, is_male), pdist) in zip(test, pdists)[:5]:
|
| 274 |
+
if is_male == True:
|
| 275 |
+
fmt = " %-15s *%6.4f %6.4f"
|
| 276 |
+
else:
|
| 277 |
+
fmt = " %-15s %6.4f *%6.4f"
|
| 278 |
+
print(fmt % (name, pdist.prob(True), pdist.prob(False)))
|
| 279 |
+
except NotImplementedError:
|
| 280 |
+
pass
|
| 281 |
+
|
| 282 |
+
# Return the classifier
|
| 283 |
+
return classifier
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
_inst_cache = {}
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def wsd_demo(trainer, word, features, n=1000):
|
| 290 |
+
import random
|
| 291 |
+
|
| 292 |
+
from nltk.corpus import senseval
|
| 293 |
+
|
| 294 |
+
# Get the instances.
|
| 295 |
+
print("Reading data...")
|
| 296 |
+
global _inst_cache
|
| 297 |
+
if word not in _inst_cache:
|
| 298 |
+
_inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
|
| 299 |
+
instances = _inst_cache[word][:]
|
| 300 |
+
if n > len(instances):
|
| 301 |
+
n = len(instances)
|
| 302 |
+
senses = list({l for (i, l) in instances})
|
| 303 |
+
print(" Senses: " + " ".join(senses))
|
| 304 |
+
|
| 305 |
+
# Randomly split the names into a test & train set.
|
| 306 |
+
print("Splitting into test & train...")
|
| 307 |
+
random.seed(123456)
|
| 308 |
+
random.shuffle(instances)
|
| 309 |
+
train = instances[: int(0.8 * n)]
|
| 310 |
+
test = instances[int(0.8 * n) : n]
|
| 311 |
+
|
| 312 |
+
# Train up a classifier.
|
| 313 |
+
print("Training classifier...")
|
| 314 |
+
classifier = trainer([(features(i), l) for (i, l) in train])
|
| 315 |
+
|
| 316 |
+
# Run the classifier on the test data.
|
| 317 |
+
print("Testing classifier...")
|
| 318 |
+
acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
|
| 319 |
+
print("Accuracy: %6.4f" % acc)
|
| 320 |
+
|
| 321 |
+
# For classifiers that can find probabilities, show the log
|
| 322 |
+
# likelihood and some sample probability distributions.
|
| 323 |
+
try:
|
| 324 |
+
test_featuresets = [features(i) for (i, n) in test]
|
| 325 |
+
pdists = classifier.prob_classify_many(test_featuresets)
|
| 326 |
+
ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
|
| 327 |
+
print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test)))
|
| 328 |
+
except NotImplementedError:
|
| 329 |
+
pass
|
| 330 |
+
|
| 331 |
+
# Return the classifier
|
| 332 |
+
return classifier
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def check_megam_config():
|
| 336 |
+
"""
|
| 337 |
+
Checks whether the MEGAM binary is configured.
|
| 338 |
+
"""
|
| 339 |
+
try:
|
| 340 |
+
_megam_bin
|
| 341 |
+
except NameError as e:
|
| 342 |
+
err_msg = str(
|
| 343 |
+
"Please configure your megam binary first, e.g.\n"
|
| 344 |
+
">>> nltk.config_megam('/usr/bin/local/megam')"
|
| 345 |
+
)
|
| 346 |
+
raise NameError(err_msg) from e
|
.eggs/nltk-3.8-py3.10.egg/nltk/classify/weka.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Interface to Weka Classsifiers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
Classifiers that make use of the external 'Weka' package.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import re
|
| 14 |
+
import subprocess
|
| 15 |
+
import tempfile
|
| 16 |
+
import time
|
| 17 |
+
import zipfile
|
| 18 |
+
from sys import stdin
|
| 19 |
+
|
| 20 |
+
from nltk.classify.api import ClassifierI
|
| 21 |
+
from nltk.internals import config_java, java
|
| 22 |
+
from nltk.probability import DictionaryProbDist
|
| 23 |
+
|
| 24 |
+
_weka_classpath = None
|
| 25 |
+
_weka_search = [
|
| 26 |
+
".",
|
| 27 |
+
"/usr/share/weka",
|
| 28 |
+
"/usr/local/share/weka",
|
| 29 |
+
"/usr/lib/weka",
|
| 30 |
+
"/usr/local/lib/weka",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def config_weka(classpath=None):
|
| 35 |
+
global _weka_classpath
|
| 36 |
+
|
| 37 |
+
# Make sure java's configured first.
|
| 38 |
+
config_java()
|
| 39 |
+
|
| 40 |
+
if classpath is not None:
|
| 41 |
+
_weka_classpath = classpath
|
| 42 |
+
|
| 43 |
+
if _weka_classpath is None:
|
| 44 |
+
searchpath = _weka_search
|
| 45 |
+
if "WEKAHOME" in os.environ:
|
| 46 |
+
searchpath.insert(0, os.environ["WEKAHOME"])
|
| 47 |
+
|
| 48 |
+
for path in searchpath:
|
| 49 |
+
if os.path.exists(os.path.join(path, "weka.jar")):
|
| 50 |
+
_weka_classpath = os.path.join(path, "weka.jar")
|
| 51 |
+
version = _check_weka_version(_weka_classpath)
|
| 52 |
+
if version:
|
| 53 |
+
print(f"[Found Weka: {_weka_classpath} (version {version})]")
|
| 54 |
+
else:
|
| 55 |
+
print("[Found Weka: %s]" % _weka_classpath)
|
| 56 |
+
_check_weka_version(_weka_classpath)
|
| 57 |
+
|
| 58 |
+
if _weka_classpath is None:
|
| 59 |
+
raise LookupError(
|
| 60 |
+
"Unable to find weka.jar! Use config_weka() "
|
| 61 |
+
"or set the WEKAHOME environment variable. "
|
| 62 |
+
"For more information about Weka, please see "
|
| 63 |
+
"https://www.cs.waikato.ac.nz/ml/weka/"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _check_weka_version(jar):
|
| 68 |
+
try:
|
| 69 |
+
zf = zipfile.ZipFile(jar)
|
| 70 |
+
except (SystemExit, KeyboardInterrupt):
|
| 71 |
+
raise
|
| 72 |
+
except:
|
| 73 |
+
return None
|
| 74 |
+
try:
|
| 75 |
+
try:
|
| 76 |
+
return zf.read("weka/core/version.txt")
|
| 77 |
+
except KeyError:
|
| 78 |
+
return None
|
| 79 |
+
finally:
|
| 80 |
+
zf.close()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class WekaClassifier(ClassifierI):
|
| 84 |
+
def __init__(self, formatter, model_filename):
|
| 85 |
+
self._formatter = formatter
|
| 86 |
+
self._model = model_filename
|
| 87 |
+
|
| 88 |
+
def prob_classify_many(self, featuresets):
|
| 89 |
+
return self._classify_many(featuresets, ["-p", "0", "-distribution"])
|
| 90 |
+
|
| 91 |
+
def classify_many(self, featuresets):
|
| 92 |
+
return self._classify_many(featuresets, ["-p", "0"])
|
| 93 |
+
|
| 94 |
+
def _classify_many(self, featuresets, options):
|
| 95 |
+
# Make sure we can find java & weka.
|
| 96 |
+
config_weka()
|
| 97 |
+
|
| 98 |
+
temp_dir = tempfile.mkdtemp()
|
| 99 |
+
try:
|
| 100 |
+
# Write the test data file.
|
| 101 |
+
test_filename = os.path.join(temp_dir, "test.arff")
|
| 102 |
+
self._formatter.write(test_filename, featuresets)
|
| 103 |
+
|
| 104 |
+
# Call weka to classify the data.
|
| 105 |
+
cmd = [
|
| 106 |
+
"weka.classifiers.bayes.NaiveBayes",
|
| 107 |
+
"-l",
|
| 108 |
+
self._model,
|
| 109 |
+
"-T",
|
| 110 |
+
test_filename,
|
| 111 |
+
] + options
|
| 112 |
+
(stdout, stderr) = java(
|
| 113 |
+
cmd,
|
| 114 |
+
classpath=_weka_classpath,
|
| 115 |
+
stdout=subprocess.PIPE,
|
| 116 |
+
stderr=subprocess.PIPE,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Check if something went wrong:
|
| 120 |
+
if stderr and not stdout:
|
| 121 |
+
if "Illegal options: -distribution" in stderr:
|
| 122 |
+
raise ValueError(
|
| 123 |
+
"The installed version of weka does "
|
| 124 |
+
"not support probability distribution "
|
| 125 |
+
"output."
|
| 126 |
+
)
|
| 127 |
+
else:
|
| 128 |
+
raise ValueError("Weka failed to generate output:\n%s" % stderr)
|
| 129 |
+
|
| 130 |
+
# Parse weka's output.
|
| 131 |
+
return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n"))
|
| 132 |
+
|
| 133 |
+
finally:
|
| 134 |
+
for f in os.listdir(temp_dir):
|
| 135 |
+
os.remove(os.path.join(temp_dir, f))
|
| 136 |
+
os.rmdir(temp_dir)
|
| 137 |
+
|
| 138 |
+
def parse_weka_distribution(self, s):
|
| 139 |
+
probs = [float(v) for v in re.split("[*,]+", s) if v.strip()]
|
| 140 |
+
probs = dict(zip(self._formatter.labels(), probs))
|
| 141 |
+
return DictionaryProbDist(probs)
|
| 142 |
+
|
| 143 |
+
def parse_weka_output(self, lines):
|
| 144 |
+
# Strip unwanted text from stdout
|
| 145 |
+
for i, line in enumerate(lines):
|
| 146 |
+
if line.strip().startswith("inst#"):
|
| 147 |
+
lines = lines[i:]
|
| 148 |
+
break
|
| 149 |
+
|
| 150 |
+
if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]:
|
| 151 |
+
return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()]
|
| 152 |
+
elif lines[0].split() == [
|
| 153 |
+
"inst#",
|
| 154 |
+
"actual",
|
| 155 |
+
"predicted",
|
| 156 |
+
"error",
|
| 157 |
+
"distribution",
|
| 158 |
+
]:
|
| 159 |
+
return [
|
| 160 |
+
self.parse_weka_distribution(line.split()[-1])
|
| 161 |
+
for line in lines[1:]
|
| 162 |
+
if line.strip()
|
| 163 |
+
]
|
| 164 |
+
|
| 165 |
+
# is this safe:?
|
| 166 |
+
elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]):
|
| 167 |
+
return [line.split()[1] for line in lines if line.strip()]
|
| 168 |
+
|
| 169 |
+
else:
|
| 170 |
+
for line in lines[:10]:
|
| 171 |
+
print(line)
|
| 172 |
+
raise ValueError(
|
| 173 |
+
"Unhandled output format -- your version "
|
| 174 |
+
"of weka may not be supported.\n"
|
| 175 |
+
" Header: %s" % lines[0]
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
# [xx] full list of classifiers (some may be abstract?):
|
| 179 |
+
# ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule,
|
| 180 |
+
# DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48,
|
| 181 |
+
# JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic,
|
| 182 |
+
# LogisticBase, M5Base, MultilayerPerceptron,
|
| 183 |
+
# MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial,
|
| 184 |
+
# NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART,
|
| 185 |
+
# PreConstructedLinearModel, Prism, RandomForest,
|
| 186 |
+
# RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor,
|
| 187 |
+
# RuleNode, SimpleLinearRegression, SimpleLogistic,
|
| 188 |
+
# SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI,
|
| 189 |
+
# VotedPerceptron, Winnow, ZeroR
|
| 190 |
+
|
| 191 |
+
_CLASSIFIER_CLASS = {
|
| 192 |
+
"naivebayes": "weka.classifiers.bayes.NaiveBayes",
|
| 193 |
+
"C4.5": "weka.classifiers.trees.J48",
|
| 194 |
+
"log_regression": "weka.classifiers.functions.Logistic",
|
| 195 |
+
"svm": "weka.classifiers.functions.SMO",
|
| 196 |
+
"kstar": "weka.classifiers.lazy.KStar",
|
| 197 |
+
"ripper": "weka.classifiers.rules.JRip",
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
@classmethod
|
| 201 |
+
def train(
|
| 202 |
+
cls,
|
| 203 |
+
model_filename,
|
| 204 |
+
featuresets,
|
| 205 |
+
classifier="naivebayes",
|
| 206 |
+
options=[],
|
| 207 |
+
quiet=True,
|
| 208 |
+
):
|
| 209 |
+
# Make sure we can find java & weka.
|
| 210 |
+
config_weka()
|
| 211 |
+
|
| 212 |
+
# Build an ARFF formatter.
|
| 213 |
+
formatter = ARFF_Formatter.from_train(featuresets)
|
| 214 |
+
|
| 215 |
+
temp_dir = tempfile.mkdtemp()
|
| 216 |
+
try:
|
| 217 |
+
# Write the training data file.
|
| 218 |
+
train_filename = os.path.join(temp_dir, "train.arff")
|
| 219 |
+
formatter.write(train_filename, featuresets)
|
| 220 |
+
|
| 221 |
+
if classifier in cls._CLASSIFIER_CLASS:
|
| 222 |
+
javaclass = cls._CLASSIFIER_CLASS[classifier]
|
| 223 |
+
elif classifier in cls._CLASSIFIER_CLASS.values():
|
| 224 |
+
javaclass = classifier
|
| 225 |
+
else:
|
| 226 |
+
raise ValueError("Unknown classifier %s" % classifier)
|
| 227 |
+
|
| 228 |
+
# Train the weka model.
|
| 229 |
+
cmd = [javaclass, "-d", model_filename, "-t", train_filename]
|
| 230 |
+
cmd += list(options)
|
| 231 |
+
if quiet:
|
| 232 |
+
stdout = subprocess.PIPE
|
| 233 |
+
else:
|
| 234 |
+
stdout = None
|
| 235 |
+
java(cmd, classpath=_weka_classpath, stdout=stdout)
|
| 236 |
+
|
| 237 |
+
# Return the new classifier.
|
| 238 |
+
return WekaClassifier(formatter, model_filename)
|
| 239 |
+
|
| 240 |
+
finally:
|
| 241 |
+
for f in os.listdir(temp_dir):
|
| 242 |
+
os.remove(os.path.join(temp_dir, f))
|
| 243 |
+
os.rmdir(temp_dir)
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
class ARFF_Formatter:
|
| 247 |
+
"""
|
| 248 |
+
Converts featuresets and labeled featuresets to ARFF-formatted
|
| 249 |
+
strings, appropriate for input into Weka.
|
| 250 |
+
|
| 251 |
+
Features and classes can be specified manually in the constructor, or may
|
| 252 |
+
be determined from data using ``from_train``.
|
| 253 |
+
"""
|
| 254 |
+
|
| 255 |
+
def __init__(self, labels, features):
|
| 256 |
+
"""
|
| 257 |
+
:param labels: A list of all class labels that can be generated.
|
| 258 |
+
:param features: A list of feature specifications, where
|
| 259 |
+
each feature specification is a tuple (fname, ftype);
|
| 260 |
+
and ftype is an ARFF type string such as NUMERIC or
|
| 261 |
+
STRING.
|
| 262 |
+
"""
|
| 263 |
+
self._labels = labels
|
| 264 |
+
self._features = features
|
| 265 |
+
|
| 266 |
+
def format(self, tokens):
|
| 267 |
+
"""Returns a string representation of ARFF output for the given data."""
|
| 268 |
+
return self.header_section() + self.data_section(tokens)
|
| 269 |
+
|
| 270 |
+
def labels(self):
|
| 271 |
+
"""Returns the list of classes."""
|
| 272 |
+
return list(self._labels)
|
| 273 |
+
|
| 274 |
+
def write(self, outfile, tokens):
|
| 275 |
+
"""Writes ARFF data to a file for the given data."""
|
| 276 |
+
if not hasattr(outfile, "write"):
|
| 277 |
+
outfile = open(outfile, "w")
|
| 278 |
+
outfile.write(self.format(tokens))
|
| 279 |
+
outfile.close()
|
| 280 |
+
|
| 281 |
+
@staticmethod
|
| 282 |
+
def from_train(tokens):
|
| 283 |
+
"""
|
| 284 |
+
Constructs an ARFF_Formatter instance with class labels and feature
|
| 285 |
+
types determined from the given data. Handles boolean, numeric and
|
| 286 |
+
string (note: not nominal) types.
|
| 287 |
+
"""
|
| 288 |
+
# Find the set of all attested labels.
|
| 289 |
+
labels = {label for (tok, label) in tokens}
|
| 290 |
+
|
| 291 |
+
# Determine the types of all features.
|
| 292 |
+
features = {}
|
| 293 |
+
for tok, label in tokens:
|
| 294 |
+
for (fname, fval) in tok.items():
|
| 295 |
+
if issubclass(type(fval), bool):
|
| 296 |
+
ftype = "{True, False}"
|
| 297 |
+
elif issubclass(type(fval), (int, float, bool)):
|
| 298 |
+
ftype = "NUMERIC"
|
| 299 |
+
elif issubclass(type(fval), str):
|
| 300 |
+
ftype = "STRING"
|
| 301 |
+
elif fval is None:
|
| 302 |
+
continue # can't tell the type.
|
| 303 |
+
else:
|
| 304 |
+
raise ValueError("Unsupported value type %r" % ftype)
|
| 305 |
+
|
| 306 |
+
if features.get(fname, ftype) != ftype:
|
| 307 |
+
raise ValueError("Inconsistent type for %s" % fname)
|
| 308 |
+
features[fname] = ftype
|
| 309 |
+
features = sorted(features.items())
|
| 310 |
+
|
| 311 |
+
return ARFF_Formatter(labels, features)
|
| 312 |
+
|
| 313 |
+
def header_section(self):
|
| 314 |
+
"""Returns an ARFF header as a string."""
|
| 315 |
+
# Header comment.
|
| 316 |
+
s = (
|
| 317 |
+
"% Weka ARFF file\n"
|
| 318 |
+
+ "% Generated automatically by NLTK\n"
|
| 319 |
+
+ "%% %s\n\n" % time.ctime()
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
# Relation name
|
| 323 |
+
s += "@RELATION rel\n\n"
|
| 324 |
+
|
| 325 |
+
# Input attribute specifications
|
| 326 |
+
for fname, ftype in self._features:
|
| 327 |
+
s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype)
|
| 328 |
+
|
| 329 |
+
# Label attribute specification
|
| 330 |
+
s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels))
|
| 331 |
+
|
| 332 |
+
return s
|
| 333 |
+
|
| 334 |
+
def data_section(self, tokens, labeled=None):
|
| 335 |
+
"""
|
| 336 |
+
Returns the ARFF data section for the given data.
|
| 337 |
+
|
| 338 |
+
:param tokens: a list of featuresets (dicts) or labelled featuresets
|
| 339 |
+
which are tuples (featureset, label).
|
| 340 |
+
:param labeled: Indicates whether the given tokens are labeled
|
| 341 |
+
or not. If None, then the tokens will be assumed to be
|
| 342 |
+
labeled if the first token's value is a tuple or list.
|
| 343 |
+
"""
|
| 344 |
+
# Check if the tokens are labeled or unlabeled. If unlabeled,
|
| 345 |
+
# then use 'None'
|
| 346 |
+
if labeled is None:
|
| 347 |
+
labeled = tokens and isinstance(tokens[0], (tuple, list))
|
| 348 |
+
if not labeled:
|
| 349 |
+
tokens = [(tok, None) for tok in tokens]
|
| 350 |
+
|
| 351 |
+
# Data section
|
| 352 |
+
s = "\n@DATA\n"
|
| 353 |
+
for (tok, label) in tokens:
|
| 354 |
+
for fname, ftype in self._features:
|
| 355 |
+
s += "%s," % self._fmt_arff_val(tok.get(fname))
|
| 356 |
+
s += "%s\n" % self._fmt_arff_val(label)
|
| 357 |
+
|
| 358 |
+
return s
|
| 359 |
+
|
| 360 |
+
def _fmt_arff_val(self, fval):
|
| 361 |
+
if fval is None:
|
| 362 |
+
return "?"
|
| 363 |
+
elif isinstance(fval, (bool, int)):
|
| 364 |
+
return "%s" % fval
|
| 365 |
+
elif isinstance(fval, float):
|
| 366 |
+
return "%r" % fval
|
| 367 |
+
else:
|
| 368 |
+
return "%r" % fval
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
if __name__ == "__main__":
|
| 372 |
+
from nltk.classify.util import binary_names_demo_features, names_demo
|
| 373 |
+
|
| 374 |
+
def make_classifier(featuresets):
|
| 375 |
+
return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5")
|
| 376 |
+
|
| 377 |
+
classifier = names_demo(make_classifier, binary_names_demo_features)
|
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/__init__.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Clusterers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
This module contains a number of basic clustering algorithms. Clustering
|
| 10 |
+
describes the task of discovering groups of similar items with a large
|
| 11 |
+
collection. It is also describe as unsupervised machine learning, as the data
|
| 12 |
+
from which it learns is unannotated with class information, as is the case for
|
| 13 |
+
supervised learning. Annotated data is difficult and expensive to obtain in
|
| 14 |
+
the quantities required for the majority of supervised learning algorithms.
|
| 15 |
+
This problem, the knowledge acquisition bottleneck, is common to most natural
|
| 16 |
+
language processing tasks, thus fueling the need for quality unsupervised
|
| 17 |
+
approaches.
|
| 18 |
+
|
| 19 |
+
This module contains a k-means clusterer, E-M clusterer and a group average
|
| 20 |
+
agglomerative clusterer (GAAC). All these clusterers involve finding good
|
| 21 |
+
cluster groupings for a set of vectors in multi-dimensional space.
|
| 22 |
+
|
| 23 |
+
The K-means clusterer starts with k arbitrary chosen means then allocates each
|
| 24 |
+
vector to the cluster with the closest mean. It then recalculates the means of
|
| 25 |
+
each cluster as the centroid of the vectors in the cluster. This process
|
| 26 |
+
repeats until the cluster memberships stabilise. This is a hill-climbing
|
| 27 |
+
algorithm which may converge to a local maximum. Hence the clustering is
|
| 28 |
+
often repeated with random initial means and the most commonly occurring
|
| 29 |
+
output means are chosen.
|
| 30 |
+
|
| 31 |
+
The GAAC clusterer starts with each of the *N* vectors as singleton clusters.
|
| 32 |
+
It then iteratively merges pairs of clusters which have the closest centroids.
|
| 33 |
+
This continues until there is only one cluster. The order of merges gives rise
|
| 34 |
+
to a dendrogram - a tree with the earlier merges lower than later merges. The
|
| 35 |
+
membership of a given number of clusters *c*, *1 <= c <= N*, can be found by
|
| 36 |
+
cutting the dendrogram at depth *c*.
|
| 37 |
+
|
| 38 |
+
The Gaussian EM clusterer models the vectors as being produced by a mixture
|
| 39 |
+
of k Gaussian sources. The parameters of these sources (prior probability,
|
| 40 |
+
mean and covariance matrix) are then found to maximise the likelihood of the
|
| 41 |
+
given data. This is done with the expectation maximisation algorithm. It
|
| 42 |
+
starts with k arbitrarily chosen means, priors and covariance matrices. It
|
| 43 |
+
then calculates the membership probabilities for each vector in each of the
|
| 44 |
+
clusters - this is the 'E' step. The cluster parameters are then updated in
|
| 45 |
+
the 'M' step using the maximum likelihood estimate from the cluster membership
|
| 46 |
+
probabilities. This process continues until the likelihood of the data does
|
| 47 |
+
not significantly increase.
|
| 48 |
+
|
| 49 |
+
They all extend the ClusterI interface which defines common operations
|
| 50 |
+
available with each clusterer. These operations include:
|
| 51 |
+
|
| 52 |
+
- cluster: clusters a sequence of vectors
|
| 53 |
+
- classify: assign a vector to a cluster
|
| 54 |
+
- classification_probdist: give the probability distribution over cluster memberships
|
| 55 |
+
|
| 56 |
+
The current existing classifiers also extend cluster.VectorSpace, an
|
| 57 |
+
abstract class which allows for singular value decomposition (SVD) and vector
|
| 58 |
+
normalisation. SVD is used to reduce the dimensionality of the vector space in
|
| 59 |
+
such a manner as to preserve as much of the variation as possible, by
|
| 60 |
+
reparameterising the axes in order of variability and discarding all bar the
|
| 61 |
+
first d dimensions. Normalisation ensures that vectors fall in the unit
|
| 62 |
+
hypersphere.
|
| 63 |
+
|
| 64 |
+
Usage example (see also demo())::
|
| 65 |
+
|
| 66 |
+
from nltk import cluster
|
| 67 |
+
from nltk.cluster import euclidean_distance
|
| 68 |
+
from numpy import array
|
| 69 |
+
|
| 70 |
+
vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]]
|
| 71 |
+
|
| 72 |
+
# initialise the clusterer (will also assign the vectors to clusters)
|
| 73 |
+
clusterer = cluster.KMeansClusterer(2, euclidean_distance)
|
| 74 |
+
clusterer.cluster(vectors, True)
|
| 75 |
+
|
| 76 |
+
# classify a new vector
|
| 77 |
+
print(clusterer.classify(array([3, 3])))
|
| 78 |
+
|
| 79 |
+
Note that the vectors must use numpy array-like
|
| 80 |
+
objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for
|
| 81 |
+
efficiency when required.
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
from nltk.cluster.em import EMClusterer
|
| 85 |
+
from nltk.cluster.gaac import GAAClusterer
|
| 86 |
+
from nltk.cluster.kmeans import KMeansClusterer
|
| 87 |
+
from nltk.cluster.util import (
|
| 88 |
+
Dendrogram,
|
| 89 |
+
VectorSpaceClusterer,
|
| 90 |
+
cosine_distance,
|
| 91 |
+
euclidean_distance,
|
| 92 |
+
)
|
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/api.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Clusterer Interfaces
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
| 5 |
+
# Porting: Steven Bird <stevenbird1@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
from abc import ABCMeta, abstractmethod
|
| 10 |
+
|
| 11 |
+
from nltk.probability import DictionaryProbDist
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ClusterI(metaclass=ABCMeta):
|
| 15 |
+
"""
|
| 16 |
+
Interface covering basic clustering functionality.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
@abstractmethod
|
| 20 |
+
def cluster(self, vectors, assign_clusters=False):
|
| 21 |
+
"""
|
| 22 |
+
Assigns the vectors to clusters, learning the clustering parameters
|
| 23 |
+
from the data. Returns a cluster identifier for each vector.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
@abstractmethod
|
| 27 |
+
def classify(self, token):
|
| 28 |
+
"""
|
| 29 |
+
Classifies the token into a cluster, setting the token's CLUSTER
|
| 30 |
+
parameter to that cluster identifier.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def likelihood(self, vector, label):
|
| 34 |
+
"""
|
| 35 |
+
Returns the likelihood (a float) of the token having the
|
| 36 |
+
corresponding cluster.
|
| 37 |
+
"""
|
| 38 |
+
if self.classify(vector) == label:
|
| 39 |
+
return 1.0
|
| 40 |
+
else:
|
| 41 |
+
return 0.0
|
| 42 |
+
|
| 43 |
+
def classification_probdist(self, vector):
|
| 44 |
+
"""
|
| 45 |
+
Classifies the token into a cluster, returning
|
| 46 |
+
a probability distribution over the cluster identifiers.
|
| 47 |
+
"""
|
| 48 |
+
likelihoods = {}
|
| 49 |
+
sum = 0.0
|
| 50 |
+
for cluster in self.cluster_names():
|
| 51 |
+
likelihoods[cluster] = self.likelihood(vector, cluster)
|
| 52 |
+
sum += likelihoods[cluster]
|
| 53 |
+
for cluster in self.cluster_names():
|
| 54 |
+
likelihoods[cluster] /= sum
|
| 55 |
+
return DictionaryProbDist(likelihoods)
|
| 56 |
+
|
| 57 |
+
@abstractmethod
|
| 58 |
+
def num_clusters(self):
|
| 59 |
+
"""
|
| 60 |
+
Returns the number of clusters.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
def cluster_names(self):
|
| 64 |
+
"""
|
| 65 |
+
Returns the names of the clusters.
|
| 66 |
+
:rtype: list
|
| 67 |
+
"""
|
| 68 |
+
return list(range(self.num_clusters()))
|
| 69 |
+
|
| 70 |
+
def cluster_name(self, index):
|
| 71 |
+
"""
|
| 72 |
+
Returns the names of the cluster at index.
|
| 73 |
+
"""
|
| 74 |
+
return index
|
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/em.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Expectation Maximization Clusterer
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
import numpy
|
| 10 |
+
except ImportError:
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
from nltk.cluster.util import VectorSpaceClusterer
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class EMClusterer(VectorSpaceClusterer):
|
| 17 |
+
"""
|
| 18 |
+
The Gaussian EM clusterer models the vectors as being produced by
|
| 19 |
+
a mixture of k Gaussian sources. The parameters of these sources
|
| 20 |
+
(prior probability, mean and covariance matrix) are then found to
|
| 21 |
+
maximise the likelihood of the given data. This is done with the
|
| 22 |
+
expectation maximisation algorithm. It starts with k arbitrarily
|
| 23 |
+
chosen means, priors and covariance matrices. It then calculates
|
| 24 |
+
the membership probabilities for each vector in each of the
|
| 25 |
+
clusters; this is the 'E' step. The cluster parameters are then
|
| 26 |
+
updated in the 'M' step using the maximum likelihood estimate from
|
| 27 |
+
the cluster membership probabilities. This process continues until
|
| 28 |
+
the likelihood of the data does not significantly increase.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(
|
| 32 |
+
self,
|
| 33 |
+
initial_means,
|
| 34 |
+
priors=None,
|
| 35 |
+
covariance_matrices=None,
|
| 36 |
+
conv_threshold=1e-6,
|
| 37 |
+
bias=0.1,
|
| 38 |
+
normalise=False,
|
| 39 |
+
svd_dimensions=None,
|
| 40 |
+
):
|
| 41 |
+
"""
|
| 42 |
+
Creates an EM clusterer with the given starting parameters,
|
| 43 |
+
convergence threshold and vector mangling parameters.
|
| 44 |
+
|
| 45 |
+
:param initial_means: the means of the gaussian cluster centers
|
| 46 |
+
:type initial_means: [seq of] numpy array or seq of SparseArray
|
| 47 |
+
:param priors: the prior probability for each cluster
|
| 48 |
+
:type priors: numpy array or seq of float
|
| 49 |
+
:param covariance_matrices: the covariance matrix for each cluster
|
| 50 |
+
:type covariance_matrices: [seq of] numpy array
|
| 51 |
+
:param conv_threshold: maximum change in likelihood before deemed
|
| 52 |
+
convergent
|
| 53 |
+
:type conv_threshold: int or float
|
| 54 |
+
:param bias: variance bias used to ensure non-singular covariance
|
| 55 |
+
matrices
|
| 56 |
+
:type bias: float
|
| 57 |
+
:param normalise: should vectors be normalised to length 1
|
| 58 |
+
:type normalise: boolean
|
| 59 |
+
:param svd_dimensions: number of dimensions to use in reducing vector
|
| 60 |
+
dimensionsionality with SVD
|
| 61 |
+
:type svd_dimensions: int
|
| 62 |
+
"""
|
| 63 |
+
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
|
| 64 |
+
self._means = numpy.array(initial_means, numpy.float64)
|
| 65 |
+
self._num_clusters = len(initial_means)
|
| 66 |
+
self._conv_threshold = conv_threshold
|
| 67 |
+
self._covariance_matrices = covariance_matrices
|
| 68 |
+
self._priors = priors
|
| 69 |
+
self._bias = bias
|
| 70 |
+
|
| 71 |
+
def num_clusters(self):
|
| 72 |
+
return self._num_clusters
|
| 73 |
+
|
| 74 |
+
def cluster_vectorspace(self, vectors, trace=False):
|
| 75 |
+
assert len(vectors) > 0
|
| 76 |
+
|
| 77 |
+
# set the parameters to initial values
|
| 78 |
+
dimensions = len(vectors[0])
|
| 79 |
+
means = self._means
|
| 80 |
+
priors = self._priors
|
| 81 |
+
if not priors:
|
| 82 |
+
priors = self._priors = (
|
| 83 |
+
numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
|
| 84 |
+
)
|
| 85 |
+
covariances = self._covariance_matrices
|
| 86 |
+
if not covariances:
|
| 87 |
+
covariances = self._covariance_matrices = [
|
| 88 |
+
numpy.identity(dimensions, numpy.float64)
|
| 89 |
+
for i in range(self._num_clusters)
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
# do the E and M steps until the likelihood plateaus
|
| 93 |
+
lastl = self._loglikelihood(vectors, priors, means, covariances)
|
| 94 |
+
converged = False
|
| 95 |
+
|
| 96 |
+
while not converged:
|
| 97 |
+
if trace:
|
| 98 |
+
print("iteration; loglikelihood", lastl)
|
| 99 |
+
# E-step, calculate hidden variables, h[i,j]
|
| 100 |
+
h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
|
| 101 |
+
for i in range(len(vectors)):
|
| 102 |
+
for j in range(self._num_clusters):
|
| 103 |
+
h[i, j] = priors[j] * self._gaussian(
|
| 104 |
+
means[j], covariances[j], vectors[i]
|
| 105 |
+
)
|
| 106 |
+
h[i, :] /= sum(h[i, :])
|
| 107 |
+
|
| 108 |
+
# M-step, update parameters - cvm, p, mean
|
| 109 |
+
for j in range(self._num_clusters):
|
| 110 |
+
covariance_before = covariances[j]
|
| 111 |
+
new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
|
| 112 |
+
new_mean = numpy.zeros(dimensions, numpy.float64)
|
| 113 |
+
sum_hj = 0.0
|
| 114 |
+
for i in range(len(vectors)):
|
| 115 |
+
delta = vectors[i] - means[j]
|
| 116 |
+
new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
|
| 117 |
+
sum_hj += h[i, j]
|
| 118 |
+
new_mean += h[i, j] * vectors[i]
|
| 119 |
+
covariances[j] = new_covariance / sum_hj
|
| 120 |
+
means[j] = new_mean / sum_hj
|
| 121 |
+
priors[j] = sum_hj / len(vectors)
|
| 122 |
+
|
| 123 |
+
# bias term to stop covariance matrix being singular
|
| 124 |
+
covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
|
| 125 |
+
|
| 126 |
+
# calculate likelihood - FIXME: may be broken
|
| 127 |
+
l = self._loglikelihood(vectors, priors, means, covariances)
|
| 128 |
+
|
| 129 |
+
# check for convergence
|
| 130 |
+
if abs(lastl - l) < self._conv_threshold:
|
| 131 |
+
converged = True
|
| 132 |
+
lastl = l
|
| 133 |
+
|
| 134 |
+
def classify_vectorspace(self, vector):
|
| 135 |
+
best = None
|
| 136 |
+
for j in range(self._num_clusters):
|
| 137 |
+
p = self._priors[j] * self._gaussian(
|
| 138 |
+
self._means[j], self._covariance_matrices[j], vector
|
| 139 |
+
)
|
| 140 |
+
if not best or p > best[0]:
|
| 141 |
+
best = (p, j)
|
| 142 |
+
return best[1]
|
| 143 |
+
|
| 144 |
+
def likelihood_vectorspace(self, vector, cluster):
|
| 145 |
+
cid = self.cluster_names().index(cluster)
|
| 146 |
+
return self._priors[cluster] * self._gaussian(
|
| 147 |
+
self._means[cluster], self._covariance_matrices[cluster], vector
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
def _gaussian(self, mean, cvm, x):
|
| 151 |
+
m = len(mean)
|
| 152 |
+
assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape)
|
| 153 |
+
try:
|
| 154 |
+
det = numpy.linalg.det(cvm)
|
| 155 |
+
inv = numpy.linalg.inv(cvm)
|
| 156 |
+
a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0)
|
| 157 |
+
dx = x - mean
|
| 158 |
+
print(dx, inv)
|
| 159 |
+
b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
|
| 160 |
+
return a * numpy.exp(b)
|
| 161 |
+
except OverflowError:
|
| 162 |
+
# happens when the exponent is negative infinity - i.e. b = 0
|
| 163 |
+
# i.e. the inverse of cvm is huge (cvm is almost zero)
|
| 164 |
+
return 0
|
| 165 |
+
|
| 166 |
+
def _loglikelihood(self, vectors, priors, means, covariances):
|
| 167 |
+
llh = 0.0
|
| 168 |
+
for vector in vectors:
|
| 169 |
+
p = 0
|
| 170 |
+
for j in range(len(priors)):
|
| 171 |
+
p += priors[j] * self._gaussian(means[j], covariances[j], vector)
|
| 172 |
+
llh += numpy.log(p)
|
| 173 |
+
return llh
|
| 174 |
+
|
| 175 |
+
def __repr__(self):
|
| 176 |
+
return "<EMClusterer means=%s>" % list(self._means)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def demo():
|
| 180 |
+
"""
|
| 181 |
+
Non-interactive demonstration of the clusterers with simple 2-D data.
|
| 182 |
+
"""
|
| 183 |
+
|
| 184 |
+
from nltk import cluster
|
| 185 |
+
|
| 186 |
+
# example from figure 14.10, page 519, Manning and Schutze
|
| 187 |
+
|
| 188 |
+
vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
|
| 189 |
+
means = [[4, 2], [4, 2.01]]
|
| 190 |
+
|
| 191 |
+
clusterer = cluster.EMClusterer(means, bias=0.1)
|
| 192 |
+
clusters = clusterer.cluster(vectors, True, trace=True)
|
| 193 |
+
|
| 194 |
+
print("Clustered:", vectors)
|
| 195 |
+
print("As: ", clusters)
|
| 196 |
+
print()
|
| 197 |
+
|
| 198 |
+
for c in range(2):
|
| 199 |
+
print("Cluster:", c)
|
| 200 |
+
print("Prior: ", clusterer._priors[c])
|
| 201 |
+
print("Mean: ", clusterer._means[c])
|
| 202 |
+
print("Covar: ", clusterer._covariance_matrices[c])
|
| 203 |
+
print()
|
| 204 |
+
|
| 205 |
+
# classify a new vector
|
| 206 |
+
vector = numpy.array([2, 2])
|
| 207 |
+
print("classify(%s):" % vector, end=" ")
|
| 208 |
+
print(clusterer.classify(vector))
|
| 209 |
+
|
| 210 |
+
# show the classification probabilities
|
| 211 |
+
vector = numpy.array([2, 2])
|
| 212 |
+
print("classification_probdist(%s):" % vector)
|
| 213 |
+
pdist = clusterer.classification_probdist(vector)
|
| 214 |
+
for sample in pdist.samples():
|
| 215 |
+
print(f"{sample} => {pdist.prob(sample) * 100:.0f}%")
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
if __name__ == "__main__":
|
| 219 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/gaac.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Group Average Agglomerative Clusterer
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
import numpy
|
| 10 |
+
except ImportError:
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class GAAClusterer(VectorSpaceClusterer):
|
| 17 |
+
"""
|
| 18 |
+
The Group Average Agglomerative starts with each of the N vectors as singleton
|
| 19 |
+
clusters. It then iteratively merges pairs of clusters which have the
|
| 20 |
+
closest centroids. This continues until there is only one cluster. The
|
| 21 |
+
order of merges gives rise to a dendrogram: a tree with the earlier merges
|
| 22 |
+
lower than later merges. The membership of a given number of clusters c, 1
|
| 23 |
+
<= c <= N, can be found by cutting the dendrogram at depth c.
|
| 24 |
+
|
| 25 |
+
This clusterer uses the cosine similarity metric only, which allows for
|
| 26 |
+
efficient speed-up in the clustering process.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
|
| 30 |
+
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
|
| 31 |
+
self._num_clusters = num_clusters
|
| 32 |
+
self._dendrogram = None
|
| 33 |
+
self._groups_values = None
|
| 34 |
+
|
| 35 |
+
def cluster(self, vectors, assign_clusters=False, trace=False):
|
| 36 |
+
# stores the merge order
|
| 37 |
+
self._dendrogram = Dendrogram(
|
| 38 |
+
[numpy.array(vector, numpy.float64) for vector in vectors]
|
| 39 |
+
)
|
| 40 |
+
return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
|
| 41 |
+
|
| 42 |
+
def cluster_vectorspace(self, vectors, trace=False):
|
| 43 |
+
# variables describing the initial situation
|
| 44 |
+
N = len(vectors)
|
| 45 |
+
cluster_len = [1] * N
|
| 46 |
+
cluster_count = N
|
| 47 |
+
index_map = numpy.arange(N)
|
| 48 |
+
|
| 49 |
+
# construct the similarity matrix
|
| 50 |
+
dims = (N, N)
|
| 51 |
+
dist = numpy.ones(dims, dtype=float) * numpy.inf
|
| 52 |
+
for i in range(N):
|
| 53 |
+
for j in range(i + 1, N):
|
| 54 |
+
dist[i, j] = cosine_distance(vectors[i], vectors[j])
|
| 55 |
+
|
| 56 |
+
while cluster_count > max(self._num_clusters, 1):
|
| 57 |
+
i, j = numpy.unravel_index(dist.argmin(), dims)
|
| 58 |
+
if trace:
|
| 59 |
+
print("merging %d and %d" % (i, j))
|
| 60 |
+
|
| 61 |
+
# update similarities for merging i and j
|
| 62 |
+
self._merge_similarities(dist, cluster_len, i, j)
|
| 63 |
+
|
| 64 |
+
# remove j
|
| 65 |
+
dist[:, j] = numpy.inf
|
| 66 |
+
dist[j, :] = numpy.inf
|
| 67 |
+
|
| 68 |
+
# merge the clusters
|
| 69 |
+
cluster_len[i] = cluster_len[i] + cluster_len[j]
|
| 70 |
+
self._dendrogram.merge(index_map[i], index_map[j])
|
| 71 |
+
cluster_count -= 1
|
| 72 |
+
|
| 73 |
+
# update the index map to reflect the indexes if we
|
| 74 |
+
# had removed j
|
| 75 |
+
index_map[j + 1 :] -= 1
|
| 76 |
+
index_map[j] = N
|
| 77 |
+
|
| 78 |
+
self.update_clusters(self._num_clusters)
|
| 79 |
+
|
| 80 |
+
def _merge_similarities(self, dist, cluster_len, i, j):
|
| 81 |
+
# the new cluster i merged from i and j adopts the average of
|
| 82 |
+
# i and j's similarity to each other cluster, weighted by the
|
| 83 |
+
# number of points in the clusters i and j
|
| 84 |
+
i_weight = cluster_len[i]
|
| 85 |
+
j_weight = cluster_len[j]
|
| 86 |
+
weight_sum = i_weight + j_weight
|
| 87 |
+
|
| 88 |
+
# update for x<i
|
| 89 |
+
dist[:i, i] = dist[:i, i] * i_weight + dist[:i, j] * j_weight
|
| 90 |
+
dist[:i, i] /= weight_sum
|
| 91 |
+
# update for i<x<j
|
| 92 |
+
dist[i, i + 1 : j] = (
|
| 93 |
+
dist[i, i + 1 : j] * i_weight + dist[i + 1 : j, j] * j_weight
|
| 94 |
+
)
|
| 95 |
+
# update for i<j<x
|
| 96 |
+
dist[i, j + 1 :] = dist[i, j + 1 :] * i_weight + dist[j, j + 1 :] * j_weight
|
| 97 |
+
dist[i, i + 1 :] /= weight_sum
|
| 98 |
+
|
| 99 |
+
def update_clusters(self, num_clusters):
|
| 100 |
+
clusters = self._dendrogram.groups(num_clusters)
|
| 101 |
+
self._centroids = []
|
| 102 |
+
for cluster in clusters:
|
| 103 |
+
assert len(cluster) > 0
|
| 104 |
+
if self._should_normalise:
|
| 105 |
+
centroid = self._normalise(cluster[0])
|
| 106 |
+
else:
|
| 107 |
+
centroid = numpy.array(cluster[0])
|
| 108 |
+
for vector in cluster[1:]:
|
| 109 |
+
if self._should_normalise:
|
| 110 |
+
centroid += self._normalise(vector)
|
| 111 |
+
else:
|
| 112 |
+
centroid += vector
|
| 113 |
+
centroid /= len(cluster)
|
| 114 |
+
self._centroids.append(centroid)
|
| 115 |
+
self._num_clusters = len(self._centroids)
|
| 116 |
+
|
| 117 |
+
def classify_vectorspace(self, vector):
|
| 118 |
+
best = None
|
| 119 |
+
for i in range(self._num_clusters):
|
| 120 |
+
centroid = self._centroids[i]
|
| 121 |
+
dist = cosine_distance(vector, centroid)
|
| 122 |
+
if not best or dist < best[0]:
|
| 123 |
+
best = (dist, i)
|
| 124 |
+
return best[1]
|
| 125 |
+
|
| 126 |
+
def dendrogram(self):
|
| 127 |
+
"""
|
| 128 |
+
:return: The dendrogram representing the current clustering
|
| 129 |
+
:rtype: Dendrogram
|
| 130 |
+
"""
|
| 131 |
+
return self._dendrogram
|
| 132 |
+
|
| 133 |
+
def num_clusters(self):
|
| 134 |
+
return self._num_clusters
|
| 135 |
+
|
| 136 |
+
def __repr__(self):
|
| 137 |
+
return "<GroupAverageAgglomerative Clusterer n=%d>" % self._num_clusters
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def demo():
|
| 141 |
+
"""
|
| 142 |
+
Non-interactive demonstration of the clusterers with simple 2-D data.
|
| 143 |
+
"""
|
| 144 |
+
|
| 145 |
+
from nltk.cluster import GAAClusterer
|
| 146 |
+
|
| 147 |
+
# use a set of tokens with 2D indices
|
| 148 |
+
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
|
| 149 |
+
|
| 150 |
+
# test the GAAC clusterer with 4 clusters
|
| 151 |
+
clusterer = GAAClusterer(4)
|
| 152 |
+
clusters = clusterer.cluster(vectors, True)
|
| 153 |
+
|
| 154 |
+
print("Clusterer:", clusterer)
|
| 155 |
+
print("Clustered:", vectors)
|
| 156 |
+
print("As:", clusters)
|
| 157 |
+
print()
|
| 158 |
+
|
| 159 |
+
# show the dendrogram
|
| 160 |
+
clusterer.dendrogram().show()
|
| 161 |
+
|
| 162 |
+
# classify a new vector
|
| 163 |
+
vector = numpy.array([3, 3])
|
| 164 |
+
print("classify(%s):" % vector, end=" ")
|
| 165 |
+
print(clusterer.classify(vector))
|
| 166 |
+
print()
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
if __name__ == "__main__":
|
| 170 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/cluster/kmeans.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: K-Means Clusterer
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Trevor Cohn <tacohn@cs.mu.oz.au>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
import copy
|
| 9 |
+
import random
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
import numpy
|
| 14 |
+
except ImportError:
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
from nltk.cluster.util import VectorSpaceClusterer
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class KMeansClusterer(VectorSpaceClusterer):
|
| 22 |
+
"""
|
| 23 |
+
The K-means clusterer starts with k arbitrary chosen means then allocates
|
| 24 |
+
each vector to the cluster with the closest mean. It then recalculates the
|
| 25 |
+
means of each cluster as the centroid of the vectors in the cluster. This
|
| 26 |
+
process repeats until the cluster memberships stabilise. This is a
|
| 27 |
+
hill-climbing algorithm which may converge to a local maximum. Hence the
|
| 28 |
+
clustering is often repeated with random initial means and the most
|
| 29 |
+
commonly occurring output means are chosen.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(
|
| 33 |
+
self,
|
| 34 |
+
num_means,
|
| 35 |
+
distance,
|
| 36 |
+
repeats=1,
|
| 37 |
+
conv_test=1e-6,
|
| 38 |
+
initial_means=None,
|
| 39 |
+
normalise=False,
|
| 40 |
+
svd_dimensions=None,
|
| 41 |
+
rng=None,
|
| 42 |
+
avoid_empty_clusters=False,
|
| 43 |
+
):
|
| 44 |
+
|
| 45 |
+
"""
|
| 46 |
+
:param num_means: the number of means to use (may use fewer)
|
| 47 |
+
:type num_means: int
|
| 48 |
+
:param distance: measure of distance between two vectors
|
| 49 |
+
:type distance: function taking two vectors and returning a float
|
| 50 |
+
:param repeats: number of randomised clustering trials to use
|
| 51 |
+
:type repeats: int
|
| 52 |
+
:param conv_test: maximum variation in mean differences before
|
| 53 |
+
deemed convergent
|
| 54 |
+
:type conv_test: number
|
| 55 |
+
:param initial_means: set of k initial means
|
| 56 |
+
:type initial_means: sequence of vectors
|
| 57 |
+
:param normalise: should vectors be normalised to length 1
|
| 58 |
+
:type normalise: boolean
|
| 59 |
+
:param svd_dimensions: number of dimensions to use in reducing vector
|
| 60 |
+
dimensionsionality with SVD
|
| 61 |
+
:type svd_dimensions: int
|
| 62 |
+
:param rng: random number generator (or None)
|
| 63 |
+
:type rng: Random
|
| 64 |
+
:param avoid_empty_clusters: include current centroid in computation
|
| 65 |
+
of next one; avoids undefined behavior
|
| 66 |
+
when clusters become empty
|
| 67 |
+
:type avoid_empty_clusters: boolean
|
| 68 |
+
"""
|
| 69 |
+
VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
|
| 70 |
+
self._num_means = num_means
|
| 71 |
+
self._distance = distance
|
| 72 |
+
self._max_difference = conv_test
|
| 73 |
+
assert not initial_means or len(initial_means) == num_means
|
| 74 |
+
self._means = initial_means
|
| 75 |
+
assert repeats >= 1
|
| 76 |
+
assert not (initial_means and repeats > 1)
|
| 77 |
+
self._repeats = repeats
|
| 78 |
+
self._rng = rng if rng else random.Random()
|
| 79 |
+
self._avoid_empty_clusters = avoid_empty_clusters
|
| 80 |
+
|
| 81 |
+
def cluster_vectorspace(self, vectors, trace=False):
|
| 82 |
+
if self._means and self._repeats > 1:
|
| 83 |
+
print("Warning: means will be discarded for subsequent trials")
|
| 84 |
+
|
| 85 |
+
meanss = []
|
| 86 |
+
for trial in range(self._repeats):
|
| 87 |
+
if trace:
|
| 88 |
+
print("k-means trial", trial)
|
| 89 |
+
if not self._means or trial > 1:
|
| 90 |
+
self._means = self._rng.sample(list(vectors), self._num_means)
|
| 91 |
+
self._cluster_vectorspace(vectors, trace)
|
| 92 |
+
meanss.append(self._means)
|
| 93 |
+
|
| 94 |
+
if len(meanss) > 1:
|
| 95 |
+
# sort the means first (so that different cluster numbering won't
|
| 96 |
+
# effect the distance comparison)
|
| 97 |
+
for means in meanss:
|
| 98 |
+
means.sort(key=sum)
|
| 99 |
+
|
| 100 |
+
# find the set of means that's minimally different from the others
|
| 101 |
+
min_difference = min_means = None
|
| 102 |
+
for i in range(len(meanss)):
|
| 103 |
+
d = 0
|
| 104 |
+
for j in range(len(meanss)):
|
| 105 |
+
if i != j:
|
| 106 |
+
d += self._sum_distances(meanss[i], meanss[j])
|
| 107 |
+
if min_difference is None or d < min_difference:
|
| 108 |
+
min_difference, min_means = d, meanss[i]
|
| 109 |
+
|
| 110 |
+
# use the best means
|
| 111 |
+
self._means = min_means
|
| 112 |
+
|
| 113 |
+
def _cluster_vectorspace(self, vectors, trace=False):
|
| 114 |
+
if self._num_means < len(vectors):
|
| 115 |
+
# perform k-means clustering
|
| 116 |
+
converged = False
|
| 117 |
+
while not converged:
|
| 118 |
+
# assign the tokens to clusters based on minimum distance to
|
| 119 |
+
# the cluster means
|
| 120 |
+
clusters = [[] for m in range(self._num_means)]
|
| 121 |
+
for vector in vectors:
|
| 122 |
+
index = self.classify_vectorspace(vector)
|
| 123 |
+
clusters[index].append(vector)
|
| 124 |
+
|
| 125 |
+
if trace:
|
| 126 |
+
print("iteration")
|
| 127 |
+
# for i in range(self._num_means):
|
| 128 |
+
# print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
|
| 129 |
+
|
| 130 |
+
# recalculate cluster means by computing the centroid of each cluster
|
| 131 |
+
new_means = list(map(self._centroid, clusters, self._means))
|
| 132 |
+
|
| 133 |
+
# measure the degree of change from the previous step for convergence
|
| 134 |
+
difference = self._sum_distances(self._means, new_means)
|
| 135 |
+
if difference < self._max_difference:
|
| 136 |
+
converged = True
|
| 137 |
+
|
| 138 |
+
# remember the new means
|
| 139 |
+
self._means = new_means
|
| 140 |
+
|
| 141 |
+
def classify_vectorspace(self, vector):
|
| 142 |
+
# finds the closest cluster centroid
|
| 143 |
+
# returns that cluster's index
|
| 144 |
+
best_distance = best_index = None
|
| 145 |
+
for index in range(len(self._means)):
|
| 146 |
+
mean = self._means[index]
|
| 147 |
+
dist = self._distance(vector, mean)
|
| 148 |
+
if best_distance is None or dist < best_distance:
|
| 149 |
+
best_index, best_distance = index, dist
|
| 150 |
+
return best_index
|
| 151 |
+
|
| 152 |
+
def num_clusters(self):
|
| 153 |
+
if self._means:
|
| 154 |
+
return len(self._means)
|
| 155 |
+
else:
|
| 156 |
+
return self._num_means
|
| 157 |
+
|
| 158 |
+
def means(self):
|
| 159 |
+
"""
|
| 160 |
+
The means used for clustering.
|
| 161 |
+
"""
|
| 162 |
+
return self._means
|
| 163 |
+
|
| 164 |
+
def _sum_distances(self, vectors1, vectors2):
|
| 165 |
+
difference = 0.0
|
| 166 |
+
for u, v in zip(vectors1, vectors2):
|
| 167 |
+
difference += self._distance(u, v)
|
| 168 |
+
return difference
|
| 169 |
+
|
| 170 |
+
def _centroid(self, cluster, mean):
|
| 171 |
+
if self._avoid_empty_clusters:
|
| 172 |
+
centroid = copy.copy(mean)
|
| 173 |
+
for vector in cluster:
|
| 174 |
+
centroid += vector
|
| 175 |
+
return centroid / (1 + len(cluster))
|
| 176 |
+
else:
|
| 177 |
+
if not len(cluster):
|
| 178 |
+
sys.stderr.write("Error: no centroid defined for empty cluster.\n")
|
| 179 |
+
sys.stderr.write(
|
| 180 |
+
"Try setting argument 'avoid_empty_clusters' to True\n"
|
| 181 |
+
)
|
| 182 |
+
assert False
|
| 183 |
+
centroid = copy.copy(cluster[0])
|
| 184 |
+
for vector in cluster[1:]:
|
| 185 |
+
centroid += vector
|
| 186 |
+
return centroid / len(cluster)
|
| 187 |
+
|
| 188 |
+
def __repr__(self):
|
| 189 |
+
return "<KMeansClusterer means=%s repeats=%d>" % (self._means, self._repeats)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
#################################################################################
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def demo():
|
| 196 |
+
# example from figure 14.9, page 517, Manning and Schutze
|
| 197 |
+
|
| 198 |
+
from nltk.cluster import KMeansClusterer, euclidean_distance
|
| 199 |
+
|
| 200 |
+
vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
|
| 201 |
+
means = [[4, 3], [5, 5]]
|
| 202 |
+
|
| 203 |
+
clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
|
| 204 |
+
clusters = clusterer.cluster(vectors, True, trace=True)
|
| 205 |
+
|
| 206 |
+
print("Clustered:", vectors)
|
| 207 |
+
print("As:", clusters)
|
| 208 |
+
print("Means:", clusterer.means())
|
| 209 |
+
print()
|
| 210 |
+
|
| 211 |
+
vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
|
| 212 |
+
|
| 213 |
+
# test k-means using the euclidean distance metric, 2 means and repeat
|
| 214 |
+
# clustering 10 times with random seeds
|
| 215 |
+
|
| 216 |
+
clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
|
| 217 |
+
clusters = clusterer.cluster(vectors, True)
|
| 218 |
+
print("Clustered:", vectors)
|
| 219 |
+
print("As:", clusters)
|
| 220 |
+
print("Means:", clusterer.means())
|
| 221 |
+
print()
|
| 222 |
+
|
| 223 |
+
# classify a new vector
|
| 224 |
+
vector = numpy.array([3, 3])
|
| 225 |
+
print("classify(%s):" % vector, end=" ")
|
| 226 |
+
print(clusterer.classify(vector))
|
| 227 |
+
print()
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
if __name__ == "__main__":
|
| 231 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/__init__.py
ADDED
|
@@ -0,0 +1,529 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Corpus Readers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Edward Loper <edloper@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
# TODO this docstring isn't up-to-date!
|
| 9 |
+
"""
|
| 10 |
+
NLTK corpus readers. The modules in this package provide functions
|
| 11 |
+
that can be used to read corpus files in a variety of formats. These
|
| 12 |
+
functions can be used to read both the corpus files that are
|
| 13 |
+
distributed in the NLTK corpus package, and corpus files that are part
|
| 14 |
+
of external corpora.
|
| 15 |
+
|
| 16 |
+
Available Corpora
|
| 17 |
+
=================
|
| 18 |
+
|
| 19 |
+
Please see https://www.nltk.org/nltk_data/ for a complete list.
|
| 20 |
+
Install corpora using nltk.download().
|
| 21 |
+
|
| 22 |
+
Corpus Reader Functions
|
| 23 |
+
=======================
|
| 24 |
+
Each corpus module defines one or more "corpus reader functions",
|
| 25 |
+
which can be used to read documents from that corpus. These functions
|
| 26 |
+
take an argument, ``item``, which is used to indicate which document
|
| 27 |
+
should be read from the corpus:
|
| 28 |
+
|
| 29 |
+
- If ``item`` is one of the unique identifiers listed in the corpus
|
| 30 |
+
module's ``items`` variable, then the corresponding document will
|
| 31 |
+
be loaded from the NLTK corpus package.
|
| 32 |
+
- If ``item`` is a filename, then that file will be read.
|
| 33 |
+
|
| 34 |
+
Additionally, corpus reader functions can be given lists of item
|
| 35 |
+
names; in which case, they will return a concatenation of the
|
| 36 |
+
corresponding documents.
|
| 37 |
+
|
| 38 |
+
Corpus reader functions are named based on the type of information
|
| 39 |
+
they return. Some common examples, and their return types, are:
|
| 40 |
+
|
| 41 |
+
- words(): list of str
|
| 42 |
+
- sents(): list of (list of str)
|
| 43 |
+
- paras(): list of (list of (list of str))
|
| 44 |
+
- tagged_words(): list of (str,str) tuple
|
| 45 |
+
- tagged_sents(): list of (list of (str,str))
|
| 46 |
+
- tagged_paras(): list of (list of (list of (str,str)))
|
| 47 |
+
- chunked_sents(): list of (Tree w/ (str,str) leaves)
|
| 48 |
+
- parsed_sents(): list of (Tree with str leaves)
|
| 49 |
+
- parsed_paras(): list of (list of (Tree with str leaves))
|
| 50 |
+
- xml(): A single xml ElementTree
|
| 51 |
+
- raw(): unprocessed corpus contents
|
| 52 |
+
|
| 53 |
+
For example, to read a list of the words in the Brown Corpus, use
|
| 54 |
+
``nltk.corpus.brown.words()``:
|
| 55 |
+
|
| 56 |
+
>>> from nltk.corpus import brown
|
| 57 |
+
>>> print(", ".join(brown.words())) # doctest: +ELLIPSIS
|
| 58 |
+
The, Fulton, County, Grand, Jury, said, ...
|
| 59 |
+
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
import re
|
| 63 |
+
|
| 64 |
+
from nltk.corpus.reader import *
|
| 65 |
+
from nltk.corpus.util import LazyCorpusLoader
|
| 66 |
+
from nltk.tokenize import RegexpTokenizer
|
| 67 |
+
|
| 68 |
+
abc: PlaintextCorpusReader = LazyCorpusLoader(
|
| 69 |
+
"abc",
|
| 70 |
+
PlaintextCorpusReader,
|
| 71 |
+
r"(?!\.).*\.txt",
|
| 72 |
+
encoding=[("science", "latin_1"), ("rural", "utf8")],
|
| 73 |
+
)
|
| 74 |
+
alpino: AlpinoCorpusReader = LazyCorpusLoader(
|
| 75 |
+
"alpino", AlpinoCorpusReader, tagset="alpino"
|
| 76 |
+
)
|
| 77 |
+
bcp47: BCP47CorpusReader = LazyCorpusLoader(
|
| 78 |
+
"bcp47", BCP47CorpusReader, r"(cldr|iana)/*"
|
| 79 |
+
)
|
| 80 |
+
brown: CategorizedTaggedCorpusReader = LazyCorpusLoader(
|
| 81 |
+
"brown",
|
| 82 |
+
CategorizedTaggedCorpusReader,
|
| 83 |
+
r"c[a-z]\d\d",
|
| 84 |
+
cat_file="cats.txt",
|
| 85 |
+
tagset="brown",
|
| 86 |
+
encoding="ascii",
|
| 87 |
+
)
|
| 88 |
+
cess_cat: BracketParseCorpusReader = LazyCorpusLoader(
|
| 89 |
+
"cess_cat",
|
| 90 |
+
BracketParseCorpusReader,
|
| 91 |
+
r"(?!\.).*\.tbf",
|
| 92 |
+
tagset="unknown",
|
| 93 |
+
encoding="ISO-8859-15",
|
| 94 |
+
)
|
| 95 |
+
cess_esp: BracketParseCorpusReader = LazyCorpusLoader(
|
| 96 |
+
"cess_esp",
|
| 97 |
+
BracketParseCorpusReader,
|
| 98 |
+
r"(?!\.).*\.tbf",
|
| 99 |
+
tagset="unknown",
|
| 100 |
+
encoding="ISO-8859-15",
|
| 101 |
+
)
|
| 102 |
+
cmudict: CMUDictCorpusReader = LazyCorpusLoader(
|
| 103 |
+
"cmudict", CMUDictCorpusReader, ["cmudict"]
|
| 104 |
+
)
|
| 105 |
+
comtrans: AlignedCorpusReader = LazyCorpusLoader(
|
| 106 |
+
"comtrans", AlignedCorpusReader, r"(?!\.).*\.txt"
|
| 107 |
+
)
|
| 108 |
+
comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader(
|
| 109 |
+
"comparative_sentences",
|
| 110 |
+
ComparativeSentencesCorpusReader,
|
| 111 |
+
r"labeledSentences\.txt",
|
| 112 |
+
encoding="latin-1",
|
| 113 |
+
)
|
| 114 |
+
conll2000: ConllChunkCorpusReader = LazyCorpusLoader(
|
| 115 |
+
"conll2000",
|
| 116 |
+
ConllChunkCorpusReader,
|
| 117 |
+
["train.txt", "test.txt"],
|
| 118 |
+
("NP", "VP", "PP"),
|
| 119 |
+
tagset="wsj",
|
| 120 |
+
encoding="ascii",
|
| 121 |
+
)
|
| 122 |
+
conll2002: ConllChunkCorpusReader = LazyCorpusLoader(
|
| 123 |
+
"conll2002",
|
| 124 |
+
ConllChunkCorpusReader,
|
| 125 |
+
r".*\.(test|train).*",
|
| 126 |
+
("LOC", "PER", "ORG", "MISC"),
|
| 127 |
+
encoding="utf-8",
|
| 128 |
+
)
|
| 129 |
+
conll2007: DependencyCorpusReader = LazyCorpusLoader(
|
| 130 |
+
"conll2007",
|
| 131 |
+
DependencyCorpusReader,
|
| 132 |
+
r".*\.(test|train).*",
|
| 133 |
+
encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")],
|
| 134 |
+
)
|
| 135 |
+
crubadan: CrubadanCorpusReader = LazyCorpusLoader(
|
| 136 |
+
"crubadan", CrubadanCorpusReader, r".*\.txt"
|
| 137 |
+
)
|
| 138 |
+
dependency_treebank: DependencyCorpusReader = LazyCorpusLoader(
|
| 139 |
+
"dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii"
|
| 140 |
+
)
|
| 141 |
+
extended_omw: CorpusReader = LazyCorpusLoader(
|
| 142 |
+
"extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8"
|
| 143 |
+
)
|
| 144 |
+
floresta: BracketParseCorpusReader = LazyCorpusLoader(
|
| 145 |
+
"floresta",
|
| 146 |
+
BracketParseCorpusReader,
|
| 147 |
+
r"(?!\.).*\.ptb",
|
| 148 |
+
"#",
|
| 149 |
+
tagset="unknown",
|
| 150 |
+
encoding="ISO-8859-15",
|
| 151 |
+
)
|
| 152 |
+
framenet15: FramenetCorpusReader = LazyCorpusLoader(
|
| 153 |
+
"framenet_v15",
|
| 154 |
+
FramenetCorpusReader,
|
| 155 |
+
[
|
| 156 |
+
"frRelation.xml",
|
| 157 |
+
"frameIndex.xml",
|
| 158 |
+
"fulltextIndex.xml",
|
| 159 |
+
"luIndex.xml",
|
| 160 |
+
"semTypes.xml",
|
| 161 |
+
],
|
| 162 |
+
)
|
| 163 |
+
framenet: FramenetCorpusReader = LazyCorpusLoader(
|
| 164 |
+
"framenet_v17",
|
| 165 |
+
FramenetCorpusReader,
|
| 166 |
+
[
|
| 167 |
+
"frRelation.xml",
|
| 168 |
+
"frameIndex.xml",
|
| 169 |
+
"fulltextIndex.xml",
|
| 170 |
+
"luIndex.xml",
|
| 171 |
+
"semTypes.xml",
|
| 172 |
+
],
|
| 173 |
+
)
|
| 174 |
+
gazetteers: WordListCorpusReader = LazyCorpusLoader(
|
| 175 |
+
"gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2"
|
| 176 |
+
)
|
| 177 |
+
genesis: PlaintextCorpusReader = LazyCorpusLoader(
|
| 178 |
+
"genesis",
|
| 179 |
+
PlaintextCorpusReader,
|
| 180 |
+
r"(?!\.).*\.txt",
|
| 181 |
+
encoding=[
|
| 182 |
+
("finnish|french|german", "latin_1"),
|
| 183 |
+
("swedish", "cp865"),
|
| 184 |
+
(".*", "utf_8"),
|
| 185 |
+
],
|
| 186 |
+
)
|
| 187 |
+
gutenberg: PlaintextCorpusReader = LazyCorpusLoader(
|
| 188 |
+
"gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
|
| 189 |
+
)
|
| 190 |
+
ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*")
|
| 191 |
+
inaugural: PlaintextCorpusReader = LazyCorpusLoader(
|
| 192 |
+
"inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1"
|
| 193 |
+
)
|
| 194 |
+
# [XX] This should probably just use TaggedCorpusReader:
|
| 195 |
+
indian: IndianCorpusReader = LazyCorpusLoader(
|
| 196 |
+
"indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8"
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
jeita: ChasenCorpusReader = LazyCorpusLoader(
|
| 200 |
+
"jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8"
|
| 201 |
+
)
|
| 202 |
+
knbc: KNBCorpusReader = LazyCorpusLoader(
|
| 203 |
+
"knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp"
|
| 204 |
+
)
|
| 205 |
+
lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader(
|
| 206 |
+
"lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp"
|
| 207 |
+
)
|
| 208 |
+
mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader(
|
| 209 |
+
"mac_morpho",
|
| 210 |
+
MacMorphoCorpusReader,
|
| 211 |
+
r"(?!\.).*\.txt",
|
| 212 |
+
tagset="unknown",
|
| 213 |
+
encoding="latin-1",
|
| 214 |
+
)
|
| 215 |
+
machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader(
|
| 216 |
+
"machado",
|
| 217 |
+
PortugueseCategorizedPlaintextCorpusReader,
|
| 218 |
+
r"(?!\.).*\.txt",
|
| 219 |
+
cat_pattern=r"([a-z]*)/.*",
|
| 220 |
+
encoding="latin-1",
|
| 221 |
+
)
|
| 222 |
+
masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader(
|
| 223 |
+
"masc_tagged",
|
| 224 |
+
CategorizedTaggedCorpusReader,
|
| 225 |
+
r"(spoken|written)/.*\.txt",
|
| 226 |
+
cat_file="categories.txt",
|
| 227 |
+
tagset="wsj",
|
| 228 |
+
encoding="utf-8",
|
| 229 |
+
sep="_",
|
| 230 |
+
)
|
| 231 |
+
movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
|
| 232 |
+
"movie_reviews",
|
| 233 |
+
CategorizedPlaintextCorpusReader,
|
| 234 |
+
r"(?!\.).*\.txt",
|
| 235 |
+
cat_pattern=r"(neg|pos)/.*",
|
| 236 |
+
encoding="ascii",
|
| 237 |
+
)
|
| 238 |
+
multext_east: MTECorpusReader = LazyCorpusLoader(
|
| 239 |
+
"mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8"
|
| 240 |
+
)
|
| 241 |
+
names: WordListCorpusReader = LazyCorpusLoader(
|
| 242 |
+
"names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii"
|
| 243 |
+
)
|
| 244 |
+
nps_chat: NPSChatCorpusReader = LazyCorpusLoader(
|
| 245 |
+
"nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj"
|
| 246 |
+
)
|
| 247 |
+
opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader(
|
| 248 |
+
"opinion_lexicon",
|
| 249 |
+
OpinionLexiconCorpusReader,
|
| 250 |
+
r"(\w+)\-words\.txt",
|
| 251 |
+
encoding="ISO-8859-2",
|
| 252 |
+
)
|
| 253 |
+
ppattach: PPAttachmentCorpusReader = LazyCorpusLoader(
|
| 254 |
+
"ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"]
|
| 255 |
+
)
|
| 256 |
+
product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader(
|
| 257 |
+
"product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
|
| 258 |
+
)
|
| 259 |
+
product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader(
|
| 260 |
+
"product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8"
|
| 261 |
+
)
|
| 262 |
+
pros_cons: ProsConsCorpusReader = LazyCorpusLoader(
|
| 263 |
+
"pros_cons",
|
| 264 |
+
ProsConsCorpusReader,
|
| 265 |
+
r"Integrated(Cons|Pros)\.txt",
|
| 266 |
+
cat_pattern=r"Integrated(Cons|Pros)\.txt",
|
| 267 |
+
encoding="ISO-8859-2",
|
| 268 |
+
)
|
| 269 |
+
ptb: CategorizedBracketParseCorpusReader = (
|
| 270 |
+
LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions
|
| 271 |
+
"ptb",
|
| 272 |
+
CategorizedBracketParseCorpusReader,
|
| 273 |
+
r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG",
|
| 274 |
+
cat_file="allcats.txt",
|
| 275 |
+
tagset="wsj",
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
qc: StringCategoryCorpusReader = LazyCorpusLoader(
|
| 279 |
+
"qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2"
|
| 280 |
+
)
|
| 281 |
+
reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader(
|
| 282 |
+
"reuters",
|
| 283 |
+
CategorizedPlaintextCorpusReader,
|
| 284 |
+
"(training|test).*",
|
| 285 |
+
cat_file="cats.txt",
|
| 286 |
+
encoding="ISO-8859-2",
|
| 287 |
+
)
|
| 288 |
+
rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml")
|
| 289 |
+
senseval: SensevalCorpusReader = LazyCorpusLoader(
|
| 290 |
+
"senseval", SensevalCorpusReader, r"(?!\.).*\.pos"
|
| 291 |
+
)
|
| 292 |
+
sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
|
| 293 |
+
"sentence_polarity",
|
| 294 |
+
CategorizedSentencesCorpusReader,
|
| 295 |
+
r"rt-polarity\.(neg|pos)",
|
| 296 |
+
cat_pattern=r"rt-polarity\.(neg|pos)",
|
| 297 |
+
encoding="utf-8",
|
| 298 |
+
)
|
| 299 |
+
sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader(
|
| 300 |
+
"sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8"
|
| 301 |
+
)
|
| 302 |
+
shakespeare: XMLCorpusReader = LazyCorpusLoader(
|
| 303 |
+
"shakespeare", XMLCorpusReader, r"(?!\.).*\.xml"
|
| 304 |
+
)
|
| 305 |
+
sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader(
|
| 306 |
+
"sinica_treebank",
|
| 307 |
+
SinicaTreebankCorpusReader,
|
| 308 |
+
["parsed"],
|
| 309 |
+
tagset="unknown",
|
| 310 |
+
encoding="utf-8",
|
| 311 |
+
)
|
| 312 |
+
state_union: PlaintextCorpusReader = LazyCorpusLoader(
|
| 313 |
+
"state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2"
|
| 314 |
+
)
|
| 315 |
+
stopwords: WordListCorpusReader = LazyCorpusLoader(
|
| 316 |
+
"stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8"
|
| 317 |
+
)
|
| 318 |
+
subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader(
|
| 319 |
+
"subjectivity",
|
| 320 |
+
CategorizedSentencesCorpusReader,
|
| 321 |
+
r"(quote.tok.gt9|plot.tok.gt9)\.5000",
|
| 322 |
+
cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]},
|
| 323 |
+
encoding="latin-1",
|
| 324 |
+
)
|
| 325 |
+
swadesh: SwadeshCorpusReader = LazyCorpusLoader(
|
| 326 |
+
"swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8"
|
| 327 |
+
)
|
| 328 |
+
swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader(
|
| 329 |
+
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8"
|
| 330 |
+
)
|
| 331 |
+
swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader(
|
| 332 |
+
"panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8"
|
| 333 |
+
)
|
| 334 |
+
switchboard: SwitchboardCorpusReader = LazyCorpusLoader(
|
| 335 |
+
"switchboard", SwitchboardCorpusReader, tagset="wsj"
|
| 336 |
+
)
|
| 337 |
+
timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader)
|
| 338 |
+
timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader(
|
| 339 |
+
"timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii"
|
| 340 |
+
)
|
| 341 |
+
toolbox: ToolboxCorpusReader = LazyCorpusLoader(
|
| 342 |
+
"toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)"
|
| 343 |
+
)
|
| 344 |
+
treebank: BracketParseCorpusReader = LazyCorpusLoader(
|
| 345 |
+
"treebank/combined",
|
| 346 |
+
BracketParseCorpusReader,
|
| 347 |
+
r"wsj_.*\.mrg",
|
| 348 |
+
tagset="wsj",
|
| 349 |
+
encoding="ascii",
|
| 350 |
+
)
|
| 351 |
+
treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader(
|
| 352 |
+
"treebank/tagged",
|
| 353 |
+
ChunkedCorpusReader,
|
| 354 |
+
r"wsj_.*\.pos",
|
| 355 |
+
sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True),
|
| 356 |
+
para_block_reader=tagged_treebank_para_block_reader,
|
| 357 |
+
tagset="wsj",
|
| 358 |
+
encoding="ascii",
|
| 359 |
+
)
|
| 360 |
+
treebank_raw: PlaintextCorpusReader = LazyCorpusLoader(
|
| 361 |
+
"treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2"
|
| 362 |
+
)
|
| 363 |
+
twitter_samples: TwitterCorpusReader = LazyCorpusLoader(
|
| 364 |
+
"twitter_samples", TwitterCorpusReader, r".*\.json"
|
| 365 |
+
)
|
| 366 |
+
udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader)
|
| 367 |
+
udhr2: PlaintextCorpusReader = LazyCorpusLoader(
|
| 368 |
+
"udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8"
|
| 369 |
+
)
|
| 370 |
+
universal_treebanks: ConllCorpusReader = LazyCorpusLoader(
|
| 371 |
+
"universal_treebanks_v20",
|
| 372 |
+
ConllCorpusReader,
|
| 373 |
+
r".*\.conll",
|
| 374 |
+
columntypes=(
|
| 375 |
+
"ignore",
|
| 376 |
+
"words",
|
| 377 |
+
"ignore",
|
| 378 |
+
"ignore",
|
| 379 |
+
"pos",
|
| 380 |
+
"ignore",
|
| 381 |
+
"ignore",
|
| 382 |
+
"ignore",
|
| 383 |
+
"ignore",
|
| 384 |
+
"ignore",
|
| 385 |
+
),
|
| 386 |
+
)
|
| 387 |
+
verbnet: VerbnetCorpusReader = LazyCorpusLoader(
|
| 388 |
+
"verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml"
|
| 389 |
+
)
|
| 390 |
+
webtext: PlaintextCorpusReader = LazyCorpusLoader(
|
| 391 |
+
"webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2"
|
| 392 |
+
)
|
| 393 |
+
wordnet: WordNetCorpusReader = LazyCorpusLoader(
|
| 394 |
+
"wordnet",
|
| 395 |
+
WordNetCorpusReader,
|
| 396 |
+
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
| 397 |
+
)
|
| 398 |
+
wordnet31: WordNetCorpusReader = LazyCorpusLoader(
|
| 399 |
+
"wordnet31",
|
| 400 |
+
WordNetCorpusReader,
|
| 401 |
+
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
| 402 |
+
)
|
| 403 |
+
wordnet2021: WordNetCorpusReader = LazyCorpusLoader(
|
| 404 |
+
"wordnet2021",
|
| 405 |
+
WordNetCorpusReader,
|
| 406 |
+
LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"),
|
| 407 |
+
)
|
| 408 |
+
wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader(
|
| 409 |
+
"wordnet_ic", WordNetICCorpusReader, r".*\.dat"
|
| 410 |
+
)
|
| 411 |
+
words: WordListCorpusReader = LazyCorpusLoader(
|
| 412 |
+
"words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii"
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
# defined after treebank
|
| 416 |
+
propbank: PropbankCorpusReader = LazyCorpusLoader(
|
| 417 |
+
"propbank",
|
| 418 |
+
PropbankCorpusReader,
|
| 419 |
+
"prop.txt",
|
| 420 |
+
r"frames/.*\.xml",
|
| 421 |
+
"verbs.txt",
|
| 422 |
+
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
|
| 423 |
+
treebank,
|
| 424 |
+
) # Must be defined *after* treebank corpus.
|
| 425 |
+
nombank: NombankCorpusReader = LazyCorpusLoader(
|
| 426 |
+
"nombank.1.0",
|
| 427 |
+
NombankCorpusReader,
|
| 428 |
+
"nombank.1.0",
|
| 429 |
+
r"frames/.*\.xml",
|
| 430 |
+
"nombank.1.0.words",
|
| 431 |
+
lambda filename: re.sub(r"^wsj/\d\d/", "", filename),
|
| 432 |
+
treebank,
|
| 433 |
+
) # Must be defined *after* treebank corpus.
|
| 434 |
+
propbank_ptb: PropbankCorpusReader = LazyCorpusLoader(
|
| 435 |
+
"propbank",
|
| 436 |
+
PropbankCorpusReader,
|
| 437 |
+
"prop.txt",
|
| 438 |
+
r"frames/.*\.xml",
|
| 439 |
+
"verbs.txt",
|
| 440 |
+
lambda filename: filename.upper(),
|
| 441 |
+
ptb,
|
| 442 |
+
) # Must be defined *after* ptb corpus.
|
| 443 |
+
nombank_ptb: NombankCorpusReader = LazyCorpusLoader(
|
| 444 |
+
"nombank.1.0",
|
| 445 |
+
NombankCorpusReader,
|
| 446 |
+
"nombank.1.0",
|
| 447 |
+
r"frames/.*\.xml",
|
| 448 |
+
"nombank.1.0.words",
|
| 449 |
+
lambda filename: filename.upper(),
|
| 450 |
+
ptb,
|
| 451 |
+
) # Must be defined *after* ptb corpus.
|
| 452 |
+
semcor: SemcorCorpusReader = LazyCorpusLoader(
|
| 453 |
+
"semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet
|
| 454 |
+
) # Must be defined *after* wordnet corpus.
|
| 455 |
+
|
| 456 |
+
nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader(
|
| 457 |
+
"nonbreaking_prefixes",
|
| 458 |
+
NonbreakingPrefixesCorpusReader,
|
| 459 |
+
r"(?!README|\.).*",
|
| 460 |
+
encoding="utf8",
|
| 461 |
+
)
|
| 462 |
+
perluniprops: UnicharsCorpusReader = LazyCorpusLoader(
|
| 463 |
+
"perluniprops",
|
| 464 |
+
UnicharsCorpusReader,
|
| 465 |
+
r"(?!README|\.).*",
|
| 466 |
+
nltk_data_subdir="misc",
|
| 467 |
+
encoding="utf8",
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
# mwa_ppdb = LazyCorpusLoader(
|
| 471 |
+
# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8')
|
| 472 |
+
|
| 473 |
+
# See https://github.com/nltk/nltk/issues/1579
|
| 474 |
+
# and https://github.com/nltk/nltk/issues/1716
|
| 475 |
+
#
|
| 476 |
+
# pl196x = LazyCorpusLoader(
|
| 477 |
+
# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml',
|
| 478 |
+
# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8')
|
| 479 |
+
#
|
| 480 |
+
# ipipan = LazyCorpusLoader(
|
| 481 |
+
# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml')
|
| 482 |
+
#
|
| 483 |
+
# nkjp = LazyCorpusLoader(
|
| 484 |
+
# 'nkjp', NKJPCorpusReader, r'', encoding='utf8')
|
| 485 |
+
#
|
| 486 |
+
# panlex_lite = LazyCorpusLoader(
|
| 487 |
+
# 'panlex_lite', PanLexLiteCorpusReader)
|
| 488 |
+
#
|
| 489 |
+
# ycoe = LazyCorpusLoader(
|
| 490 |
+
# 'ycoe', YCOECorpusReader)
|
| 491 |
+
#
|
| 492 |
+
# corpus not available with NLTK; these lines caused help(nltk.corpus) to break
|
| 493 |
+
# hebrew_treebank = LazyCorpusLoader(
|
| 494 |
+
# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt')
|
| 495 |
+
|
| 496 |
+
# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116
|
| 497 |
+
def demo():
|
| 498 |
+
# This is out-of-date:
|
| 499 |
+
abc.demo()
|
| 500 |
+
brown.demo()
|
| 501 |
+
# chat80.demo()
|
| 502 |
+
cmudict.demo()
|
| 503 |
+
conll2000.demo()
|
| 504 |
+
conll2002.demo()
|
| 505 |
+
genesis.demo()
|
| 506 |
+
gutenberg.demo()
|
| 507 |
+
ieer.demo()
|
| 508 |
+
inaugural.demo()
|
| 509 |
+
indian.demo()
|
| 510 |
+
names.demo()
|
| 511 |
+
ppattach.demo()
|
| 512 |
+
senseval.demo()
|
| 513 |
+
shakespeare.demo()
|
| 514 |
+
sinica_treebank.demo()
|
| 515 |
+
state_union.demo()
|
| 516 |
+
stopwords.demo()
|
| 517 |
+
timit.demo()
|
| 518 |
+
toolbox.demo()
|
| 519 |
+
treebank.demo()
|
| 520 |
+
udhr.demo()
|
| 521 |
+
webtext.demo()
|
| 522 |
+
words.demo()
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
# ycoe.demo()
|
| 526 |
+
|
| 527 |
+
if __name__ == "__main__":
|
| 528 |
+
# demo()
|
| 529 |
+
pass
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/europarl_raw.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Europarl Corpus Readers
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Nitin Madnani <nmadnani@umiacs.umd.edu>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
from nltk.corpus.reader import *
|
| 11 |
+
from nltk.corpus.util import LazyCorpusLoader
|
| 12 |
+
|
| 13 |
+
# Create a new corpus reader instance for each European language
|
| 14 |
+
danish: EuroparlCorpusReader = LazyCorpusLoader(
|
| 15 |
+
"europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8"
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
dutch: EuroparlCorpusReader = LazyCorpusLoader(
|
| 19 |
+
"europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8"
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
english: EuroparlCorpusReader = LazyCorpusLoader(
|
| 23 |
+
"europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
finnish: EuroparlCorpusReader = LazyCorpusLoader(
|
| 27 |
+
"europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
french: EuroparlCorpusReader = LazyCorpusLoader(
|
| 31 |
+
"europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8"
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
german: EuroparlCorpusReader = LazyCorpusLoader(
|
| 35 |
+
"europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8"
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
greek: EuroparlCorpusReader = LazyCorpusLoader(
|
| 39 |
+
"europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
italian: EuroparlCorpusReader = LazyCorpusLoader(
|
| 43 |
+
"europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
portuguese: EuroparlCorpusReader = LazyCorpusLoader(
|
| 47 |
+
"europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8"
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
spanish: EuroparlCorpusReader = LazyCorpusLoader(
|
| 51 |
+
"europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8"
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
swedish: EuroparlCorpusReader = LazyCorpusLoader(
|
| 55 |
+
"europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8"
|
| 56 |
+
)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/childes.py
ADDED
|
@@ -0,0 +1,630 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CHILDES XML Corpus Reader
|
| 2 |
+
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
|
| 5 |
+
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
Corpus reader for the XML version of the CHILDES corpus.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
__docformat__ = "epytext en"
|
| 14 |
+
|
| 15 |
+
import re
|
| 16 |
+
from collections import defaultdict
|
| 17 |
+
|
| 18 |
+
from nltk.corpus.reader.util import concat
|
| 19 |
+
from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader
|
| 20 |
+
from nltk.util import LazyConcatenation, LazyMap, flatten
|
| 21 |
+
|
| 22 |
+
# to resolve the namespace issue
|
| 23 |
+
NS = "http://www.talkbank.org/ns/talkbank"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class CHILDESCorpusReader(XMLCorpusReader):
|
| 27 |
+
"""
|
| 28 |
+
Corpus reader for the XML version of the CHILDES corpus.
|
| 29 |
+
The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
|
| 30 |
+
version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
|
| 31 |
+
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
|
| 32 |
+
(``nltk_data/corpora/CHILDES/``).
|
| 33 |
+
|
| 34 |
+
For access to the file text use the usual nltk functions,
|
| 35 |
+
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(self, root, fileids, lazy=True):
|
| 39 |
+
XMLCorpusReader.__init__(self, root, fileids)
|
| 40 |
+
self._lazy = lazy
|
| 41 |
+
|
| 42 |
+
def words(
|
| 43 |
+
self,
|
| 44 |
+
fileids=None,
|
| 45 |
+
speaker="ALL",
|
| 46 |
+
stem=False,
|
| 47 |
+
relation=False,
|
| 48 |
+
strip_space=True,
|
| 49 |
+
replace=False,
|
| 50 |
+
):
|
| 51 |
+
"""
|
| 52 |
+
:return: the given file(s) as a list of words
|
| 53 |
+
:rtype: list(str)
|
| 54 |
+
|
| 55 |
+
:param speaker: If specified, select specific speaker(s) defined
|
| 56 |
+
in the corpus. Default is 'ALL' (all participants). Common choices
|
| 57 |
+
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
| 58 |
+
researchers)
|
| 59 |
+
:param stem: If true, then use word stems instead of word strings.
|
| 60 |
+
:param relation: If true, then return tuples of (stem, index,
|
| 61 |
+
dependent_index)
|
| 62 |
+
:param strip_space: If true, then strip trailing spaces from word
|
| 63 |
+
tokens. Otherwise, leave the spaces on the tokens.
|
| 64 |
+
:param replace: If true, then use the replaced (intended) word instead
|
| 65 |
+
of the original word (e.g., 'wat' will be replaced with 'watch')
|
| 66 |
+
"""
|
| 67 |
+
sent = None
|
| 68 |
+
pos = False
|
| 69 |
+
if not self._lazy:
|
| 70 |
+
return [
|
| 71 |
+
self._get_words(
|
| 72 |
+
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
| 73 |
+
)
|
| 74 |
+
for fileid in self.abspaths(fileids)
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
get_words = lambda fileid: self._get_words(
|
| 78 |
+
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
| 79 |
+
)
|
| 80 |
+
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
| 81 |
+
|
| 82 |
+
def tagged_words(
|
| 83 |
+
self,
|
| 84 |
+
fileids=None,
|
| 85 |
+
speaker="ALL",
|
| 86 |
+
stem=False,
|
| 87 |
+
relation=False,
|
| 88 |
+
strip_space=True,
|
| 89 |
+
replace=False,
|
| 90 |
+
):
|
| 91 |
+
"""
|
| 92 |
+
:return: the given file(s) as a list of tagged
|
| 93 |
+
words and punctuation symbols, encoded as tuples
|
| 94 |
+
``(word,tag)``.
|
| 95 |
+
:rtype: list(tuple(str,str))
|
| 96 |
+
|
| 97 |
+
:param speaker: If specified, select specific speaker(s) defined
|
| 98 |
+
in the corpus. Default is 'ALL' (all participants). Common choices
|
| 99 |
+
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
| 100 |
+
researchers)
|
| 101 |
+
:param stem: If true, then use word stems instead of word strings.
|
| 102 |
+
:param relation: If true, then return tuples of (stem, index,
|
| 103 |
+
dependent_index)
|
| 104 |
+
:param strip_space: If true, then strip trailing spaces from word
|
| 105 |
+
tokens. Otherwise, leave the spaces on the tokens.
|
| 106 |
+
:param replace: If true, then use the replaced (intended) word instead
|
| 107 |
+
of the original word (e.g., 'wat' will be replaced with 'watch')
|
| 108 |
+
"""
|
| 109 |
+
sent = None
|
| 110 |
+
pos = True
|
| 111 |
+
if not self._lazy:
|
| 112 |
+
return [
|
| 113 |
+
self._get_words(
|
| 114 |
+
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
| 115 |
+
)
|
| 116 |
+
for fileid in self.abspaths(fileids)
|
| 117 |
+
]
|
| 118 |
+
|
| 119 |
+
get_words = lambda fileid: self._get_words(
|
| 120 |
+
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
| 121 |
+
)
|
| 122 |
+
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
| 123 |
+
|
| 124 |
+
def sents(
|
| 125 |
+
self,
|
| 126 |
+
fileids=None,
|
| 127 |
+
speaker="ALL",
|
| 128 |
+
stem=False,
|
| 129 |
+
relation=None,
|
| 130 |
+
strip_space=True,
|
| 131 |
+
replace=False,
|
| 132 |
+
):
|
| 133 |
+
"""
|
| 134 |
+
:return: the given file(s) as a list of sentences or utterances, each
|
| 135 |
+
encoded as a list of word strings.
|
| 136 |
+
:rtype: list(list(str))
|
| 137 |
+
|
| 138 |
+
:param speaker: If specified, select specific speaker(s) defined
|
| 139 |
+
in the corpus. Default is 'ALL' (all participants). Common choices
|
| 140 |
+
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
| 141 |
+
researchers)
|
| 142 |
+
:param stem: If true, then use word stems instead of word strings.
|
| 143 |
+
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
| 144 |
+
If there is manually-annotated relation info, it will return
|
| 145 |
+
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
| 146 |
+
:param strip_space: If true, then strip trailing spaces from word
|
| 147 |
+
tokens. Otherwise, leave the spaces on the tokens.
|
| 148 |
+
:param replace: If true, then use the replaced (intended) word instead
|
| 149 |
+
of the original word (e.g., 'wat' will be replaced with 'watch')
|
| 150 |
+
"""
|
| 151 |
+
sent = True
|
| 152 |
+
pos = False
|
| 153 |
+
if not self._lazy:
|
| 154 |
+
return [
|
| 155 |
+
self._get_words(
|
| 156 |
+
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
| 157 |
+
)
|
| 158 |
+
for fileid in self.abspaths(fileids)
|
| 159 |
+
]
|
| 160 |
+
|
| 161 |
+
get_words = lambda fileid: self._get_words(
|
| 162 |
+
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
| 163 |
+
)
|
| 164 |
+
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
| 165 |
+
|
| 166 |
+
def tagged_sents(
|
| 167 |
+
self,
|
| 168 |
+
fileids=None,
|
| 169 |
+
speaker="ALL",
|
| 170 |
+
stem=False,
|
| 171 |
+
relation=None,
|
| 172 |
+
strip_space=True,
|
| 173 |
+
replace=False,
|
| 174 |
+
):
|
| 175 |
+
"""
|
| 176 |
+
:return: the given file(s) as a list of
|
| 177 |
+
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
| 178 |
+
:rtype: list(list(tuple(str,str)))
|
| 179 |
+
|
| 180 |
+
:param speaker: If specified, select specific speaker(s) defined
|
| 181 |
+
in the corpus. Default is 'ALL' (all participants). Common choices
|
| 182 |
+
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
| 183 |
+
researchers)
|
| 184 |
+
:param stem: If true, then use word stems instead of word strings.
|
| 185 |
+
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
| 186 |
+
If there is manually-annotated relation info, it will return
|
| 187 |
+
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
| 188 |
+
:param strip_space: If true, then strip trailing spaces from word
|
| 189 |
+
tokens. Otherwise, leave the spaces on the tokens.
|
| 190 |
+
:param replace: If true, then use the replaced (intended) word instead
|
| 191 |
+
of the original word (e.g., 'wat' will be replaced with 'watch')
|
| 192 |
+
"""
|
| 193 |
+
sent = True
|
| 194 |
+
pos = True
|
| 195 |
+
if not self._lazy:
|
| 196 |
+
return [
|
| 197 |
+
self._get_words(
|
| 198 |
+
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
| 199 |
+
)
|
| 200 |
+
for fileid in self.abspaths(fileids)
|
| 201 |
+
]
|
| 202 |
+
|
| 203 |
+
get_words = lambda fileid: self._get_words(
|
| 204 |
+
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
| 205 |
+
)
|
| 206 |
+
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
| 207 |
+
|
| 208 |
+
def corpus(self, fileids=None):
|
| 209 |
+
"""
|
| 210 |
+
:return: the given file(s) as a dict of ``(corpus_property_key, value)``
|
| 211 |
+
:rtype: list(dict)
|
| 212 |
+
"""
|
| 213 |
+
if not self._lazy:
|
| 214 |
+
return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
|
| 215 |
+
return LazyMap(self._get_corpus, self.abspaths(fileids))
|
| 216 |
+
|
| 217 |
+
def _get_corpus(self, fileid):
|
| 218 |
+
results = dict()
|
| 219 |
+
xmldoc = ElementTree.parse(fileid).getroot()
|
| 220 |
+
for key, value in xmldoc.items():
|
| 221 |
+
results[key] = value
|
| 222 |
+
return results
|
| 223 |
+
|
| 224 |
+
def participants(self, fileids=None):
|
| 225 |
+
"""
|
| 226 |
+
:return: the given file(s) as a dict of
|
| 227 |
+
``(participant_property_key, value)``
|
| 228 |
+
:rtype: list(dict)
|
| 229 |
+
"""
|
| 230 |
+
if not self._lazy:
|
| 231 |
+
return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
|
| 232 |
+
return LazyMap(self._get_participants, self.abspaths(fileids))
|
| 233 |
+
|
| 234 |
+
def _get_participants(self, fileid):
|
| 235 |
+
# multidimensional dicts
|
| 236 |
+
def dictOfDicts():
|
| 237 |
+
return defaultdict(dictOfDicts)
|
| 238 |
+
|
| 239 |
+
xmldoc = ElementTree.parse(fileid).getroot()
|
| 240 |
+
# getting participants' data
|
| 241 |
+
pat = dictOfDicts()
|
| 242 |
+
for participant in xmldoc.findall(
|
| 243 |
+
f".//{{{NS}}}Participants/{{{NS}}}participant"
|
| 244 |
+
):
|
| 245 |
+
for (key, value) in participant.items():
|
| 246 |
+
pat[participant.get("id")][key] = value
|
| 247 |
+
return pat
|
| 248 |
+
|
| 249 |
+
def age(self, fileids=None, speaker="CHI", month=False):
|
| 250 |
+
"""
|
| 251 |
+
:return: the given file(s) as string or int
|
| 252 |
+
:rtype: list or int
|
| 253 |
+
|
| 254 |
+
:param month: If true, return months instead of year-month-date
|
| 255 |
+
"""
|
| 256 |
+
if not self._lazy:
|
| 257 |
+
return [
|
| 258 |
+
self._get_age(fileid, speaker, month)
|
| 259 |
+
for fileid in self.abspaths(fileids)
|
| 260 |
+
]
|
| 261 |
+
get_age = lambda fileid: self._get_age(fileid, speaker, month)
|
| 262 |
+
return LazyMap(get_age, self.abspaths(fileids))
|
| 263 |
+
|
| 264 |
+
def _get_age(self, fileid, speaker, month):
|
| 265 |
+
xmldoc = ElementTree.parse(fileid).getroot()
|
| 266 |
+
for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"):
|
| 267 |
+
try:
|
| 268 |
+
if pat.get("id") == speaker:
|
| 269 |
+
age = pat.get("age")
|
| 270 |
+
if month:
|
| 271 |
+
age = self.convert_age(age)
|
| 272 |
+
return age
|
| 273 |
+
# some files don't have age data
|
| 274 |
+
except (TypeError, AttributeError) as e:
|
| 275 |
+
return None
|
| 276 |
+
|
| 277 |
+
def convert_age(self, age_year):
|
| 278 |
+
"Caclculate age in months from a string in CHILDES format"
|
| 279 |
+
m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
|
| 280 |
+
age_month = int(m.group(1)) * 12 + int(m.group(2))
|
| 281 |
+
try:
|
| 282 |
+
if int(m.group(3)) > 15:
|
| 283 |
+
age_month += 1
|
| 284 |
+
# some corpora don't have age information?
|
| 285 |
+
except ValueError as e:
|
| 286 |
+
pass
|
| 287 |
+
return age_month
|
| 288 |
+
|
| 289 |
+
def MLU(self, fileids=None, speaker="CHI"):
|
| 290 |
+
"""
|
| 291 |
+
:return: the given file(s) as a floating number
|
| 292 |
+
:rtype: list(float)
|
| 293 |
+
"""
|
| 294 |
+
if not self._lazy:
|
| 295 |
+
return [
|
| 296 |
+
self._getMLU(fileid, speaker=speaker)
|
| 297 |
+
for fileid in self.abspaths(fileids)
|
| 298 |
+
]
|
| 299 |
+
get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
|
| 300 |
+
return LazyMap(get_MLU, self.abspaths(fileids))
|
| 301 |
+
|
| 302 |
+
def _getMLU(self, fileid, speaker):
|
| 303 |
+
sents = self._get_words(
|
| 304 |
+
fileid,
|
| 305 |
+
speaker=speaker,
|
| 306 |
+
sent=True,
|
| 307 |
+
stem=True,
|
| 308 |
+
relation=False,
|
| 309 |
+
pos=True,
|
| 310 |
+
strip_space=True,
|
| 311 |
+
replace=True,
|
| 312 |
+
)
|
| 313 |
+
results = []
|
| 314 |
+
lastSent = []
|
| 315 |
+
numFillers = 0
|
| 316 |
+
sentDiscount = 0
|
| 317 |
+
for sent in sents:
|
| 318 |
+
posList = [pos for (word, pos) in sent]
|
| 319 |
+
# if any part of the sentence is intelligible
|
| 320 |
+
if any(pos == "unk" for pos in posList):
|
| 321 |
+
continue
|
| 322 |
+
# if the sentence is null
|
| 323 |
+
elif sent == []:
|
| 324 |
+
continue
|
| 325 |
+
# if the sentence is the same as the last sent
|
| 326 |
+
elif sent == lastSent:
|
| 327 |
+
continue
|
| 328 |
+
else:
|
| 329 |
+
results.append([word for (word, pos) in sent])
|
| 330 |
+
# count number of fillers
|
| 331 |
+
if len({"co", None}.intersection(posList)) > 0:
|
| 332 |
+
numFillers += posList.count("co")
|
| 333 |
+
numFillers += posList.count(None)
|
| 334 |
+
sentDiscount += 1
|
| 335 |
+
lastSent = sent
|
| 336 |
+
try:
|
| 337 |
+
thisWordList = flatten(results)
|
| 338 |
+
# count number of morphemes
|
| 339 |
+
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
|
| 340 |
+
numWords = (
|
| 341 |
+
len(flatten([word.split("-") for word in thisWordList])) - numFillers
|
| 342 |
+
)
|
| 343 |
+
numSents = len(results) - sentDiscount
|
| 344 |
+
mlu = numWords / numSents
|
| 345 |
+
except ZeroDivisionError:
|
| 346 |
+
mlu = 0
|
| 347 |
+
# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
|
| 348 |
+
return mlu
|
| 349 |
+
|
| 350 |
+
def _get_words(
|
| 351 |
+
self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
| 352 |
+
):
|
| 353 |
+
if (
|
| 354 |
+
isinstance(speaker, str) and speaker != "ALL"
|
| 355 |
+
): # ensure we have a list of speakers
|
| 356 |
+
speaker = [speaker]
|
| 357 |
+
xmldoc = ElementTree.parse(fileid).getroot()
|
| 358 |
+
# processing each xml doc
|
| 359 |
+
results = []
|
| 360 |
+
for xmlsent in xmldoc.findall(".//{%s}u" % NS):
|
| 361 |
+
sents = []
|
| 362 |
+
# select speakers
|
| 363 |
+
if speaker == "ALL" or xmlsent.get("who") in speaker:
|
| 364 |
+
for xmlword in xmlsent.findall(".//{%s}w" % NS):
|
| 365 |
+
infl = None
|
| 366 |
+
suffixStem = None
|
| 367 |
+
suffixTag = None
|
| 368 |
+
# getting replaced words
|
| 369 |
+
if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"):
|
| 370 |
+
xmlword = xmlsent.find(
|
| 371 |
+
f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w"
|
| 372 |
+
)
|
| 373 |
+
elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"):
|
| 374 |
+
xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk")
|
| 375 |
+
# get text
|
| 376 |
+
if xmlword.text:
|
| 377 |
+
word = xmlword.text
|
| 378 |
+
else:
|
| 379 |
+
word = ""
|
| 380 |
+
# strip tailing space
|
| 381 |
+
if strip_space:
|
| 382 |
+
word = word.strip()
|
| 383 |
+
# stem
|
| 384 |
+
if relation or stem:
|
| 385 |
+
try:
|
| 386 |
+
xmlstem = xmlword.find(".//{%s}stem" % NS)
|
| 387 |
+
word = xmlstem.text
|
| 388 |
+
except AttributeError as e:
|
| 389 |
+
pass
|
| 390 |
+
# if there is an inflection
|
| 391 |
+
try:
|
| 392 |
+
xmlinfl = xmlword.find(
|
| 393 |
+
f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk"
|
| 394 |
+
)
|
| 395 |
+
word += "-" + xmlinfl.text
|
| 396 |
+
except:
|
| 397 |
+
pass
|
| 398 |
+
# if there is a suffix
|
| 399 |
+
try:
|
| 400 |
+
xmlsuffix = xmlword.find(
|
| 401 |
+
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem"
|
| 402 |
+
% (NS, NS, NS, NS)
|
| 403 |
+
)
|
| 404 |
+
suffixStem = xmlsuffix.text
|
| 405 |
+
except AttributeError:
|
| 406 |
+
suffixStem = ""
|
| 407 |
+
if suffixStem:
|
| 408 |
+
word += "~" + suffixStem
|
| 409 |
+
# pos
|
| 410 |
+
if relation or pos:
|
| 411 |
+
try:
|
| 412 |
+
xmlpos = xmlword.findall(".//{%s}c" % NS)
|
| 413 |
+
xmlpos2 = xmlword.findall(".//{%s}s" % NS)
|
| 414 |
+
if xmlpos2 != []:
|
| 415 |
+
tag = xmlpos[0].text + ":" + xmlpos2[0].text
|
| 416 |
+
else:
|
| 417 |
+
tag = xmlpos[0].text
|
| 418 |
+
except (AttributeError, IndexError) as e:
|
| 419 |
+
tag = ""
|
| 420 |
+
try:
|
| 421 |
+
xmlsuffixpos = xmlword.findall(
|
| 422 |
+
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c"
|
| 423 |
+
% (NS, NS, NS, NS, NS)
|
| 424 |
+
)
|
| 425 |
+
xmlsuffixpos2 = xmlword.findall(
|
| 426 |
+
".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s"
|
| 427 |
+
% (NS, NS, NS, NS, NS)
|
| 428 |
+
)
|
| 429 |
+
if xmlsuffixpos2:
|
| 430 |
+
suffixTag = (
|
| 431 |
+
xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
|
| 432 |
+
)
|
| 433 |
+
else:
|
| 434 |
+
suffixTag = xmlsuffixpos[0].text
|
| 435 |
+
except:
|
| 436 |
+
pass
|
| 437 |
+
if suffixTag:
|
| 438 |
+
tag += "~" + suffixTag
|
| 439 |
+
word = (word, tag)
|
| 440 |
+
# relational
|
| 441 |
+
# the gold standard is stored in
|
| 442 |
+
# <mor></mor><mor type="trn"><gra type="grt">
|
| 443 |
+
if relation == True:
|
| 444 |
+
for xmlstem_rel in xmlword.findall(
|
| 445 |
+
f".//{{{NS}}}mor/{{{NS}}}gra"
|
| 446 |
+
):
|
| 447 |
+
if not xmlstem_rel.get("type") == "grt":
|
| 448 |
+
word = (
|
| 449 |
+
word[0],
|
| 450 |
+
word[1],
|
| 451 |
+
xmlstem_rel.get("index")
|
| 452 |
+
+ "|"
|
| 453 |
+
+ xmlstem_rel.get("head")
|
| 454 |
+
+ "|"
|
| 455 |
+
+ xmlstem_rel.get("relation"),
|
| 456 |
+
)
|
| 457 |
+
else:
|
| 458 |
+
word = (
|
| 459 |
+
word[0],
|
| 460 |
+
word[1],
|
| 461 |
+
word[2],
|
| 462 |
+
word[0],
|
| 463 |
+
word[1],
|
| 464 |
+
xmlstem_rel.get("index")
|
| 465 |
+
+ "|"
|
| 466 |
+
+ xmlstem_rel.get("head")
|
| 467 |
+
+ "|"
|
| 468 |
+
+ xmlstem_rel.get("relation"),
|
| 469 |
+
)
|
| 470 |
+
try:
|
| 471 |
+
for xmlpost_rel in xmlword.findall(
|
| 472 |
+
f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra"
|
| 473 |
+
):
|
| 474 |
+
if not xmlpost_rel.get("type") == "grt":
|
| 475 |
+
suffixStem = (
|
| 476 |
+
suffixStem[0],
|
| 477 |
+
suffixStem[1],
|
| 478 |
+
xmlpost_rel.get("index")
|
| 479 |
+
+ "|"
|
| 480 |
+
+ xmlpost_rel.get("head")
|
| 481 |
+
+ "|"
|
| 482 |
+
+ xmlpost_rel.get("relation"),
|
| 483 |
+
)
|
| 484 |
+
else:
|
| 485 |
+
suffixStem = (
|
| 486 |
+
suffixStem[0],
|
| 487 |
+
suffixStem[1],
|
| 488 |
+
suffixStem[2],
|
| 489 |
+
suffixStem[0],
|
| 490 |
+
suffixStem[1],
|
| 491 |
+
xmlpost_rel.get("index")
|
| 492 |
+
+ "|"
|
| 493 |
+
+ xmlpost_rel.get("head")
|
| 494 |
+
+ "|"
|
| 495 |
+
+ xmlpost_rel.get("relation"),
|
| 496 |
+
)
|
| 497 |
+
except:
|
| 498 |
+
pass
|
| 499 |
+
sents.append(word)
|
| 500 |
+
if sent or relation:
|
| 501 |
+
results.append(sents)
|
| 502 |
+
else:
|
| 503 |
+
results.extend(sents)
|
| 504 |
+
return LazyMap(lambda x: x, results)
|
| 505 |
+
|
| 506 |
+
# Ready-to-use browser opener
|
| 507 |
+
|
| 508 |
+
"""
|
| 509 |
+
The base URL for viewing files on the childes website. This
|
| 510 |
+
shouldn't need to be changed, unless CHILDES changes the configuration
|
| 511 |
+
of their server or unless the user sets up their own corpus webserver.
|
| 512 |
+
"""
|
| 513 |
+
childes_url_base = r"https://childes.talkbank.org/browser/index.php?url="
|
| 514 |
+
|
| 515 |
+
def webview_file(self, fileid, urlbase=None):
|
| 516 |
+
"""Map a corpus file to its web version on the CHILDES website,
|
| 517 |
+
and open it in a web browser.
|
| 518 |
+
|
| 519 |
+
The complete URL to be used is:
|
| 520 |
+
childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
|
| 521 |
+
|
| 522 |
+
If no urlbase is passed, we try to calculate it. This
|
| 523 |
+
requires that the childes corpus was set up to mirror the
|
| 524 |
+
folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
|
| 525 |
+
nltk_data/corpora/childes/Eng-USA/Cornell/??? or
|
| 526 |
+
nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
|
| 527 |
+
|
| 528 |
+
The function first looks (as a special case) if "Eng-USA" is
|
| 529 |
+
on the path consisting of <corpus root>+fileid; then if
|
| 530 |
+
"childes", possibly followed by "data-xml", appears. If neither
|
| 531 |
+
one is found, we use the unmodified fileid and hope for the best.
|
| 532 |
+
If this is not right, specify urlbase explicitly, e.g., if the
|
| 533 |
+
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
|
| 534 |
+
"""
|
| 535 |
+
|
| 536 |
+
import webbrowser
|
| 537 |
+
|
| 538 |
+
if urlbase:
|
| 539 |
+
path = urlbase + "/" + fileid
|
| 540 |
+
else:
|
| 541 |
+
full = self.root + "/" + fileid
|
| 542 |
+
full = re.sub(r"\\", "/", full)
|
| 543 |
+
if "/childes/" in full.lower():
|
| 544 |
+
# Discard /data-xml/ if present
|
| 545 |
+
path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0]
|
| 546 |
+
elif "eng-usa" in full.lower():
|
| 547 |
+
path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0]
|
| 548 |
+
else:
|
| 549 |
+
path = fileid
|
| 550 |
+
|
| 551 |
+
# Strip ".xml" and add ".cha", as necessary:
|
| 552 |
+
if path.endswith(".xml"):
|
| 553 |
+
path = path[:-4]
|
| 554 |
+
|
| 555 |
+
if not path.endswith(".cha"):
|
| 556 |
+
path = path + ".cha"
|
| 557 |
+
|
| 558 |
+
url = self.childes_url_base + path
|
| 559 |
+
|
| 560 |
+
webbrowser.open_new_tab(url)
|
| 561 |
+
print("Opening in browser:", url)
|
| 562 |
+
# Pausing is a good idea, but it's up to the user...
|
| 563 |
+
# raw_input("Hit Return to continue")
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
def demo(corpus_root=None):
|
| 567 |
+
"""
|
| 568 |
+
The CHILDES corpus should be manually downloaded and saved
|
| 569 |
+
to ``[NLTK_Data_Dir]/corpora/childes/``
|
| 570 |
+
"""
|
| 571 |
+
if not corpus_root:
|
| 572 |
+
from nltk.data import find
|
| 573 |
+
|
| 574 |
+
corpus_root = find("corpora/childes/data-xml/Eng-USA/")
|
| 575 |
+
|
| 576 |
+
try:
|
| 577 |
+
childes = CHILDESCorpusReader(corpus_root, ".*.xml")
|
| 578 |
+
# describe all corpus
|
| 579 |
+
for file in childes.fileids()[:5]:
|
| 580 |
+
corpus = ""
|
| 581 |
+
corpus_id = ""
|
| 582 |
+
for (key, value) in childes.corpus(file)[0].items():
|
| 583 |
+
if key == "Corpus":
|
| 584 |
+
corpus = value
|
| 585 |
+
if key == "Id":
|
| 586 |
+
corpus_id = value
|
| 587 |
+
print("Reading", corpus, corpus_id, " .....")
|
| 588 |
+
print("words:", childes.words(file)[:7], "...")
|
| 589 |
+
print(
|
| 590 |
+
"words with replaced words:",
|
| 591 |
+
childes.words(file, replace=True)[:7],
|
| 592 |
+
" ...",
|
| 593 |
+
)
|
| 594 |
+
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
|
| 595 |
+
print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...")
|
| 596 |
+
print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...")
|
| 597 |
+
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
|
| 598 |
+
print(
|
| 599 |
+
"words with relations and pos-tag:",
|
| 600 |
+
childes.words(file, relation=True)[:5],
|
| 601 |
+
" ...",
|
| 602 |
+
)
|
| 603 |
+
print("sentence:", childes.sents(file)[:2], " ...")
|
| 604 |
+
for (participant, values) in childes.participants(file)[0].items():
|
| 605 |
+
for (key, value) in values.items():
|
| 606 |
+
print("\tparticipant", participant, key, ":", value)
|
| 607 |
+
print("num of sent:", len(childes.sents(file)))
|
| 608 |
+
print("num of morphemes:", len(childes.words(file, stem=True)))
|
| 609 |
+
print("age:", childes.age(file))
|
| 610 |
+
print("age in month:", childes.age(file, month=True))
|
| 611 |
+
print("MLU:", childes.MLU(file))
|
| 612 |
+
print()
|
| 613 |
+
|
| 614 |
+
except LookupError as e:
|
| 615 |
+
print(
|
| 616 |
+
"""The CHILDES corpus, or the parts you need, should be manually
|
| 617 |
+
downloaded from https://childes.talkbank.org/data-xml/ and saved at
|
| 618 |
+
[NLTK_Data_Dir]/corpora/childes/
|
| 619 |
+
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
|
| 620 |
+
demo('/path/to/childes/data-xml/Eng-USA/")
|
| 621 |
+
"""
|
| 622 |
+
)
|
| 623 |
+
# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
|
| 624 |
+
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
|
| 625 |
+
##this fails
|
| 626 |
+
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
|
| 627 |
+
|
| 628 |
+
|
| 629 |
+
if __name__ == "__main__":
|
| 630 |
+
demo()
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/chunked.py
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Chunked Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
A reader for corpora that contain chunked (and optionally tagged)
|
| 11 |
+
documents.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import codecs
|
| 15 |
+
import os.path
|
| 16 |
+
|
| 17 |
+
import nltk
|
| 18 |
+
from nltk.chunk import tagstr2tree
|
| 19 |
+
from nltk.corpus.reader.api import *
|
| 20 |
+
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
|
| 21 |
+
from nltk.corpus.reader.util import *
|
| 22 |
+
from nltk.tokenize import *
|
| 23 |
+
from nltk.tree import Tree
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ChunkedCorpusReader(CorpusReader):
|
| 27 |
+
"""
|
| 28 |
+
Reader for chunked (and optionally tagged) corpora. Paragraphs
|
| 29 |
+
are split using a block reader. They are then tokenized into
|
| 30 |
+
sentences using a sentence tokenizer. Finally, these sentences
|
| 31 |
+
are parsed into chunk trees using a string-to-chunktree conversion
|
| 32 |
+
function. Each of these steps can be performed using a default
|
| 33 |
+
function or a custom function. By default, paragraphs are split
|
| 34 |
+
on blank lines; sentences are listed one per line; and sentences
|
| 35 |
+
are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(
|
| 39 |
+
self,
|
| 40 |
+
root,
|
| 41 |
+
fileids,
|
| 42 |
+
extension="",
|
| 43 |
+
str2chunktree=tagstr2tree,
|
| 44 |
+
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
| 45 |
+
para_block_reader=read_blankline_block,
|
| 46 |
+
encoding="utf8",
|
| 47 |
+
tagset=None,
|
| 48 |
+
):
|
| 49 |
+
"""
|
| 50 |
+
:param root: The root directory for this corpus.
|
| 51 |
+
:param fileids: A list or regexp specifying the fileids in this corpus.
|
| 52 |
+
"""
|
| 53 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 54 |
+
self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
|
| 55 |
+
"""Arguments for corpus views generated by this corpus: a tuple
|
| 56 |
+
(str2chunktree, sent_tokenizer, para_block_tokenizer)"""
|
| 57 |
+
|
| 58 |
+
def words(self, fileids=None):
|
| 59 |
+
"""
|
| 60 |
+
:return: the given file(s) as a list of words
|
| 61 |
+
and punctuation symbols.
|
| 62 |
+
:rtype: list(str)
|
| 63 |
+
"""
|
| 64 |
+
return concat(
|
| 65 |
+
[
|
| 66 |
+
ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
|
| 67 |
+
for (f, enc) in self.abspaths(fileids, True)
|
| 68 |
+
]
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
def sents(self, fileids=None):
|
| 72 |
+
"""
|
| 73 |
+
:return: the given file(s) as a list of
|
| 74 |
+
sentences or utterances, each encoded as a list of word
|
| 75 |
+
strings.
|
| 76 |
+
:rtype: list(list(str))
|
| 77 |
+
"""
|
| 78 |
+
return concat(
|
| 79 |
+
[
|
| 80 |
+
ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
|
| 81 |
+
for (f, enc) in self.abspaths(fileids, True)
|
| 82 |
+
]
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
def paras(self, fileids=None):
|
| 86 |
+
"""
|
| 87 |
+
:return: the given file(s) as a list of
|
| 88 |
+
paragraphs, each encoded as a list of sentences, which are
|
| 89 |
+
in turn encoded as lists of word strings.
|
| 90 |
+
:rtype: list(list(list(str)))
|
| 91 |
+
"""
|
| 92 |
+
return concat(
|
| 93 |
+
[
|
| 94 |
+
ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
|
| 95 |
+
for (f, enc) in self.abspaths(fileids, True)
|
| 96 |
+
]
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
def tagged_words(self, fileids=None, tagset=None):
|
| 100 |
+
"""
|
| 101 |
+
:return: the given file(s) as a list of tagged
|
| 102 |
+
words and punctuation symbols, encoded as tuples
|
| 103 |
+
``(word,tag)``.
|
| 104 |
+
:rtype: list(tuple(str,str))
|
| 105 |
+
"""
|
| 106 |
+
return concat(
|
| 107 |
+
[
|
| 108 |
+
ChunkedCorpusView(
|
| 109 |
+
f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
|
| 110 |
+
)
|
| 111 |
+
for (f, enc) in self.abspaths(fileids, True)
|
| 112 |
+
]
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
def tagged_sents(self, fileids=None, tagset=None):
|
| 116 |
+
"""
|
| 117 |
+
:return: the given file(s) as a list of
|
| 118 |
+
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
| 119 |
+
|
| 120 |
+
:rtype: list(list(tuple(str,str)))
|
| 121 |
+
"""
|
| 122 |
+
return concat(
|
| 123 |
+
[
|
| 124 |
+
ChunkedCorpusView(
|
| 125 |
+
f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
|
| 126 |
+
)
|
| 127 |
+
for (f, enc) in self.abspaths(fileids, True)
|
| 128 |
+
]
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
def tagged_paras(self, fileids=None, tagset=None):
|
| 132 |
+
"""
|
| 133 |
+
:return: the given file(s) as a list of
|
| 134 |
+
paragraphs, each encoded as a list of sentences, which are
|
| 135 |
+
in turn encoded as lists of ``(word,tag)`` tuples.
|
| 136 |
+
:rtype: list(list(list(tuple(str,str))))
|
| 137 |
+
"""
|
| 138 |
+
return concat(
|
| 139 |
+
[
|
| 140 |
+
ChunkedCorpusView(
|
| 141 |
+
f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
|
| 142 |
+
)
|
| 143 |
+
for (f, enc) in self.abspaths(fileids, True)
|
| 144 |
+
]
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
def chunked_words(self, fileids=None, tagset=None):
|
| 148 |
+
"""
|
| 149 |
+
:return: the given file(s) as a list of tagged
|
| 150 |
+
words and chunks. Words are encoded as ``(word, tag)``
|
| 151 |
+
tuples (if the corpus has tags) or word strings (if the
|
| 152 |
+
corpus has no tags). Chunks are encoded as depth-one
|
| 153 |
+
trees over ``(word,tag)`` tuples or word strings.
|
| 154 |
+
:rtype: list(tuple(str,str) and Tree)
|
| 155 |
+
"""
|
| 156 |
+
return concat(
|
| 157 |
+
[
|
| 158 |
+
ChunkedCorpusView(
|
| 159 |
+
f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
|
| 160 |
+
)
|
| 161 |
+
for (f, enc) in self.abspaths(fileids, True)
|
| 162 |
+
]
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
def chunked_sents(self, fileids=None, tagset=None):
|
| 166 |
+
"""
|
| 167 |
+
:return: the given file(s) as a list of
|
| 168 |
+
sentences, each encoded as a shallow Tree. The leaves
|
| 169 |
+
of these trees are encoded as ``(word, tag)`` tuples (if
|
| 170 |
+
the corpus has tags) or word strings (if the corpus has no
|
| 171 |
+
tags).
|
| 172 |
+
:rtype: list(Tree)
|
| 173 |
+
"""
|
| 174 |
+
return concat(
|
| 175 |
+
[
|
| 176 |
+
ChunkedCorpusView(
|
| 177 |
+
f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
|
| 178 |
+
)
|
| 179 |
+
for (f, enc) in self.abspaths(fileids, True)
|
| 180 |
+
]
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
def chunked_paras(self, fileids=None, tagset=None):
|
| 184 |
+
"""
|
| 185 |
+
:return: the given file(s) as a list of
|
| 186 |
+
paragraphs, each encoded as a list of sentences, which are
|
| 187 |
+
in turn encoded as a shallow Tree. The leaves of these
|
| 188 |
+
trees are encoded as ``(word, tag)`` tuples (if the corpus
|
| 189 |
+
has tags) or word strings (if the corpus has no tags).
|
| 190 |
+
:rtype: list(list(Tree))
|
| 191 |
+
"""
|
| 192 |
+
return concat(
|
| 193 |
+
[
|
| 194 |
+
ChunkedCorpusView(
|
| 195 |
+
f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
|
| 196 |
+
)
|
| 197 |
+
for (f, enc) in self.abspaths(fileids, True)
|
| 198 |
+
]
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
def _read_block(self, stream):
|
| 202 |
+
return [tagstr2tree(t) for t in read_blankline_block(stream)]
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
class ChunkedCorpusView(StreamBackedCorpusView):
|
| 206 |
+
def __init__(
|
| 207 |
+
self,
|
| 208 |
+
fileid,
|
| 209 |
+
encoding,
|
| 210 |
+
tagged,
|
| 211 |
+
group_by_sent,
|
| 212 |
+
group_by_para,
|
| 213 |
+
chunked,
|
| 214 |
+
str2chunktree,
|
| 215 |
+
sent_tokenizer,
|
| 216 |
+
para_block_reader,
|
| 217 |
+
source_tagset=None,
|
| 218 |
+
target_tagset=None,
|
| 219 |
+
):
|
| 220 |
+
StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
|
| 221 |
+
self._tagged = tagged
|
| 222 |
+
self._group_by_sent = group_by_sent
|
| 223 |
+
self._group_by_para = group_by_para
|
| 224 |
+
self._chunked = chunked
|
| 225 |
+
self._str2chunktree = str2chunktree
|
| 226 |
+
self._sent_tokenizer = sent_tokenizer
|
| 227 |
+
self._para_block_reader = para_block_reader
|
| 228 |
+
self._source_tagset = source_tagset
|
| 229 |
+
self._target_tagset = target_tagset
|
| 230 |
+
|
| 231 |
+
def read_block(self, stream):
|
| 232 |
+
block = []
|
| 233 |
+
for para_str in self._para_block_reader(stream):
|
| 234 |
+
para = []
|
| 235 |
+
for sent_str in self._sent_tokenizer.tokenize(para_str):
|
| 236 |
+
sent = self._str2chunktree(
|
| 237 |
+
sent_str,
|
| 238 |
+
source_tagset=self._source_tagset,
|
| 239 |
+
target_tagset=self._target_tagset,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
# If requested, throw away the tags.
|
| 243 |
+
if not self._tagged:
|
| 244 |
+
sent = self._untag(sent)
|
| 245 |
+
|
| 246 |
+
# If requested, throw away the chunks.
|
| 247 |
+
if not self._chunked:
|
| 248 |
+
sent = sent.leaves()
|
| 249 |
+
|
| 250 |
+
# Add the sentence to `para`.
|
| 251 |
+
if self._group_by_sent:
|
| 252 |
+
para.append(sent)
|
| 253 |
+
else:
|
| 254 |
+
para.extend(sent)
|
| 255 |
+
|
| 256 |
+
# Add the paragraph to `block`.
|
| 257 |
+
if self._group_by_para:
|
| 258 |
+
block.append(para)
|
| 259 |
+
else:
|
| 260 |
+
block.extend(para)
|
| 261 |
+
|
| 262 |
+
# Return the block
|
| 263 |
+
return block
|
| 264 |
+
|
| 265 |
+
def _untag(self, tree):
|
| 266 |
+
for i, child in enumerate(tree):
|
| 267 |
+
if isinstance(child, Tree):
|
| 268 |
+
self._untag(child)
|
| 269 |
+
elif isinstance(child, tuple):
|
| 270 |
+
tree[i] = child[0]
|
| 271 |
+
else:
|
| 272 |
+
raise ValueError("expected child to be Tree or tuple")
|
| 273 |
+
return tree
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/cmudict.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6]
|
| 10 |
+
ftp://ftp.cs.cmu.edu/project/speech/dict/
|
| 11 |
+
Copyright 1998 Carnegie Mellon University
|
| 12 |
+
|
| 13 |
+
File Format: Each line consists of an uppercased word, a counter
|
| 14 |
+
(for alternative pronunciations), and a transcription. Vowels are
|
| 15 |
+
marked for stress (1=primary, 2=secondary, 0=no stress). E.g.:
|
| 16 |
+
NATURAL 1 N AE1 CH ER0 AH0 L
|
| 17 |
+
|
| 18 |
+
The dictionary contains 127069 entries. Of these, 119400 words are assigned
|
| 19 |
+
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
|
| 20 |
+
three or more pronunciations. Many of these are fast-speech variants.
|
| 21 |
+
|
| 22 |
+
Phonemes: There are 39 phonemes, as shown below:
|
| 23 |
+
|
| 24 |
+
Phoneme Example Translation Phoneme Example Translation
|
| 25 |
+
------- ------- ----------- ------- ------- -----------
|
| 26 |
+
AA odd AA D AE at AE T
|
| 27 |
+
AH hut HH AH T AO ought AO T
|
| 28 |
+
AW cow K AW AY hide HH AY D
|
| 29 |
+
B be B IY CH cheese CH IY Z
|
| 30 |
+
D dee D IY DH thee DH IY
|
| 31 |
+
EH Ed EH D ER hurt HH ER T
|
| 32 |
+
EY ate EY T F fee F IY
|
| 33 |
+
G green G R IY N HH he HH IY
|
| 34 |
+
IH it IH T IY eat IY T
|
| 35 |
+
JH gee JH IY K key K IY
|
| 36 |
+
L lee L IY M me M IY
|
| 37 |
+
N knee N IY NG ping P IH NG
|
| 38 |
+
OW oat OW T OY toy T OY
|
| 39 |
+
P pee P IY R read R IY D
|
| 40 |
+
S sea S IY SH she SH IY
|
| 41 |
+
T tea T IY TH theta TH EY T AH
|
| 42 |
+
UH hood HH UH D UW two T UW
|
| 43 |
+
V vee V IY W we W IY
|
| 44 |
+
Y yield Y IY L D Z zee Z IY
|
| 45 |
+
ZH seizure S IY ZH ER
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
from nltk.corpus.reader.api import *
|
| 49 |
+
from nltk.corpus.reader.util import *
|
| 50 |
+
from nltk.util import Index
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class CMUDictCorpusReader(CorpusReader):
|
| 54 |
+
def entries(self):
|
| 55 |
+
"""
|
| 56 |
+
:return: the cmudict lexicon as a list of entries
|
| 57 |
+
containing (word, transcriptions) tuples.
|
| 58 |
+
"""
|
| 59 |
+
return concat(
|
| 60 |
+
[
|
| 61 |
+
StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc)
|
| 62 |
+
for fileid, enc in self.abspaths(None, True)
|
| 63 |
+
]
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
def words(self):
|
| 67 |
+
"""
|
| 68 |
+
:return: a list of all words defined in the cmudict lexicon.
|
| 69 |
+
"""
|
| 70 |
+
return [word.lower() for (word, _) in self.entries()]
|
| 71 |
+
|
| 72 |
+
def dict(self):
|
| 73 |
+
"""
|
| 74 |
+
:return: the cmudict lexicon as a dictionary, whose keys are
|
| 75 |
+
lowercase words and whose values are lists of pronunciations.
|
| 76 |
+
"""
|
| 77 |
+
return dict(Index(self.entries()))
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def read_cmudict_block(stream):
|
| 81 |
+
entries = []
|
| 82 |
+
while len(entries) < 100: # Read 100 at a time.
|
| 83 |
+
line = stream.readline()
|
| 84 |
+
if line == "":
|
| 85 |
+
return entries # end of file.
|
| 86 |
+
pieces = line.split()
|
| 87 |
+
entries.append((pieces[0].lower(), pieces[2:]))
|
| 88 |
+
return entries
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/comparative_sents.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Comparative Sentence Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
|
| 5 |
+
# URL: <https://www.nltk.org/>
|
| 6 |
+
# For license information, see LICENSE.TXT
|
| 7 |
+
|
| 8 |
+
"""
|
| 9 |
+
CorpusReader for the Comparative Sentence Dataset.
|
| 10 |
+
|
| 11 |
+
- Comparative Sentence Dataset information -
|
| 12 |
+
|
| 13 |
+
Annotated by: Nitin Jindal and Bing Liu, 2006.
|
| 14 |
+
Department of Computer Sicence
|
| 15 |
+
University of Illinois at Chicago
|
| 16 |
+
|
| 17 |
+
Contact: Nitin Jindal, njindal@cs.uic.edu
|
| 18 |
+
Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub)
|
| 19 |
+
|
| 20 |
+
Distributed with permission.
|
| 21 |
+
|
| 22 |
+
Related papers:
|
| 23 |
+
|
| 24 |
+
- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents".
|
| 25 |
+
Proceedings of the ACM SIGIR International Conference on Information Retrieval
|
| 26 |
+
(SIGIR-06), 2006.
|
| 27 |
+
|
| 28 |
+
- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations".
|
| 29 |
+
Proceedings of Twenty First National Conference on Artificial Intelligence
|
| 30 |
+
(AAAI-2006), 2006.
|
| 31 |
+
|
| 32 |
+
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
|
| 33 |
+
Proceedings of the 22nd International Conference on Computational Linguistics
|
| 34 |
+
(Coling-2008), Manchester, 18-22 August, 2008.
|
| 35 |
+
"""
|
| 36 |
+
import re
|
| 37 |
+
|
| 38 |
+
from nltk.corpus.reader.api import *
|
| 39 |
+
from nltk.tokenize import *
|
| 40 |
+
|
| 41 |
+
# Regular expressions for dataset components
|
| 42 |
+
STARS = re.compile(r"^\*+$")
|
| 43 |
+
COMPARISON = re.compile(r"<cs-[1234]>")
|
| 44 |
+
CLOSE_COMPARISON = re.compile(r"</cs-[1234]>")
|
| 45 |
+
GRAD_COMPARISON = re.compile(r"<cs-[123]>")
|
| 46 |
+
NON_GRAD_COMPARISON = re.compile(r"<cs-4>")
|
| 47 |
+
ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)")
|
| 48 |
+
KEYWORD = re.compile(r"\(([^\(]*)\)$")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class Comparison:
|
| 52 |
+
"""
|
| 53 |
+
A Comparison represents a comparative sentence and its constituents.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def __init__(
|
| 57 |
+
self,
|
| 58 |
+
text=None,
|
| 59 |
+
comp_type=None,
|
| 60 |
+
entity_1=None,
|
| 61 |
+
entity_2=None,
|
| 62 |
+
feature=None,
|
| 63 |
+
keyword=None,
|
| 64 |
+
):
|
| 65 |
+
"""
|
| 66 |
+
:param text: a string (optionally tokenized) containing a comparison.
|
| 67 |
+
:param comp_type: an integer defining the type of comparison expressed.
|
| 68 |
+
Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative),
|
| 69 |
+
4 (Non-gradable).
|
| 70 |
+
:param entity_1: the first entity considered in the comparison relation.
|
| 71 |
+
:param entity_2: the second entity considered in the comparison relation.
|
| 72 |
+
:param feature: the feature considered in the comparison relation.
|
| 73 |
+
:param keyword: the word or phrase which is used for that comparative relation.
|
| 74 |
+
"""
|
| 75 |
+
self.text = text
|
| 76 |
+
self.comp_type = comp_type
|
| 77 |
+
self.entity_1 = entity_1
|
| 78 |
+
self.entity_2 = entity_2
|
| 79 |
+
self.feature = feature
|
| 80 |
+
self.keyword = keyword
|
| 81 |
+
|
| 82 |
+
def __repr__(self):
|
| 83 |
+
return (
|
| 84 |
+
'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", '
|
| 85 |
+
'feature="{}", keyword="{}")'
|
| 86 |
+
).format(
|
| 87 |
+
self.text,
|
| 88 |
+
self.comp_type,
|
| 89 |
+
self.entity_1,
|
| 90 |
+
self.entity_2,
|
| 91 |
+
self.feature,
|
| 92 |
+
self.keyword,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class ComparativeSentencesCorpusReader(CorpusReader):
|
| 97 |
+
"""
|
| 98 |
+
Reader for the Comparative Sentence Dataset by Jindal and Liu (2006).
|
| 99 |
+
|
| 100 |
+
>>> from nltk.corpus import comparative_sentences
|
| 101 |
+
>>> comparison = comparative_sentences.comparisons()[0]
|
| 102 |
+
>>> comparison.text # doctest: +NORMALIZE_WHITESPACE
|
| 103 |
+
['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly',
|
| 104 |
+
'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve",
|
| 105 |
+
'had', '.']
|
| 106 |
+
>>> comparison.entity_2
|
| 107 |
+
'models'
|
| 108 |
+
>>> (comparison.feature, comparison.keyword)
|
| 109 |
+
('rewind', 'more')
|
| 110 |
+
>>> len(comparative_sentences.comparisons())
|
| 111 |
+
853
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
CorpusView = StreamBackedCorpusView
|
| 115 |
+
|
| 116 |
+
def __init__(
|
| 117 |
+
self,
|
| 118 |
+
root,
|
| 119 |
+
fileids,
|
| 120 |
+
word_tokenizer=WhitespaceTokenizer(),
|
| 121 |
+
sent_tokenizer=None,
|
| 122 |
+
encoding="utf8",
|
| 123 |
+
):
|
| 124 |
+
"""
|
| 125 |
+
:param root: The root directory for this corpus.
|
| 126 |
+
:param fileids: a list or regexp specifying the fileids in this corpus.
|
| 127 |
+
:param word_tokenizer: tokenizer for breaking sentences or paragraphs
|
| 128 |
+
into words. Default: `WhitespaceTokenizer`
|
| 129 |
+
:param sent_tokenizer: tokenizer for breaking paragraphs into sentences.
|
| 130 |
+
:param encoding: the encoding that should be used to read the corpus.
|
| 131 |
+
"""
|
| 132 |
+
|
| 133 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 134 |
+
self._word_tokenizer = word_tokenizer
|
| 135 |
+
self._sent_tokenizer = sent_tokenizer
|
| 136 |
+
self._readme = "README.txt"
|
| 137 |
+
|
| 138 |
+
def comparisons(self, fileids=None):
|
| 139 |
+
"""
|
| 140 |
+
Return all comparisons in the corpus.
|
| 141 |
+
|
| 142 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 143 |
+
comparisons have to be returned.
|
| 144 |
+
:return: the given file(s) as a list of Comparison objects.
|
| 145 |
+
:rtype: list(Comparison)
|
| 146 |
+
"""
|
| 147 |
+
if fileids is None:
|
| 148 |
+
fileids = self._fileids
|
| 149 |
+
elif isinstance(fileids, str):
|
| 150 |
+
fileids = [fileids]
|
| 151 |
+
return concat(
|
| 152 |
+
[
|
| 153 |
+
self.CorpusView(path, self._read_comparison_block, encoding=enc)
|
| 154 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 155 |
+
]
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
def keywords(self, fileids=None):
|
| 159 |
+
"""
|
| 160 |
+
Return a set of all keywords used in the corpus.
|
| 161 |
+
|
| 162 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 163 |
+
keywords have to be returned.
|
| 164 |
+
:return: the set of keywords and comparative phrases used in the corpus.
|
| 165 |
+
:rtype: set(str)
|
| 166 |
+
"""
|
| 167 |
+
all_keywords = concat(
|
| 168 |
+
[
|
| 169 |
+
self.CorpusView(path, self._read_keyword_block, encoding=enc)
|
| 170 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 171 |
+
]
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
keywords_set = {keyword.lower() for keyword in all_keywords if keyword}
|
| 175 |
+
return keywords_set
|
| 176 |
+
|
| 177 |
+
def keywords_readme(self):
|
| 178 |
+
"""
|
| 179 |
+
Return the list of words and constituents considered as clues of a
|
| 180 |
+
comparison (from listOfkeywords.txt).
|
| 181 |
+
"""
|
| 182 |
+
keywords = []
|
| 183 |
+
with self.open("listOfkeywords.txt") as fp:
|
| 184 |
+
raw_text = fp.read()
|
| 185 |
+
for line in raw_text.split("\n"):
|
| 186 |
+
if not line or line.startswith("//"):
|
| 187 |
+
continue
|
| 188 |
+
keywords.append(line.strip())
|
| 189 |
+
return keywords
|
| 190 |
+
|
| 191 |
+
def sents(self, fileids=None):
|
| 192 |
+
"""
|
| 193 |
+
Return all sentences in the corpus.
|
| 194 |
+
|
| 195 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 196 |
+
sentences have to be returned.
|
| 197 |
+
:return: all sentences of the corpus as lists of tokens (or as plain
|
| 198 |
+
strings, if no word tokenizer is specified).
|
| 199 |
+
:rtype: list(list(str)) or list(str)
|
| 200 |
+
"""
|
| 201 |
+
return concat(
|
| 202 |
+
[
|
| 203 |
+
self.CorpusView(path, self._read_sent_block, encoding=enc)
|
| 204 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 205 |
+
]
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
def words(self, fileids=None):
|
| 209 |
+
"""
|
| 210 |
+
Return all words and punctuation symbols in the corpus.
|
| 211 |
+
|
| 212 |
+
:param fileids: a list or regexp specifying the ids of the files whose
|
| 213 |
+
words have to be returned.
|
| 214 |
+
:return: the given file(s) as a list of words and punctuation symbols.
|
| 215 |
+
:rtype: list(str)
|
| 216 |
+
"""
|
| 217 |
+
return concat(
|
| 218 |
+
[
|
| 219 |
+
self.CorpusView(path, self._read_word_block, encoding=enc)
|
| 220 |
+
for (path, enc, fileid) in self.abspaths(fileids, True, True)
|
| 221 |
+
]
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
def _read_comparison_block(self, stream):
|
| 225 |
+
while True:
|
| 226 |
+
line = stream.readline()
|
| 227 |
+
if not line:
|
| 228 |
+
return [] # end of file.
|
| 229 |
+
comparison_tags = re.findall(COMPARISON, line)
|
| 230 |
+
if comparison_tags:
|
| 231 |
+
grad_comparisons = re.findall(GRAD_COMPARISON, line)
|
| 232 |
+
non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line)
|
| 233 |
+
# Advance to the next line (it contains the comparative sentence)
|
| 234 |
+
comparison_text = stream.readline().strip()
|
| 235 |
+
if self._word_tokenizer:
|
| 236 |
+
comparison_text = self._word_tokenizer.tokenize(comparison_text)
|
| 237 |
+
# Skip the next line (it contains closing comparison tags)
|
| 238 |
+
stream.readline()
|
| 239 |
+
# If gradable comparisons are found, create Comparison instances
|
| 240 |
+
# and populate their fields
|
| 241 |
+
comparison_bundle = []
|
| 242 |
+
if grad_comparisons:
|
| 243 |
+
# Each comparison tag has its own relations on a separate line
|
| 244 |
+
for comp in grad_comparisons:
|
| 245 |
+
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
|
| 246 |
+
comparison = Comparison(
|
| 247 |
+
text=comparison_text, comp_type=comp_type
|
| 248 |
+
)
|
| 249 |
+
line = stream.readline()
|
| 250 |
+
entities_feats = ENTITIES_FEATS.findall(line)
|
| 251 |
+
if entities_feats:
|
| 252 |
+
for (code, entity_feat) in entities_feats:
|
| 253 |
+
if code == "1":
|
| 254 |
+
comparison.entity_1 = entity_feat.strip()
|
| 255 |
+
elif code == "2":
|
| 256 |
+
comparison.entity_2 = entity_feat.strip()
|
| 257 |
+
elif code == "3":
|
| 258 |
+
comparison.feature = entity_feat.strip()
|
| 259 |
+
keyword = KEYWORD.findall(line)
|
| 260 |
+
if keyword:
|
| 261 |
+
comparison.keyword = keyword[0]
|
| 262 |
+
comparison_bundle.append(comparison)
|
| 263 |
+
# If non-gradable comparisons are found, create a simple Comparison
|
| 264 |
+
# instance for each one
|
| 265 |
+
if non_grad_comparisons:
|
| 266 |
+
for comp in non_grad_comparisons:
|
| 267 |
+
# comp_type in this case should always be 4.
|
| 268 |
+
comp_type = int(re.match(r"<cs-(\d)>", comp).group(1))
|
| 269 |
+
comparison = Comparison(
|
| 270 |
+
text=comparison_text, comp_type=comp_type
|
| 271 |
+
)
|
| 272 |
+
comparison_bundle.append(comparison)
|
| 273 |
+
# Flatten the list of comparisons before returning them
|
| 274 |
+
# return concat([comparison_bundle])
|
| 275 |
+
return comparison_bundle
|
| 276 |
+
|
| 277 |
+
def _read_keyword_block(self, stream):
|
| 278 |
+
keywords = []
|
| 279 |
+
for comparison in self._read_comparison_block(stream):
|
| 280 |
+
keywords.append(comparison.keyword)
|
| 281 |
+
return keywords
|
| 282 |
+
|
| 283 |
+
def _read_sent_block(self, stream):
|
| 284 |
+
while True:
|
| 285 |
+
line = stream.readline()
|
| 286 |
+
if re.match(STARS, line):
|
| 287 |
+
while True:
|
| 288 |
+
line = stream.readline()
|
| 289 |
+
if re.match(STARS, line):
|
| 290 |
+
break
|
| 291 |
+
continue
|
| 292 |
+
if (
|
| 293 |
+
not re.findall(COMPARISON, line)
|
| 294 |
+
and not ENTITIES_FEATS.findall(line)
|
| 295 |
+
and not re.findall(CLOSE_COMPARISON, line)
|
| 296 |
+
):
|
| 297 |
+
if self._sent_tokenizer:
|
| 298 |
+
return [
|
| 299 |
+
self._word_tokenizer.tokenize(sent)
|
| 300 |
+
for sent in self._sent_tokenizer.tokenize(line)
|
| 301 |
+
]
|
| 302 |
+
else:
|
| 303 |
+
return [self._word_tokenizer.tokenize(line)]
|
| 304 |
+
|
| 305 |
+
def _read_word_block(self, stream):
|
| 306 |
+
words = []
|
| 307 |
+
for sent in self._read_sent_block(stream):
|
| 308 |
+
words.extend(sent)
|
| 309 |
+
return words
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/conll.py
ADDED
|
@@ -0,0 +1,579 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: CONLL Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Steven Bird <stevenbird1@gmail.com>
|
| 5 |
+
# Edward Loper <edloper@gmail.com>
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
Read CoNLL-style chunk fileids.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import textwrap
|
| 14 |
+
|
| 15 |
+
from nltk.corpus.reader.api import *
|
| 16 |
+
from nltk.corpus.reader.util import *
|
| 17 |
+
from nltk.tag import map_tag
|
| 18 |
+
from nltk.tree import Tree
|
| 19 |
+
from nltk.util import LazyConcatenation, LazyMap
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ConllCorpusReader(CorpusReader):
|
| 23 |
+
"""
|
| 24 |
+
A corpus reader for CoNLL-style files. These files consist of a
|
| 25 |
+
series of sentences, separated by blank lines. Each sentence is
|
| 26 |
+
encoded using a table (or "grid") of values, where each line
|
| 27 |
+
corresponds to a single word, and each column corresponds to an
|
| 28 |
+
annotation type. The set of columns used by CoNLL-style files can
|
| 29 |
+
vary from corpus to corpus; the ``ConllCorpusReader`` constructor
|
| 30 |
+
therefore takes an argument, ``columntypes``, which is used to
|
| 31 |
+
specify the columns that are used by a given corpus. By default
|
| 32 |
+
columns are split by consecutive whitespaces, with the
|
| 33 |
+
``separator`` argument you can set a string to split by (e.g.
|
| 34 |
+
``\'\t\'``).
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@todo: Add support for reading from corpora where different
|
| 38 |
+
parallel files contain different columns.
|
| 39 |
+
@todo: Possibly add caching of the grid corpus view? This would
|
| 40 |
+
allow the same grid view to be used by different data access
|
| 41 |
+
methods (eg words() and parsed_sents() could both share the
|
| 42 |
+
same grid corpus view object).
|
| 43 |
+
@todo: Better support for -DOCSTART-. Currently, we just ignore
|
| 44 |
+
it, but it could be used to define methods that retrieve a
|
| 45 |
+
document at a time (eg parsed_documents()).
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
# /////////////////////////////////////////////////////////////////
|
| 49 |
+
# Column Types
|
| 50 |
+
# /////////////////////////////////////////////////////////////////
|
| 51 |
+
|
| 52 |
+
WORDS = "words" #: column type for words
|
| 53 |
+
POS = "pos" #: column type for part-of-speech tags
|
| 54 |
+
TREE = "tree" #: column type for parse trees
|
| 55 |
+
CHUNK = "chunk" #: column type for chunk structures
|
| 56 |
+
NE = "ne" #: column type for named entities
|
| 57 |
+
SRL = "srl" #: column type for semantic role labels
|
| 58 |
+
IGNORE = "ignore" #: column type for column that should be ignored
|
| 59 |
+
|
| 60 |
+
#: A list of all column types supported by the conll corpus reader.
|
| 61 |
+
COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE)
|
| 62 |
+
|
| 63 |
+
# /////////////////////////////////////////////////////////////////
|
| 64 |
+
# Constructor
|
| 65 |
+
# /////////////////////////////////////////////////////////////////
|
| 66 |
+
|
| 67 |
+
def __init__(
|
| 68 |
+
self,
|
| 69 |
+
root,
|
| 70 |
+
fileids,
|
| 71 |
+
columntypes,
|
| 72 |
+
chunk_types=None,
|
| 73 |
+
root_label="S",
|
| 74 |
+
pos_in_tree=False,
|
| 75 |
+
srl_includes_roleset=True,
|
| 76 |
+
encoding="utf8",
|
| 77 |
+
tree_class=Tree,
|
| 78 |
+
tagset=None,
|
| 79 |
+
separator=None,
|
| 80 |
+
):
|
| 81 |
+
for columntype in columntypes:
|
| 82 |
+
if columntype not in self.COLUMN_TYPES:
|
| 83 |
+
raise ValueError("Bad column type %r" % columntype)
|
| 84 |
+
if isinstance(chunk_types, str):
|
| 85 |
+
chunk_types = [chunk_types]
|
| 86 |
+
self._chunk_types = chunk_types
|
| 87 |
+
self._colmap = {c: i for (i, c) in enumerate(columntypes)}
|
| 88 |
+
self._pos_in_tree = pos_in_tree
|
| 89 |
+
self._root_label = root_label # for chunks
|
| 90 |
+
self._srl_includes_roleset = srl_includes_roleset
|
| 91 |
+
self._tree_class = tree_class
|
| 92 |
+
CorpusReader.__init__(self, root, fileids, encoding)
|
| 93 |
+
self._tagset = tagset
|
| 94 |
+
self.sep = separator
|
| 95 |
+
|
| 96 |
+
# /////////////////////////////////////////////////////////////////
|
| 97 |
+
# Data Access Methods
|
| 98 |
+
# /////////////////////////////////////////////////////////////////
|
| 99 |
+
|
| 100 |
+
def words(self, fileids=None):
|
| 101 |
+
self._require(self.WORDS)
|
| 102 |
+
return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids)))
|
| 103 |
+
|
| 104 |
+
def sents(self, fileids=None):
|
| 105 |
+
self._require(self.WORDS)
|
| 106 |
+
return LazyMap(self._get_words, self._grids(fileids))
|
| 107 |
+
|
| 108 |
+
def tagged_words(self, fileids=None, tagset=None):
|
| 109 |
+
self._require(self.WORDS, self.POS)
|
| 110 |
+
|
| 111 |
+
def get_tagged_words(grid):
|
| 112 |
+
return self._get_tagged_words(grid, tagset)
|
| 113 |
+
|
| 114 |
+
return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
|
| 115 |
+
|
| 116 |
+
def tagged_sents(self, fileids=None, tagset=None):
|
| 117 |
+
self._require(self.WORDS, self.POS)
|
| 118 |
+
|
| 119 |
+
def get_tagged_words(grid):
|
| 120 |
+
return self._get_tagged_words(grid, tagset)
|
| 121 |
+
|
| 122 |
+
return LazyMap(get_tagged_words, self._grids(fileids))
|
| 123 |
+
|
| 124 |
+
def chunked_words(self, fileids=None, chunk_types=None, tagset=None):
|
| 125 |
+
self._require(self.WORDS, self.POS, self.CHUNK)
|
| 126 |
+
if chunk_types is None:
|
| 127 |
+
chunk_types = self._chunk_types
|
| 128 |
+
|
| 129 |
+
def get_chunked_words(grid): # capture chunk_types as local var
|
| 130 |
+
return self._get_chunked_words(grid, chunk_types, tagset)
|
| 131 |
+
|
| 132 |
+
return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids)))
|
| 133 |
+
|
| 134 |
+
def chunked_sents(self, fileids=None, chunk_types=None, tagset=None):
|
| 135 |
+
self._require(self.WORDS, self.POS, self.CHUNK)
|
| 136 |
+
if chunk_types is None:
|
| 137 |
+
chunk_types = self._chunk_types
|
| 138 |
+
|
| 139 |
+
def get_chunked_words(grid): # capture chunk_types as local var
|
| 140 |
+
return self._get_chunked_words(grid, chunk_types, tagset)
|
| 141 |
+
|
| 142 |
+
return LazyMap(get_chunked_words, self._grids(fileids))
|
| 143 |
+
|
| 144 |
+
def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None):
|
| 145 |
+
self._require(self.WORDS, self.POS, self.TREE)
|
| 146 |
+
if pos_in_tree is None:
|
| 147 |
+
pos_in_tree = self._pos_in_tree
|
| 148 |
+
|
| 149 |
+
def get_parsed_sent(grid): # capture pos_in_tree as local var
|
| 150 |
+
return self._get_parsed_sent(grid, pos_in_tree, tagset)
|
| 151 |
+
|
| 152 |
+
return LazyMap(get_parsed_sent, self._grids(fileids))
|
| 153 |
+
|
| 154 |
+
def srl_spans(self, fileids=None):
|
| 155 |
+
self._require(self.SRL)
|
| 156 |
+
return LazyMap(self._get_srl_spans, self._grids(fileids))
|
| 157 |
+
|
| 158 |
+
def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
|
| 159 |
+
self._require(self.WORDS, self.POS, self.TREE, self.SRL)
|
| 160 |
+
if pos_in_tree is None:
|
| 161 |
+
pos_in_tree = self._pos_in_tree
|
| 162 |
+
|
| 163 |
+
def get_srl_instances(grid): # capture pos_in_tree as local var
|
| 164 |
+
return self._get_srl_instances(grid, pos_in_tree)
|
| 165 |
+
|
| 166 |
+
result = LazyMap(get_srl_instances, self._grids(fileids))
|
| 167 |
+
if flatten:
|
| 168 |
+
result = LazyConcatenation(result)
|
| 169 |
+
return result
|
| 170 |
+
|
| 171 |
+
def iob_words(self, fileids=None, tagset=None):
|
| 172 |
+
"""
|
| 173 |
+
:return: a list of word/tag/IOB tuples
|
| 174 |
+
:rtype: list(tuple)
|
| 175 |
+
:param fileids: the list of fileids that make up this corpus
|
| 176 |
+
:type fileids: None or str or list
|
| 177 |
+
"""
|
| 178 |
+
self._require(self.WORDS, self.POS, self.CHUNK)
|
| 179 |
+
|
| 180 |
+
def get_iob_words(grid):
|
| 181 |
+
return self._get_iob_words(grid, tagset)
|
| 182 |
+
|
| 183 |
+
return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
|
| 184 |
+
|
| 185 |
+
def iob_sents(self, fileids=None, tagset=None):
|
| 186 |
+
"""
|
| 187 |
+
:return: a list of lists of word/tag/IOB tuples
|
| 188 |
+
:rtype: list(list)
|
| 189 |
+
:param fileids: the list of fileids that make up this corpus
|
| 190 |
+
:type fileids: None or str or list
|
| 191 |
+
"""
|
| 192 |
+
self._require(self.WORDS, self.POS, self.CHUNK)
|
| 193 |
+
|
| 194 |
+
def get_iob_words(grid):
|
| 195 |
+
return self._get_iob_words(grid, tagset)
|
| 196 |
+
|
| 197 |
+
return LazyMap(get_iob_words, self._grids(fileids))
|
| 198 |
+
|
| 199 |
+
# /////////////////////////////////////////////////////////////////
|
| 200 |
+
# Grid Reading
|
| 201 |
+
# /////////////////////////////////////////////////////////////////
|
| 202 |
+
|
| 203 |
+
def _grids(self, fileids=None):
|
| 204 |
+
# n.b.: we could cache the object returned here (keyed on
|
| 205 |
+
# fileids), which would let us reuse the same corpus view for
|
| 206 |
+
# different things (eg srl and parse trees).
|
| 207 |
+
return concat(
|
| 208 |
+
[
|
| 209 |
+
StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc)
|
| 210 |
+
for (fileid, enc) in self.abspaths(fileids, True)
|
| 211 |
+
]
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
def _read_grid_block(self, stream):
|
| 215 |
+
grids = []
|
| 216 |
+
for block in read_blankline_block(stream):
|
| 217 |
+
block = block.strip()
|
| 218 |
+
if not block:
|
| 219 |
+
continue
|
| 220 |
+
|
| 221 |
+
grid = [line.split(self.sep) for line in block.split("\n")]
|
| 222 |
+
|
| 223 |
+
# If there's a docstart row, then discard. ([xx] eventually it
|
| 224 |
+
# would be good to actually use it)
|
| 225 |
+
if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-":
|
| 226 |
+
del grid[0]
|
| 227 |
+
|
| 228 |
+
# Check that the grid is consistent.
|
| 229 |
+
for row in grid:
|
| 230 |
+
if len(row) != len(grid[0]):
|
| 231 |
+
raise ValueError("Inconsistent number of columns:\n%s" % block)
|
| 232 |
+
grids.append(grid)
|
| 233 |
+
return grids
|
| 234 |
+
|
| 235 |
+
# /////////////////////////////////////////////////////////////////
|
| 236 |
+
# Transforms
|
| 237 |
+
# /////////////////////////////////////////////////////////////////
|
| 238 |
+
# given a grid, transform it into some representation (e.g.,
|
| 239 |
+
# a list of words or a parse tree).
|
| 240 |
+
|
| 241 |
+
def _get_words(self, grid):
|
| 242 |
+
return self._get_column(grid, self._colmap["words"])
|
| 243 |
+
|
| 244 |
+
def _get_tagged_words(self, grid, tagset=None):
|
| 245 |
+
pos_tags = self._get_column(grid, self._colmap["pos"])
|
| 246 |
+
if tagset and tagset != self._tagset:
|
| 247 |
+
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
| 248 |
+
return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags))
|
| 249 |
+
|
| 250 |
+
def _get_iob_words(self, grid, tagset=None):
|
| 251 |
+
pos_tags = self._get_column(grid, self._colmap["pos"])
|
| 252 |
+
if tagset and tagset != self._tagset:
|
| 253 |
+
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
| 254 |
+
return list(
|
| 255 |
+
zip(
|
| 256 |
+
self._get_column(grid, self._colmap["words"]),
|
| 257 |
+
pos_tags,
|
| 258 |
+
self._get_column(grid, self._colmap["chunk"]),
|
| 259 |
+
)
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
def _get_chunked_words(self, grid, chunk_types, tagset=None):
|
| 263 |
+
# n.b.: this method is very similar to conllstr2tree.
|
| 264 |
+
words = self._get_column(grid, self._colmap["words"])
|
| 265 |
+
pos_tags = self._get_column(grid, self._colmap["pos"])
|
| 266 |
+
if tagset and tagset != self._tagset:
|
| 267 |
+
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
| 268 |
+
chunk_tags = self._get_column(grid, self._colmap["chunk"])
|
| 269 |
+
|
| 270 |
+
stack = [Tree(self._root_label, [])]
|
| 271 |
+
|
| 272 |
+
for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags):
|
| 273 |
+
if chunk_tag == "O":
|
| 274 |
+
state, chunk_type = "O", ""
|
| 275 |
+
else:
|
| 276 |
+
(state, chunk_type) = chunk_tag.split("-")
|
| 277 |
+
# If it's a chunk we don't care about, treat it as O.
|
| 278 |
+
if chunk_types is not None and chunk_type not in chunk_types:
|
| 279 |
+
state = "O"
|
| 280 |
+
# Treat a mismatching I like a B.
|
| 281 |
+
if state == "I" and chunk_type != stack[-1].label():
|
| 282 |
+
state = "B"
|
| 283 |
+
# For B or I: close any open chunks
|
| 284 |
+
if state in "BO" and len(stack) == 2:
|
| 285 |
+
stack.pop()
|
| 286 |
+
# For B: start a new chunk.
|
| 287 |
+
if state == "B":
|
| 288 |
+
new_chunk = Tree(chunk_type, [])
|
| 289 |
+
stack[-1].append(new_chunk)
|
| 290 |
+
stack.append(new_chunk)
|
| 291 |
+
# Add the word token.
|
| 292 |
+
stack[-1].append((word, pos_tag))
|
| 293 |
+
|
| 294 |
+
return stack[0]
|
| 295 |
+
|
| 296 |
+
def _get_parsed_sent(self, grid, pos_in_tree, tagset=None):
|
| 297 |
+
words = self._get_column(grid, self._colmap["words"])
|
| 298 |
+
pos_tags = self._get_column(grid, self._colmap["pos"])
|
| 299 |
+
if tagset and tagset != self._tagset:
|
| 300 |
+
pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags]
|
| 301 |
+
parse_tags = self._get_column(grid, self._colmap["tree"])
|
| 302 |
+
|
| 303 |
+
treestr = ""
|
| 304 |
+
for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags):
|
| 305 |
+
if word == "(":
|
| 306 |
+
word = "-LRB-"
|
| 307 |
+
if word == ")":
|
| 308 |
+
word = "-RRB-"
|
| 309 |
+
if pos_tag == "(":
|
| 310 |
+
pos_tag = "-LRB-"
|
| 311 |
+
if pos_tag == ")":
|
| 312 |
+
pos_tag = "-RRB-"
|
| 313 |
+
(left, right) = parse_tag.split("*")
|
| 314 |
+
right = right.count(")") * ")" # only keep ')'.
|
| 315 |
+
treestr += f"{left} ({pos_tag} {word}) {right}"
|
| 316 |
+
try:
|
| 317 |
+
tree = self._tree_class.fromstring(treestr)
|
| 318 |
+
except (ValueError, IndexError):
|
| 319 |
+
tree = self._tree_class.fromstring(f"({self._root_label} {treestr})")
|
| 320 |
+
|
| 321 |
+
if not pos_in_tree:
|
| 322 |
+
for subtree in tree.subtrees():
|
| 323 |
+
for i, child in enumerate(subtree):
|
| 324 |
+
if (
|
| 325 |
+
isinstance(child, Tree)
|
| 326 |
+
and len(child) == 1
|
| 327 |
+
and isinstance(child[0], str)
|
| 328 |
+
):
|
| 329 |
+
subtree[i] = (child[0], child.label())
|
| 330 |
+
|
| 331 |
+
return tree
|
| 332 |
+
|
| 333 |
+
def _get_srl_spans(self, grid):
|
| 334 |
+
"""
|
| 335 |
+
list of list of (start, end), tag) tuples
|
| 336 |
+
"""
|
| 337 |
+
if self._srl_includes_roleset:
|
| 338 |
+
predicates = self._get_column(grid, self._colmap["srl"] + 1)
|
| 339 |
+
start_col = self._colmap["srl"] + 2
|
| 340 |
+
else:
|
| 341 |
+
predicates = self._get_column(grid, self._colmap["srl"])
|
| 342 |
+
start_col = self._colmap["srl"] + 1
|
| 343 |
+
|
| 344 |
+
# Count how many predicates there are. This tells us how many
|
| 345 |
+
# columns to expect for SRL data.
|
| 346 |
+
num_preds = len([p for p in predicates if p != "-"])
|
| 347 |
+
|
| 348 |
+
spanlists = []
|
| 349 |
+
for i in range(num_preds):
|
| 350 |
+
col = self._get_column(grid, start_col + i)
|
| 351 |
+
spanlist = []
|
| 352 |
+
stack = []
|
| 353 |
+
for wordnum, srl_tag in enumerate(col):
|
| 354 |
+
(left, right) = srl_tag.split("*")
|
| 355 |
+
for tag in left.split("("):
|
| 356 |
+
if tag:
|
| 357 |
+
stack.append((tag, wordnum))
|
| 358 |
+
for i in range(right.count(")")):
|
| 359 |
+
(tag, start) = stack.pop()
|
| 360 |
+
spanlist.append(((start, wordnum + 1), tag))
|
| 361 |
+
spanlists.append(spanlist)
|
| 362 |
+
|
| 363 |
+
return spanlists
|
| 364 |
+
|
| 365 |
+
def _get_srl_instances(self, grid, pos_in_tree):
|
| 366 |
+
tree = self._get_parsed_sent(grid, pos_in_tree)
|
| 367 |
+
spanlists = self._get_srl_spans(grid)
|
| 368 |
+
if self._srl_includes_roleset:
|
| 369 |
+
predicates = self._get_column(grid, self._colmap["srl"] + 1)
|
| 370 |
+
rolesets = self._get_column(grid, self._colmap["srl"])
|
| 371 |
+
else:
|
| 372 |
+
predicates = self._get_column(grid, self._colmap["srl"])
|
| 373 |
+
rolesets = [None] * len(predicates)
|
| 374 |
+
|
| 375 |
+
instances = ConllSRLInstanceList(tree)
|
| 376 |
+
for wordnum, predicate in enumerate(predicates):
|
| 377 |
+
if predicate == "-":
|
| 378 |
+
continue
|
| 379 |
+
# Decide which spanlist to use. Don't assume that they're
|
| 380 |
+
# sorted in the same order as the predicates (even though
|
| 381 |
+
# they usually are).
|
| 382 |
+
for spanlist in spanlists:
|
| 383 |
+
for (start, end), tag in spanlist:
|
| 384 |
+
if wordnum in range(start, end) and tag in ("V", "C-V"):
|
| 385 |
+
break
|
| 386 |
+
else:
|
| 387 |
+
continue
|
| 388 |
+
break
|
| 389 |
+
else:
|
| 390 |
+
raise ValueError("No srl column found for %r" % predicate)
|
| 391 |
+
instances.append(
|
| 392 |
+
ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
return instances
|
| 396 |
+
|
| 397 |
+
# /////////////////////////////////////////////////////////////////
|
| 398 |
+
# Helper Methods
|
| 399 |
+
# /////////////////////////////////////////////////////////////////
|
| 400 |
+
|
| 401 |
+
def _require(self, *columntypes):
|
| 402 |
+
for columntype in columntypes:
|
| 403 |
+
if columntype not in self._colmap:
|
| 404 |
+
raise ValueError(
|
| 405 |
+
"This corpus does not contain a %s " "column." % columntype
|
| 406 |
+
)
|
| 407 |
+
|
| 408 |
+
@staticmethod
|
| 409 |
+
def _get_column(grid, column_index):
|
| 410 |
+
return [grid[i][column_index] for i in range(len(grid))]
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
class ConllSRLInstance:
|
| 414 |
+
"""
|
| 415 |
+
An SRL instance from a CoNLL corpus, which identifies and
|
| 416 |
+
providing labels for the arguments of a single verb.
|
| 417 |
+
"""
|
| 418 |
+
|
| 419 |
+
# [xx] add inst.core_arguments, inst.argm_arguments?
|
| 420 |
+
|
| 421 |
+
def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans):
|
| 422 |
+
self.verb = []
|
| 423 |
+
"""A list of the word indices of the words that compose the
|
| 424 |
+
verb whose arguments are identified by this instance.
|
| 425 |
+
This will contain multiple word indices when multi-word
|
| 426 |
+
verbs are used (e.g. 'turn on')."""
|
| 427 |
+
|
| 428 |
+
self.verb_head = verb_head
|
| 429 |
+
"""The word index of the head word of the verb whose arguments
|
| 430 |
+
are identified by this instance. E.g., for a sentence that
|
| 431 |
+
uses the verb 'turn on,' ``verb_head`` will be the word index
|
| 432 |
+
of the word 'turn'."""
|
| 433 |
+
|
| 434 |
+
self.verb_stem = verb_stem
|
| 435 |
+
|
| 436 |
+
self.roleset = roleset
|
| 437 |
+
|
| 438 |
+
self.arguments = []
|
| 439 |
+
"""A list of ``(argspan, argid)`` tuples, specifying the location
|
| 440 |
+
and type for each of the arguments identified by this
|
| 441 |
+
instance. ``argspan`` is a tuple ``start, end``, indicating
|
| 442 |
+
that the argument consists of the ``words[start:end]``."""
|
| 443 |
+
|
| 444 |
+
self.tagged_spans = tagged_spans
|
| 445 |
+
"""A list of ``(span, id)`` tuples, specifying the location and
|
| 446 |
+
type for each of the arguments, as well as the verb pieces,
|
| 447 |
+
that make up this instance."""
|
| 448 |
+
|
| 449 |
+
self.tree = tree
|
| 450 |
+
"""The parse tree for the sentence containing this instance."""
|
| 451 |
+
|
| 452 |
+
self.words = tree.leaves()
|
| 453 |
+
"""A list of the words in the sentence containing this
|
| 454 |
+
instance."""
|
| 455 |
+
|
| 456 |
+
# Fill in the self.verb and self.arguments values.
|
| 457 |
+
for (start, end), tag in tagged_spans:
|
| 458 |
+
if tag in ("V", "C-V"):
|
| 459 |
+
self.verb += list(range(start, end))
|
| 460 |
+
else:
|
| 461 |
+
self.arguments.append(((start, end), tag))
|
| 462 |
+
|
| 463 |
+
def __repr__(self):
|
| 464 |
+
# Originally, its:
|
| 465 |
+
##plural = 's' if len(self.arguments) != 1 else ''
|
| 466 |
+
plural = "s" if len(self.arguments) != 1 else ""
|
| 467 |
+
return "<ConllSRLInstance for %r with %d argument%s>" % (
|
| 468 |
+
(self.verb_stem, len(self.arguments), plural)
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
def pprint(self):
|
| 472 |
+
verbstr = " ".join(self.words[i][0] for i in self.verb)
|
| 473 |
+
hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n"
|
| 474 |
+
s = ""
|
| 475 |
+
for i, word in enumerate(self.words):
|
| 476 |
+
if isinstance(word, tuple):
|
| 477 |
+
word = word[0]
|
| 478 |
+
for (start, end), argid in self.arguments:
|
| 479 |
+
if i == start:
|
| 480 |
+
s += "[%s " % argid
|
| 481 |
+
if i == end:
|
| 482 |
+
s += "] "
|
| 483 |
+
if i in self.verb:
|
| 484 |
+
word = "<<%s>>" % word
|
| 485 |
+
s += word + " "
|
| 486 |
+
return hdr + textwrap.fill(
|
| 487 |
+
s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" "
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
class ConllSRLInstanceList(list):
|
| 492 |
+
"""
|
| 493 |
+
Set of instances for a single sentence
|
| 494 |
+
"""
|
| 495 |
+
|
| 496 |
+
def __init__(self, tree, instances=()):
|
| 497 |
+
self.tree = tree
|
| 498 |
+
list.__init__(self, instances)
|
| 499 |
+
|
| 500 |
+
def __str__(self):
|
| 501 |
+
return self.pprint()
|
| 502 |
+
|
| 503 |
+
def pprint(self, include_tree=False):
|
| 504 |
+
# Sanity check: trees should be the same
|
| 505 |
+
for inst in self:
|
| 506 |
+
if inst.tree != self.tree:
|
| 507 |
+
raise ValueError("Tree mismatch!")
|
| 508 |
+
|
| 509 |
+
# If desired, add trees:
|
| 510 |
+
if include_tree:
|
| 511 |
+
words = self.tree.leaves()
|
| 512 |
+
pos = [None] * len(words)
|
| 513 |
+
synt = ["*"] * len(words)
|
| 514 |
+
self._tree2conll(self.tree, 0, words, pos, synt)
|
| 515 |
+
|
| 516 |
+
s = ""
|
| 517 |
+
for i in range(len(words)):
|
| 518 |
+
# optional tree columns
|
| 519 |
+
if include_tree:
|
| 520 |
+
s += "%-20s " % words[i]
|
| 521 |
+
s += "%-8s " % pos[i]
|
| 522 |
+
s += "%15s*%-8s " % tuple(synt[i].split("*"))
|
| 523 |
+
|
| 524 |
+
# verb head column
|
| 525 |
+
for inst in self:
|
| 526 |
+
if i == inst.verb_head:
|
| 527 |
+
s += "%-20s " % inst.verb_stem
|
| 528 |
+
break
|
| 529 |
+
else:
|
| 530 |
+
s += "%-20s " % "-"
|
| 531 |
+
# Remaining columns: self
|
| 532 |
+
for inst in self:
|
| 533 |
+
argstr = "*"
|
| 534 |
+
for (start, end), argid in inst.tagged_spans:
|
| 535 |
+
if i == start:
|
| 536 |
+
argstr = f"({argid}{argstr}"
|
| 537 |
+
if i == (end - 1):
|
| 538 |
+
argstr += ")"
|
| 539 |
+
s += "%-12s " % argstr
|
| 540 |
+
s += "\n"
|
| 541 |
+
return s
|
| 542 |
+
|
| 543 |
+
def _tree2conll(self, tree, wordnum, words, pos, synt):
|
| 544 |
+
assert isinstance(tree, Tree)
|
| 545 |
+
if len(tree) == 1 and isinstance(tree[0], str):
|
| 546 |
+
pos[wordnum] = tree.label()
|
| 547 |
+
assert words[wordnum] == tree[0]
|
| 548 |
+
return wordnum + 1
|
| 549 |
+
elif len(tree) == 1 and isinstance(tree[0], tuple):
|
| 550 |
+
assert len(tree[0]) == 2
|
| 551 |
+
pos[wordnum], pos[wordnum] = tree[0]
|
| 552 |
+
return wordnum + 1
|
| 553 |
+
else:
|
| 554 |
+
synt[wordnum] = f"({tree.label()}{synt[wordnum]}"
|
| 555 |
+
for child in tree:
|
| 556 |
+
wordnum = self._tree2conll(child, wordnum, words, pos, synt)
|
| 557 |
+
synt[wordnum - 1] += ")"
|
| 558 |
+
return wordnum
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
class ConllChunkCorpusReader(ConllCorpusReader):
|
| 562 |
+
"""
|
| 563 |
+
A ConllCorpusReader whose data file contains three columns: words,
|
| 564 |
+
pos, and chunk.
|
| 565 |
+
"""
|
| 566 |
+
|
| 567 |
+
def __init__(
|
| 568 |
+
self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None
|
| 569 |
+
):
|
| 570 |
+
ConllCorpusReader.__init__(
|
| 571 |
+
self,
|
| 572 |
+
root,
|
| 573 |
+
fileids,
|
| 574 |
+
("words", "pos", "chunk"),
|
| 575 |
+
chunk_types=chunk_types,
|
| 576 |
+
encoding=encoding,
|
| 577 |
+
tagset=tagset,
|
| 578 |
+
separator=separator,
|
| 579 |
+
)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/crubadan.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: An Crubadan N-grams Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Avital Pekker <avital.pekker@utoronto.ca>
|
| 5 |
+
#
|
| 6 |
+
# URL: <https://www.nltk.org/>
|
| 7 |
+
# For license information, see LICENSE.TXT
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
An NLTK interface for the n-gram statistics gathered from
|
| 11 |
+
the corpora for each language using An Crubadan.
|
| 12 |
+
|
| 13 |
+
There are multiple potential applications for the data but
|
| 14 |
+
this reader was created with the goal of using it in the
|
| 15 |
+
context of language identification.
|
| 16 |
+
|
| 17 |
+
For details about An Crubadan, this data, and its potential uses, see:
|
| 18 |
+
http://borel.slu.edu/crubadan/index.html
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import re
|
| 22 |
+
from os import path
|
| 23 |
+
|
| 24 |
+
from nltk.corpus.reader import CorpusReader
|
| 25 |
+
from nltk.data import ZipFilePathPointer
|
| 26 |
+
from nltk.probability import FreqDist
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class CrubadanCorpusReader(CorpusReader):
|
| 30 |
+
"""
|
| 31 |
+
A corpus reader used to access language An Crubadan n-gram files.
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
_LANG_MAPPER_FILE = "table.txt"
|
| 35 |
+
_all_lang_freq = {}
|
| 36 |
+
|
| 37 |
+
def __init__(self, root, fileids, encoding="utf8", tagset=None):
|
| 38 |
+
super().__init__(root, fileids, encoding="utf8")
|
| 39 |
+
self._lang_mapping_data = []
|
| 40 |
+
self._load_lang_mapping_data()
|
| 41 |
+
|
| 42 |
+
def lang_freq(self, lang):
|
| 43 |
+
"""Return n-gram FreqDist for a specific language
|
| 44 |
+
given ISO 639-3 language code"""
|
| 45 |
+
|
| 46 |
+
if lang not in self._all_lang_freq:
|
| 47 |
+
self._all_lang_freq[lang] = self._load_lang_ngrams(lang)
|
| 48 |
+
|
| 49 |
+
return self._all_lang_freq[lang]
|
| 50 |
+
|
| 51 |
+
def langs(self):
|
| 52 |
+
"""Return a list of supported languages as ISO 639-3 codes"""
|
| 53 |
+
return [row[1] for row in self._lang_mapping_data]
|
| 54 |
+
|
| 55 |
+
def iso_to_crubadan(self, lang):
|
| 56 |
+
"""Return internal Crubadan code based on ISO 639-3 code"""
|
| 57 |
+
for i in self._lang_mapping_data:
|
| 58 |
+
if i[1].lower() == lang.lower():
|
| 59 |
+
return i[0]
|
| 60 |
+
|
| 61 |
+
def crubadan_to_iso(self, lang):
|
| 62 |
+
"""Return ISO 639-3 code given internal Crubadan code"""
|
| 63 |
+
for i in self._lang_mapping_data:
|
| 64 |
+
if i[0].lower() == lang.lower():
|
| 65 |
+
return i[1]
|
| 66 |
+
|
| 67 |
+
def _load_lang_mapping_data(self):
|
| 68 |
+
"""Load language mappings between codes and description from table.txt"""
|
| 69 |
+
if isinstance(self.root, ZipFilePathPointer):
|
| 70 |
+
raise RuntimeError(
|
| 71 |
+
"Please install the 'crubadan' corpus first, use nltk.download()"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
mapper_file = path.join(self.root, self._LANG_MAPPER_FILE)
|
| 75 |
+
if self._LANG_MAPPER_FILE not in self.fileids():
|
| 76 |
+
raise RuntimeError("Could not find language mapper file: " + mapper_file)
|
| 77 |
+
|
| 78 |
+
with open(mapper_file, encoding="utf-8") as raw:
|
| 79 |
+
strip_raw = raw.read().strip()
|
| 80 |
+
|
| 81 |
+
self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")]
|
| 82 |
+
|
| 83 |
+
def _load_lang_ngrams(self, lang):
|
| 84 |
+
"""Load single n-gram language file given the ISO 639-3 language code
|
| 85 |
+
and return its FreqDist"""
|
| 86 |
+
|
| 87 |
+
if lang not in self.langs():
|
| 88 |
+
raise RuntimeError("Unsupported language.")
|
| 89 |
+
|
| 90 |
+
crubadan_code = self.iso_to_crubadan(lang)
|
| 91 |
+
ngram_file = path.join(self.root, crubadan_code + "-3grams.txt")
|
| 92 |
+
|
| 93 |
+
if not path.isfile(ngram_file):
|
| 94 |
+
raise RuntimeError("No N-gram file found for requested language.")
|
| 95 |
+
|
| 96 |
+
counts = FreqDist()
|
| 97 |
+
with open(ngram_file, encoding="utf-8") as f:
|
| 98 |
+
for line in f:
|
| 99 |
+
data = line.split(" ")
|
| 100 |
+
|
| 101 |
+
ngram = data[1].strip("\n")
|
| 102 |
+
freq = int(data[0])
|
| 103 |
+
|
| 104 |
+
counts[ngram] = freq
|
| 105 |
+
|
| 106 |
+
return counts
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/dependency.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Natural Language Toolkit: Dependency Corpus Reader
|
| 2 |
+
#
|
| 3 |
+
# Copyright (C) 2001-2022 NLTK Project
|
| 4 |
+
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
|
| 5 |
+
# Iker Manterola <returntothehangar@hotmail.com>
|
| 6 |
+
#
|
| 7 |
+
# URL: <https://www.nltk.org/>
|
| 8 |
+
# For license information, see LICENSE.TXT
|
| 9 |
+
|
| 10 |
+
from nltk.corpus.reader.api import *
|
| 11 |
+
from nltk.corpus.reader.util import *
|
| 12 |
+
from nltk.parse import DependencyGraph
|
| 13 |
+
from nltk.tokenize import *
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class DependencyCorpusReader(SyntaxCorpusReader):
|
| 17 |
+
def __init__(
|
| 18 |
+
self,
|
| 19 |
+
root,
|
| 20 |
+
fileids,
|
| 21 |
+
encoding="utf8",
|
| 22 |
+
word_tokenizer=TabTokenizer(),
|
| 23 |
+
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
|
| 24 |
+
para_block_reader=read_blankline_block,
|
| 25 |
+
):
|
| 26 |
+
SyntaxCorpusReader.__init__(self, root, fileids, encoding)
|
| 27 |
+
|
| 28 |
+
#########################################################
|
| 29 |
+
|
| 30 |
+
def words(self, fileids=None):
|
| 31 |
+
return concat(
|
| 32 |
+
[
|
| 33 |
+
DependencyCorpusView(fileid, False, False, False, encoding=enc)
|
| 34 |
+
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
| 35 |
+
]
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
def tagged_words(self, fileids=None):
|
| 39 |
+
return concat(
|
| 40 |
+
[
|
| 41 |
+
DependencyCorpusView(fileid, True, False, False, encoding=enc)
|
| 42 |
+
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
| 43 |
+
]
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
def sents(self, fileids=None):
|
| 47 |
+
return concat(
|
| 48 |
+
[
|
| 49 |
+
DependencyCorpusView(fileid, False, True, False, encoding=enc)
|
| 50 |
+
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
| 51 |
+
]
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
def tagged_sents(self, fileids=None):
|
| 55 |
+
return concat(
|
| 56 |
+
[
|
| 57 |
+
DependencyCorpusView(fileid, True, True, False, encoding=enc)
|
| 58 |
+
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
| 59 |
+
]
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
def parsed_sents(self, fileids=None):
|
| 63 |
+
sents = concat(
|
| 64 |
+
[
|
| 65 |
+
DependencyCorpusView(fileid, False, True, True, encoding=enc)
|
| 66 |
+
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
| 67 |
+
]
|
| 68 |
+
)
|
| 69 |
+
return [DependencyGraph(sent) for sent in sents]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class DependencyCorpusView(StreamBackedCorpusView):
|
| 73 |
+
_DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da
|
| 74 |
+
|
| 75 |
+
def __init__(
|
| 76 |
+
self,
|
| 77 |
+
corpus_file,
|
| 78 |
+
tagged,
|
| 79 |
+
group_by_sent,
|
| 80 |
+
dependencies,
|
| 81 |
+
chunk_types=None,
|
| 82 |
+
encoding="utf8",
|
| 83 |
+
):
|
| 84 |
+
self._tagged = tagged
|
| 85 |
+
self._dependencies = dependencies
|
| 86 |
+
self._group_by_sent = group_by_sent
|
| 87 |
+
self._chunk_types = chunk_types
|
| 88 |
+
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
| 89 |
+
|
| 90 |
+
def read_block(self, stream):
|
| 91 |
+
# Read the next sentence.
|
| 92 |
+
sent = read_blankline_block(stream)[0].strip()
|
| 93 |
+
# Strip off the docstart marker, if present.
|
| 94 |
+
if sent.startswith(self._DOCSTART):
|
| 95 |
+
sent = sent[len(self._DOCSTART) :].lstrip()
|
| 96 |
+
|
| 97 |
+
# extract word and tag from any of the formats
|
| 98 |
+
if not self._dependencies:
|
| 99 |
+
lines = [line.split("\t") for line in sent.split("\n")]
|
| 100 |
+
if len(lines[0]) == 3 or len(lines[0]) == 4:
|
| 101 |
+
sent = [(line[0], line[1]) for line in lines]
|
| 102 |
+
elif len(lines[0]) == 10:
|
| 103 |
+
sent = [(line[1], line[4]) for line in lines]
|
| 104 |
+
else:
|
| 105 |
+
raise ValueError("Unexpected number of fields in dependency tree file")
|
| 106 |
+
|
| 107 |
+
# discard tags if they weren't requested
|
| 108 |
+
if not self._tagged:
|
| 109 |
+
sent = [word for (word, tag) in sent]
|
| 110 |
+
|
| 111 |
+
# Return the result.
|
| 112 |
+
if self._group_by_sent:
|
| 113 |
+
return [sent]
|
| 114 |
+
else:
|
| 115 |
+
return list(sent)
|
.eggs/nltk-3.8-py3.10.egg/nltk/corpus/reader/framenet.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|