varox34 commited on
Commit
366b225
·
verified ·
1 Parent(s): 56c2140

Upload 64 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. README.md +67 -0
  3. UD_Tamil-TTB-master/.gitignore +0 -0
  4. UD_Tamil-TTB-master/CONTRIBUTING.md +8 -0
  5. UD_Tamil-TTB-master/LICENSE.txt +7 -0
  6. UD_Tamil-TTB-master/README.md +103 -0
  7. UD_Tamil-TTB-master/eval.log +99 -0
  8. UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD-commented.odt +0 -0
  9. UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD.odt +0 -0
  10. UD_Tamil-TTB-master/not-to-release/issues-UD-2.3.odt +0 -0
  11. UD_Tamil-TTB-master/stats.xml +110 -0
  12. UD_Tamil-TTB-master/ta_ttb-ud-dev.conllu +0 -0
  13. UD_Tamil-TTB-master/ta_ttb-ud-test.conllu +0 -0
  14. UD_Tamil-TTB-master/ta_ttb-ud-train.conllu +0 -0
  15. app.py +86 -0
  16. arc_eager.py +93 -0
  17. best_mapping.pth +3 -0
  18. biaffine-parser-master/.gitignore +22 -0
  19. biaffine-parser-master/.travis.yml +36 -0
  20. biaffine-parser-master/README.md +226 -0
  21. biaffine-parser-master/config.ini +30 -0
  22. biaffine-parser-master/data/naive3.conllx +7 -0
  23. biaffine-parser-master/data/ptb/tamdev.conllx +0 -0
  24. biaffine-parser-master/data/ptb/tamtest.conllx +0 -0
  25. biaffine-parser-master/data/ptb/tamtrain.conllx +0 -0
  26. biaffine-parser-master/exp/ptb/fields +3 -0
  27. biaffine-parser-master/exp/ptb/model +3 -0
  28. biaffine-parser-master/parser/__init__.py +5 -0
  29. biaffine-parser-master/parser/cmds/__init__.py +7 -0
  30. biaffine-parser-master/parser/cmds/cmd.py +151 -0
  31. biaffine-parser-master/parser/cmds/evaluate.py +49 -0
  32. biaffine-parser-master/parser/cmds/predict.py +49 -0
  33. biaffine-parser-master/parser/cmds/train.py +113 -0
  34. biaffine-parser-master/parser/config.py +41 -0
  35. biaffine-parser-master/parser/model.py +140 -0
  36. biaffine-parser-master/parser/modules/__init__.py +11 -0
  37. biaffine-parser-master/parser/modules/bert.py +62 -0
  38. biaffine-parser-master/parser/modules/biaffine.py +43 -0
  39. biaffine-parser-master/parser/modules/bilstm.py +126 -0
  40. biaffine-parser-master/parser/modules/char_lstm.py +30 -0
  41. biaffine-parser-master/parser/modules/dropout.py +60 -0
  42. biaffine-parser-master/parser/modules/mlp.py +28 -0
  43. biaffine-parser-master/parser/modules/scalar_mix.py +32 -0
  44. biaffine-parser-master/parser/utils/__init__.py +8 -0
  45. biaffine-parser-master/parser/utils/alg.py +143 -0
  46. biaffine-parser-master/parser/utils/common.py +6 -0
  47. biaffine-parser-master/parser/utils/corpus.py +88 -0
  48. biaffine-parser-master/parser/utils/data.py +110 -0
  49. biaffine-parser-master/parser/utils/embedding.py +41 -0
  50. biaffine-parser-master/parser/utils/field.py +172 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ biaffine-parser-master/exp/ptb/fields filter=lfs diff=lfs merge=lfs -text
37
+ biaffine-parser-master/exp/ptb/model filter=lfs diff=lfs merge=lfs -text
38
+ models/tnt_pos_tagger_hin.dill filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dependency-parser
2
+ ## Dependencies :
3
+ * OS : Ubuntu 22.04.3 LTS
4
+ * Python 3.7
5
+ * flask 1.1.1
6
+ * flask-wtf 0.14.2
7
+ * flask-markdown 0.3
8
+ * nltk 3.4.5
9
+ * pygraphviz 1.7
10
+ * conllu 2.2.2
11
+ * scikit-learn 0.22.1
12
+ * dill 0.3.1.1
13
+ * transformers 2.1.1
14
+
15
+ ```bash
16
+ > python3.7 -m pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
17
+ ```
18
+
19
+ ## Instructions for web app :
20
+
21
+ Run the following to host the app at localhost:5000
22
+ ```bash
23
+ > python3.7 app.py
24
+ ```
25
+ ## Results :
26
+ Trained a model using <b>BERT</b> and parser implemented from <b>Deep Biaffine Attention for Neural Dependency Parsing</b> on Telugu UD Treebank dataset
27
+
28
+ <b>
29
+ train: 400 sentences
30
+ </b>
31
+ <br>
32
+
33
+ <b>
34
+ dev: 80 sentences
35
+ </b>
36
+ <br>
37
+
38
+ <b>
39
+ test: 120 sentences
40
+ </b>
41
+
42
+
43
+ ### Train
44
+ Training the model
45
+
46
+ ![parameter](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/304c0f93-a377-4a9f-a9cd-34f9d756ba3a)
47
+
48
+ ### Evaluate
49
+
50
+ Evaluation score after testing with Test Dataset
51
+
52
+ ![evaluate](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/5c245c8a-2fe6-4e6a-b3f1-63b5503b8140)
53
+ ### Prediction
54
+
55
+
56
+ #### Entering the sentence
57
+
58
+ ![before](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/9c725d7e-40d9-4caf-8666-86bf4dc06419)
59
+
60
+ #### Final Result
61
+
62
+ ![finalres](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/d6cd1260-3e8d-4352-96f6-97061a829d58)
63
+
64
+ #### Original Values
65
+
66
+ ![original](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/84391246-41f5-469d-90f9-d86e18494999)
67
+
UD_Tamil-TTB-master/.gitignore ADDED
File without changes
UD_Tamil-TTB-master/CONTRIBUTING.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Contributing
2
+
3
+ Please do not make pull requests against master, any such pull requests will be
4
+ closed. Pull requests against the dev branch are accepted in some treebanks but
5
+ not in others - check the Contributing line in the README file!
6
+
7
+ For full details on the branch policy see
8
+ [here](http://universaldependencies.org/release_checklist.html#repository-branches).
UD_Tamil-TTB-master/LICENSE.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ This work is licensed under the Creative Commons Attribution-NonCommercial-
2
+ ShareAlike 3.0 Unported License. To view a copy of this license, visit
3
+
4
+ http://creativecommons.org/licenses/by-nc-sa/3.0/
5
+
6
+ or send a letter to
7
+ Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
UD_Tamil-TTB-master/README.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Summary
2
+
3
+ The UD Tamil treebank is based on the Tamil Dependency Treebank created at the
4
+ Charles University in Prague by Loganathan Ramasamy.
5
+
6
+
7
+ # Introduction
8
+
9
+ The treebank was part of
10
+ HamleDT, a collection of treebanks converted to the Prague dependency style
11
+ (since 2011). Later versions of HamleDT added a conversion to the Stanford
12
+ dependencies (2014) and to Universal Dependencies (HamleDT 3.0, 2015). The
13
+ first release of Universal Dependencies that includes this treebank is UD v1.2
14
+ in November 2015. It is essentially the HamleDT conversion but the data is not
15
+ identical to HamleDT 3.0 because the conversion procedure has been further
16
+ improved.
17
+
18
+ ## References:
19
+
20
+ * [TamilTB](http://ufal.mff.cuni.cz/~ramasamy/tamiltb/0.1/)
21
+ * [HamleDT](http://ufal.mff.cuni.cz/hamledt)
22
+ * [Treex](http://ufal.mff.cuni.cz/treex) is the software used for conversion
23
+ * [Interset](http://ufal.mff.cuni.cz/interset) was used to convert POS tags and features
24
+ * Loganathan Ramasamy, Zdeněk Žabokrtský. 2012.
25
+ [Prague Dependency Style Treebank for Tamil](http://www.lrec-conf.org/proceedings/lrec2012/summaries/456.html).
26
+ In: *Proceedings of Eighth International Conference on Language Resources and Evaluation (LREC 2012),*
27
+ İstanbul, Turkey, ISBN 978-2-9517408-7-7, pp. 1888–1894.
28
+
29
+ <pre>
30
+ @inproceedings{ta,
31
+ author = {Ramasamy, Loganathan and \v{Z}abokrtsk\'{y}, Zden\v{e}k},
32
+ year = {2012},
33
+ title = {Prague Dependency Style Treebank for {Tamil}},
34
+ booktitle = {Proceedings of Eighth International Conference on Language Resources and Evaluation ({LREC} 2012)},
35
+ address = {\.{I}stanbul, Turkey},
36
+ editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Mehmet Uğur Doğan and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
37
+ isbn = {978-2-9517408-7-7},
38
+ pages = {1888--1894},
39
+ url = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/456.html}
40
+ }
41
+ </pre>
42
+
43
+
44
+ # Source of annotations
45
+
46
+ This table summarizes the origins and checking of the various columns of the CoNLL-U data.
47
+
48
+ | Column | Status |
49
+ | ------ | ------ |
50
+ | ID | Sentence segmentation and tokenization (including cutting off certain suffixes that constitute independent syntactic words) was automatically done and then hand-corrected. |
51
+ | FORM | Identical to TamilTB form. |
52
+ | LEMMA | Gold (preprocessed and then manually corrected). |
53
+ | UPOSTAG | Converted automatically from XPOSTAG (via [Interset](https://ufal.mff.cuni.cz/interset)). |
54
+ | XPOSTAG | Gold (preprocessed and then manually corrected). |
55
+ | FEATS | Converted automatically from XPOSTAG (via Interset). |
56
+ | HEAD | Original TamilTB annotation is manual (preprocessed by a rule-based parser and then manually corrected). Automatic conversion to UD; human checking of patterns revealed by automatic consistency tests. |
57
+ | DEPREL | Original TamilTB annotation is manual (preprocessed by a rule-based parser and then manually corrected). Automatic conversion to UD; human checking of patterns revealed by automatic consistency tests. |
58
+ | DEPS | &mdash; (currently unused) |
59
+ | MISC | Information about token spacing restored using heuristics. Mapping between multi-word tokens and syntactic words verified against the source text. |
60
+
61
+
62
+ # Changelog
63
+
64
+ * 2023-11-15 v2.13
65
+ * Fixed: no Gender for numerals and particles.
66
+ * 2021-05-15 v2.8
67
+ * Distinguished acl:relcl from other acl.
68
+ * Added enhanced dependencies for relative clauses.
69
+ * 2020-05-15 v2.6
70
+ * Added enhanced relations with case information.
71
+ * 2019-05-15 v2.4
72
+ * Fixed some annotation errors in the original treebank, re-run the conversion.
73
+ * Dative and instrumental objects are now treated as oblique arguments.
74
+ * 2018-04-15 v2.2
75
+ * Repository renamed from UD_Tamil to UD_Tamil-TTB.
76
+ * Added enhanced representation of dependencies propagated across coordination.
77
+ The distinction of shared and private dependents is derived deterministically from the original Prague annotation.
78
+ * 2017-03-01 v2.0
79
+ * Converted to UD v2 guidelines.
80
+ * Reconsidered PRON vs. DET distinction.
81
+ * Improved advmod vs. obl distinction.
82
+ * 2016-05-15 v1.3
83
+ * Added Latin transliteration of lemmas and full sentences.
84
+ * Added orthographic words (surface tokens) and their mapping to nodes.
85
+ * Improved conversion of AuxY.
86
+
87
+
88
+ <pre>
89
+ === Machine-readable metadata (DO NOT REMOVE!) ================================
90
+ Data available since: UD v1.2
91
+ License: CC BY-NC-SA 3.0
92
+ Includes text: yes
93
+ Genre: news
94
+ Lemmas: converted from manual
95
+ UPOS: converted from manual
96
+ XPOS: manual native
97
+ Features: converted from manual
98
+ Relations: converted from manual
99
+ Contributors: Ramasamy, Loganathan; Zeman, Daniel
100
+ Contributing: elsewhere
101
+ Contact: zeman@ufal.mff.cuni.cz
102
+ ===============================================================================
103
+ </pre>
UD_Tamil-TTB-master/eval.log ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Running the following version of UD tools:
2
+ commit e9726a6a7d6913193d90edb45a4cb549235c5b16
3
+ Author: Dan Zeman <zeman@ufal.mff.cuni.cz>
4
+ Date: Sat Nov 4 17:10:55 2023 +0100
5
+ Evaluating the following revision of UD_Tamil-TTB:
6
+ commit c1739c0397fd034200edaf4e403c2e4c9923dd75
7
+ Merge: 1392fa0 fbea79c
8
+ Author: Dan Zeman <zeman@ufal.mff.cuni.cz>
9
+ Size: counted 9581 of 9581 words (nodes).
10
+ Size: min(0, log((N/1000)**2)) = 4.51956394133747.
11
+ Size: maximum value 13.815511 is for 1000000 words or more.
12
+ Split: Did not find more than 10000 training words.
13
+ Split: Did not find at least 10000 development words.
14
+ Split: Did not find at least 10000 test words.
15
+ Lemmas: source of annotation (from README) factor is 0.8.
16
+ Universal POS tags: 14 out of 17 found in the corpus.
17
+ Universal POS tags: source of annotation (from README) factor is 0.8.
18
+ Features: 8280 out of 9581 total words have one or more features.
19
+ Features: source of annotation (from README) factor is 0.8.
20
+ Universal relations: 25 out of 37 found in the corpus.
21
+ Universal relations: source of annotation (from README) factor is 0.8.
22
+ Udapi:
23
+ TOTAL 205
24
+ Udapi: found 205 bugs.
25
+ Udapi: worst expected case (threshold) is one bug per 10 words. There are 9581 words.
26
+ Genres: found 1 out of 17 known.
27
+ /net/work/people/zeman/unidep/tools/validate.py --lang ta --max-err=10 UD_Tamil-TTB/ta_ttb-ud-dev.conllu
28
+ [Line 9 Sent dev-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:dat' in '11:obl:dat'
29
+
30
+ The following 63 enhanced relations are currently permitted in language [ta]:
31
+ acl, acl:relcl, advcl, advcl:cond, advmod, advmod:emph, advmod:lmod, amod, appos, aux, aux:neg, aux:pass, case, cc, ccomp, clf, compound, compound:lvc, compound:prt, compound:redup, compound:svc, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, flat:name, goeswith, iobj, list, mark, nmod, nmod:poss, nsubj, nsubj:nc, nsubj:nc:xsubj, nsubj:pass, nsubj:pass:xsubj, nsubj:xsubj, nummod, obj, obl, obl:agent, obl:arg, obl:cmpr, obl:inst, obl:lmod, obl:pmod, obl:tmod, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp
32
+ See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.
33
+
34
+ [Line 10 Sent dev-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:dat' in '7:obl:dat'
35
+ [Line 11 Sent dev-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:இல்:nom' in '11:obl:இல்:nom'
36
+ [Line 32 Sent dev-s2]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '10:obl:com'
37
+ [Line 45 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '11:obl:com'
38
+ [Line 48 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '11:obl:com'
39
+ [Line 50 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '10:nmod:nom'
40
+ [Line 58 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '18:nmod:nom'
41
+ [Line 68 Sent dev-s4]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '23:obl:loc'
42
+ ...suppressing further errors regarding Enhanced
43
+ Enhanced errors: 351
44
+ *** FAILED *** with 351 errors
45
+ Exit code: 1
46
+ /net/work/people/zeman/unidep/tools/validate.py --lang ta --max-err=10 UD_Tamil-TTB/ta_ttb-ud-test.conllu
47
+ [Line 6 Sent test-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:இலிருந்து:nom' in '4:nmod:இலிருந்து:nom'
48
+
49
+ The following 63 enhanced relations are currently permitted in language [ta]:
50
+ acl, acl:relcl, advcl, advcl:cond, advmod, advmod:emph, advmod:lmod, amod, appos, aux, aux:neg, aux:pass, case, cc, ccomp, clf, compound, compound:lvc, compound:prt, compound:redup, compound:svc, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, flat:name, goeswith, iobj, list, mark, nmod, nmod:poss, nsubj, nsubj:nc, nsubj:nc:xsubj, nsubj:pass, nsubj:pass:xsubj, nsubj:xsubj, nummod, obj, obl, obl:agent, obl:arg, obl:cmpr, obl:inst, obl:lmod, obl:pmod, obl:tmod, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp
51
+ See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.
52
+
53
+ [Line 13 Sent test-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:dat' in '9:obl:dat'
54
+ [Line 28 Sent test-s2]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:இலிருந்து:nom' in '9:obl:இலிருந்து:nom'
55
+ [Line 42 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '2:nmod:nom'
56
+ [Line 43 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '5:obl:loc'
57
+ [Line 44 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '4:nmod:nom'
58
+ [Line 49 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:dat' in '9:nmod:dat'
59
+ [Line 54 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:arg:இடம்:gen' in '15:obl:arg:இடம்:gen'
60
+ [Line 66 Sent test-s4]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '5:obl:com'
61
+ ...suppressing further errors regarding Enhanced
62
+ [Line 2738 Sent test-s118 Node 7]: [L3 Syntax too-many-subjects] Multiple subjects [4, 6] not subtyped as ':outer'. Outer subjects are allowed if a clause acts as the predicate of another clause.
63
+ Enhanced errors: 483
64
+ Syntax errors: 1
65
+ *** FAILED *** with 484 errors
66
+ Exit code: 1
67
+ /net/work/people/zeman/unidep/tools/validate.py --lang ta --max-err=10 UD_Tamil-TTB/ta_ttb-ud-train.conllu
68
+ [Line 5 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:அருகே:nom' in '18:obl:அருகே:nom'
69
+
70
+ The following 63 enhanced relations are currently permitted in language [ta]:
71
+ acl, acl:relcl, advcl, advcl:cond, advmod, advmod:emph, advmod:lmod, amod, appos, aux, aux:neg, aux:pass, case, cc, ccomp, clf, compound, compound:lvc, compound:prt, compound:redup, compound:svc, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, flat:name, goeswith, iobj, list, mark, nmod, nmod:poss, nsubj, nsubj:nc, nsubj:nc:xsubj, nsubj:pass, nsubj:pass:xsubj, nsubj:xsubj, nummod, obj, obl, obl:agent, obl:arg, obl:cmpr, obl:inst, obl:lmod, obl:pmod, obl:tmod, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp
72
+ See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.
73
+
74
+ [Line 7 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '4:nmod:nom'
75
+ [Line 8 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '18:obl:loc'
76
+ [Line 9 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '6:nmod:nom'
77
+ [Line 10 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '11:nmod:nom'
78
+ [Line 16 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:dat' in '12:nmod:dat'
79
+ [Line 19 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:dat' in '15:nmod:dat'
80
+ [Line 20 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:இல்:nom' in '17:nmod:இல்:nom'
81
+ [Line 22 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '18:obl:loc'
82
+ ...suppressing further errors regarding Enhanced
83
+ [Line 4427 Sent train-s192 Node 25]: [L3 Syntax too-many-subjects] Multiple subjects [11, 17] not subtyped as ':outer'. Outer subjects are allowed if a clause acts as the predicate of another clause.
84
+ Enhanced errors: 1922
85
+ Syntax errors: 1
86
+ *** FAILED *** with 1923 errors
87
+ Exit code: 1
88
+ Validity: 0.01
89
+ (weight=0.0769230769230769) * (score{features}=0.8) = 0.0615384615384615
90
+ (weight=0.0769230769230769) * (score{genres}=0.0588235294117647) = 0.00452488687782805
91
+ (weight=0.0769230769230769) * (score{lemmas}=0.8) = 0.0615384615384615
92
+ (weight=0.256410256410256) * (score{size}=0.327136946721963) = 0.0838812683902469
93
+ (weight=0.0512820512820513) * (score{split}=0.01) = 0.000512820512820513
94
+ (weight=0.0769230769230769) * (score{tags}=0.658823529411765) = 0.0506787330316742
95
+ (weight=0.307692307692308) * (score{udapi}=0.786034860661726) = 0.241856880203608
96
+ (weight=0.0769230769230769) * (score{udeprels}=0.540540540540541) = 0.0415800415800416
97
+ (TOTAL score=0.546111553673142) * (availability=1) * (validity=0.01) = 0.00546111553673142
98
+ STARS = 0
99
+ UD_Tamil-TTB 0.00546111553673142 0
UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD-commented.odt ADDED
Binary file (71.5 kB). View file
 
UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD.odt ADDED
Binary file (71.1 kB). View file
 
UD_Tamil-TTB-master/not-to-release/issues-UD-2.3.odt ADDED
Binary file (39.2 kB). View file
 
UD_Tamil-TTB-master/stats.xml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <treebank>
3
+ <!-- tokens means "surface tokens", e.g. Spanish "vámonos" counts as one token
4
+ words means "syntactic words", e.g. Spanish "vámonos" is split to two words, "vamos" and "nos"
5
+ fused is the number of tokens that are split to two or more syntactic words
6
+ The words and fused elements can be omitted if no token is split to smaller syntactic words. -->
7
+ <size>
8
+ <total><sentences>600</sentences><tokens>8635</tokens><words>9581</words><fused>835</fused></total>
9
+ <train><sentences>400</sentences><tokens>5734</tokens><words>6329</words><fused>520</fused></train>
10
+ <dev><sentences>80</sentences><tokens>1129</tokens><words>1263</words><fused>121</fused></dev>
11
+ <test><sentences>120</sentences><tokens>1772</tokens><words>1989</words><fused>194</fused></test>
12
+ </size>
13
+ <lemmas unique="2024" /><!-- ., ,, உள், உம், என், படு, இரு, ஆகு, அவர், இந்தியா, தெரிவி, செய், ஆக, ஆன, நாடு -->
14
+ <forms unique="3584" /><!-- ., ,, உம், ஆக, உள்ளது, ஆன, என்று, உள்ள, அவர், வேண்டும், இந்த, பட்ட், மற்றும், அரசு, நாடுகள் -->
15
+ <fusions unique="620" /><!-- என்றும், இடையிலான, செயலாளராக, தெரிவிக்கப்பட்டுள்ளது, தெரிவித்துள்ளது, தெரிவித்துள்ளார், இடத்தையும், குறிப்பிடத்தக்கது, வெளியிட்டுள்ள, Kஉறித்தும், பேருக்கும், ஆதரவாக, காரணமாக, குறிப்பிடப்பட்டுள்ளது, கூறியுள்ளார் -->
16
+ <!-- Statistics of universal POS tags. The comments show the most frequent lemmas. -->
17
+ <tags unique="14">
18
+ <tag name="ADJ">557</tag><!-- உள், மத்திய, இரு, ஒரு, நடைபெறு, உள்ளிடு, புதிய, கடந்த, முன்னாள், வரு -->
19
+ <tag name="ADP">293</tag><!-- ஆகு, உடன், குறி, இலிருந்து, மீது, உள், இரு, மூலம், சார்பு, போல் -->
20
+ <tag name="ADV">384</tag><!-- ஆகு, இன்று, மேலும், ஆனால், பின்னர், இதனால், இது, இதுகுறித்து, ஏற்கெனவே, இதுவரை -->
21
+ <tag name="AUX">634</tag><!-- உள், படு, வேண்டு, இரு, வரு, கொள், இல், செய், விடு, வா -->
22
+ <tag name="CCONJ">46</tag><!-- மற்றும், அல்லது -->
23
+ <tag name="DET">120</tag><!-- இந்த, அந்த, எந்த, மிக, அதிகம், மிகவும், முழுவதும், அந்தந்த, ஒரு, குறைவு -->
24
+ <tag name="NOUN">2758</tag><!-- அரசு, நாடு, ஆண்டு, தலைவர், மக்கள், இடம், பேர், கட்சி, செயலாளர், முதல்வர் -->
25
+ <tag name="NUM">274</tag><!-- இரு, ஆயிரம், 2, லட்சம், மூன்று, 10, ஒன்று, 20, இரண்டு, ஒரு -->
26
+ <tag name="PART">654</tag><!-- உம், என், ஆன, ஆக, ஆகு, போது, தான், ஏ, ஆவது, ஓ -->
27
+ <tag name="PRON">236</tag><!-- அவர், இது, அது, தன், அனைவர், என், யார், நான், நாம், இவர் -->
28
+ <tag name="PROPN">1370</tag><!-- இந்தியா, அமெரிக்கா, இலங்கை, பாகிஸ்தான், சென்னை, தமிழகம், ஒபாமா, அதிமுக, காங்கிரஸ், ஜெயலலிதா -->
29
+ <tag name="PUNCT">1000</tag><!-- ., ,, -, :, (, ), ", ரூ, ரூ., ; -->
30
+ <tag name="VERB">1254</tag><!-- தெரிவி, செய், கூறு, இரு, செல், பெறு, வழங்கு, நடைபெறு, குறிப்பிடு, ஏற்படு -->
31
+ <tag name="X">1</tag><!-- என் -->
32
+ </tags>
33
+ <!-- Statistics of features and values. The comments show the most frequent word forms. -->
34
+ <feats unique="41">
35
+ <feat name="AdpType" value="Post" upos="ADP">288</feat><!-- ஆக, உடன், இலிருந்து, குறித்து, மீது, சார்பில், மூலம், இடம், இருந்து, இடையில் -->
36
+ <feat name="Animacy" value="Anim" upos="AUX,NOUN,PRON,PROPN,VERB">420</feat><!-- உள்ளனர், பேர், மக்கள், அதிகாரிகள், அனைவரும், அவர்கள், தனது, புலிகள், போலீஸார், நான் -->
37
+ <feat name="Case" value="Acc" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">340</feat><!-- அதை, அவர்களை, இடத்தைய், இதை, நிலையங்களை, வெடிகளை, அணையை, அரசை, ஆட்சியை, உத்தரவை -->
38
+ <feat name="Case" value="Com" upos="NOUN">13</feat><!-- மனிதாபிமானத்தோடு, அமைப்புடன், அவருடன், உயிருடன், சிலருடன், தங்கப்பதக்கத்துடன், துணையோடு, நம்பிக்கையோடு, நேயத்துடன், நேயத்தோடு -->
39
+ <feat name="Case" value="Dat" upos="AUX,NOUN,NUM,PRON,PROPN,VERB">262</feat><!-- இந்தியாவுக்கு, மணிக்கு, அவர்களுக்கு, ஆண்டுக்கு, பேருக்க், மக்களுக்கு, அரசுக்கு, அவருக்கு, ஆண்டுகளுக்குப், கொள்வதற்க் -->
40
+ <feat name="Case" value="Gen" upos="NOUN,PRON,PROPN,VERB">177</feat><!-- அரசின், தனது, அவரது, ஒபாமாவின், நாடுகளின், நாட்டின், அமெரிக்காவின், அவர்களது, இதன், இந்தியாவின் -->
41
+ <feat name="Case" value="Ins" upos="AUX,NOUN,PART,PRON,PROPN,VERB">24</feat><!-- உள்ளதால், என்பதால், காரணத்தால், அளிக்காததால், அவர்களால், ஆகியதால், ஆனதால், இல்லாததால், எங்களால், ஒப்பந்தத்தால் -->
42
+ <feat name="Case" value="Loc" upos="NOUN,NUM,PRON,PROPN,VERB">487</feat><!-- நிலையில், அறிக்கையில், பகுதியில், வகையில், இந்தியாவில், கவுன்சிலில், தலைமையில், அளவில், சிறையில், சென்னையில் -->
43
+ <feat name="Case" value="Nom" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">2929</feat><!-- அவர், அரசு, நாடுகள், இந்தியா, தலைவர், செயலாளர், முதல்வர், பேர், ஆண்டு, மக்கள் -->
44
+ <feat name="Gender" value="Com" upos="AUX,NOUN,PRON,PROPN,VERB">1217</feat><!-- அவர், உள்ளார், தலைவர், செயலாளர், உள்ளனர், தெரிவித்தார், முதல்வர், பேர், மக்கள், என்றார் -->
45
+ <feat name="Gender" value="Masc" upos="NOUN">2</feat><!-- அற்றவன், ஆடவனின் -->
46
+ <feat name="Gender" value="Neut" upos="AUX,NOUN,PRON,PROPN,VERB">4042</feat><!-- உள்ளது, வேண்டும், அரசு, நாடுகள், இந்தியா, இல்லை, பட்டது, இந்திய, ஆண்டு, அமெரிக்க -->
47
+ <feat name="Mood" value="Cnd" upos="AUX,PART,VERB">28</feat><!-- இருந்தால், விட்டால், ஆனால், உடைத்தால், ஏற்பட்டால், கட்டினால், பட்டால், பெற்றால், வந்தால், இருப்பின் -->
48
+ <feat name="Mood" value="Imp" upos="VERB">1</feat><!-- இருங்கள் -->
49
+ <feat name="Mood" value="Ind" upos="AUX,VERB">718</feat><!-- உள்ளது, வேண்டும், உள்ளார், இல்லை, உள்ளனர், தெரிவித்தார், பட்டது, உள்ளன, என்றார், இருந்தது -->
50
+ <feat name="Number" value="Plur" upos="AUX,NOUN,PRON,PROPN,VERB">909</feat><!-- நாடுகள், இல்லை, உள்ளனர், பேர், மக்கள், உள்ளன, அதிகாரிகள், வருகின்றனர், அனைவரும், அவர்கள் -->
51
+ <feat name="Number" value="Sing" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">4395</feat><!-- உள்ளது, அவர், வேண்டும், அரசு, இந்தியா, உள்ளார், தலைவர், பட்டது, செயலாளர், தெரிவித்தார் -->
52
+ <feat name="NumForm" value="Digit" upos="NUM">149</feat><!-- 2, 10, 20, 3, 80, 16, 4, 5, 50, 6 -->
53
+ <feat name="NumType" value="Card" upos="DET,NUM">282</feat><!-- இரு, 2, ஆயிரம், மூன்று, லட்சம், 10, 20, 3, 80, இரண்டு -->
54
+ <feat name="NumType" value="Ord" upos="ADJ,NUM">52</feat><!-- முதல், இரண்டாவது, 1992-ம், 1-ம், 12-ம், 125-ம், 15-ம், 21-ம், 11வத��, 12வது -->
55
+ <feat name="Person" value="1" upos="AUX,PRON,VERB">68</feat><!-- நான், உள்ளோம், எனது, நாம், உள்ளேன், தனக்கு, நாங்கள், போராடுவோம், எனக்கு, கொள்கிறேன் -->
56
+ <feat name="Person" value="2" upos="AUX,PRON,VERB">12</feat><!-- நீங்கள், அஞ்சுகிறீர்கள், இருக்கிறீர்கள், இருங்கள், உங்களுக்கு, உங்களைச், உங்கள், கவலைப்படாதீர்கள், வருகிறீர்கள், விரும்புகிறீர்கள் -->
57
+ <feat name="Person" value="3" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">5224</feat><!-- உள்ளது, அவர், வேண்டும், அரசு, நாடுகள், இந்தியா, உள்ளார், இல்லை, தலைவர், பட்டது -->
58
+ <feat name="Polarity" value="Neg" upos="ADJ,AUX,VERB">35</feat><!-- முடியாது, கூடாது, இல்லாமல், செயல்படாமல், செய்யாத, தரா, நிறைவேற்றா, மாட்டாது, அல்லாத, எடுக்காத -->
59
+ <feat name="Polarity" value="Pos" upos="ADJ,ADP,ADV,AUX,NOUN,NUM,PART,VERB">2294</feat><!-- உள்ளது, என்று, உள்ள, வேண்டும், பட்ட், உள்ளார், இல்லை, என, என்ற், பட்டது -->
60
+ <feat name="Polite" value="Form" upos="AUX,NOUN,NUM,PRON,PROPN,VERB">798</feat><!-- அவர், உள்ளார், தலைவர், செயலாளர், தெரிவித்தார், முதல்வர், என்றார், அமைச்சர், அதிபர், உறுப்பினர் -->
61
+ <feat name="PronType" value="Ind" upos="PRON">8</feat><!-- யாரும், எதுவும், யாருக்கும், யாரையும் -->
62
+ <feat name="PronType" value="Int" upos="PRON">6</feat><!-- யார், எத்தகையது, ஏத், யாருடைய -->
63
+ <feat name="PronType" value="Prs" upos="PRON">222</feat><!-- அவர், இது, அனைவரும், அவர்கள், தனது, அது, அதை, நான், அவரது, அவர்களை -->
64
+ <feat name="PunctType" value="Comm" upos="PUNCT">400</feat><!-- ,, -, :, (, ), ", ரூ, ரூ., ;, ’ -->
65
+ <feat name="PunctType" value="Peri" upos="PUNCT">600</feat><!-- . -->
66
+ <feat name="Reflex" value="Yes" upos="PRON">16</feat><!-- தனது, தனக்கு, தங்களது, தங்களின், தன்னைப், தமக்கு -->
67
+ <feat name="Tense" value="Fut" upos="ADJ,ADV,AUX,NOUN,PART,VERB">356</feat><!-- வேண்டும், நடைபெறும், ப்படும், வரும், இருக்கும், என்பது, இருப்பத், ஏற்படும், சேர்ந்தவர்கள், படும் -->
68
+ <feat name="Tense" value="Past" upos="ADJ,AUX,NOUN,PART,VERB">518</feat><!-- பட்டது, தெரிவித்தார், என்ற, என்றார், உள்ளிட்ட, இருந்தது, இருந்த, நடைபெற்ற, வந்த, இருந்தார் -->
69
+ <feat name="Tense" value="Pres" upos="ADJ,AUX,NOUN,PART,VERB">123</feat><!-- வருகின்றனர், வருகிறது, படுகிறது, தெரிகிறது, வருகின்றன, இருக்கிறது, என்கிற, தெரிவிக்கிறது, படுகின்றனர், இருக்கிற -->
70
+ <feat name="VerbForm" value="Fin" upos="AUX,PART,VERB">747</feat><!-- உள்ளது, வேண்டும், உள்ளார், இல்லை, உள்ளனர், தெரிவித்தார், பட்டது, உள்ளன, என்றார், இருந்தது -->
71
+ <feat name="VerbForm" value="Ger" upos="AUX,PART,VERB">210</feat><!-- உள்ளத், என்பது, இருப்பத், விட்டத், கூறியத், பட்டது, இருந்தத், உயிரிழந்தத், உயிரிழப்பத், உள்ளதால் -->
72
+ <feat name="VerbForm" value="Inf" upos="AUX,PART,VERB">476</feat><!-- என்று, என, என்ற், செய்யப், தெரிவிக்கப், செய்ய, வழங்கப், நியமிக்கப், ப்பட, குறிப்பிடத் -->
73
+ <feat name="VerbForm" value="Part" upos="ADJ,ADP,ADV,AUX,NOUN,PART,VERB">882</feat><!-- உள்ள, பட்ட், பட்டு, கொண்டு, தெரிவித்த், செய்து, என்ற, உள்ளிட்ட, இருந்த, நடைபெற்ற -->
74
+ <feat name="Voice" value="Act" upos="AUX,VERB">1616</feat><!-- உள்ளது, வேண்டும், உள்ளார், இல்லை, உள்ளனர், தெரிவித்தார், கொண்டு, தெரிவித்த், செய்து, உள்ளன -->
75
+ <feat name="Voice" value="Pass" upos="AUX,VERB">155</feat><!-- பட்ட், பட்டது, பட்டு, ப்படும், படுகிறது, ப்பட, படவ், படுகின்றனர், பட்டதற்கு, பட்டனர் -->
76
+ </feats>
77
+ <!-- Statistics of universal dependency relations. -->
78
+ <deps unique="30">
79
+ <dep name="acl">63</dep>
80
+ <dep name="acl:relcl">69</dep>
81
+ <dep name="advcl">358</dep>
82
+ <dep name="advmod">401</dep>
83
+ <dep name="advmod:emph">231</dep>
84
+ <dep name="amod">549</dep>
85
+ <dep name="aux">608</dep>
86
+ <dep name="case">270</dep>
87
+ <dep name="cc">103</dep>
88
+ <dep name="ccomp">166</dep>
89
+ <dep name="compound">13</dep>
90
+ <dep name="compound:prt">1</dep>
91
+ <dep name="conj">236</dep>
92
+ <dep name="cop">1</dep>
93
+ <dep name="csubj">11</dep>
94
+ <dep name="dep">1</dep>
95
+ <dep name="det">114</dep>
96
+ <dep name="iobj">27</dep>
97
+ <dep name="mark">280</dep>
98
+ <dep name="nmod">2024</dep>
99
+ <dep name="nsubj">664</dep>
100
+ <dep name="nsubj:pass">1</dep>
101
+ <dep name="nummod">239</dep>
102
+ <dep name="obj">537</dep>
103
+ <dep name="obl">888</dep>
104
+ <dep name="obl:arg">89</dep>
105
+ <dep name="parataxis">5</dep>
106
+ <dep name="punct">1000</dep>
107
+ <dep name="root">600</dep>
108
+ <dep name="xcomp">32</dep>
109
+ </deps>
110
+ </treebank>
UD_Tamil-TTB-master/ta_ttb-ud-dev.conllu ADDED
The diff for this file is too large to render. See raw diff
 
UD_Tamil-TTB-master/ta_ttb-ud-test.conllu ADDED
The diff for this file is too large to render. See raw diff
 
UD_Tamil-TTB-master/ta_ttb-ud-train.conllu ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ try:
2
+ from flask import Flask
3
+
4
+ from flask import Flask, request, render_template, redirect, url_for, session, send_file
5
+
6
+ from flask_wtf import FlaskForm, RecaptchaField
7
+ from wtforms import StringField, SubmitField, RadioField, DateTimeField, SelectField, TextAreaField
8
+
9
+ from wtforms.validators import DataRequired
10
+
11
+ from flask import session
12
+ from flaskext.markdown import Markdown
13
+ from arc_eager import Process
14
+ import os
15
+
16
+ except Exception as e:
17
+ print(e)
18
+ print("Some Modules are Missing")
19
+
20
+ app = Flask(__name__)
21
+ Markdown(app)
22
+ app.config["SECRET_KEY"] = 'mysecretkey'
23
+
24
+
25
+ class Widgets(FlaskForm):
26
+
27
+ Statement = StringField(label="STATEMENT")
28
+
29
+ submit = SubmitField(label="Submit")
30
+
31
+
32
+ def foo(value):
33
+ print("Work to be done")
34
+
35
+
36
+ @app.after_request
37
+ def add_header(r):
38
+ """
39
+ Add headers to both force latest IE rendering engine or Chrome Frame,
40
+ and also to cache the rendered page for 10 minutes.
41
+ """
42
+ r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
43
+ r.headers["Pragma"] = "no-cache"
44
+ r.headers["Expires"] = "0"
45
+ r.headers['Cache-Control'] = 'public, max-age=0'
46
+ return r
47
+
48
+ @app.route("/", methods=["GET", "POST"])
49
+ def home():
50
+
51
+ form = Widgets()
52
+ if request.method == 'POST':
53
+ if (form.validate_on_submit()):
54
+ val = form.Statement.data
55
+ print(val)
56
+ session['data'] = val
57
+ # return render_template('home.html', form=form)
58
+ return redirect('/thanks')
59
+ return render_template('home.html', form=form)
60
+
61
+
62
+ @app.route("/thanks", methods=["GET", "POST"])
63
+ def thanks():
64
+ val = session['data']
65
+ txt, err = Process(val)
66
+ txt = txt.split('\n')
67
+ # newval = foo(val)
68
+ ex = [{
69
+ "words": [
70
+ {"text": "This", "tag": "DT"},
71
+ {"text": "is", "tag": "VBZ"},
72
+ {"text": "a", "tag": "DT"},
73
+ {"text": "sentence", "tag": "NN"}
74
+ ],
75
+ "arcs": [
76
+ {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
77
+ {"start": 2, "end": 3, "label": "det", "dir": "left"},
78
+ {"start": 1, "end": 3, "label": "attr", "dir": "right"}
79
+ ]
80
+ }]
81
+
82
+ return render_template('thanks.html',user_image='/static/process.png',text=txt,show= not err)
83
+
84
+
85
+ if __name__ == "__main__":
86
+ app.run(debug=True)
arc_eager.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from conllu import parse_incr, parse
2
+ from nltk.parse import DependencyGraph, DependencyEvaluator
3
+ from nltk.parse.transitionparser import TransitionParser
4
+ import pickle
5
+ import pygraphviz as pgv
6
+ from test_hn_pos import test_fn
7
+ import os
8
+
9
+ def Process(sentence):
10
+ words = sentence.replace('|','।').split()
11
+ tags = test_fn(words)
12
+ text = []
13
+ i = 0
14
+ for word, tag in zip(words,tags):
15
+ i += 1
16
+ fill = '_'
17
+ text.append('\t'.join([str(i),word,fill,fill,fill,fill,fill,fill,fill,fill]))
18
+ dg = DependencyGraph('\n'.join(text))
19
+ text = '\n'.join(text)
20
+ text = text + '\n\n' + text
21
+ with open('biaffine-parser-master/data/naive3.conllx','w') as f:
22
+ f.write(text)
23
+ os.chdir('biaffine-parser-master')
24
+ os.system('python3.7 run.py predict --feat=bert --fdata=data/naive3.conllx --fpred=data/naive3.conllx')
25
+ txt = ''
26
+ os.chdir('..')
27
+ with open('biaffine-parser-master/data/naive3.conllx','r') as f:
28
+ txt = f.read().split('\n\n')[0]
29
+
30
+ # parser = TransitionParser('arc-eager')
31
+ # with open('models/parser.pkl','rb') as in_file:
32
+ # parser = pickle.load(in_file)
33
+ # predictions = parser.parse([dg],'models/arc_eager.model')
34
+ # txt = predictions[0].to_conll(4)
35
+ err = False
36
+ try:
37
+ out = DependencyGraph(txt)
38
+ out_dot = out.to_dot()
39
+ G = pgv.AGraph(out_dot)
40
+ G.layout(prog='dot') # use dot
41
+ G.draw('static/process.png')
42
+ except:
43
+ err = True
44
+ txt += '''Error generating graph.\n'''
45
+ return txt, err
46
+
47
+
48
+
49
+
50
+ ## creates dependency graph list according to nltk library specification
51
+ def DepGraphList(sentenceList):
52
+ dgList = []
53
+ i = 0
54
+ j = 0
55
+ for sentence in sentenceList:
56
+ text = []
57
+ for token in sentence:
58
+ text.append(' '.join([token['form'],token['upostag'],str(token['head']),token['deprel'].upper()]))
59
+ try:
60
+ dg = DependencyGraph('\n'.join(text))
61
+ except:
62
+ j += 1
63
+ continue
64
+ i += 1
65
+ dgList.append(dg)
66
+ print(i,j)
67
+ return dgList
68
+
69
+ def main():
70
+ #data_file = open('data/test.conllu','r',encoding='utf-8')
71
+ #sentence_iter = parse_incr(data_file)
72
+ #sentences = []
73
+ #for s in sentence_iter:
74
+ # sentences.append(s)
75
+ #training_set = DepGraphList(sentences[len(sentences)//4:])
76
+ #test_set = DepGraphList(sentences[0:len(sentences)//4])
77
+
78
+ #parser = TransitionParser('arc-eager')
79
+ ## Training
80
+ # parser.train(training_set,'models/arc_eager.model')
81
+ # with open('models/parser2.pkl','wb') as out:
82
+ # pickle.dump(parser,out)
83
+ # # ## Evaluation
84
+ # with open('models/parser2.pkl','rb') as in_file:
85
+ # parser = pickle.load(in_file)
86
+ # predictions = parser.parse(test_set,'models/arc_eager.model')
87
+ # de = DependencyEvaluator(predictions,test_set)
88
+ # print(de.eval())
89
+ Process('राम अच्छा पुरुष है |')
90
+ return
91
+
92
+ if __name__=='__main__':
93
+ main()
best_mapping.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52585379a81d5f8275ba347b5ddee6d69c196bc24fb0012713a63ec173b6312b
3
+ size 3523565
biaffine-parser-master/.gitignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ignore data files
2
+ data
3
+
4
+ # ignore bash scripts
5
+ *.sh
6
+
7
+ # ignore experimental results
8
+ exp
9
+ results
10
+
11
+ # ignore log files
12
+ log*
13
+
14
+ # ignore pycache
15
+ __pycache__
16
+
17
+ # ignore saved model
18
+ *.pkl
19
+ *.pt
20
+
21
+ # ignore vscode
22
+ .vscode
biaffine-parser-master/.travis.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ language: python
2
+
3
+ # Setup anaconda
4
+ before_install:
5
+ - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
6
+ - chmod +x miniconda.sh
7
+ - ./miniconda.sh -b -p $HOME/miniconda
8
+ - export PATH=$HOME/miniconda/bin:$PATH
9
+ - conda update --yes --quiet conda
10
+ - conda config --set always_yes yes
11
+ - conda create --quiet -n py37 python=3.7
12
+ - source activate py37
13
+
14
+ # Install packages
15
+ install:
16
+ - conda install --quiet pytorch=1.3.0 -c pytorch
17
+ - conda install --quiet flake8
18
+ - pip install -r requirements.txt
19
+
20
+ script:
21
+ - flake8 .
22
+
23
+
24
+ wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
25
+ chmod +x miniconda.sh
26
+ ./miniconda.sh -b -p $HOME/miniconda
27
+ export PATH=$HOME/miniconda/bin:$PATH
28
+ conda update --yes --quiet conda
29
+ conda config --set always_yes yes
30
+ conda create --quiet -n py37 python=3.7
31
+ source activate py37
32
+
33
+ chmod +x miniconda.sh \
34
+ ./miniconda.sh -b -p $HOME/miniconda \
35
+ export PATH=$HOME/miniconda/bin:$PATH \
36
+ source activate py37
biaffine-parser-master/README.md ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Biaffine Parser
2
+
3
+ [![Travis](https://img.shields.io/travis/zysite/biaffine-parser.svg)](https://travis-ci.org/zysite/biaffine-parser)
4
+ [![LICENSE](https://img.shields.io/github/license/zysite/biaffine-parser.svg)](https://github.com/zysite/biaffine-parser/blob/master/LICENSE)
5
+ [![GitHub stars](https://img.shields.io/github/stars/zysite/biaffine-parser.svg)](https://github.com/zysite/biaffine-parser/stargazers)
6
+ [![GitHub forks](https://img.shields.io/github/forks/zysite/biaffine-parser.svg)](https://github.com/zysite/biaffine-parser/network/members)
7
+
8
+ An implementation of "Deep Biaffine Attention for Neural Dependency Parsing".
9
+
10
+ Details and [hyperparameter choices](#Hyperparameters) are almost identical to those described in the paper,
11
+ except that we provide the Eisner rather than MST algorithm to ensure well-formedness.
12
+ Practically, projective decoding like Eisner is the best choice since PTB contains mostly (99.9%) projective trees.
13
+
14
+ Besides the basic implementations, we also provide other features to replace the POS tags (TAG),
15
+ i.e., character-level embeddings (CHAR) and BERT.
16
+
17
+ ## Requirements
18
+
19
+ * `python`: 3.7.0
20
+ * [`pytorch`](https://github.com/pytorch/pytorch): 1.3.0
21
+ * [`transformers`](https://github.com/huggingface/transformers): 2.1.1
22
+
23
+ ## Datasets
24
+
25
+ The model is evaluated on the Stanford Dependency conversion ([v3.3.0](https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip)) of the English Penn Treebank with POS tags predicted by [Stanford POS tagger](https://nlp.stanford.edu/software/stanford-postagger-full-2018-10-16.zip).
26
+
27
+ For all datasets, we follow the conventional data splits:
28
+
29
+ * Train: 02-21 (39,832 sentences)
30
+ * Dev: 22 (1,700 sentences)
31
+ * Test: 23 (2,416 sentences)
32
+
33
+ ## Performance
34
+
35
+ | FEAT | UAS | LAS | Speed (Sents/s) |
36
+ | ------------- | :---: | :---: | :-------------: |
37
+ | TAG | 95.90 | 94.25 | 1696.22 |
38
+ | TAG + Eisner | 95.93 | 94.28 | 350.46 |
39
+ | CHAR | 95.99 | 94.38 | 1464.59 |
40
+ | CHAR + Eisner | 96.02 | 94.41 | 323.73 |
41
+ | BERT | 96.64 | 95.11 | 438.72 |
42
+ | BERT + Eisner | 96.65 | 95.12 | 214.68 |
43
+
44
+ Note that punctuation is ignored in all evaluation metrics for PTB.
45
+
46
+ Aside from using consistent hyperparameters, there are some keypoints that significantly affect the performance:
47
+
48
+ - Dividing the pretrained embedding by its standard-deviation
49
+ - Applying the same dropout mask at every recurrent timestep
50
+ - Jointly dropping the word and additional feature representations
51
+
52
+ For the above reasons, we may have to give up some native modules in pytorch (e.g., `LSTM` and `Dropout`),
53
+ and use custom ones instead.
54
+
55
+ As shown above, our results have outperformed the [offical implementation](https://github.com/tdozat/Parser-v1) (95.74 and 94.08).
56
+ Incorporating character-level features or external embeddings like BERT can further improve the performance of the model.
57
+
58
+ ## Usage
59
+
60
+ You can start the training, evaluation and prediction process by using subcommands registered in `parser.cmds`.
61
+
62
+ ```sh
63
+ $ python run.py -h
64
+ usage: run.py [-h] {evaluate,predict,train} ...
65
+
66
+ Create the Biaffine Parser model.
67
+
68
+ optional arguments:
69
+ -h, --help show this help message and exit
70
+
71
+ Commands:
72
+ {evaluate,predict,train}
73
+ evaluate Evaluate the specified model and dataset.
74
+ predict Use a trained model to make predictions.
75
+ train Train a model.
76
+ ```
77
+
78
+ Before triggering the subcommands, please make sure that the data files must be in CoNLL-X format.
79
+ If some fields are missing, you can use underscores as placeholders.
80
+ Below are some examples:
81
+
82
+ ```sh
83
+ $ python run.py train -p -d=0 -f=exp/ptb.char --feat=char \
84
+ --ftrain=data/ptb/train.conllx \
85
+ --fdev=data/ptb/dev.conllx \
86
+ --ftest=data/ptb/test.conllx \
87
+ --fembed=data/glove.6B.100d.txt \
88
+ --unk=unk
89
+
90
+ $ python run.py evaluate -d=0 -f=exp/ptb.char --feat=char --tree \
91
+ --fdata=data/ptb/test.conllx
92
+
93
+ $ cat data/naive.conllx
94
+ 1 Too _ _ _ _ _ _ _ _
95
+ 2 young _ _ _ _ _ _ _ _
96
+ 3 too _ _ _ _ _ _ _ _
97
+ 4 simple _ _ _ _ _ _ _ _
98
+ 5 , _ _ _ _ _ _ _ _
99
+ 6 sometimes _ _ _ _ _ _ _ _
100
+ 7 naive _ _ _ _ _ _ _ _
101
+ 8 . _ _ _ _ _ _ _ _
102
+
103
+ $ python run.py predict -d=0 -f=exp/ptb.char --feat=char --tree \
104
+ --fdata=data/naive.conllx \
105
+ --fpred=naive.conllx
106
+
107
+ $ cat naive.conllx
108
+ 1 Too _ _ _ _ 2 advmod _ _
109
+ 2 young _ _ _ _ 0 root _ _
110
+ 3 too _ _ _ _ 4 advmod _ _
111
+ 4 simple _ _ _ _ 2 dep _ _
112
+ 5 , _ _ _ _ 2 punct _ _
113
+ 6 sometimes _ _ _ _ 7 advmod _ _
114
+ 7 naive _ _ _ _ 2 dep _ _
115
+ 8 . _ _ _ _ 2 punct _ _
116
+
117
+ ```
118
+
119
+ All the optional arguments of the subcommands are as follows:
120
+
121
+ ```sh
122
+ $ python run.py train -h
123
+ usage: run.py train [-h] [--buckets BUCKETS] [--punct] [--ftrain FTRAIN]
124
+ [--fdev FDEV] [--ftest FTEST] [--fembed FEMBED]
125
+ [--unk UNK] [--conf CONF] [--file FILE] [--preprocess]
126
+ [--device DEVICE] [--seed SEED] [--threads THREADS]
127
+ [--tree] [--feat {tag,char,bert}]
128
+
129
+ optional arguments:
130
+ -h, --help show this help message and exit
131
+ --buckets BUCKETS max num of buckets to use
132
+ --punct whether to include punctuation
133
+ --ftrain FTRAIN path to train file
134
+ --fdev FDEV path to dev file
135
+ --ftest FTEST path to test file
136
+ --fembed FEMBED path to pretrained embeddings
137
+ --unk UNK unk token in pretrained embeddings
138
+ --conf CONF, -c CONF path to config file
139
+ --file FILE, -f FILE path to saved files
140
+ --preprocess, -p whether to preprocess the data first
141
+ --device DEVICE, -d DEVICE
142
+ ID of GPU to use
143
+ --seed SEED, -s SEED seed for generating random numbers
144
+ --threads THREADS, -t THREADS
145
+ max num of threads
146
+ --tree whether to ensure well-formedness
147
+ --feat {tag,char,bert}
148
+ choices of additional features
149
+
150
+ $ python run.py evaluate -h
151
+ usage: run.py evaluate [-h] [--batch-size BATCH_SIZE] [--buckets BUCKETS]
152
+ [--punct] [--fdata FDATA] [--conf CONF] [--file FILE]
153
+ [--preprocess] [--device DEVICE] [--seed SEED]
154
+ [--threads THREADS] [--tree] [--feat {tag,char,bert}]
155
+
156
+ optional arguments:
157
+ -h, --help show this help message and exit
158
+ --batch-size BATCH_SIZE
159
+ batch size
160
+ --buckets BUCKETS max num of buckets to use
161
+ --punct whether to include punctuation
162
+ --fdata FDATA path to dataset
163
+ --conf CONF, -c CONF path to config file
164
+ --file FILE, -f FILE path to saved files
165
+ --preprocess, -p whether to preprocess the data first
166
+ --device DEVICE, -d DEVICE
167
+ ID of GPU to use
168
+ --seed SEED, -s SEED seed for generating random numbers
169
+ --threads THREADS, -t THREADS
170
+ max num of threads
171
+ --tree whether to ensure well-formedness
172
+ --feat {tag,char,bert}
173
+ choices of additional features
174
+
175
+ $ python run.py predict -h
176
+ usage: run.py predict [-h] [--batch-size BATCH_SIZE] [--fdata FDATA]
177
+ [--fpred FPRED] [--conf CONF] [--file FILE]
178
+ [--preprocess] [--device DEVICE] [--seed SEED]
179
+ [--threads THREADS] [--tree] [--feat {tag,char,bert}]
180
+
181
+ optional arguments:
182
+ -h, --help show this help message and exit
183
+ --batch-size BATCH_SIZE
184
+ batch size
185
+ --fdata FDATA path to dataset
186
+ --fpred FPRED path to predicted result
187
+ --conf CONF, -c CONF path to config file
188
+ --file FILE, -f FILE path to saved files
189
+ --preprocess, -p whether to preprocess the data first
190
+ --device DEVICE, -d DEVICE
191
+ ID of GPU to use
192
+ --seed SEED, -s SEED seed for generating random numbers
193
+ --threads THREADS, -t THREADS
194
+ max num of threads
195
+ --tree whether to ensure well-formedness
196
+ --feat {tag,char,bert}
197
+ choices of additional features
198
+ ```
199
+
200
+ ## Hyperparameters
201
+
202
+ | Param | Description | Value |
203
+ | :------------ | :----------------------------------------------------------- | :--------------------------------------------------------------------: |
204
+ | n_embed | dimension of embeddings | 100 |
205
+ | n_char_embed | dimension of char embeddings | 50 |
206
+ | n_bert_layers | number of bert layers to use | 4 |
207
+ | embed_dropout | dropout ratio of embeddings | 0.33 |
208
+ | n_lstm_hidden | dimension of lstm hidden states | 400 |
209
+ | n_lstm_layers | number of lstm layers | 3 |
210
+ | lstm_dropout | dropout ratio of lstm | 0.33 |
211
+ | n_mlp_arc | arc mlp size | 500 |
212
+ | n_mlp_rel | label mlp size | 100 |
213
+ | mlp_dropout | dropout ratio of mlp | 0.33 |
214
+ | lr | starting learning rate of training | 2e-3 |
215
+ | betas | hyperparameters of momentum and L2 norm | (0.9, 0.9) |
216
+ | epsilon | stability constant | 1e-12 |
217
+ | annealing | formula of learning rate annealing | <img src="https://latex.codecogs.com/gif.latex?.75^{\frac{t}{5000}}"/> |
218
+ | batch_size | approximate number of tokens per training update | 5000 |
219
+ | epochs | max number of epochs | 50000 |
220
+ | patience | patience for early stop | 100 |
221
+ | min_freq | minimum frequency of words in the training set not discarded | 2 |
222
+ | fix_len | fixed length of a word | 20 |
223
+
224
+ ## References
225
+
226
+ * [Deep Biaffine Attention for Neural Dependency Parsing](https://arxiv.org/abs/1611.01734)
biaffine-parser-master/config.ini ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [Data]
2
+ bert_model = 'bert-base-multilingual-uncased'
3
+
4
+ [Network]
5
+ n_embed = 768
6
+ n_char_embed = 50
7
+ n_bert_layers = 4
8
+ embed_dropout = .33
9
+ n_lstm_hidden = 400
10
+ n_lstm_layers = 3
11
+ lstm_dropout = .33
12
+ n_mlp_arc = 500
13
+ n_mlp_rel = 100
14
+ mlp_dropout = .33
15
+
16
+ [Optimizer]
17
+ lr = 2e-3
18
+ mu = .9
19
+ nu = .9
20
+ epsilon = 1e-12
21
+ clip = 5.0
22
+ decay = .75
23
+ decay_steps = 5000
24
+
25
+ [Run]
26
+ batch_size = 1000
27
+ epochs = 300
28
+ patience = 30
29
+ min_freq = 2
30
+ fix_len = 20
biaffine-parser-master/data/naive3.conllx ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ 1 அதற்கு _ _ _ _ 2 obl _ _
2
+ 2 கடந்த _ _ _ _ 4 amod _ _
3
+ 3 சட்டப்பேரவைத் _ _ _ _ 4 nmod _ _
4
+ 4 தேர்தலில் _ _ _ _ 5 obl _ _
5
+ 5 பலன்கிட்டியது _ _ _ _ 0 root _ _
6
+ 6 . _ _ _ _ 5 punct _ _
7
+
biaffine-parser-master/data/ptb/tamdev.conllx ADDED
The diff for this file is too large to render. See raw diff
 
biaffine-parser-master/data/ptb/tamtest.conllx ADDED
The diff for this file is too large to render. See raw diff
 
biaffine-parser-master/data/ptb/tamtrain.conllx ADDED
The diff for this file is too large to render. See raw diff
 
biaffine-parser-master/exp/ptb/fields ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:333059b6bc4af3b1eca7d03cdeeef162912b6e2e5d96a0d7373d05c0beab614d
3
+ size 3189679
biaffine-parser-master/exp/ptb/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:626032b06dd8f45c5755707990069dc0451e00a0c2e7992bfbd62d05a078dc0e
3
+ size 736388858
biaffine-parser-master/parser/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from .model import Model
4
+
5
+ __all__ = ['Model']
biaffine-parser-master/parser/cmds/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from .evaluate import Evaluate
4
+ from .predict import Predict
5
+ from .train import Train
6
+
7
+ __all__ = ['Evaluate', 'Predict', 'Train']
biaffine-parser-master/parser/cmds/cmd.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import os
4
+ from parser.utils import Embedding
5
+ from parser.utils.alg import eisner
6
+ from parser.utils.common import bos, pad, unk
7
+ from parser.utils.corpus import CoNLL, Corpus
8
+ from parser.utils.field import BertField, CharField, Field
9
+ from parser.utils.fn import ispunct
10
+ from parser.utils.metric import Metric
11
+
12
+ import torch
13
+ import torch.nn as nn
14
+ from transformers import AutoTokenizer, BertTokenizer
15
+
16
+
17
+ class CMD(object):
18
+
19
+ def __call__(self, args):
20
+ self.args = args
21
+ if not os.path.exists(args.file):
22
+ os.mkdir(args.file)
23
+ if not os.path.exists(args.fields) or args.preprocess:
24
+ print("Preprocess the data")
25
+ self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True)
26
+ if args.feat == 'char':
27
+ self.FEAT = CharField('chars', pad=pad, unk=unk, bos=bos,
28
+ fix_len=args.fix_len, tokenize=list)
29
+ elif args.feat == 'bert':
30
+ tokenizer = BertTokenizer.from_pretrained(args.bert_model)
31
+ #tokenizer = AutoTokenizer.from_pretrained("sailen7/finetuning-sentiment-model-3000-samples")
32
+ self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]',
33
+ tokenize=tokenizer.encode)
34
+ else:
35
+ self.FEAT = Field('tags', bos=bos)
36
+ self.HEAD = Field('heads', bos=bos, use_vocab=False, fn=int)
37
+ self.REL = Field('rels', bos=bos)
38
+ if args.feat in ('char', 'bert'):
39
+ self.fields = CoNLL(FORM=(self.WORD, self.FEAT),
40
+ HEAD=self.HEAD, DEPREL=self.REL)
41
+ else:
42
+ self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT,
43
+ HEAD=self.HEAD, DEPREL=self.REL)
44
+
45
+ train = Corpus.load(args.ftrain, self.fields)
46
+ # if args.fembed:
47
+ # embed = Embedding.load(args.fembed, args.unk)
48
+ # else:
49
+ embed = None
50
+ self.WORD.build(train, args.min_freq, embed)
51
+ self.FEAT.build(train)
52
+ self.REL.build(train)
53
+ torch.save(self.fields, args.fields)
54
+ else:
55
+ self.fields = torch.load(args.fields)
56
+ if args.feat in ('char', 'bert'):
57
+ self.WORD, self.FEAT = self.fields.FORM
58
+ else:
59
+ self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS
60
+ self.HEAD, self.REL = self.fields.HEAD, self.fields.DEPREL
61
+ self.puncts = torch.tensor([i for s, i in self.WORD.vocab.stoi.items()
62
+ if ispunct(s)]).to(args.device)
63
+ self.criterion = nn.CrossEntropyLoss()
64
+
65
+ print(f"{self.WORD}\n{self.FEAT}\n{self.HEAD}\n{self.REL}")
66
+ args.update({
67
+ 'n_words': self.WORD.vocab.n_init,
68
+ 'n_feats': len(self.FEAT.vocab),
69
+ 'n_rels': len(self.REL.vocab),
70
+ 'pad_index': self.WORD.pad_index,
71
+ 'unk_index': self.WORD.unk_index,
72
+ 'bos_index': self.WORD.bos_index
73
+ })
74
+
75
+ def train(self, loader):
76
+ self.model.train()
77
+
78
+ for words, feats, arcs, rels in loader:
79
+ self.optimizer.zero_grad()
80
+
81
+ mask = words.ne(self.args.pad_index)
82
+ # ignore the first token of each sentence
83
+ mask[:, 0] = 0
84
+ arc_scores, rel_scores = self.model(words, feats)
85
+ loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask)
86
+ loss.backward()
87
+ nn.utils.clip_grad_norm_(self.model.parameters(),
88
+ self.args.clip)
89
+ self.optimizer.step()
90
+ self.scheduler.step()
91
+
92
+ @torch.no_grad()
93
+ def evaluate(self, loader):
94
+ self.model.eval()
95
+
96
+ loss, metric = 0, Metric()
97
+
98
+ for words, feats, arcs, rels in loader:
99
+ mask = words.ne(self.args.pad_index)
100
+ # ignore the first token of each sentence
101
+ mask[:, 0] = 0
102
+ arc_scores, rel_scores = self.model(words, feats)
103
+ loss += self.get_loss(arc_scores, rel_scores, arcs, rels, mask)
104
+ arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
105
+ # ignore all punctuation if not specified
106
+ if not self.args.punct:
107
+ mask &= words.unsqueeze(-1).ne(self.puncts).all(-1)
108
+ metric(arc_preds, rel_preds, arcs, rels, mask)
109
+ loss /= len(loader)
110
+
111
+ return loss, metric
112
+
113
+ @torch.no_grad()
114
+ def predict(self, loader):
115
+ self.model.eval()
116
+
117
+ all_arcs, all_rels = [], []
118
+ for words, feats in loader:
119
+ print("words ->", words, " ", "features -> ",feats )
120
+ mask = words.ne(self.args.pad_index)
121
+ # ignore the first token of each sentence
122
+ mask[:, 0] = 0
123
+ lens = mask.sum(1).tolist()
124
+ arc_scores, rel_scores = self.model(words, feats)
125
+ arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
126
+ all_arcs.extend(arc_preds[mask].split(lens))
127
+ all_rels.extend(rel_preds[mask].split(lens))
128
+ all_arcs = [seq.tolist() for seq in all_arcs]
129
+ all_rels = [self.REL.vocab.id2token(seq.tolist()) for seq in all_rels]
130
+
131
+ return all_arcs, all_rels
132
+
133
+ def get_loss(self, arc_scores, rel_scores, arcs, rels, mask):
134
+ arc_scores, arcs = arc_scores[mask], arcs[mask]
135
+ rel_scores, rels = rel_scores[mask], rels[mask]
136
+ rel_scores = rel_scores[torch.arange(len(arcs)), arcs]
137
+ arc_loss = self.criterion(arc_scores, arcs)
138
+ rel_loss = self.criterion(rel_scores, rels)
139
+ loss = arc_loss + rel_loss
140
+
141
+ return loss
142
+
143
+ def decode(self, arc_scores, rel_scores, mask):
144
+ if self.args.tree:
145
+ arc_preds = eisner(arc_scores, mask)
146
+ else:
147
+ arc_preds = arc_scores.argmax(-1)
148
+ rel_preds = rel_scores.argmax(-1)
149
+ rel_preds = rel_preds.gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)
150
+
151
+ return arc_preds, rel_preds
biaffine-parser-master/parser/cmds/evaluate.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from datetime import datetime
4
+ from parser import Model
5
+ from parser.cmds.cmd import CMD
6
+ from parser.utils.corpus import Corpus
7
+ from parser.utils.data import TextDataset, batchify
8
+
9
+
10
+ class Evaluate(CMD):
11
+
12
+ def add_subparser(self, name, parser):
13
+ subparser = parser.add_parser(
14
+ name, help='Evaluate the specified model and dataset.'
15
+ )
16
+ subparser.add_argument('--batch-size', default=1000, type=int,
17
+ help='batch size')
18
+ subparser.add_argument('--buckets', default=10, type=int,
19
+ help='max num of buckets to use')
20
+ subparser.add_argument('--punct', action='store_true',
21
+ help='whether to include punctuation')
22
+ subparser.add_argument('--fdata', default='data/ptb/tamtest.conllx',
23
+ help='path to dataset')
24
+
25
+ return subparser
26
+
27
+ def __call__(self, args):
28
+ super(Evaluate, self).__call__(args)
29
+
30
+ print("Load the dataset")
31
+ corpus = Corpus.load(args.fdata, self.fields)
32
+ dataset = TextDataset(corpus, self.fields, args.buckets)
33
+ # set the data loader
34
+ dataset.loader = batchify(dataset, args.batch_size)
35
+ print(f"{len(dataset)} sentences, "
36
+ f"{len(dataset.loader)} batches, "
37
+ f"{len(dataset.buckets)} buckets")
38
+
39
+ print("Load the model")
40
+ self.model = Model.load(args.model)
41
+ print(f"{self.model}\n")
42
+
43
+ print("Evaluate the dataset")
44
+ start = datetime.now()
45
+ loss, metric = self.evaluate(dataset.loader)
46
+ total_time = datetime.now() - start
47
+ print(f"Loss: {loss:.4f} {metric}")
48
+ print(f"{total_time}s elapsed, "
49
+ f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
biaffine-parser-master/parser/cmds/predict.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from datetime import datetime
4
+ from parser import Model
5
+ from parser.cmds.cmd import CMD
6
+ from parser.utils.corpus import Corpus
7
+ from parser.utils.data import TextDataset, batchify
8
+
9
+
10
+ class Predict(CMD):
11
+
12
+ def add_subparser(self, name, parser):
13
+ subparser = parser.add_parser(
14
+ name, help='Use a trained model to make predictions.'
15
+ )
16
+ subparser.add_argument('--batch-size', default=1000, type=int,
17
+ help='batch size')
18
+ subparser.add_argument('--fdata', default='data/ptb/tamtest.conllx',
19
+ help='path to dataset')
20
+ subparser.add_argument('--fpred', default='pred.conllx',
21
+ help='path to predicted result')
22
+
23
+ return subparser
24
+
25
+ def __call__(self, args):
26
+ super(Predict, self).__call__(args)
27
+
28
+ print("Load the dataset")
29
+ corpus = Corpus.load(args.fdata, self.fields)
30
+ dataset = TextDataset(corpus, [self.WORD, self.FEAT])
31
+ # set the data loader
32
+ dataset.loader = batchify(dataset, args.batch_size)
33
+
34
+ print(type(dataset))
35
+ print(f"{len(dataset)} sentences, "
36
+ f"{len(dataset.loader)} batches")
37
+
38
+ print("Load the model")
39
+ self.model = Model.load(args.model)
40
+ print(f"{self.model}\n")
41
+
42
+ print("Make predictions on the dataset")
43
+ start = datetime.now()
44
+ corpus.heads, corpus.rels = self.predict(dataset.loader)
45
+ print(f"Save the predicted result to {args.fpred}")
46
+ corpus.save(args.fpred)
47
+ total_time = datetime.now() - start
48
+ print(f"{total_time}s elapsed, "
49
+ f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
biaffine-parser-master/parser/cmds/train.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from datetime import datetime, timedelta
4
+ from parser import Model
5
+ from parser.cmds.cmd import CMD
6
+ from parser.utils.corpus import Corpus
7
+ from parser.utils.data import TextDataset, batchify
8
+ from parser.utils.metric import Metric
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ from torch.optim import Adam
13
+ from torch.optim.lr_scheduler import ExponentialLR
14
+
15
+
16
+ class Train(CMD):
17
+
18
+ def add_subparser(self, name, parser):
19
+ subparser = parser.add_parser(
20
+ name, help='Train a model.'
21
+ )
22
+ subparser.add_argument('--buckets', default=10, type=int,
23
+ help='max num of buckets to use')
24
+ subparser.add_argument('--punct', action='store_true',
25
+ help='whether to include punctuation')
26
+ subparser.add_argument('--ftrain', default='data/ptb/tamtrain.conllx',
27
+ help='path to train file')
28
+ subparser.add_argument('--fdev', default='data/ptb/tamdev.conllx',
29
+ help='path to dev file')
30
+ subparser.add_argument('--ftest', default='data/ptb/tamtest.conllx',
31
+ help='path to test file')
32
+ # subparser.add_argument('--fembed', default='data/tam.txt',
33
+ # help='path to pretrained embeddings')
34
+ subparser.add_argument('--unk', default='unk',
35
+ help='unk token in pretrained embeddings')
36
+
37
+ return subparser
38
+
39
+ def __call__(self, args):
40
+ super(Train, self).__call__(args)
41
+
42
+ train = Corpus.load(args.ftrain, self.fields)
43
+ dev = Corpus.load(args.fdev, self.fields)
44
+ test = Corpus.load(args.ftest, self.fields)
45
+
46
+ train = TextDataset(train, self.fields, args.buckets)
47
+ dev = TextDataset(dev, self.fields, args.buckets)
48
+ test = TextDataset(test, self.fields, args.buckets)
49
+ # set the data loaders
50
+ train.loader = batchify(train, args.batch_size, True)
51
+ dev.loader = batchify(dev, args.batch_size)
52
+ test.loader = batchify(test, args.batch_size)
53
+ print(f"{'train:':6} {len(train):5} sentences, "
54
+ f"{len(train.loader):3} batches, "
55
+ f"{len(train.buckets)} buckets")
56
+ print(f"{'dev:':6} {len(dev):5} sentences, "
57
+ f"{len(dev.loader):3} batches, "
58
+ f"{len(train.buckets)} buckets")
59
+ print(f"{'test:':6} {len(test):5} sentences, "
60
+ f"{len(test.loader):3} batches, "
61
+ f"{len(train.buckets)} buckets")
62
+
63
+ print("Create the model")
64
+ self.model = Model(args).load_pretrained(self.WORD.embed)
65
+ print(f"{self.model}\n")
66
+ self.model = self.model.to(args.device)
67
+ if torch.cuda.device_count() > 1:
68
+ print("GPU")
69
+ self.model = nn.DataParallel(self.model)
70
+ self.optimizer = Adam(self.model.parameters(),
71
+ args.lr,
72
+ (args.mu, args.nu),
73
+ args.epsilon)
74
+ self.scheduler = ExponentialLR(self.optimizer,
75
+ args.decay**(1/args.decay_steps))
76
+
77
+ total_time = timedelta()
78
+ best_e, best_metric = 1, Metric()
79
+
80
+ for epoch in range(1, args.epochs + 1):
81
+ start = datetime.now()
82
+ # train one epoch and update the parameters
83
+ self.train(train.loader)
84
+
85
+ print(f"Epoch {epoch} / {args.epochs}:")
86
+ loss, train_metric = self.evaluate(train.loader)
87
+ print(f"{'train:':6} Loss: {loss:.4f} {train_metric}")
88
+ loss, dev_metric = self.evaluate(dev.loader)
89
+ print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
90
+ loss, test_metric = self.evaluate(test.loader)
91
+ print(f"{'test:':6} Loss: {loss:.4f} {test_metric}")
92
+
93
+ t = datetime.now() - start
94
+ # save the model if it is the best so far
95
+ if dev_metric > best_metric and epoch > args.patience:
96
+ best_e, best_metric = epoch, dev_metric
97
+ if hasattr(self.model, 'module'):
98
+ self.model.module.save(args.model)
99
+ else:
100
+ self.model.save(args.model)
101
+ print(f"{t}s elapsed (saved)\n")
102
+ else:
103
+ print(f"{t}s elapsed\n")
104
+ total_time += t
105
+ if epoch - best_e >= args.patience:
106
+ break
107
+ self.model = Model.load(args.model)
108
+ loss, metric = self.evaluate(test.loader)
109
+
110
+ print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
111
+ print(f"the score of test at epoch {best_e} is {metric.score:.2%}")
112
+ print(f"average time of each epoch is {total_time / epoch}s")
113
+ print(f"{total_time}s elapsed")
biaffine-parser-master/parser/config.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from ast import literal_eval
4
+ from configparser import ConfigParser
5
+ from argparse import Namespace
6
+
7
+
8
+ class Config(ConfigParser):
9
+
10
+ def __init__(self, path):
11
+ super(Config, self).__init__()
12
+
13
+ self.read(path)
14
+ self.namespace = Namespace()
15
+ self.update(dict((name, literal_eval(value))
16
+ for section in self.sections()
17
+ for name, value in self.items(section)))
18
+
19
+ def __repr__(self):
20
+ s = line = "-" * 15 + "-+-" + "-" * 25 + "\n"
21
+ s += f"{'Param':15} | {'Value':^25}\n" + line
22
+ for name, value in vars(self.namespace).items():
23
+ s += f"{name:15} | {str(value):^25}\n"
24
+ s += line
25
+
26
+ return s
27
+
28
+ def __getattr__(self, attr):
29
+ return getattr(self.namespace, attr)
30
+
31
+ def __getstate__(self):
32
+ return vars(self)
33
+
34
+ def __setstate__(self, state):
35
+ self.__dict__.update(state)
36
+
37
+ def update(self, kwargs):
38
+ for name, value in kwargs.items():
39
+ setattr(self.namespace, name, value)
40
+
41
+ return self
biaffine-parser-master/parser/model.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from parser.modules import CHAR_LSTM, MLP, BertEmbedding, Biaffine, BiLSTM
4
+ from parser.modules.dropout import IndependentDropout, SharedDropout
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ from torch.nn.utils.rnn import (pack_padded_sequence, pad_packed_sequence,
9
+ pad_sequence)
10
+
11
+
12
+ class Model(nn.Module):
13
+
14
+ def __init__(self, args):
15
+ super(Model, self).__init__()
16
+
17
+ self.args = args
18
+ # the embedding layer
19
+ self.word_embed = nn.Embedding(num_embeddings=args.n_words,
20
+ embedding_dim=args.n_embed)
21
+ if args.feat == 'char':
22
+ self.feat_embed = CHAR_LSTM(n_chars=args.n_feats,
23
+ n_embed=args.n_char_embed,
24
+ n_out=args.n_embed)
25
+ elif args.feat == 'bert':
26
+ self.feat_embed = BertEmbedding(model=args.bert_model,
27
+ n_layers=args.n_bert_layers,
28
+ n_out=args.n_embed)
29
+ else:
30
+ self.feat_embed = nn.Embedding(num_embeddings=args.n_feats,
31
+ embedding_dim=args.n_embed)
32
+ self.embed_dropout = IndependentDropout(p=args.embed_dropout)
33
+
34
+ # the word-lstm layer
35
+ self.lstm = BiLSTM(input_size=args.n_embed*2,
36
+ hidden_size=args.n_lstm_hidden,
37
+ num_layers=args.n_lstm_layers,
38
+ dropout=args.lstm_dropout)
39
+ self.lstm_dropout = SharedDropout(p=args.lstm_dropout)
40
+
41
+ # the MLP layers
42
+ self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden*2,
43
+ n_hidden=args.n_mlp_arc,
44
+ dropout=args.mlp_dropout)
45
+ self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden*2,
46
+ n_hidden=args.n_mlp_arc,
47
+ dropout=args.mlp_dropout)
48
+ self.mlp_rel_h = MLP(n_in=args.n_lstm_hidden*2,
49
+ n_hidden=args.n_mlp_rel,
50
+ dropout=args.mlp_dropout)
51
+ self.mlp_rel_d = MLP(n_in=args.n_lstm_hidden*2,
52
+ n_hidden=args.n_mlp_rel,
53
+ dropout=args.mlp_dropout)
54
+
55
+ # the Biaffine layers
56
+ self.arc_attn = Biaffine(n_in=args.n_mlp_arc,
57
+ bias_x=True,
58
+ bias_y=False)
59
+ self.rel_attn = Biaffine(n_in=args.n_mlp_rel,
60
+ n_out=args.n_rels,
61
+ bias_x=True,
62
+ bias_y=True)
63
+ self.pad_index = args.pad_index
64
+ self.unk_index = args.unk_index
65
+
66
+ def load_pretrained(self, embed=None):
67
+ if embed is not None:
68
+ self.pretrained = nn.Embedding.from_pretrained(embed)
69
+ nn.init.zeros_(self.word_embed.weight)
70
+
71
+ return self
72
+
73
+ def forward(self, words, feats):
74
+ batch_size, seq_len = words.shape
75
+ # get the mask and lengths of given batch
76
+ mask = words.ne(self.pad_index)
77
+ lens = mask.sum(dim=1)
78
+ # set the indices larger than num_embeddings to unk_index
79
+ ext_mask = words.ge(self.word_embed.num_embeddings)
80
+ ext_words = words.masked_fill(ext_mask, self.unk_index)
81
+
82
+ # get outputs from embedding layers
83
+ word_embed = self.word_embed(ext_words)
84
+ if hasattr(self, 'pretrained'):
85
+ word_embed = torch.cat((word_embed, self.pretrained(words)), dim=2)
86
+ if self.args.feat == 'char':
87
+ print(mask.shape)
88
+ feat_embed = self.feat_embed(feats[mask])
89
+ feat_embed = pad_sequence(feat_embed.split(lens.tolist()), True)
90
+ elif self.args.feat == 'bert':
91
+ feat_embed = self.feat_embed(*feats)
92
+ else:
93
+ feat_embed = self.feat_embed(feats)
94
+ word_embed, feat_embed = self.embed_dropout(word_embed, feat_embed)
95
+ # concatenate the word and feat representations
96
+ embed = torch.cat((word_embed, feat_embed), dim=-1)
97
+
98
+ lens = lens.to('cpu')
99
+ x = pack_padded_sequence(embed, lens ,True, False)
100
+ x, _ = self.lstm(x)
101
+ x, _ = pad_packed_sequence(x, True, total_length=seq_len)
102
+ x = self.lstm_dropout(x)
103
+
104
+ # apply MLPs to the BiLSTM output states
105
+ arc_h = self.mlp_arc_h(x)
106
+ arc_d = self.mlp_arc_d(x)
107
+ rel_h = self.mlp_rel_h(x)
108
+ rel_d = self.mlp_rel_d(x)
109
+
110
+ # get arc and rel scores from the bilinear attention
111
+ # [batch_size, seq_len, seq_len]
112
+ s_arc = self.arc_attn(arc_d, arc_h)
113
+ # [batch_size, seq_len, seq_len, n_rels]
114
+ s_rel = self.rel_attn(rel_d, rel_h).permute(0, 2, 3, 1)
115
+ # set the scores that exceed the length of each sentence to -inf
116
+ s_arc.masked_fill_(~mask.unsqueeze(1), float('-inf'))
117
+
118
+ return s_arc, s_rel
119
+
120
+ @classmethod
121
+ def load(cls, path):
122
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
123
+ state = torch.load(path, map_location=device)
124
+ model = cls(state['args'])
125
+ model.load_pretrained(state['pretrained'])
126
+ model.load_state_dict(state['state_dict'], False)
127
+ model.to(device)
128
+
129
+ return model
130
+
131
+ def save(self, path):
132
+ state_dict, pretrained = self.state_dict(), None
133
+ if hasattr(self, 'pretrained'):
134
+ pretrained = state_dict.pop('pretrained.weight')
135
+ state = {
136
+ 'args': self.args,
137
+ 'state_dict': state_dict,
138
+ 'pretrained': pretrained
139
+ }
140
+ torch.save(state, path)
biaffine-parser-master/parser/modules/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from . import dropout
4
+ from .bert import BertEmbedding
5
+ from .biaffine import Biaffine
6
+ from .bilstm import BiLSTM
7
+ from .char_lstm import CHAR_LSTM
8
+ from .mlp import MLP
9
+
10
+ __all__ = ['CHAR_LSTM', 'MLP', 'BertEmbedding',
11
+ 'Biaffine', 'BiLSTM', 'dropout']
biaffine-parser-master/parser/modules/bert.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import BertModel
6
+
7
+ from .scalar_mix import ScalarMix
8
+
9
+
10
+ class BertEmbedding(nn.Module):
11
+
12
+ def __init__(self, model, n_layers, n_out, requires_grad=False):
13
+ proxies = {
14
+ "http": "http://10.10.1.10:3128",
15
+ "https": "https://10.10.1.10:1080",
16
+ }
17
+ super(BertEmbedding, self).__init__()
18
+
19
+ #self.bert = AutoModelForMaskedLM.from_pretrained("Sanath369/distilroberta-base-finetuned-telugu_bert1")
20
+ self.bert = BertModel.from_pretrained(model, output_hidden_states=True)
21
+
22
+ self.bert = self.bert.requires_grad_(requires_grad)
23
+ self.n_layers = n_layers
24
+ self.n_out = n_out
25
+ self.requires_grad = requires_grad
26
+ self.hidden_size = self.bert.config.hidden_size
27
+
28
+ self.scalar_mix = ScalarMix(n_layers)
29
+ self.projection = nn.Linear(self.hidden_size, n_out, False)
30
+
31
+ def __repr__(self):
32
+ s = self.__class__.__name__ + '('
33
+ s += f"n_layers={self.n_layers}, n_out={self.n_out}"
34
+ if self.requires_grad:
35
+ s += f", requires_grad={self.requires_grad}"
36
+ s += ')'
37
+
38
+ return s
39
+
40
+ def forward(self, subwords, bert_lens, bert_mask):
41
+ batch_size, seq_len = bert_lens.shape
42
+ mask = bert_lens.gt(0)
43
+
44
+ if not self.requires_grad:
45
+ self.bert.eval()
46
+ # print(subwords)
47
+ out = self.bert(subwords, attention_mask=bert_mask)
48
+ # print(out[0].shape)
49
+ # print(out[1].shape)
50
+ # print("bert_mask:", bert_mask)
51
+ _,_,bert = self.bert(subwords, attention_mask=bert_mask)
52
+ bert = bert[-self.n_layers:]
53
+ # print("first" , bert)
54
+ bert = self.scalar_mix(bert)
55
+ # print("Second" , bert)
56
+ bert = bert[bert_mask].split(bert_lens[mask].tolist())
57
+ bert = torch.stack([i.mean(0) for i in bert])
58
+ bert_embed = bert.new_zeros(batch_size, seq_len, self.hidden_size)
59
+ bert_embed = bert_embed.masked_scatter_(mask.unsqueeze(-1), bert)
60
+ bert_embed = self.projection(bert_embed)
61
+
62
+ return bert_embed
biaffine-parser-master/parser/modules/biaffine.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+
7
+ class Biaffine(nn.Module):
8
+
9
+ def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
10
+ super(Biaffine, self).__init__()
11
+
12
+ self.n_in = n_in
13
+ self.n_out = n_out
14
+ self.bias_x = bias_x
15
+ self.bias_y = bias_y
16
+ self.weight = nn.Parameter(torch.Tensor(n_out,
17
+ n_in + bias_x,
18
+ n_in + bias_y))
19
+ self.reset_parameters()
20
+
21
+ def extra_repr(self):
22
+ s = f"n_in={self.n_in}, n_out={self.n_out}"
23
+ if self.bias_x:
24
+ s += f", bias_x={self.bias_x}"
25
+ if self.bias_y:
26
+ s += f", bias_y={self.bias_y}"
27
+
28
+ return s
29
+
30
+ def reset_parameters(self):
31
+ nn.init.zeros_(self.weight)
32
+
33
+ def forward(self, x, y):
34
+ if self.bias_x:
35
+ x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
36
+ if self.bias_y:
37
+ y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
38
+ # [batch_size, n_out, seq_len, seq_len]
39
+ s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
40
+ # remove dim 1 if n_out == 1
41
+ s = s.squeeze(1)
42
+
43
+ return s
biaffine-parser-master/parser/modules/bilstm.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from parser.modules.dropout import SharedDropout
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn.modules.rnn import apply_permutation
8
+ from torch.nn.utils.rnn import PackedSequence
9
+
10
+
11
+ class BiLSTM(nn.Module):
12
+
13
+ def __init__(self, input_size, hidden_size, num_layers=1, dropout=0):
14
+ super(BiLSTM, self).__init__()
15
+
16
+ self.input_size = input_size
17
+ self.hidden_size = hidden_size
18
+ self.num_layers = num_layers
19
+ self.dropout = dropout
20
+
21
+ self.f_cells = nn.ModuleList()
22
+ self.b_cells = nn.ModuleList()
23
+ for _ in range(self.num_layers):
24
+ self.f_cells.append(nn.LSTMCell(input_size=input_size,
25
+ hidden_size=hidden_size))
26
+ self.b_cells.append(nn.LSTMCell(input_size=input_size,
27
+ hidden_size=hidden_size))
28
+ input_size = hidden_size * 2
29
+
30
+ self.reset_parameters()
31
+
32
+ def __repr__(self):
33
+ s = self.__class__.__name__ + '('
34
+ s += f"{self.input_size}, {self.hidden_size}"
35
+ if self.num_layers > 1:
36
+ s += f", num_layers={self.num_layers}"
37
+ if self.dropout > 0:
38
+ s += f", dropout={self.dropout}"
39
+ s += ')'
40
+
41
+ return s
42
+
43
+ def reset_parameters(self):
44
+ for param in self.parameters():
45
+ # apply orthogonal_ to weight
46
+ if len(param.shape) > 1:
47
+ nn.init.orthogonal_(param)
48
+ # apply zeros_ to bias
49
+ else:
50
+ nn.init.zeros_(param)
51
+
52
+ def permute_hidden(self, hx, permutation):
53
+ if permutation is None:
54
+ return hx
55
+ h = apply_permutation(hx[0], permutation)
56
+ c = apply_permutation(hx[1], permutation)
57
+
58
+ return h, c
59
+
60
+ def layer_forward(self, x, hx, cell, batch_sizes, reverse=False):
61
+ hx_0 = hx_i = hx
62
+ hx_n, output = [], []
63
+ steps = reversed(range(len(x))) if reverse else range(len(x))
64
+ if self.training:
65
+ hid_mask = SharedDropout.get_mask(hx_0[0], self.dropout)
66
+
67
+ for t in steps:
68
+ last_batch_size, batch_size = len(hx_i[0]), batch_sizes[t]
69
+ if last_batch_size < batch_size:
70
+ hx_i = [torch.cat((h, ih[last_batch_size:batch_size]))
71
+ for h, ih in zip(hx_i, hx_0)]
72
+ else:
73
+ hx_n.append([h[batch_size:] for h in hx_i])
74
+ hx_i = [h[:batch_size] for h in hx_i]
75
+ hx_i = [h for h in cell(x[t], hx_i)]
76
+ output.append(hx_i[0])
77
+ if self.training:
78
+ hx_i[0] = hx_i[0] * hid_mask[:batch_size]
79
+ if reverse:
80
+ hx_n = hx_i
81
+ output.reverse()
82
+ else:
83
+ hx_n.append(hx_i)
84
+ hx_n = [torch.cat(h) for h in zip(*reversed(hx_n))]
85
+ output = torch.cat(output)
86
+
87
+ return output, hx_n
88
+
89
+ def forward(self, sequence, hx=None):
90
+ x, batch_sizes = sequence.data, sequence.batch_sizes.tolist()
91
+ batch_size = batch_sizes[0]
92
+ h_n, c_n = [], []
93
+
94
+ if hx is None:
95
+ ih = x.new_zeros(self.num_layers * 2, batch_size, self.hidden_size)
96
+ h, c = ih, ih
97
+ else:
98
+ h, c = self.permute_hidden(hx, sequence.sorted_indices)
99
+ h = h.view(self.num_layers, 2, batch_size, self.hidden_size)
100
+ c = c.view(self.num_layers, 2, batch_size, self.hidden_size)
101
+
102
+ for i in range(self.num_layers):
103
+ x = torch.split(x, batch_sizes)
104
+ if self.training:
105
+ mask = SharedDropout.get_mask(x[0], self.dropout)
106
+ x = [i * mask[:len(i)] for i in x]
107
+ x_f, (h_f, c_f) = self.layer_forward(x=x,
108
+ hx=(h[i, 0], c[i, 0]),
109
+ cell=self.f_cells[i],
110
+ batch_sizes=batch_sizes)
111
+ x_b, (h_b, c_b) = self.layer_forward(x=x,
112
+ hx=(h[i, 1], c[i, 1]),
113
+ cell=self.b_cells[i],
114
+ batch_sizes=batch_sizes,
115
+ reverse=True)
116
+ x = torch.cat((x_f, x_b), -1)
117
+ h_n.append(torch.stack((h_f, h_b)))
118
+ c_n.append(torch.stack((c_f, c_b)))
119
+ x = PackedSequence(x,
120
+ sequence.batch_sizes,
121
+ sequence.sorted_indices,
122
+ sequence.unsorted_indices)
123
+ hx = torch.cat(h_n, 0), torch.cat(c_n, 0)
124
+ hx = self.permute_hidden(hx, sequence.unsorted_indices)
125
+
126
+ return x, hx
biaffine-parser-master/parser/modules/char_lstm.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch.nn.utils.rnn import pack_padded_sequence
6
+
7
+
8
+ class CHAR_LSTM(nn.Module):
9
+
10
+ def __init__(self, n_chars, n_embed, n_out):
11
+ super(CHAR_LSTM, self).__init__()
12
+
13
+ # the embedding layer
14
+ self.embed = nn.Embedding(num_embeddings=n_chars,
15
+ embedding_dim=n_embed)
16
+ # the lstm layer
17
+ self.lstm = nn.LSTM(input_size=n_embed,
18
+ hidden_size=n_out//2,
19
+ batch_first=True,
20
+ bidirectional=True)
21
+
22
+ def forward(self, x):
23
+ mask = x.gt(0)
24
+ lens = mask.sum(dim=1)
25
+
26
+ x = pack_padded_sequence(self.embed(x), lens, True, False)
27
+ x, (hidden, _) = self.lstm(x)
28
+ hidden = torch.cat(torch.unbind(hidden), dim=-1)
29
+
30
+ return hidden
biaffine-parser-master/parser/modules/dropout.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+
7
+ class SharedDropout(nn.Module):
8
+
9
+ def __init__(self, p=0.5, batch_first=True):
10
+ super(SharedDropout, self).__init__()
11
+
12
+ self.p = p
13
+ self.batch_first = batch_first
14
+
15
+ def extra_repr(self):
16
+ s = f"p={self.p}"
17
+ if self.batch_first:
18
+ s += f", batch_first={self.batch_first}"
19
+
20
+ return s
21
+
22
+ def forward(self, x):
23
+ if self.training:
24
+ if self.batch_first:
25
+ mask = self.get_mask(x[:, 0], self.p)
26
+ else:
27
+ mask = self.get_mask(x[0], self.p)
28
+ x *= mask.unsqueeze(1) if self.batch_first else mask
29
+
30
+ return x
31
+
32
+ @staticmethod
33
+ def get_mask(x, p):
34
+ mask = x.new_empty(x.shape).bernoulli_(1 - p)
35
+ mask = mask / (1 - p)
36
+
37
+ return mask
38
+
39
+
40
+ class IndependentDropout(nn.Module):
41
+
42
+ def __init__(self, p=0.5):
43
+ super(IndependentDropout, self).__init__()
44
+
45
+ self.p = p
46
+
47
+ def extra_repr(self):
48
+ return f"p={self.p}"
49
+
50
+ def forward(self, *items):
51
+ if self.training:
52
+ masks = [x.new_empty(x.shape[:2]).bernoulli_(1 - self.p)
53
+ for x in items]
54
+ total = sum(masks)
55
+ scale = len(items) / total.max(torch.ones_like(total))
56
+ masks = [mask * scale for mask in masks]
57
+ items = [item * mask.unsqueeze(dim=-1)
58
+ for item, mask in zip(items, masks)]
59
+
60
+ return items
biaffine-parser-master/parser/modules/mlp.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from parser.modules.dropout import SharedDropout
4
+
5
+ import torch.nn as nn
6
+
7
+
8
+ class MLP(nn.Module):
9
+
10
+ def __init__(self, n_in, n_hidden, dropout=0):
11
+ super(MLP, self).__init__()
12
+
13
+ self.linear = nn.Linear(n_in, n_hidden)
14
+ self.activation = nn.LeakyReLU(negative_slope=0.1)
15
+ self.dropout = SharedDropout(p=dropout)
16
+
17
+ self.reset_parameters()
18
+
19
+ def reset_parameters(self):
20
+ nn.init.orthogonal_(self.linear.weight)
21
+ nn.init.zeros_(self.linear.bias)
22
+
23
+ def forward(self, x):
24
+ x = self.linear(x)
25
+ x = self.activation(x)
26
+ x = self.dropout(x)
27
+
28
+ return x
biaffine-parser-master/parser/modules/scalar_mix.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import numpy as np
6
+
7
+
8
+ class ScalarMix(nn.Module):
9
+
10
+ def __init__(self, n_layers, dropout=0):
11
+ super(ScalarMix, self).__init__()
12
+
13
+ self.n_layers = n_layers
14
+ self.dropout = dropout
15
+
16
+ self.weights = nn.Parameter(torch.zeros(n_layers))
17
+ self.gamma = nn.Parameter(torch.tensor([1.0]))
18
+ self.dropout = nn.Dropout(dropout)
19
+
20
+ def extra_repr(self):
21
+ s = f"n_layers={self.n_layers}"
22
+ if self.dropout.p > 0:
23
+ s += f", dropout={self.dropout.p}"
24
+
25
+ return s
26
+
27
+ def forward(self, tensors):
28
+
29
+ normed_weights = self.dropout(self.weights.softmax(-1))
30
+ weighted_sum = sum(w * h for w, h in zip(normed_weights, tensors))
31
+
32
+ return self.gamma * weighted_sum
biaffine-parser-master/parser/utils/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from . import corpus, data, field, fn, metric
4
+ from .embedding import Embedding
5
+ from .vocab import Vocab
6
+
7
+ __all__ = ['Corpus', 'Embedding', 'Vocab',
8
+ 'corpus', 'data', 'field', 'fn', 'metric']
biaffine-parser-master/parser/utils/alg.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import torch
4
+ from torch.nn.utils.rnn import pad_sequence
5
+
6
+
7
+ def kmeans(x, k):
8
+ x = torch.tensor(x, dtype=torch.float)
9
+ # count the frequency of each datapoint
10
+ d, indices, f = x.unique(return_inverse=True, return_counts=True)
11
+ # calculate the sum of the values of the same datapoints
12
+ total = d * f
13
+ # initialize k centroids randomly
14
+ c, old = d[torch.randperm(len(d))[:k]], None
15
+ # assign labels to each datapoint based on centroids
16
+ dists, y = torch.abs_(d.unsqueeze(-1) - c).min(dim=-1)
17
+ # make sure number of datapoints is greater than that of clusters
18
+ assert len(d) >= k, f"unable to assign {len(d)} datapoints to {k} clusters"
19
+
20
+ while old is None or not c.equal(old):
21
+ # if an empty cluster is encountered,
22
+ # choose the farthest datapoint from the biggest cluster
23
+ # and move that the empty one
24
+ for i in range(k):
25
+ if not y.eq(i).any():
26
+ mask = y.eq(torch.arange(k).unsqueeze(-1))
27
+ lens = mask.sum(dim=-1)
28
+ biggest = mask[lens.argmax()].nonzero().view(-1)
29
+ farthest = dists[biggest].argmax()
30
+ y[biggest[farthest]] = i
31
+ mask = y.eq(torch.arange(k).unsqueeze(-1))
32
+ # update the centroids
33
+ c, old = (total * mask).sum(-1) / (f * mask).sum(-1), c
34
+ # re-assign all datapoints to clusters
35
+ dists, y = torch.abs_(d.unsqueeze(-1) - c).min(dim=-1)
36
+ # assign all datapoints to the new-generated clusters
37
+ # without considering the empty ones
38
+ y, assigned = y[indices], y.unique().tolist()
39
+ # get the centroids of the assigned clusters
40
+ centroids = c[assigned].tolist()
41
+ # map all values of datapoints to buckets
42
+ clusters = [torch.where(y.eq(i))[0].tolist() for i in assigned]
43
+
44
+ return centroids, clusters
45
+
46
+
47
+ def eisner(scores, mask):
48
+ lens = mask.sum(1)
49
+ batch_size, seq_len, _ = scores.shape
50
+ scores = scores.permute(2, 1, 0)
51
+ s_i = torch.full_like(scores, float('-inf'))
52
+ s_c = torch.full_like(scores, float('-inf'))
53
+ p_i = scores.new_zeros(seq_len, seq_len, batch_size).long()
54
+ p_c = scores.new_zeros(seq_len, seq_len, batch_size).long()
55
+ s_c.diagonal().fill_(0)
56
+
57
+ for w in range(1, seq_len):
58
+ n = seq_len - w
59
+ starts = p_i.new_tensor(range(n)).unsqueeze(0)
60
+ # ilr = C(i->r) + C(j->r+1)
61
+ ilr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
62
+ # [batch_size, n, w]
63
+ ilr = ilr.permute(2, 0, 1)
64
+ il = ilr + scores.diagonal(-w).unsqueeze(-1)
65
+ # I(j->i) = max(C(i->r) + C(j->r+1) + s(j->i)), i <= r < j
66
+ il_span, il_path = il.max(-1)
67
+ s_i.diagonal(-w).copy_(il_span)
68
+ p_i.diagonal(-w).copy_(il_path + starts)
69
+ ir = ilr + scores.diagonal(w).unsqueeze(-1)
70
+ # I(i->j) = max(C(i->r) + C(j->r+1) + s(i->j)), i <= r < j
71
+ ir_span, ir_path = ir.max(-1)
72
+ s_i.diagonal(w).copy_(ir_span)
73
+ p_i.diagonal(w).copy_(ir_path + starts)
74
+
75
+ # C(j->i) = max(C(r->i) + I(j->r)), i <= r < j
76
+ cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
77
+ cl_span, cl_path = cl.permute(2, 0, 1).max(-1)
78
+ s_c.diagonal(-w).copy_(cl_span)
79
+ p_c.diagonal(-w).copy_(cl_path + starts)
80
+ # C(i->j) = max(I(i->r) + C(r->j)), i < r <= j
81
+ cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
82
+ cr_span, cr_path = cr.permute(2, 0, 1).max(-1)
83
+ s_c.diagonal(w).copy_(cr_span)
84
+ s_c[0, w][lens.ne(w)] = float('-inf')
85
+ p_c.diagonal(w).copy_(cr_path + starts + 1)
86
+
87
+ predicts = []
88
+ p_c = p_c.permute(2, 0, 1).cpu()
89
+ p_i = p_i.permute(2, 0, 1).cpu()
90
+ for i, length in enumerate(lens.tolist()):
91
+ heads = p_c.new_ones(length + 1, dtype=torch.long)
92
+ backtrack(p_i[i], p_c[i], heads, 0, length, True)
93
+ predicts.append(heads.to(mask.device))
94
+
95
+ return pad_sequence(predicts, True)
96
+
97
+
98
+ def backtrack(p_i, p_c, heads, i, j, complete):
99
+ if i == j:
100
+ return
101
+ if complete:
102
+ r = p_c[i, j]
103
+ backtrack(p_i, p_c, heads, i, r, False)
104
+ backtrack(p_i, p_c, heads, r, j, True)
105
+ else:
106
+ r, heads[j] = p_i[i, j], i
107
+ i, j = sorted((i, j))
108
+ backtrack(p_i, p_c, heads, i, r, True)
109
+ backtrack(p_i, p_c, heads, j, r + 1, True)
110
+
111
+
112
+ def stripe(x, n, w, offset=(0, 0), dim=1):
113
+ r'''Returns a diagonal stripe of the tensor.
114
+
115
+ Parameters:
116
+ x (Tensor): the input tensor with 2 or more dims.
117
+ n (int): the length of the stripe.
118
+ w (int): the width of the stripe.
119
+ offset (tuple): the offset of the first two dims.
120
+ dim (int): 0 if returns a horizontal stripe; 1 else.
121
+
122
+ Example::
123
+ >>> x = torch.arange(25).view(5, 5)
124
+ >>> x
125
+ tensor([[ 0, 1, 2, 3, 4],
126
+ [ 5, 6, 7, 8, 9],
127
+ [10, 11, 12, 13, 14],
128
+ [15, 16, 17, 18, 19],
129
+ [20, 21, 22, 23, 24]])
130
+ >>> stripe(x, 2, 3, (1, 1))
131
+ tensor([[ 6, 7, 8],
132
+ [12, 13, 14]])
133
+ >>> stripe(x, 2, 3, dim=0)
134
+ tensor([[ 0, 5, 10],
135
+ [ 6, 11, 16]])
136
+ '''
137
+ x, seq_len = x.contiguous(), x.size(1)
138
+ stride, numel = list(x.stride()), x[0, 0].numel()
139
+ stride[0] = (seq_len + 1) * numel
140
+ stride[1] = (1 if dim == 1 else seq_len) * numel
141
+ return x.as_strided(size=(n, w, *x.shape[2:]),
142
+ stride=stride,
143
+ storage_offset=(offset[0]*seq_len+offset[1])*numel)
biaffine-parser-master/parser/utils/common.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ pad = '<pad>'
4
+ unk = '<unk>'
5
+ bos = '<bos>'
6
+ eos = '<eos>'
biaffine-parser-master/parser/utils/corpus.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from collections import namedtuple
4
+ from collections.abc import Iterable
5
+ from parser.utils.field import Field
6
+
7
+ CoNLL = namedtuple(typename='CoNLL',
8
+ field_names=['ID', 'FORM', 'LEMMA', 'CPOS', 'POS',
9
+ 'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'],
10
+ defaults=[None]*10)
11
+
12
+
13
+ class Sentence(object):
14
+
15
+ def __init__(self, fields, values):
16
+ for field, value in zip(fields, values):
17
+ if isinstance(field, Iterable):
18
+ for j in range(len(field)):
19
+ setattr(self, field[j].name, value)
20
+ else:
21
+ setattr(self, field.name, value)
22
+ self.fields = fields
23
+
24
+ @property
25
+ def values(self):
26
+ for field in self.fields:
27
+ if isinstance(field, Iterable):
28
+ yield getattr(self, field[0].name)
29
+ else:
30
+ yield getattr(self, field.name)
31
+
32
+ def __len__(self):
33
+ return len(next(iter(self.values)))
34
+
35
+ def __repr__(self):
36
+ return '\n'.join('\t'.join(map(str, line))
37
+ for line in zip(*self.values)) + '\n'
38
+
39
+
40
+ class Corpus(object):
41
+
42
+ def __init__(self, fields, sentences):
43
+ super(Corpus, self).__init__()
44
+
45
+ self.fields = fields
46
+ self.sentences = sentences
47
+
48
+ def __len__(self):
49
+ return len(self.sentences)
50
+
51
+ def __repr__(self):
52
+ return '\n'.join(str(sentence) for sentence in self)
53
+
54
+ def __getitem__(self, index):
55
+ return self.sentences[index]
56
+
57
+ def __getattr__(self, name):
58
+ print(name)
59
+ if not hasattr(self.sentences[0], name):
60
+ raise AttributeError
61
+ for sentence in self.sentences:
62
+ yield getattr(sentence, name)
63
+
64
+ def __setattr__(self, name, value):
65
+ if name in ['fields', 'sentences']:
66
+ self.__dict__[name] = value
67
+ else:
68
+ for i, sentence in enumerate(self.sentences):
69
+ setattr(sentence, name, value[i])
70
+
71
+ @classmethod
72
+ def load(cls, path, fields):
73
+ start, sentences = 0, []
74
+ fields = [field if field is not None else Field(str(i))
75
+ for i, field in enumerate(fields)]
76
+ with open(path, 'r') as f:
77
+ lines = [line.strip() for line in f]
78
+ for i, line in enumerate(lines):
79
+ if not line:
80
+ values = list(zip(*[l.split('\t') for l in lines[start:i]]))
81
+ sentences.append(Sentence(fields, values))
82
+ start = i + 1
83
+
84
+ return cls(fields, sentences)
85
+
86
+ def save(self, path):
87
+ with open(path, 'w') as f:
88
+ f.write(f"{self}\n")
biaffine-parser-master/parser/utils/data.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from collections.abc import Iterable
4
+ from itertools import chain
5
+ from parser.utils.alg import kmeans
6
+
7
+ import torch
8
+ from torch.nn.utils.rnn import pad_sequence
9
+ from torch.utils.data import DataLoader, Dataset, Sampler
10
+
11
+
12
+ class TextDataLoader(DataLoader):
13
+
14
+ def __init__(self, *args, **kwargs):
15
+ super(TextDataLoader, self).__init__(*args, **kwargs)
16
+
17
+ self.fields = self.dataset.fields
18
+
19
+ def __iter__(self):
20
+ for raw_batch in super(TextDataLoader, self).__iter__():
21
+ batch, device = [], 'cuda' if torch.cuda.is_available() else 'cpu'
22
+ for data, field in zip(raw_batch, self.fields):
23
+ if isinstance(data[0], torch.Tensor):
24
+ data = pad_sequence(data, True, field.pad_index).to(device)
25
+ elif isinstance(data[0], Iterable):
26
+ data = [pad_sequence(f, True, field.pad_index).to(device)
27
+ for f in zip(*data)]
28
+ batch.append(data)
29
+ yield batch
30
+
31
+
32
+ class TextDataset(Dataset):
33
+
34
+ def __init__(self, corpus, fields, n_buckets=1):
35
+ super(TextDataset, self).__init__()
36
+
37
+ self.corpus = corpus
38
+ self.fields = list(chain(*[
39
+ field if isinstance(field, Iterable) else [field]
40
+ for field in fields if field is not None
41
+ ]))
42
+ for field in self.fields:
43
+ value = field.numericalize(getattr(corpus, field.name))
44
+ setattr(self, field.name, value)
45
+ # NOTE: the final bucket count is roughly equal to n_buckets
46
+ self.lengths = [len(i) + sum([bool(field.bos), bool(field.bos)])
47
+ for i in corpus]
48
+ self.buckets = dict(zip(*kmeans(self.lengths, n_buckets)))
49
+
50
+ def __getitem__(self, index):
51
+ for field in self.fields:
52
+ yield getattr(self, field.name)[index]
53
+
54
+ def __len__(self):
55
+ return len(self.corpus)
56
+
57
+ @property
58
+ def loader(self):
59
+ if hasattr(self, 'data_loader'):
60
+ return self.data_loader
61
+ else:
62
+ raise AttributeError
63
+
64
+ @loader.setter
65
+ def loader(self, data_loader):
66
+ self.data_loader = data_loader
67
+
68
+ @classmethod
69
+ def collate_fn(cls, batch):
70
+ return (field for field in zip(*batch))
71
+
72
+
73
+ class TextSampler(Sampler):
74
+
75
+ def __init__(self, buckets, batch_size, shuffle=False):
76
+ self.batch_size = batch_size
77
+ self.shuffle = shuffle
78
+ self.sizes, self.buckets = zip(*[
79
+ (size, bucket) for size, bucket in buckets.items()
80
+ ])
81
+ # the number of chunks in each bucket, which is clipped by
82
+ # range [1, len(bucket)]
83
+ self.chunks = [
84
+ min(len(bucket), max(round(size * len(bucket) / batch_size), 1))
85
+ for size, bucket in zip(self.sizes, self.buckets)
86
+ ]
87
+
88
+ def __iter__(self):
89
+ # if shuffle, shuffle both the buckets and samples in each bucket
90
+ range_fn = torch.randperm if self.shuffle else torch.arange
91
+ for i in range_fn(len(self.buckets)).tolist():
92
+ split_sizes = [(len(self.buckets[i]) - j - 1) // self.chunks[i] + 1
93
+ for j in range(self.chunks[i])]
94
+ # DON'T use `torch.chunk` which may return wrong number of chunks
95
+ for batch in range_fn(len(self.buckets[i])).split(split_sizes):
96
+ yield [self.buckets[i][j] for j in batch.tolist()]
97
+
98
+ def __len__(self):
99
+ return sum(self.chunks)
100
+
101
+
102
+ def batchify(dataset, batch_size, shuffle=False):
103
+ batch_sampler = TextSampler(buckets=dataset.buckets,
104
+ batch_size=batch_size,
105
+ shuffle=shuffle)
106
+ loader = TextDataLoader(dataset=dataset,
107
+ batch_sampler=batch_sampler,
108
+ collate_fn=dataset.collate_fn)
109
+
110
+ return loader
biaffine-parser-master/parser/utils/embedding.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import torch
4
+
5
+
6
+ class Embedding(object):
7
+
8
+ def __init__(self, tokens, vectors, unk=None):
9
+ super(Embedding, self).__init__()
10
+ self.tokens = tokens
11
+ self.vectors = torch.tensor([v[0] for v in vectors])
12
+ print(self.vectors.size(0))
13
+ self.pretrained = {w: v for w, v in zip(tokens, vectors)}
14
+ self.unk = '[UNK]'
15
+
16
+ def __len__(self):
17
+ return len(self.tokens)
18
+
19
+ def __contains__(self, token):
20
+ return token in self.pretrained
21
+
22
+ @property
23
+ def dim(self):
24
+ return self.vectors.size(0)
25
+
26
+ @property
27
+ def unk_index(self):
28
+ if self.unk is not None:
29
+ return self.tokens.index(self.unk)
30
+ else:
31
+ raise AttributeError
32
+
33
+ @classmethod
34
+ def load(cls, path, unk=None):
35
+ with open(path, 'r') as f:
36
+ lines = [line for line in f]
37
+ splits = [line.split() for line in lines]
38
+ tokens, vectors = zip(*[(s[0], list(map(float, s[1:])))
39
+ for s in splits])
40
+
41
+ return cls(tokens, vectors, unk=unk)
biaffine-parser-master/parser/utils/field.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from collections import Counter
4
+ from parser.utils.vocab import Vocab
5
+
6
+ import torch
7
+
8
+
9
+ class Field(object):
10
+
11
+ def __init__(self, name, pad=None, unk=None, bos=None, eos=None,
12
+ lower=False, use_vocab=True, tokenize=None, fn=None):
13
+ self.name = name
14
+ self.pad = pad
15
+ self.unk = unk
16
+ self.bos = bos
17
+ self.eos = eos
18
+ self.lower = lower
19
+ self.use_vocab = use_vocab
20
+ self.tokenize = tokenize
21
+ self.fn = fn
22
+
23
+ self.specials = [token for token in [pad, unk, bos, eos]
24
+ if token is not None]
25
+
26
+ def __repr__(self):
27
+ s, params = f"({self.name}): {self.__class__.__name__}(", []
28
+ if self.pad is not None:
29
+ params.append(f"pad={self.pad}")
30
+ if self.unk is not None:
31
+ params.append(f"unk={self.unk}")
32
+ if self.bos is not None:
33
+ params.append(f"bos={self.bos}")
34
+ if self.eos is not None:
35
+ params.append(f"eos={self.eos}")
36
+ if self.lower:
37
+ params.append(f"lower={self.lower}")
38
+ if not self.use_vocab:
39
+ params.append(f"use_vocab={self.use_vocab}")
40
+ s += f", ".join(params)
41
+ s += f")"
42
+
43
+ return s
44
+
45
+ @property
46
+ def pad_index(self):
47
+ return self.specials.index(self.pad) if self.pad is not None else 0
48
+
49
+ @property
50
+ def unk_index(self):
51
+ return self.specials.index(self.unk) if self.unk is not None else 0
52
+
53
+ @property
54
+ def bos_index(self):
55
+ return self.specials.index(self.bos)
56
+
57
+ @property
58
+ def eos_index(self):
59
+ return self.specials.index(self.eos)
60
+
61
+ def transform(self, sequence):
62
+ if self.tokenize is not None:
63
+ sequence = self.tokenize(sequence)
64
+ if self.lower:
65
+ sequence = [str.lower(token) for token in sequence]
66
+ if self.fn is not None:
67
+ sequence = [self.fn(token) for token in sequence]
68
+
69
+ return sequence
70
+
71
+ def build(self, corpus, min_freq=1, embed=None):
72
+ sequences = getattr(corpus, self.name)
73
+ counter = Counter(token for sequence in sequences
74
+ for token in self.transform(sequence))
75
+ self.vocab = Vocab(counter, min_freq, self.specials)
76
+
77
+
78
+ if not embed:
79
+ self.embed = None
80
+ else:
81
+ tokens = self.transform(embed.tokens)
82
+ # if the `unk` token has existed in the pretrained,
83
+ # then replace it with a self-defined one
84
+ if embed.unk:
85
+ tokens[embed.unk_index] = self.unk
86
+
87
+ self.vocab.extend(tokens)
88
+ self.embed = torch.zeros(len(self.vocab), embed.dim)
89
+ self.embed[self.vocab.token2id(tokens)] = embed.vectors
90
+ self.embed /= torch.std(self.embed)
91
+
92
+ def numericalize(self, sequences):
93
+ sequences = [self.transform(sequence) for sequence in sequences]
94
+ if self.use_vocab:
95
+ sequences = [self.vocab.token2id(sequence)
96
+ for sequence in sequences]
97
+ if self.bos:
98
+ sequences = [[self.bos_index] + sequence for sequence in sequences]
99
+ if self.eos:
100
+ sequences = [sequence + [self.eos_index] for sequence in sequences]
101
+ sequences = [torch.tensor(sequence) for sequence in sequences]
102
+
103
+ return sequences
104
+
105
+
106
+ class CharField(Field):
107
+
108
+ def __init__(self, *args, **kwargs):
109
+ self.fix_len = kwargs.pop('fix_len') if 'fix_len' in kwargs else -1
110
+ super(CharField, self).__init__(*args, **kwargs)
111
+
112
+ def build(self, corpus, min_freq=1, embed=None):
113
+ sequences = getattr(corpus, self.name)
114
+ counter = Counter(char for sequence in sequences for token in sequence
115
+ for char in self.transform(token))
116
+ self.vocab = Vocab(counter, min_freq, self.specials)
117
+
118
+ if not embed:
119
+ self.embed = None
120
+ else:
121
+ tokens = self.transform(embed.tokens)
122
+ # if the `unk` token has existed in the pretrained,
123
+ # then replace it with a self-defined one
124
+ if embed.unk:
125
+ tokens[embed.unk_index] = self.unk
126
+
127
+ self.vocab.extend(tokens)
128
+ self.embed = torch.zeros(len(self.vocab), embed.dim)
129
+ self.embed[self.vocab.token2id(tokens)] = embed.vectors
130
+
131
+ def numericalize(self, sequences):
132
+ sequences = [[self.transform(token) for token in sequence]
133
+ for sequence in sequences]
134
+ if self.fix_len <= 0:
135
+ self.fix_len = max(len(token) for sequence in sequences
136
+ for token in sequence)
137
+ if self.use_vocab:
138
+ sequences = [[self.vocab.token2id(token) for token in sequence]
139
+ for sequence in sequences]
140
+ if self.bos:
141
+ sequences = [[self.vocab.token2id(self.bos)] + sequence
142
+ for sequence in sequences]
143
+ if self.eos:
144
+ sequences = [sequence + [self.vocab.token2id(self.eos)]
145
+ for sequence in sequences]
146
+ sequences = [
147
+ torch.tensor([ids[:self.fix_len] + [0] * (self.fix_len - len(ids))
148
+ for ids in sequence])
149
+ for sequence in sequences
150
+ ]
151
+
152
+ return sequences
153
+
154
+
155
+ class BertField(Field):
156
+
157
+ def numericalize(self, sequences):
158
+ subwords, lens = [], []
159
+ sequences = [([self.bos] if self.bos else []) + list(sequence) +
160
+ ([self.eos] if self.eos else [])
161
+ for sequence in sequences]
162
+
163
+ for sequence in sequences:
164
+ sequence = [self.transform(token) for token in sequence]
165
+ sequence = [piece if piece else self.transform(self.pad)
166
+ for piece in sequence]
167
+ subwords.append(sum(sequence, []))
168
+ lens.append(torch.tensor([len(piece) for piece in sequence]))
169
+ subwords = [torch.tensor(pieces) for pieces in subwords]
170
+ mask = [torch.ones(len(pieces)).ge(0) for pieces in subwords]
171
+
172
+ return list(zip(subwords, lens, mask))