varox34 commited on Jan 27, 2024

Commit

366b225

verified ·

1 Parent(s): 56c2140

Upload 64 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
README.md +67 -0
UD_Tamil-TTB-master/.gitignore +0 -0
UD_Tamil-TTB-master/CONTRIBUTING.md +8 -0
UD_Tamil-TTB-master/LICENSE.txt +7 -0
UD_Tamil-TTB-master/README.md +103 -0
UD_Tamil-TTB-master/eval.log +99 -0
UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD-commented.odt +0 -0
UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD.odt +0 -0
UD_Tamil-TTB-master/not-to-release/issues-UD-2.3.odt +0 -0
UD_Tamil-TTB-master/stats.xml +110 -0
UD_Tamil-TTB-master/ta_ttb-ud-dev.conllu +0 -0
UD_Tamil-TTB-master/ta_ttb-ud-test.conllu +0 -0
UD_Tamil-TTB-master/ta_ttb-ud-train.conllu +0 -0
app.py +86 -0
arc_eager.py +93 -0
best_mapping.pth +3 -0
biaffine-parser-master/.gitignore +22 -0
biaffine-parser-master/.travis.yml +36 -0
biaffine-parser-master/README.md +226 -0
biaffine-parser-master/config.ini +30 -0
biaffine-parser-master/data/naive3.conllx +7 -0
biaffine-parser-master/data/ptb/tamdev.conllx +0 -0
biaffine-parser-master/data/ptb/tamtest.conllx +0 -0
biaffine-parser-master/data/ptb/tamtrain.conllx +0 -0
biaffine-parser-master/exp/ptb/fields +3 -0
biaffine-parser-master/exp/ptb/model +3 -0
biaffine-parser-master/parser/__init__.py +5 -0
biaffine-parser-master/parser/cmds/__init__.py +7 -0
biaffine-parser-master/parser/cmds/cmd.py +151 -0
biaffine-parser-master/parser/cmds/evaluate.py +49 -0
biaffine-parser-master/parser/cmds/predict.py +49 -0
biaffine-parser-master/parser/cmds/train.py +113 -0
biaffine-parser-master/parser/config.py +41 -0
biaffine-parser-master/parser/model.py +140 -0
biaffine-parser-master/parser/modules/__init__.py +11 -0
biaffine-parser-master/parser/modules/bert.py +62 -0
biaffine-parser-master/parser/modules/biaffine.py +43 -0
biaffine-parser-master/parser/modules/bilstm.py +126 -0
biaffine-parser-master/parser/modules/char_lstm.py +30 -0
biaffine-parser-master/parser/modules/dropout.py +60 -0
biaffine-parser-master/parser/modules/mlp.py +28 -0
biaffine-parser-master/parser/modules/scalar_mix.py +32 -0
biaffine-parser-master/parser/utils/__init__.py +8 -0
biaffine-parser-master/parser/utils/alg.py +143 -0
biaffine-parser-master/parser/utils/common.py +6 -0
biaffine-parser-master/parser/utils/corpus.py +88 -0
biaffine-parser-master/parser/utils/data.py +110 -0
biaffine-parser-master/parser/utils/embedding.py +41 -0
biaffine-parser-master/parser/utils/field.py +172 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+biaffine-parser-master/exp/ptb/fields filter=lfs diff=lfs merge=lfs -text
+biaffine-parser-master/exp/ptb/model filter=lfs diff=lfs merge=lfs -text
+models/tnt_pos_tagger_hin.dill filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Dependency-parser
+## Dependencies :
+* OS : Ubuntu 22.04.3 LTS
+* Python 3.7
+* flask 1.1.1
+* flask-wtf 0.14.2
+* flask-markdown 0.3
+* nltk 3.4.5
+* pygraphviz 1.7
+* conllu 2.2.2
+* scikit-learn 0.22.1
+* dill 0.3.1.1
+* transformers 2.1.1
+```bash
+> python3.7 -m pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+```
+## Instructions for web app :
+Run the following to host the app at localhost:5000
+```bash
+> python3.7 app.py
+```
+## Results :
+Trained a model using <b>BERT</b> and parser implemented from <b>Deep Biaffine Attention for Neural Dependency Parsing</b> on Telugu UD Treebank dataset
+<b>
+train: 400  sentences
+</b>
+<br>
+<b>
+dev:     80 sentences
+</b>
+<br>
+<b>
+test:    120 sentences
+</b>
+### Train
+Training the model
+![parameter](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/304c0f93-a377-4a9f-a9cd-34f9d756ba3a)
+### Evaluate
+Evaluation score after testing with Test Dataset
+![evaluate](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/5c245c8a-2fe6-4e6a-b3f1-63b5503b8140)
+### Prediction
+#### Entering the sentence
+![before](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/9c725d7e-40d9-4caf-8666-86bf4dc06419)
+#### Final Result
+![finalres](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/d6cd1260-3e8d-4352-96f6-97061a829d58)
+#### Original Values
+![original](https://github.com/varunlmxd/Dependency-Parser/assets/104298930/84391246-41f5-469d-90f9-d86e18494999)

UD_Tamil-TTB-master/.gitignore ADDED Viewed

File without changes

UD_Tamil-TTB-master/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# Contributing
+Please do not make pull requests against master, any such pull requests will be
+closed. Pull requests against the dev branch are accepted in some treebanks but
+not in others - check the Contributing line in the README file!
+For full details on the branch policy see
+[here](http://universaldependencies.org/release_checklist.html#repository-branches).

UD_Tamil-TTB-master/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+This work is licensed under the Creative Commons Attribution-NonCommercial-
+ShareAlike 3.0 Unported License. To view a copy of this license, visit
+http://creativecommons.org/licenses/by-nc-sa/3.0/
+or send a letter to
+Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.

UD_Tamil-TTB-master/README.md ADDED Viewed

	@@ -0,0 +1,103 @@

+# Summary
+The UD Tamil treebank is based on the Tamil Dependency Treebank created at the
+Charles University in Prague by Loganathan Ramasamy.
+# Introduction
+The treebank was part of
+HamleDT, a collection of treebanks converted to the Prague dependency style
+(since 2011). Later versions of HamleDT added a conversion to the Stanford
+dependencies (2014) and to Universal Dependencies (HamleDT 3.0, 2015). The
+first release of Universal Dependencies that includes this treebank is UD v1.2
+in November 2015. It is essentially the HamleDT conversion but the data is not
+identical to HamleDT 3.0 because the conversion procedure has been further
+improved.
+## References:
+* [TamilTB](http://ufal.mff.cuni.cz/~ramasamy/tamiltb/0.1/)
+* [HamleDT](http://ufal.mff.cuni.cz/hamledt)
+* [Treex](http://ufal.mff.cuni.cz/treex) is the software used for conversion
+* [Interset](http://ufal.mff.cuni.cz/interset) was used to convert POS tags and features
+* Loganathan Ramasamy, Zdeněk Žabokrtský. 2012.
+  [Prague Dependency Style Treebank for Tamil](http://www.lrec-conf.org/proceedings/lrec2012/summaries/456.html).
+  In: *Proceedings of Eighth International Conference on Language Resources and Evaluation (LREC 2012),*
+  İstanbul, Turkey, ISBN 978-2-9517408-7-7, pp. 1888–1894.
+<pre>
+@inproceedings{ta,
+  author    = {Ramasamy, Loganathan and \v{Z}abokrtsk\'{y}, Zden\v{e}k},
+  year      = {2012},
+  title     = {Prague Dependency Style Treebank for {Tamil}},
+  booktitle = {Proceedings of Eighth International Conference on Language Resources and Evaluation ({LREC} 2012)},
+  address   = {\.{I}stanbul, Turkey},
+  editor    = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Mehmet Uğur Doğan and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
+  isbn      = {978-2-9517408-7-7},
+  pages     = {1888--1894},
+  url       = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/456.html}
+}
+</pre>
+# Source of annotations
+This table summarizes the origins and checking of the various columns of the CoNLL-U data.
+| Column | Status |
+| ------ | ------ |
+| ID | Sentence segmentation and tokenization (including cutting off certain suffixes that constitute independent syntactic words) was automatically done and then hand-corrected. |
+| FORM | Identical to TamilTB form. |
+| LEMMA | Gold (preprocessed and then manually corrected). |
+| UPOSTAG | Converted automatically from XPOSTAG (via [Interset](https://ufal.mff.cuni.cz/interset)). |
+| XPOSTAG | Gold (preprocessed and then manually corrected). |
+| FEATS | Converted automatically from XPOSTAG (via Interset). |
+| HEAD | Original TamilTB annotation is manual (preprocessed by a rule-based parser and then manually corrected). Automatic conversion to UD; human checking of patterns revealed by automatic consistency tests. |
+| DEPREL | Original TamilTB annotation is manual (preprocessed by a rule-based parser and then manually corrected). Automatic conversion to UD; human checking of patterns revealed by automatic consistency tests. |
+| DEPS | &mdash; (currently unused) |
+| MISC | Information about token spacing restored using heuristics. Mapping between multi-word tokens and syntactic words verified against the source text. |
+# Changelog
+* 2023-11-15 v2.13
+  * Fixed: no Gender for numerals and particles.
+* 2021-05-15 v2.8
+  * Distinguished acl:relcl from other acl.
+  * Added enhanced dependencies for relative clauses.
+* 2020-05-15 v2.6
+  * Added enhanced relations with case information.
+* 2019-05-15 v2.4
+  * Fixed some annotation errors in the original treebank, re-run the conversion.
+  * Dative and instrumental objects are now treated as oblique arguments.
+* 2018-04-15 v2.2
+  * Repository renamed from UD_Tamil to UD_Tamil-TTB.
+  * Added enhanced representation of dependencies propagated across coordination.
+    The distinction of shared and private dependents is derived deterministically from the original Prague annotation.
+* 2017-03-01 v2.0
+  * Converted to UD v2 guidelines.
+  * Reconsidered PRON vs. DET distinction.
+  * Improved advmod vs. obl distinction.
+* 2016-05-15 v1.3
+  * Added Latin transliteration of lemmas and full sentences.
+  * Added orthographic words (surface tokens) and their mapping to nodes.
+  * Improved conversion of AuxY.
+<pre>
+=== Machine-readable metadata (DO NOT REMOVE!) ================================
+Data available since: UD v1.2
+License: CC BY-NC-SA 3.0
+Includes text: yes
+Genre: news
+Lemmas: converted from manual
+UPOS: converted from manual
+XPOS: manual native
+Features: converted from manual
+Relations: converted from manual
+Contributors: Ramasamy, Loganathan; Zeman, Daniel
+Contributing: elsewhere
+Contact: zeman@ufal.mff.cuni.cz
+===============================================================================
+</pre>

UD_Tamil-TTB-master/eval.log ADDED Viewed

	@@ -0,0 +1,99 @@

+Running the following version of UD tools:
+commit e9726a6a7d6913193d90edb45a4cb549235c5b16
+Author: Dan Zeman <zeman@ufal.mff.cuni.cz>
+Date:   Sat Nov 4 17:10:55 2023 +0100
+Evaluating the following revision of UD_Tamil-TTB:
+commit c1739c0397fd034200edaf4e403c2e4c9923dd75
+Merge: 1392fa0 fbea79c
+Author: Dan Zeman <zeman@ufal.mff.cuni.cz>
+Size: counted 9581 of 9581 words (nodes).
+Size: min(0, log((N/1000)**2)) = 4.51956394133747.
+Size: maximum value 13.815511 is for 1000000 words or more.
+Split: Did not find more than 10000 training words.
+Split: Did not find at least 10000 development words.
+Split: Did not find at least 10000 test words.
+Lemmas: source of annotation (from README) factor is 0.8.
+Universal POS tags: 14 out of 17 found in the corpus.
+Universal POS tags: source of annotation (from README) factor is 0.8.
+Features: 8280 out of 9581 total words have one or more features.
+Features: source of annotation (from README) factor is 0.8.
+Universal relations: 25 out of 37 found in the corpus.
+Universal relations: source of annotation (from README) factor is 0.8.
+Udapi:
+               TOTAL        205
+Udapi: found 205 bugs.
+Udapi: worst expected case (threshold) is one bug per 10 words. There are 9581 words.
+Genres: found 1 out of 17 known.
+/net/work/people/zeman/unidep/tools/validate.py --lang ta --max-err=10 UD_Tamil-TTB/ta_ttb-ud-dev.conllu
+[Line 9 Sent dev-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:dat' in '11:obl:dat'
+The following 63 enhanced relations are currently permitted in language [ta]:
+acl, acl:relcl, advcl, advcl:cond, advmod, advmod:emph, advmod:lmod, amod, appos, aux, aux:neg, aux:pass, case, cc, ccomp, clf, compound, compound:lvc, compound:prt, compound:redup, compound:svc, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, flat:name, goeswith, iobj, list, mark, nmod, nmod:poss, nsubj, nsubj:nc, nsubj:nc:xsubj, nsubj:pass, nsubj:pass:xsubj, nsubj:xsubj, nummod, obj, obl, obl:agent, obl:arg, obl:cmpr, obl:inst, obl:lmod, obl:pmod, obl:tmod, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp
+See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.
+[Line 10 Sent dev-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:dat' in '7:obl:dat'
+[Line 11 Sent dev-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:இல்:nom' in '11:obl:இல்:nom'
+[Line 32 Sent dev-s2]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '10:obl:com'
+[Line 45 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '11:obl:com'
+[Line 48 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '11:obl:com'
+[Line 50 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '10:nmod:nom'
+[Line 58 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '18:nmod:nom'
+[Line 68 Sent dev-s4]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '23:obl:loc'
+...suppressing further errors regarding Enhanced
+Enhanced errors: 351
+*** FAILED *** with 351 errors
+Exit code: 1
+/net/work/people/zeman/unidep/tools/validate.py --lang ta --max-err=10 UD_Tamil-TTB/ta_ttb-ud-test.conllu
+[Line 6 Sent test-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:இலிருந்து:nom' in '4:nmod:இலிருந்து:nom'
+The following 63 enhanced relations are currently permitted in language [ta]:
+acl, acl:relcl, advcl, advcl:cond, advmod, advmod:emph, advmod:lmod, amod, appos, aux, aux:neg, aux:pass, case, cc, ccomp, clf, compound, compound:lvc, compound:prt, compound:redup, compound:svc, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, flat:name, goeswith, iobj, list, mark, nmod, nmod:poss, nsubj, nsubj:nc, nsubj:nc:xsubj, nsubj:pass, nsubj:pass:xsubj, nsubj:xsubj, nummod, obj, obl, obl:agent, obl:arg, obl:cmpr, obl:inst, obl:lmod, obl:pmod, obl:tmod, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp
+See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.
+[Line 13 Sent test-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:dat' in '9:obl:dat'
+[Line 28 Sent test-s2]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:இலிருந்து:nom' in '9:obl:இலிருந்து:nom'
+[Line 42 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '2:nmod:nom'
+[Line 43 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '5:obl:loc'
+[Line 44 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '4:nmod:nom'
+[Line 49 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:dat' in '9:nmod:dat'
+[Line 54 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:arg:இடம்:gen' in '15:obl:arg:இடம்:gen'
+[Line 66 Sent test-s4]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '5:obl:com'
+...suppressing further errors regarding Enhanced
+[Line 2738 Sent test-s118 Node 7]: [L3 Syntax too-many-subjects] Multiple subjects [4, 6] not subtyped as ':outer'. Outer subjects are allowed if a clause acts as the predicate of another clause.
+Enhanced errors: 483
+Syntax errors: 1
+*** FAILED *** with 484 errors
+Exit code: 1
+/net/work/people/zeman/unidep/tools/validate.py --lang ta --max-err=10 UD_Tamil-TTB/ta_ttb-ud-train.conllu
+[Line 5 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:அருகே:nom' in '18:obl:அருகே:nom'
+The following 63 enhanced relations are currently permitted in language [ta]:
+acl, acl:relcl, advcl, advcl:cond, advmod, advmod:emph, advmod:lmod, amod, appos, aux, aux:neg, aux:pass, case, cc, ccomp, clf, compound, compound:lvc, compound:prt, compound:redup, compound:svc, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, flat:name, goeswith, iobj, list, mark, nmod, nmod:poss, nsubj, nsubj:nc, nsubj:nc:xsubj, nsubj:pass, nsubj:pass:xsubj, nsubj:xsubj, nummod, obj, obl, obl:agent, obl:arg, obl:cmpr, obl:inst, obl:lmod, obl:pmod, obl:tmod, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp
+See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.
+[Line 7 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '4:nmod:nom'
+[Line 8 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '18:obl:loc'
+[Line 9 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '6:nmod:nom'
+[Line 10 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '11:nmod:nom'
+[Line 16 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:dat' in '12:nmod:dat'
+[Line 19 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:dat' in '15:nmod:dat'
+[Line 20 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:இல்:nom' in '17:nmod:இல்:nom'
+[Line 22 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '18:obl:loc'
+...suppressing further errors regarding Enhanced
+[Line 4427 Sent train-s192 Node 25]: [L3 Syntax too-many-subjects] Multiple subjects [11, 17] not subtyped as ':outer'. Outer subjects are allowed if a clause acts as the predicate of another clause.
+Enhanced errors: 1922
+Syntax errors: 1
+*** FAILED *** with 1923 errors
+Exit code: 1
+Validity: 0.01
+(weight=0.0769230769230769) * (score{features}=0.8) = 0.0615384615384615
+(weight=0.0769230769230769) * (score{genres}=0.0588235294117647) = 0.00452488687782805
+(weight=0.0769230769230769) * (score{lemmas}=0.8) = 0.0615384615384615
+(weight=0.256410256410256) * (score{size}=0.327136946721963) = 0.0838812683902469
+(weight=0.0512820512820513) * (score{split}=0.01) = 0.000512820512820513
+(weight=0.0769230769230769) * (score{tags}=0.658823529411765) = 0.0506787330316742
+(weight=0.307692307692308) * (score{udapi}=0.786034860661726) = 0.241856880203608
+(weight=0.0769230769230769) * (score{udeprels}=0.540540540540541) = 0.0415800415800416
+(TOTAL score=0.546111553673142) * (availability=1) * (validity=0.01) = 0.00546111553673142
+STARS = 0
+UD_Tamil-TTB	0.00546111553673142	0

UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD-commented.odt ADDED Viewed

Binary file (71.5 kB). View file

UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD.odt ADDED Viewed

Binary file (71.1 kB). View file

UD_Tamil-TTB-master/not-to-release/issues-UD-2.3.odt ADDED Viewed

Binary file (39.2 kB). View file

UD_Tamil-TTB-master/stats.xml ADDED Viewed

	@@ -0,0 +1,110 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<treebank>
+  <!-- tokens means "surface tokens", e.g. Spanish "vámonos" counts as one token
+       words means "syntactic words", e.g. Spanish "vámonos" is split to two words, "vamos" and "nos"
+       fused is the number of tokens that are split to two or more syntactic words
+       The words and fused elements can be omitted if no token is split to smaller syntactic words. -->
+  <size>
+    <total><sentences>600</sentences><tokens>8635</tokens><words>9581</words><fused>835</fused></total>
+    <train><sentences>400</sentences><tokens>5734</tokens><words>6329</words><fused>520</fused></train>
+    <dev><sentences>80</sentences><tokens>1129</tokens><words>1263</words><fused>121</fused></dev>
+    <test><sentences>120</sentences><tokens>1772</tokens><words>1989</words><fused>194</fused></test>
+  </size>
+  <lemmas unique="2024" /><!-- ., ,, உள், உம், என், படு, இரு, ஆகு, அவர், இந்தியா, தெரிவி, செய், ஆக, ஆன, நாடு -->
+  <forms unique="3584" /><!-- ., ,, உம், ஆக, உள்ளது, ஆன, என்று, உள்ள, அவர், வேண்டும், இந்த, பட்ட், மற்றும், அரசு, நாடுகள் -->
+  <fusions unique="620" /><!-- என்றும், இடையிலான, செயலாளராக, தெரிவிக்கப்பட்டுள்ளது, தெரிவித்துள்ளது, தெரிவித்துள்ளார், இடத்தையும், குறிப்பிடத்தக்கது, வெளியிட்டுள்ள, Kஉறித்தும், பேருக்கும், ஆதரவாக, காரணமாக, குறிப்பிடப்பட்டுள்ளது, கூறியுள்ளார் -->
+  <!-- Statistics of universal POS tags. The comments show the most frequent lemmas. -->
+  <tags unique="14">
+    <tag name="ADJ">557</tag><!-- உள், மத்திய, இரு, ஒரு, நடைபெறு, உள்ளிடு, புதிய, கடந்த, முன்னாள், வரு -->
+    <tag name="ADP">293</tag><!-- ஆகு, உடன், குறி, இலிருந்து, மீது, உள், இரு, மூலம், சார்பு, போல் -->
+    <tag name="ADV">384</tag><!-- ஆகு, இன்று, மேலும், ஆனால், பின்னர், இதனால், இது, இதுகுறித்து, ஏற்கெனவே, இதுவரை -->
+    <tag name="AUX">634</tag><!-- உள், படு, வேண்டு, இரு, வரு, கொள், இல், செய், விடு, வா -->
+    <tag name="CCONJ">46</tag><!-- மற்றும், அல்லது -->
+    <tag name="DET">120</tag><!-- இந்த, அந்த, எந்த, மிக, அதிகம், மிகவும், முழுவதும், அந்தந்த, ஒரு, குறைவு -->
+    <tag name="NOUN">2758</tag><!-- அரசு, நாடு, ஆண்டு, தலைவர், மக்கள், இடம், பேர், கட்சி, செயலாளர், முதல்வர் -->
+    <tag name="NUM">274</tag><!-- இரு, ஆயிரம், 2, லட்சம், மூன்று, 10, ஒன்று, 20, இரண்டு, ஒரு -->
+    <tag name="PART">654</tag><!-- உம், என், ஆன, ஆக, ஆகு, போது, தான், ஏ, ஆவது, ஓ -->
+    <tag name="PRON">236</tag><!-- அவர், இது, அது, தன், அனைவர், என், யார், நான், நாம், இவர் -->
+    <tag name="PROPN">1370</tag><!-- இந்தியா, அமெரிக்கா, இலங்கை, பாகிஸ்தான், சென்னை, தமிழகம், ஒபாமா, அதிமுக, காங்கிரஸ், ஜெயலலிதா -->
+    <tag name="PUNCT">1000</tag><!-- ., ,, -, :, (, ), ", ரூ, ரூ., ; -->
+    <tag name="VERB">1254</tag><!-- தெரிவி, செய், கூறு, இரு, செல், பெறு, வழங்கு, நடைபெறு, குறிப்பிடு, ஏற்படு -->
+    <tag name="X">1</tag><!-- என் -->
+  </tags>
+  <!-- Statistics of features and values. The comments show the most frequent word forms. -->
+  <feats unique="41">
+    <feat name="AdpType" value="Post" upos="ADP">288</feat><!-- ஆக, உடன், இலிருந்து, குறித்து, மீது, சார்பில், மூலம், இடம், இருந்து, இடையில் -->
+    <feat name="Animacy" value="Anim" upos="AUX,NOUN,PRON,PROPN,VERB">420</feat><!-- உள்ளனர், பேர், மக்கள், அதிகாரிகள், அனைவரும், அவர்கள், தனது, புலிகள், போலீஸார், நான் -->
+    <feat name="Case" value="Acc" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">340</feat><!-- அதை, அவர்களை, இடத்தைய், இதை, நிலையங்களை, வெடிகளை, அணையை, அரசை, ஆட்சியை, உத்தரவை -->
+    <feat name="Case" value="Com" upos="NOUN">13</feat><!-- மனிதாபிமானத்தோடு, அமைப்புடன், அவருடன், உயிருடன், சிலருடன், தங்கப்பதக்கத்துடன், துணையோடு, நம்பிக்கையோடு, நேயத்துடன், நேயத்தோடு -->
+    <feat name="Case" value="Dat" upos="AUX,NOUN,NUM,PRON,PROPN,VERB">262</feat><!-- இந்தியாவுக்கு, மணிக்கு, அவர்களுக்கு, ஆண்டுக்கு, பேருக்க், மக்களுக்கு, அரசுக்கு, அவருக்கு, ஆண்டுகளுக்குப், கொள்வதற்க் -->
+    <feat name="Case" value="Gen" upos="NOUN,PRON,PROPN,VERB">177</feat><!-- அரசின், தனது, அவரது, ஒபாமாவின், நாடுகளின், நாட்டின், அமெரிக்காவின், அவர்களது, இதன், இந்தியாவின் -->
+    <feat name="Case" value="Ins" upos="AUX,NOUN,PART,PRON,PROPN,VERB">24</feat><!-- உள்ளதால், என்பதால், காரணத்தால், அளிக்காததால், அவர்களால், ஆகியதால், ஆனதால், இல்லாததால், எங்களால், ஒப்பந்தத்தால் -->
+    <feat name="Case" value="Loc" upos="NOUN,NUM,PRON,PROPN,VERB">487</feat><!-- நிலையில், அறிக்கையில், பகுதியில், வகையில், இந்தியாவில், கவுன்சிலில், தலைமையில், அளவில், சிறையில், சென்னையில் -->
+    <feat name="Case" value="Nom" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">2929</feat><!-- அவர், அரசு, நாடுகள், இந்தியா, தலைவர், செயலாளர், முதல்வர், பேர், ஆண்டு, மக்கள் -->
+    <feat name="Gender" value="Com" upos="AUX,NOUN,PRON,PROPN,VERB">1217</feat><!-- அவர், உள்ளார், தலைவர், செயலாளர், உள்ளனர், தெரிவித்தார், முதல்வர், பேர், மக்கள், என்றார் -->
+    <feat name="Gender" value="Masc" upos="NOUN">2</feat><!-- அற்றவன், ஆடவனின் -->
+    <feat name="Gender" value="Neut" upos="AUX,NOUN,PRON,PROPN,VERB">4042</feat><!-- உள்ளது, வேண்டும், அரசு, நாடுகள், இந்தியா, இல்லை, பட்டது, இந்திய, ஆண்டு, அமெரிக்க -->
+    <feat name="Mood" value="Cnd" upos="AUX,PART,VERB">28</feat><!-- இருந்தால், விட்டால், ஆனால், உடைத்தால், ஏற்பட்டால், கட்டினால், பட்டால், பெற்றால், வந்தால், இருப்பின் -->
+    <feat name="Mood" value="Imp" upos="VERB">1</feat><!-- இருங்கள் -->
+    <feat name="Mood" value="Ind" upos="AUX,VERB">718</feat><!-- உள்ளது, வேண்டும், உள்ளார், இல்லை, உள்ளனர், தெரிவித்தார், பட்டது, உள்ளன, என்றார், இருந்தது -->
+    <feat name="Number" value="Plur" upos="AUX,NOUN,PRON,PROPN,VERB">909</feat><!-- நாடுகள், இல்லை, உள்ளனர், பேர், மக்கள், உள்ளன, அதிகாரிகள், வருகின்றனர், அனைவரும், அவர்கள் -->
+    <feat name="Number" value="Sing" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">4395</feat><!-- உள்ளது, அவர், வேண்டும், அரசு, இந்தியா, உள்ளார், தலைவர், பட்டது, செயலாளர், தெரிவித்தார் -->
+    <feat name="NumForm" value="Digit" upos="NUM">149</feat><!-- 2, 10, 20, 3, 80, 16, 4, 5, 50, 6 -->
+    <feat name="NumType" value="Card" upos="DET,NUM">282</feat><!-- இரு, 2, ஆயிரம், மூன்று, லட்சம், 10, 20, 3, 80, இரண்டு -->
+    <feat name="NumType" value="Ord" upos="ADJ,NUM">52</feat><!-- முதல், இரண்டாவது, 1992-ம், 1-ம், 12-ம், 125-ம், 15-ம், 21-ம், 11வத��, 12வது -->
+    <feat name="Person" value="1" upos="AUX,PRON,VERB">68</feat><!-- நான், உள்ளோம், எனது, நாம், உள்ளேன், தனக்கு, நாங்கள், போராடுவோம், எனக்கு, கொள்கிறேன் -->
+    <feat name="Person" value="2" upos="AUX,PRON,VERB">12</feat><!-- நீங்கள், அஞ்சுகிறீர்கள், இருக்கிறீர்கள், இருங்கள், உங்களுக்கு, உங்களைச், உங்கள், கவலைப்படாதீர்கள், வருகிறீர்கள், விரும்புகிறீர்கள் -->
+    <feat name="Person" value="3" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">5224</feat><!-- உள்ளது, அவர், வேண்டும், அரசு, நாடுகள், இந்தியா, உள்ளார், இல்லை, தலைவர், பட்டது -->
+    <feat name="Polarity" value="Neg" upos="ADJ,AUX,VERB">35</feat><!-- முடியாது, கூடாது, இல்லாமல், செயல்படாமல், செய்யாத, தரா, நிறைவேற்றா, மாட்டாது, அல்லாத, எடுக்காத -->
+    <feat name="Polarity" value="Pos" upos="ADJ,ADP,ADV,AUX,NOUN,NUM,PART,VERB">2294</feat><!-- உள்ளது, என்று, உள்ள, வேண்டும், பட்ட், உள்ளார், இல்லை, என, என்ற், பட்டது -->
+    <feat name="Polite" value="Form" upos="AUX,NOUN,NUM,PRON,PROPN,VERB">798</feat><!-- அவர், உள்ளார், தலைவர், செயலாளர், தெரிவித்தார், முதல்வர், என்றார், அமைச்சர், அதிபர், உறுப்பினர் -->
+    <feat name="PronType" value="Ind" upos="PRON">8</feat><!-- யாரும், எதுவும், யாருக்கும், யாரையும் -->
+    <feat name="PronType" value="Int" upos="PRON">6</feat><!-- யார், எத்தகையது, ஏத், யாருடைய -->
+    <feat name="PronType" value="Prs" upos="PRON">222</feat><!-- அவர், இது, அனைவரும், அவர்கள், தனது, அது, அதை, நான், அவரது, அவர்களை -->
+    <feat name="PunctType" value="Comm" upos="PUNCT">400</feat><!-- ,, -, :, (, ), ", ரூ, ரூ., ;, ’ -->
+    <feat name="PunctType" value="Peri" upos="PUNCT">600</feat><!-- . -->
+    <feat name="Reflex" value="Yes" upos="PRON">16</feat><!-- தனது, தனக்கு, தங்களது, தங்களின், தன்னைப், தமக்கு -->
+    <feat name="Tense" value="Fut" upos="ADJ,ADV,AUX,NOUN,PART,VERB">356</feat><!-- வேண்டும், நடைபெறும், ப்படும், வரும், இருக்கும், என்பது, இருப்பத், ஏற்படும், சேர்ந்தவர்கள், படும் -->
+    <feat name="Tense" value="Past" upos="ADJ,AUX,NOUN,PART,VERB">518</feat><!-- பட்டது, தெரிவித்தார், என்ற, என்றார், உள்ளிட்ட, இருந்தது, இருந்த, நடைபெற்ற, வந்த, இருந்தார் -->
+    <feat name="Tense" value="Pres" upos="ADJ,AUX,NOUN,PART,VERB">123</feat><!-- வருகின்றனர், வருகிறது, படுகிறது, தெரிகிறது, வருகின்றன, இருக்கிறது, என்கிற, தெரிவிக்கிறது, படுகின்றனர், இருக்கிற -->
+    <feat name="VerbForm" value="Fin" upos="AUX,PART,VERB">747</feat><!-- உள்ளது, வேண்டும், உள்ளார், இல்லை, உள்ளனர், தெரிவித்தார், பட்டது, உள்ளன, என்றார், இருந்தது -->
+    <feat name="VerbForm" value="Ger" upos="AUX,PART,VERB">210</feat><!-- உள்ளத், என்பது, இருப்பத், விட்டத், கூறியத், பட்டது, இருந்தத், உயிரிழந்தத், உயிரிழப்பத், உள்ளதால் -->
+    <feat name="VerbForm" value="Inf" upos="AUX,PART,VERB">476</feat><!-- என்று, என, என்ற், செய்யப், தெரிவிக்கப், செய்ய, வழங்கப், நியமிக்கப், ப்பட, குறிப்பிடத் -->
+    <feat name="VerbForm" value="Part" upos="ADJ,ADP,ADV,AUX,NOUN,PART,VERB">882</feat><!-- உள்ள, பட்ட், பட்டு, கொண்டு, தெரிவித்த், செய்து, என்ற, உள்ளிட்ட, இருந்த, நடைபெற்ற -->
+    <feat name="Voice" value="Act" upos="AUX,VERB">1616</feat><!-- உள்ளது, வேண்டும், உள்ளார், இல்லை, உள்ளனர், தெரிவித்தார், கொண்டு, தெரிவித்த், செய்து, உள்ளன -->
+    <feat name="Voice" value="Pass" upos="AUX,VERB">155</feat><!-- பட்ட், பட்டது, பட்டு, ப்படும், படுகிறது, ப்பட, படவ், படுகின்றனர், பட்டதற்கு, பட்டனர் -->
+  </feats>
+  <!-- Statistics of universal dependency relations. -->
+  <deps unique="30">
+    <dep name="acl">63</dep>
+    <dep name="acl:relcl">69</dep>
+    <dep name="advcl">358</dep>
+    <dep name="advmod">401</dep>
+    <dep name="advmod:emph">231</dep>
+    <dep name="amod">549</dep>
+    <dep name="aux">608</dep>
+    <dep name="case">270</dep>
+    <dep name="cc">103</dep>
+    <dep name="ccomp">166</dep>
+    <dep name="compound">13</dep>
+    <dep name="compound:prt">1</dep>
+    <dep name="conj">236</dep>
+    <dep name="cop">1</dep>
+    <dep name="csubj">11</dep>
+    <dep name="dep">1</dep>
+    <dep name="det">114</dep>
+    <dep name="iobj">27</dep>
+    <dep name="mark">280</dep>
+    <dep name="nmod">2024</dep>
+    <dep name="nsubj">664</dep>
+    <dep name="nsubj:pass">1</dep>
+    <dep name="nummod">239</dep>
+    <dep name="obj">537</dep>
+    <dep name="obl">888</dep>
+    <dep name="obl:arg">89</dep>
+    <dep name="parataxis">5</dep>
+    <dep name="punct">1000</dep>
+    <dep name="root">600</dep>
+    <dep name="xcomp">32</dep>
+  </deps>
+</treebank>

UD_Tamil-TTB-master/ta_ttb-ud-dev.conllu ADDED Viewed

The diff for this file is too large to render. See raw diff

UD_Tamil-TTB-master/ta_ttb-ud-test.conllu ADDED Viewed

The diff for this file is too large to render. See raw diff

UD_Tamil-TTB-master/ta_ttb-ud-train.conllu ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+try:
+    from flask import Flask
+    from flask import Flask, request, render_template, redirect, url_for, session, send_file
+    from flask_wtf import FlaskForm, RecaptchaField
+    from wtforms import StringField, SubmitField, RadioField, DateTimeField, SelectField, TextAreaField
+    from wtforms.validators import DataRequired
+    from flask import session
+    from flaskext.markdown import Markdown
+    from arc_eager import Process
+    import os
+except Exception as e:
+    print(e)
+    print("Some Modules are Missing")
+app = Flask(__name__)
+Markdown(app)
+app.config["SECRET_KEY"] = 'mysecretkey'
+class Widgets(FlaskForm):
+    Statement = StringField(label="STATEMENT")
+    submit = SubmitField(label="Submit")
+def foo(value):
+    print("Work to be done")
+@app.after_request
+def add_header(r):
+    """
+    Add headers to both force latest IE rendering engine or Chrome Frame,
+    and also to cache the rendered page for 10 minutes.
+    """
+    r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
+    r.headers["Pragma"] = "no-cache"
+    r.headers["Expires"] = "0"
+    r.headers['Cache-Control'] = 'public, max-age=0'
+    return r
+@app.route("/", methods=["GET", "POST"])
+def home():
+    form = Widgets()
+    if request.method == 'POST':
+        if (form.validate_on_submit()):
+            val = form.Statement.data
+            print(val)
+            session['data'] = val
+            # return render_template('home.html', form=form)
+            return redirect('/thanks')
+    return render_template('home.html', form=form)
+@app.route("/thanks", methods=["GET", "POST"])
+def thanks():
+    val = session['data']
+    txt, err = Process(val)
+    txt = txt.split('\n')
+    # newval = foo(val)
+    ex = [{
+        "words": [
+            {"text": "This", "tag": "DT"},
+            {"text": "is", "tag": "VBZ"},
+            {"text": "a", "tag": "DT"},
+            {"text": "sentence", "tag": "NN"}
+        ],
+        "arcs": [
+            {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
+            {"start": 2, "end": 3, "label": "det", "dir": "left"},
+            {"start": 1, "end": 3, "label": "attr", "dir": "right"}
+        ]
+    }]
+    return render_template('thanks.html',user_image='/static/process.png',text=txt,show= not err)
+if __name__ == "__main__":
+    app.run(debug=True)

arc_eager.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from conllu import parse_incr, parse
+from nltk.parse import DependencyGraph, DependencyEvaluator
+from nltk.parse.transitionparser import TransitionParser
+import pickle
+import pygraphviz as pgv
+from test_hn_pos import test_fn
+import os
+def Process(sentence):
+    words = sentence.replace('|','।').split()
+    tags = test_fn(words)
+    text = []
+    i = 0
+    for word, tag in zip(words,tags):
+        i += 1
+        fill = '_'
+        text.append('\t'.join([str(i),word,fill,fill,fill,fill,fill,fill,fill,fill]))
+    dg = DependencyGraph('\n'.join(text))
+    text = '\n'.join(text)
+    text = text + '\n\n' + text
+    with open('biaffine-parser-master/data/naive3.conllx','w') as f:
+        f.write(text)
+    os.chdir('biaffine-parser-master')
+    os.system('python3.7 run.py predict --feat=bert --fdata=data/naive3.conllx --fpred=data/naive3.conllx')
+    txt = ''
+    os.chdir('..')
+    with open('biaffine-parser-master/data/naive3.conllx','r') as f:
+        txt = f.read().split('\n\n')[0]
+    # parser = TransitionParser('arc-eager')
+    # with open('models/parser.pkl','rb') as in_file:
+    #     parser = pickle.load(in_file)
+    # predictions = parser.parse([dg],'models/arc_eager.model')
+    # txt = predictions[0].to_conll(4)
+    err = False
+    try:
+        out = DependencyGraph(txt)
+        out_dot = out.to_dot()
+        G = pgv.AGraph(out_dot)
+        G.layout(prog='dot') # use dot
+        G.draw('static/process.png')
+    except:
+        err = True
+        txt += '''Error generating graph.\n'''
+    return txt, err
+## creates dependency graph list according to nltk library specification
+def DepGraphList(sentenceList):
+    dgList = []
+    i = 0
+    j = 0
+    for sentence in sentenceList:
+        text = []
+        for token in sentence:
+            text.append(' '.join([token['form'],token['upostag'],str(token['head']),token['deprel'].upper()]))
+        try:
+            dg = DependencyGraph('\n'.join(text))
+        except:
+            j += 1
+            continue
+        i += 1
+        dgList.append(dg)
+    print(i,j)
+    return dgList
+def main():
+    #data_file = open('data/test.conllu','r',encoding='utf-8')
+    #sentence_iter = parse_incr(data_file)
+    #sentences = []
+    #for s in sentence_iter:
+    #    sentences.append(s)
+    #training_set = DepGraphList(sentences[len(sentences)//4:])
+    #test_set = DepGraphList(sentences[0:len(sentences)//4])
+    #parser = TransitionParser('arc-eager')
+    ## Training
+    # parser.train(training_set,'models/arc_eager.model')
+    # with open('models/parser2.pkl','wb') as out:
+    #      pickle.dump(parser,out)
+    # # ## Evaluation
+    # with open('models/parser2.pkl','rb') as in_file:
+    #     parser = pickle.load(in_file)
+    # predictions = parser.parse(test_set,'models/arc_eager.model')
+    # de = DependencyEvaluator(predictions,test_set)
+    # print(de.eval())
+    Process('राम अच्छा पुरुष है |')
+    return
+if __name__=='__main__':
+    main()

best_mapping.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52585379a81d5f8275ba347b5ddee6d69c196bc24fb0012713a63ec173b6312b
+size 3523565

biaffine-parser-master/.gitignore ADDED Viewed

	@@ -0,0 +1,22 @@

+# ignore data files
+data
+# ignore bash scripts
+*.sh
+# ignore experimental results
+exp
+results
+# ignore log files
+log*
+# ignore pycache
+__pycache__
+# ignore saved model
+*.pkl
+*.pt
+# ignore vscode
+.vscode

biaffine-parser-master/.travis.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+language: python
+# Setup anaconda
+before_install:
+  - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+  - chmod +x miniconda.sh
+  - ./miniconda.sh -b -p $HOME/miniconda
+  - export PATH=$HOME/miniconda/bin:$PATH
+  - conda update --yes --quiet conda
+  - conda config --set always_yes yes
+  - conda create --quiet -n py37 python=3.7
+  - source activate py37
+# Install packages
+install:
+  - conda install --quiet pytorch=1.3.0 -c pytorch
+  - conda install --quiet flake8
+  - pip install -r requirements.txt
+script:
+  - flake8 .
+wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+chmod +x miniconda.sh
+./miniconda.sh -b -p $HOME/miniconda
+export PATH=$HOME/miniconda/bin:$PATH
+conda update --yes --quiet conda
+conda config --set always_yes yes
+conda create --quiet -n py37 python=3.7
+source activate py37
+chmod +x miniconda.sh \
+./miniconda.sh -b -p $HOME/miniconda \
+export PATH=$HOME/miniconda/bin:$PATH \
+source activate py37

biaffine-parser-master/README.md ADDED Viewed

	@@ -0,0 +1,226 @@

+# Biaffine Parser
+[![Travis](https://img.shields.io/travis/zysite/biaffine-parser.svg)](https://travis-ci.org/zysite/biaffine-parser)
+[![LICENSE](https://img.shields.io/github/license/zysite/biaffine-parser.svg)](https://github.com/zysite/biaffine-parser/blob/master/LICENSE)
+[![GitHub stars](https://img.shields.io/github/stars/zysite/biaffine-parser.svg)](https://github.com/zysite/biaffine-parser/stargazers)
+[![GitHub forks](https://img.shields.io/github/forks/zysite/biaffine-parser.svg)](https://github.com/zysite/biaffine-parser/network/members)
+An implementation of "Deep Biaffine Attention for Neural Dependency Parsing".
+Details and [hyperparameter choices](#Hyperparameters) are almost identical to those described in the paper,
+except that we provide the Eisner rather than MST algorithm to ensure well-formedness.
+Practically, projective decoding like Eisner is the best choice since PTB contains mostly (99.9%) projective trees.
+Besides the basic implementations, we also provide other features to replace the POS tags (TAG),
+i.e., character-level embeddings (CHAR) and BERT.
+## Requirements
+* `python`: 3.7.0
+* [`pytorch`](https://github.com/pytorch/pytorch): 1.3.0
+* [`transformers`](https://github.com/huggingface/transformers): 2.1.1
+## Datasets
+The model is evaluated on the Stanford Dependency conversion ([v3.3.0](https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip)) of the English Penn Treebank with POS tags predicted by [Stanford POS tagger](https://nlp.stanford.edu/software/stanford-postagger-full-2018-10-16.zip).
+For all datasets, we follow the conventional data splits:
+* Train: 02-21 (39,832 sentences)
+* Dev: 22 (1,700 sentences)
+* Test: 23 (2,416 sentences)
+## Performance
+| FEAT          |  UAS  |  LAS  | Speed (Sents/s) |
+| ------------- | :---: | :---: | :-------------: |
+| TAG           | 95.90 | 94.25 |     1696.22     |
+| TAG + Eisner  | 95.93 | 94.28 |     350.46      |
+| CHAR          | 95.99 | 94.38 |     1464.59     |
+| CHAR + Eisner | 96.02 | 94.41 |     323.73      |
+| BERT          | 96.64 | 95.11 |     438.72      |
+| BERT + Eisner | 96.65 | 95.12 |     214.68      |
+Note that punctuation is ignored in all evaluation metrics for PTB.
+Aside from using consistent hyperparameters, there are some keypoints that significantly affect the performance:
+- Dividing the pretrained embedding by its standard-deviation
+- Applying the same dropout mask at every recurrent timestep
+- Jointly dropping the word and additional feature representations
+For the above reasons, we may have to give up some native modules in pytorch (e.g., `LSTM` and `Dropout`),
+and use custom ones instead.
+As shown above, our results have outperformed the [offical implementation](https://github.com/tdozat/Parser-v1) (95.74 and 94.08).
+Incorporating character-level features or external embeddings like BERT can further improve the performance of the model.
+## Usage
+You can start the training, evaluation and prediction process by using subcommands registered in `parser.cmds`.
+```sh
+$ python run.py -h
+usage: run.py [-h] {evaluate,predict,train} ...
+Create the Biaffine Parser model.
+optional arguments:
+  -h, --help            show this help message and exit
+Commands:
+  {evaluate,predict,train}
+    evaluate            Evaluate the specified model and dataset.
+    predict             Use a trained model to make predictions.
+    train               Train a model.
+```
+Before triggering the subcommands, please make sure that the data files must be in CoNLL-X format.
+If some fields are missing, you can use underscores as placeholders.
+Below are some examples:
+```sh
+$ python run.py train -p -d=0 -f=exp/ptb.char --feat=char  \
+      --ftrain=data/ptb/train.conllx  \
+      --fdev=data/ptb/dev.conllx  \
+      --ftest=data/ptb/test.conllx  \
+      --fembed=data/glove.6B.100d.txt  \
+      --unk=unk
+$ python run.py evaluate -d=0 -f=exp/ptb.char --feat=char --tree  \
+      --fdata=data/ptb/test.conllx
+$ cat data/naive.conllx
+1       Too     _       _       _       _       _       _       _       _
+2       young   _       _       _       _       _       _       _       _
+3       too     _       _       _       _       _       _       _       _
+4       simple  _       _       _       _       _       _       _       _
+5       ,       _       _       _       _       _       _       _       _
+6       sometimes       _       _       _       _       _       _       _       _
+7       naive   _       _       _       _       _       _       _       _
+8       .       _       _       _       _       _       _       _       _
+$ python run.py predict -d=0 -f=exp/ptb.char --feat=char --tree  \
+      --fdata=data/naive.conllx  \
+      --fpred=naive.conllx
+$ cat naive.conllx
+1       Too     _       _       _       _       2       advmod  _       _
+2       young   _       _       _       _       0       root    _       _
+3       too     _       _       _       _       4       advmod  _       _
+4       simple  _       _       _       _       2       dep     _       _
+5       ,       _       _       _       _       2       punct   _       _
+6       sometimes       _       _       _       _       7       advmod  _       _
+7       naive   _       _       _       _       2       dep     _       _
+8       .       _       _       _       _       2       punct   _       _
+```
+All the optional arguments of the subcommands are as follows:
+```sh
+$ python run.py train -h
+usage: run.py train [-h] [--buckets BUCKETS] [--punct] [--ftrain FTRAIN]
+                    [--fdev FDEV] [--ftest FTEST] [--fembed FEMBED]
+                    [--unk UNK] [--conf CONF] [--file FILE] [--preprocess]
+                    [--device DEVICE] [--seed SEED] [--threads THREADS]
+                    [--tree] [--feat {tag,char,bert}]
+optional arguments:
+  -h, --help            show this help message and exit
+  --buckets BUCKETS     max num of buckets to use
+  --punct               whether to include punctuation
+  --ftrain FTRAIN       path to train file
+  --fdev FDEV           path to dev file
+  --ftest FTEST         path to test file
+  --fembed FEMBED       path to pretrained embeddings
+  --unk UNK             unk token in pretrained embeddings
+  --conf CONF, -c CONF  path to config file
+  --file FILE, -f FILE  path to saved files
+  --preprocess, -p      whether to preprocess the data first
+  --device DEVICE, -d DEVICE
+                        ID of GPU to use
+  --seed SEED, -s SEED  seed for generating random numbers
+  --threads THREADS, -t THREADS
+                        max num of threads
+  --tree                whether to ensure well-formedness
+  --feat {tag,char,bert}
+                        choices of additional features
+$ python run.py evaluate -h
+usage: run.py evaluate [-h] [--batch-size BATCH_SIZE] [--buckets BUCKETS]
+                       [--punct] [--fdata FDATA] [--conf CONF] [--file FILE]
+                       [--preprocess] [--device DEVICE] [--seed SEED]
+                       [--threads THREADS] [--tree] [--feat {tag,char,bert}]
+optional arguments:
+  -h, --help            show this help message and exit
+  --batch-size BATCH_SIZE
+                        batch size
+  --buckets BUCKETS     max num of buckets to use
+  --punct               whether to include punctuation
+  --fdata FDATA         path to dataset
+  --conf CONF, -c CONF  path to config file
+  --file FILE, -f FILE  path to saved files
+  --preprocess, -p      whether to preprocess the data first
+  --device DEVICE, -d DEVICE
+                        ID of GPU to use
+  --seed SEED, -s SEED  seed for generating random numbers
+  --threads THREADS, -t THREADS
+                        max num of threads
+  --tree                whether to ensure well-formedness
+  --feat {tag,char,bert}
+                        choices of additional features
+$ python run.py predict -h
+usage: run.py predict [-h] [--batch-size BATCH_SIZE] [--fdata FDATA]
+                      [--fpred FPRED] [--conf CONF] [--file FILE]
+                      [--preprocess] [--device DEVICE] [--seed SEED]
+                      [--threads THREADS] [--tree] [--feat {tag,char,bert}]
+optional arguments:
+  -h, --help            show this help message and exit
+  --batch-size BATCH_SIZE
+                        batch size
+  --fdata FDATA         path to dataset
+  --fpred FPRED         path to predicted result
+  --conf CONF, -c CONF  path to config file
+  --file FILE, -f FILE  path to saved files
+  --preprocess, -p      whether to preprocess the data first
+  --device DEVICE, -d DEVICE
+                        ID of GPU to use
+  --seed SEED, -s SEED  seed for generating random numbers
+  --threads THREADS, -t THREADS
+                        max num of threads
+  --tree                whether to ensure well-formedness
+  --feat {tag,char,bert}
+                        choices of additional features
+```
+## Hyperparameters
+| Param         | Description                                                  |                                 Value                                  |
+| :------------ | :----------------------------------------------------------- | :--------------------------------------------------------------------: |
+| n_embed       | dimension of embeddings                                      |                                  100                                   |
+| n_char_embed  | dimension of char embeddings                                 |                                   50                                   |
+| n_bert_layers | number of bert layers to use                                 |                                   4                                    |
+| embed_dropout | dropout ratio of embeddings                                  |                                  0.33                                  |
+| n_lstm_hidden | dimension of lstm hidden states                              |                                  400                                   |
+| n_lstm_layers | number of lstm layers                                        |                                   3                                    |
+| lstm_dropout  | dropout ratio of lstm                                        |                                  0.33                                  |
+| n_mlp_arc     | arc mlp size                                                 |                                  500                                   |
+| n_mlp_rel     | label mlp size                                               |                                  100                                   |
+| mlp_dropout   | dropout ratio of mlp                                         |                                  0.33                                  |
+| lr            | starting learning rate of training                           |                                  2e-3                                  |
+| betas         | hyperparameters of momentum and L2 norm                      |                               (0.9, 0.9)                               |
+| epsilon       | stability constant                                           |                                 1e-12                                  |
+| annealing     | formula of learning rate annealing                           | <img src="https://latex.codecogs.com/gif.latex?.75^{\frac{t}{5000}}"/> |
+| batch_size    | approximate number of tokens per training update             |                                  5000                                  |
+| epochs        | max number of epochs                                         |                                 50000                                  |
+| patience      | patience for early stop                                      |                                  100                                   |
+| min_freq      | minimum frequency of words in the training set not discarded |                                   2                                    |
+| fix_len       | fixed length of a word                                       |                                   20                                   |
+## References
+* [Deep Biaffine Attention for Neural Dependency Parsing](https://arxiv.org/abs/1611.01734)

biaffine-parser-master/config.ini ADDED Viewed

	@@ -0,0 +1,30 @@

+[Data]
+bert_model = 'bert-base-multilingual-uncased'
+[Network]
+n_embed = 768
+n_char_embed = 50
+n_bert_layers = 4
+embed_dropout = .33
+n_lstm_hidden = 400
+n_lstm_layers = 3
+lstm_dropout = .33
+n_mlp_arc = 500
+n_mlp_rel = 100
+mlp_dropout = .33
+[Optimizer]
+lr = 2e-3
+mu = .9
+nu = .9
+epsilon = 1e-12
+clip = 5.0
+decay = .75
+decay_steps = 5000
+[Run]
+batch_size = 1000
+epochs = 300
+patience = 30
+min_freq = 2
+fix_len = 20

biaffine-parser-master/data/naive3.conllx ADDED Viewed

	@@ -0,0 +1,7 @@

+1	அதற்கு	_	_	_	_	2	obl	_	_
+2	கடந்த	_	_	_	_	4	amod	_	_
+3	சட்டப்பேரவைத்	_	_	_	_	4	nmod	_	_
+4	தேர்தலில்	_	_	_	_	5	obl	_	_
+5	பலன்கிட்டியது	_	_	_	_	0	root	_	_
+6	.	_	_	_	_	5	punct	_	_

biaffine-parser-master/data/ptb/tamdev.conllx ADDED Viewed

The diff for this file is too large to render. See raw diff

biaffine-parser-master/data/ptb/tamtest.conllx ADDED Viewed

The diff for this file is too large to render. See raw diff

biaffine-parser-master/data/ptb/tamtrain.conllx ADDED Viewed

The diff for this file is too large to render. See raw diff

biaffine-parser-master/exp/ptb/fields ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:333059b6bc4af3b1eca7d03cdeeef162912b6e2e5d96a0d7373d05c0beab614d
+size 3189679

biaffine-parser-master/exp/ptb/model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:626032b06dd8f45c5755707990069dc0451e00a0c2e7992bfbd62d05a078dc0e
+size 736388858

biaffine-parser-master/parser/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# -*- coding: utf-8 -*-
+from .model import Model
+__all__ = ['Model']

biaffine-parser-master/parser/cmds/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# -*- coding: utf-8 -*-
+from .evaluate import Evaluate
+from .predict import Predict
+from .train import Train
+__all__ = ['Evaluate', 'Predict', 'Train']

biaffine-parser-master/parser/cmds/cmd.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# -*- coding: utf-8 -*-
+import os
+from parser.utils import Embedding
+from parser.utils.alg import eisner
+from parser.utils.common import bos, pad, unk
+from parser.utils.corpus import CoNLL, Corpus
+from parser.utils.field import BertField, CharField, Field
+from parser.utils.fn import ispunct
+from parser.utils.metric import Metric
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, BertTokenizer
+class CMD(object):
+    def __call__(self, args):
+        self.args = args
+        if not os.path.exists(args.file):
+            os.mkdir(args.file)
+        if not os.path.exists(args.fields) or args.preprocess:
+            print("Preprocess the data")
+            self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True)
+            if args.feat == 'char':
+                self.FEAT = CharField('chars', pad=pad, unk=unk, bos=bos,
+                                      fix_len=args.fix_len, tokenize=list)
+            elif args.feat == 'bert':
+                tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+                #tokenizer = AutoTokenizer.from_pretrained("sailen7/finetuning-sentiment-model-3000-samples")
+                self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]',
+                                      tokenize=tokenizer.encode)
+            else:
+                self.FEAT = Field('tags', bos=bos)
+            self.HEAD = Field('heads', bos=bos, use_vocab=False, fn=int)
+            self.REL = Field('rels', bos=bos)
+            if args.feat in ('char', 'bert'):
+                self.fields = CoNLL(FORM=(self.WORD, self.FEAT),
+                                    HEAD=self.HEAD, DEPREL=self.REL)
+            else:
+                self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT,
+                                    HEAD=self.HEAD, DEPREL=self.REL)
+            train = Corpus.load(args.ftrain, self.fields)
+            # if args.fembed:
+            #     embed = Embedding.load(args.fembed, args.unk)
+            # else:
+            embed = None
+            self.WORD.build(train, args.min_freq, embed)
+            self.FEAT.build(train)
+            self.REL.build(train)
+            torch.save(self.fields, args.fields)
+        else:
+            self.fields = torch.load(args.fields)
+            if args.feat in ('char', 'bert'):
+                self.WORD, self.FEAT = self.fields.FORM
+            else:
+                self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS
+            self.HEAD, self.REL = self.fields.HEAD, self.fields.DEPREL
+        self.puncts = torch.tensor([i for s, i in self.WORD.vocab.stoi.items()
+                                    if ispunct(s)]).to(args.device)
+        self.criterion = nn.CrossEntropyLoss()
+        print(f"{self.WORD}\n{self.FEAT}\n{self.HEAD}\n{self.REL}")
+        args.update({
+            'n_words': self.WORD.vocab.n_init,
+            'n_feats': len(self.FEAT.vocab),
+            'n_rels': len(self.REL.vocab),
+            'pad_index': self.WORD.pad_index,
+            'unk_index': self.WORD.unk_index,
+            'bos_index': self.WORD.bos_index
+        })
+    def train(self, loader):
+        self.model.train()
+        for words, feats, arcs, rels in loader:
+            self.optimizer.zero_grad()
+            mask = words.ne(self.args.pad_index)
+            # ignore the first token of each sentence
+            mask[:, 0] = 0
+            arc_scores, rel_scores = self.model(words, feats)
+            loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask)
+            loss.backward()
+            nn.utils.clip_grad_norm_(self.model.parameters(),
+                                     self.args.clip)
+            self.optimizer.step()
+            self.scheduler.step()
+    @torch.no_grad()
+    def evaluate(self, loader):
+        self.model.eval()
+        loss, metric = 0, Metric()
+        for words, feats, arcs, rels in loader:
+            mask = words.ne(self.args.pad_index)
+            # ignore the first token of each sentence
+            mask[:, 0] = 0
+            arc_scores, rel_scores = self.model(words, feats)
+            loss += self.get_loss(arc_scores, rel_scores, arcs, rels, mask)
+            arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
+            # ignore all punctuation if not specified
+            if not self.args.punct:
+                mask &= words.unsqueeze(-1).ne(self.puncts).all(-1)
+            metric(arc_preds, rel_preds, arcs, rels, mask)
+        loss /= len(loader)
+        return loss, metric
+    @torch.no_grad()
+    def predict(self, loader):
+        self.model.eval()
+        all_arcs, all_rels = [], []
+        for words, feats in loader:
+            print("words ->", words, " ", "features -> ",feats )
+            mask = words.ne(self.args.pad_index)
+            # ignore the first token of each sentence
+            mask[:, 0] = 0
+            lens = mask.sum(1).tolist()
+            arc_scores, rel_scores = self.model(words, feats)
+            arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
+            all_arcs.extend(arc_preds[mask].split(lens))
+            all_rels.extend(rel_preds[mask].split(lens))
+        all_arcs = [seq.tolist() for seq in all_arcs]
+        all_rels = [self.REL.vocab.id2token(seq.tolist()) for seq in all_rels]
+        return all_arcs, all_rels
+    def get_loss(self, arc_scores, rel_scores, arcs, rels, mask):
+        arc_scores, arcs = arc_scores[mask], arcs[mask]
+        rel_scores, rels = rel_scores[mask], rels[mask]
+        rel_scores = rel_scores[torch.arange(len(arcs)), arcs]
+        arc_loss = self.criterion(arc_scores, arcs)
+        rel_loss = self.criterion(rel_scores, rels)
+        loss = arc_loss + rel_loss
+        return loss
+    def decode(self, arc_scores, rel_scores, mask):
+        if self.args.tree:
+            arc_preds = eisner(arc_scores, mask)
+        else:
+            arc_preds = arc_scores.argmax(-1)
+        rel_preds = rel_scores.argmax(-1)
+        rel_preds = rel_preds.gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)
+        return arc_preds, rel_preds

biaffine-parser-master/parser/cmds/evaluate.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# -*- coding: utf-8 -*-
+from datetime import datetime
+from parser import Model
+from parser.cmds.cmd import CMD
+from parser.utils.corpus import Corpus
+from parser.utils.data import TextDataset, batchify
+class Evaluate(CMD):
+    def add_subparser(self, name, parser):
+        subparser = parser.add_parser(
+            name, help='Evaluate the specified model and dataset.'
+        )
+        subparser.add_argument('--batch-size', default=1000, type=int,
+                               help='batch size')
+        subparser.add_argument('--buckets', default=10, type=int,
+                               help='max num of buckets to use')
+        subparser.add_argument('--punct', action='store_true',
+                               help='whether to include punctuation')
+        subparser.add_argument('--fdata', default='data/ptb/tamtest.conllx',
+                               help='path to dataset')
+        return subparser
+    def __call__(self, args):
+        super(Evaluate, self).__call__(args)
+        print("Load the dataset")
+        corpus = Corpus.load(args.fdata, self.fields)
+        dataset = TextDataset(corpus, self.fields, args.buckets)
+        # set the data loader
+        dataset.loader = batchify(dataset, args.batch_size)
+        print(f"{len(dataset)} sentences, "
+              f"{len(dataset.loader)} batches, "
+              f"{len(dataset.buckets)} buckets")
+        print("Load the model")
+        self.model = Model.load(args.model)
+        print(f"{self.model}\n")
+        print("Evaluate the dataset")
+        start = datetime.now()
+        loss, metric = self.evaluate(dataset.loader)
+        total_time = datetime.now() - start
+        print(f"Loss: {loss:.4f} {metric}")
+        print(f"{total_time}s elapsed, "
+              f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")

biaffine-parser-master/parser/cmds/predict.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# -*- coding: utf-8 -*-
+from datetime import datetime
+from parser import Model
+from parser.cmds.cmd import CMD
+from parser.utils.corpus import Corpus
+from parser.utils.data import TextDataset, batchify
+class Predict(CMD):
+    def add_subparser(self, name, parser):
+        subparser = parser.add_parser(
+            name, help='Use a trained model to make predictions.'
+        )
+        subparser.add_argument('--batch-size', default=1000, type=int,
+                               help='batch size')
+        subparser.add_argument('--fdata', default='data/ptb/tamtest.conllx',
+                               help='path to dataset')
+        subparser.add_argument('--fpred', default='pred.conllx',
+                               help='path to predicted result')
+        return subparser
+    def __call__(self, args):
+        super(Predict, self).__call__(args)
+        print("Load the dataset")
+        corpus = Corpus.load(args.fdata, self.fields)
+        dataset = TextDataset(corpus, [self.WORD, self.FEAT])
+        # set the data loader
+        dataset.loader = batchify(dataset, args.batch_size)
+        print(type(dataset))
+        print(f"{len(dataset)} sentences, "
+              f"{len(dataset.loader)} batches")
+        print("Load the model")
+        self.model = Model.load(args.model)
+        print(f"{self.model}\n")
+        print("Make predictions on the dataset")
+        start = datetime.now()
+        corpus.heads, corpus.rels = self.predict(dataset.loader)
+        print(f"Save the predicted result to {args.fpred}")
+        corpus.save(args.fpred)
+        total_time = datetime.now() - start
+        print(f"{total_time}s elapsed, "
+              f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")

biaffine-parser-master/parser/cmds/train.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# -*- coding: utf-8 -*-
+from datetime import datetime, timedelta
+from parser import Model
+from parser.cmds.cmd import CMD
+from parser.utils.corpus import Corpus
+from parser.utils.data import TextDataset, batchify
+from parser.utils.metric import Metric
+import torch
+import torch.nn as nn
+from torch.optim import Adam
+from torch.optim.lr_scheduler import ExponentialLR
+class Train(CMD):
+    def add_subparser(self, name, parser):
+        subparser = parser.add_parser(
+            name, help='Train a model.'
+        )
+        subparser.add_argument('--buckets', default=10, type=int,
+                               help='max num of buckets to use')
+        subparser.add_argument('--punct', action='store_true',
+                               help='whether to include punctuation')
+        subparser.add_argument('--ftrain', default='data/ptb/tamtrain.conllx',
+                               help='path to train file')
+        subparser.add_argument('--fdev', default='data/ptb/tamdev.conllx',
+                               help='path to dev file')
+        subparser.add_argument('--ftest', default='data/ptb/tamtest.conllx',
+                               help='path to test file')
+        # subparser.add_argument('--fembed', default='data/tam.txt',
+        #                       help='path to pretrained embeddings')
+        subparser.add_argument('--unk', default='unk',
+                               help='unk token in pretrained embeddings')
+        return subparser
+    def __call__(self, args):
+        super(Train, self).__call__(args)
+        train = Corpus.load(args.ftrain, self.fields)
+        dev = Corpus.load(args.fdev, self.fields)
+        test = Corpus.load(args.ftest, self.fields)
+        train = TextDataset(train, self.fields, args.buckets)
+        dev = TextDataset(dev, self.fields, args.buckets)
+        test = TextDataset(test, self.fields, args.buckets)
+        # set the data loaders
+        train.loader = batchify(train, args.batch_size, True)
+        dev.loader = batchify(dev, args.batch_size)
+        test.loader = batchify(test, args.batch_size)
+        print(f"{'train:':6} {len(train):5} sentences, "
+              f"{len(train.loader):3} batches, "
+              f"{len(train.buckets)} buckets")
+        print(f"{'dev:':6} {len(dev):5} sentences, "
+              f"{len(dev.loader):3} batches, "
+              f"{len(train.buckets)} buckets")
+        print(f"{'test:':6} {len(test):5} sentences, "
+              f"{len(test.loader):3} batches, "
+              f"{len(train.buckets)} buckets")
+        print("Create the model")
+        self.model = Model(args).load_pretrained(self.WORD.embed)
+        print(f"{self.model}\n")
+        self.model = self.model.to(args.device)
+        if torch.cuda.device_count() > 1:
+            print("GPU")
+            self.model = nn.DataParallel(self.model)
+        self.optimizer = Adam(self.model.parameters(),
+                              args.lr,
+                              (args.mu, args.nu),
+                              args.epsilon)
+        self.scheduler = ExponentialLR(self.optimizer,
+                                       args.decay**(1/args.decay_steps))
+        total_time = timedelta()
+        best_e, best_metric = 1, Metric()
+        for epoch in range(1, args.epochs + 1):
+            start = datetime.now()
+            # train one epoch and update the parameters
+            self.train(train.loader)
+            print(f"Epoch {epoch} / {args.epochs}:")
+            loss, train_metric = self.evaluate(train.loader)
+            print(f"{'train:':6} Loss: {loss:.4f} {train_metric}")
+            loss, dev_metric = self.evaluate(dev.loader)
+            print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
+            loss, test_metric = self.evaluate(test.loader)
+            print(f"{'test:':6} Loss: {loss:.4f} {test_metric}")
+            t = datetime.now() - start
+            # save the model if it is the best so far
+            if dev_metric > best_metric and epoch > args.patience:
+                best_e, best_metric = epoch, dev_metric
+                if hasattr(self.model, 'module'):
+                    self.model.module.save(args.model)
+                else:
+                    self.model.save(args.model)
+                print(f"{t}s elapsed (saved)\n")
+            else:
+                print(f"{t}s elapsed\n")
+            total_time += t
+            if epoch - best_e >= args.patience:
+                break
+        self.model = Model.load(args.model)
+        loss, metric = self.evaluate(test.loader)
+        print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
+        print(f"the score of test at epoch {best_e} is {metric.score:.2%}")
+        print(f"average time of each epoch is {total_time / epoch}s")
+        print(f"{total_time}s elapsed")

biaffine-parser-master/parser/config.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# -*- coding: utf-8 -*-
+from ast import literal_eval
+from configparser import ConfigParser
+from argparse import Namespace
+class Config(ConfigParser):
+    def __init__(self, path):
+        super(Config, self).__init__()
+        self.read(path)
+        self.namespace = Namespace()
+        self.update(dict((name, literal_eval(value))
+                         for section in self.sections()
+                         for name, value in self.items(section)))
+    def __repr__(self):
+        s = line = "-" * 15 + "-+-" + "-" * 25 + "\n"
+        s += f"{'Param':15} | {'Value':^25}\n" + line
+        for name, value in vars(self.namespace).items():
+            s += f"{name:15} | {str(value):^25}\n"
+        s += line
+        return s
+    def __getattr__(self, attr):
+        return getattr(self.namespace, attr)
+    def __getstate__(self):
+        return vars(self)
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+    def update(self, kwargs):
+        for name, value in kwargs.items():
+            setattr(self.namespace, name, value)
+        return self

biaffine-parser-master/parser/model.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# -*- coding: utf-8 -*-
+from parser.modules import CHAR_LSTM, MLP, BertEmbedding, Biaffine, BiLSTM
+from parser.modules.dropout import IndependentDropout, SharedDropout
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import (pack_padded_sequence, pad_packed_sequence,
+                                pad_sequence)
+class Model(nn.Module):
+    def __init__(self, args):
+        super(Model, self).__init__()
+        self.args = args
+        # the embedding layer
+        self.word_embed = nn.Embedding(num_embeddings=args.n_words,
+                                       embedding_dim=args.n_embed)
+        if args.feat == 'char':
+            self.feat_embed = CHAR_LSTM(n_chars=args.n_feats,
+                                        n_embed=args.n_char_embed,
+                                        n_out=args.n_embed)
+        elif args.feat == 'bert':
+            self.feat_embed = BertEmbedding(model=args.bert_model,
+                                            n_layers=args.n_bert_layers,
+                                            n_out=args.n_embed)
+        else:
+            self.feat_embed = nn.Embedding(num_embeddings=args.n_feats,
+                                           embedding_dim=args.n_embed)
+        self.embed_dropout = IndependentDropout(p=args.embed_dropout)
+        # the word-lstm layer
+        self.lstm = BiLSTM(input_size=args.n_embed*2,
+                           hidden_size=args.n_lstm_hidden,
+                           num_layers=args.n_lstm_layers,
+                           dropout=args.lstm_dropout)
+        self.lstm_dropout = SharedDropout(p=args.lstm_dropout)
+        # the MLP layers
+        self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden*2,
+                             n_hidden=args.n_mlp_arc,
+                             dropout=args.mlp_dropout)
+        self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden*2,
+                             n_hidden=args.n_mlp_arc,
+                             dropout=args.mlp_dropout)
+        self.mlp_rel_h = MLP(n_in=args.n_lstm_hidden*2,
+                             n_hidden=args.n_mlp_rel,
+                             dropout=args.mlp_dropout)
+        self.mlp_rel_d = MLP(n_in=args.n_lstm_hidden*2,
+                             n_hidden=args.n_mlp_rel,
+                             dropout=args.mlp_dropout)
+        # the Biaffine layers
+        self.arc_attn = Biaffine(n_in=args.n_mlp_arc,
+                                 bias_x=True,
+                                 bias_y=False)
+        self.rel_attn = Biaffine(n_in=args.n_mlp_rel,
+                                 n_out=args.n_rels,
+                                 bias_x=True,
+                                 bias_y=True)
+        self.pad_index = args.pad_index
+        self.unk_index = args.unk_index
+    def load_pretrained(self, embed=None):
+        if embed is not None:
+            self.pretrained = nn.Embedding.from_pretrained(embed)
+            nn.init.zeros_(self.word_embed.weight)
+        return self
+    def forward(self, words, feats):
+        batch_size, seq_len = words.shape
+        # get the mask and lengths of given batch
+        mask = words.ne(self.pad_index)
+        lens = mask.sum(dim=1)
+        # set the indices larger than num_embeddings to unk_index
+        ext_mask = words.ge(self.word_embed.num_embeddings)
+        ext_words = words.masked_fill(ext_mask, self.unk_index)
+        # get outputs from embedding layers
+        word_embed = self.word_embed(ext_words)
+        if hasattr(self, 'pretrained'):
+            word_embed = torch.cat((word_embed, self.pretrained(words)), dim=2)
+        if self.args.feat == 'char':
+            print(mask.shape)
+            feat_embed = self.feat_embed(feats[mask])
+            feat_embed = pad_sequence(feat_embed.split(lens.tolist()), True)
+        elif self.args.feat == 'bert':
+            feat_embed = self.feat_embed(*feats)
+        else:
+            feat_embed = self.feat_embed(feats)
+        word_embed, feat_embed = self.embed_dropout(word_embed, feat_embed)
+        # concatenate the word and feat representations
+        embed = torch.cat((word_embed, feat_embed), dim=-1)
+        lens = lens.to('cpu')
+        x = pack_padded_sequence(embed, lens ,True, False)
+        x, _ = self.lstm(x)
+        x, _ = pad_packed_sequence(x, True, total_length=seq_len)
+        x = self.lstm_dropout(x)
+        # apply MLPs to the BiLSTM output states
+        arc_h = self.mlp_arc_h(x)
+        arc_d = self.mlp_arc_d(x)
+        rel_h = self.mlp_rel_h(x)
+        rel_d = self.mlp_rel_d(x)
+        # get arc and rel scores from the bilinear attention
+        # [batch_size, seq_len, seq_len]
+        s_arc = self.arc_attn(arc_d, arc_h)
+        # [batch_size, seq_len, seq_len, n_rels]
+        s_rel = self.rel_attn(rel_d, rel_h).permute(0, 2, 3, 1)
+        # set the scores that exceed the length of each sentence to -inf
+        s_arc.masked_fill_(~mask.unsqueeze(1), float('-inf'))
+        return s_arc, s_rel
+    @classmethod
+    def load(cls, path):
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        state = torch.load(path, map_location=device)
+        model = cls(state['args'])
+        model.load_pretrained(state['pretrained'])
+        model.load_state_dict(state['state_dict'], False)
+        model.to(device)
+        return model
+    def save(self, path):
+        state_dict, pretrained = self.state_dict(), None
+        if hasattr(self, 'pretrained'):
+            pretrained = state_dict.pop('pretrained.weight')
+        state = {
+            'args': self.args,
+            'state_dict': state_dict,
+            'pretrained': pretrained
+        }
+        torch.save(state, path)

biaffine-parser-master/parser/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# -*- coding: utf-8 -*-
+from . import dropout
+from .bert import BertEmbedding
+from .biaffine import Biaffine
+from .bilstm import BiLSTM
+from .char_lstm import CHAR_LSTM
+from .mlp import MLP
+__all__ = ['CHAR_LSTM', 'MLP', 'BertEmbedding',
+           'Biaffine', 'BiLSTM', 'dropout']

biaffine-parser-master/parser/modules/bert.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+from transformers import BertModel
+from .scalar_mix import ScalarMix
+class BertEmbedding(nn.Module):
+    def __init__(self, model, n_layers, n_out, requires_grad=False):
+        proxies = {
+        "http": "http://10.10.1.10:3128",
+        "https": "https://10.10.1.10:1080",
+        }
+        super(BertEmbedding, self).__init__()
+        #self.bert = AutoModelForMaskedLM.from_pretrained("Sanath369/distilroberta-base-finetuned-telugu_bert1")
+        self.bert = BertModel.from_pretrained(model, output_hidden_states=True)
+        self.bert = self.bert.requires_grad_(requires_grad)
+        self.n_layers = n_layers
+        self.n_out = n_out
+        self.requires_grad = requires_grad
+        self.hidden_size = self.bert.config.hidden_size
+        self.scalar_mix = ScalarMix(n_layers)
+        self.projection = nn.Linear(self.hidden_size, n_out, False)
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f"n_layers={self.n_layers}, n_out={self.n_out}"
+        if self.requires_grad:
+            s += f", requires_grad={self.requires_grad}"
+        s += ')'
+        return s
+    def forward(self, subwords, bert_lens, bert_mask):
+        batch_size, seq_len = bert_lens.shape
+        mask = bert_lens.gt(0)
+        if not self.requires_grad:
+            self.bert.eval()
+        # print(subwords)
+        out = self.bert(subwords, attention_mask=bert_mask)
+        # print(out[0].shape)
+        # print(out[1].shape)
+        # print("bert_mask:", bert_mask)
+        _,_,bert = self.bert(subwords, attention_mask=bert_mask)
+        bert = bert[-self.n_layers:]
+        # print("first" , bert)
+        bert = self.scalar_mix(bert)
+        # print("Second" , bert)
+        bert = bert[bert_mask].split(bert_lens[mask].tolist())
+        bert = torch.stack([i.mean(0) for i in bert])
+        bert_embed = bert.new_zeros(batch_size, seq_len, self.hidden_size)
+        bert_embed = bert_embed.masked_scatter_(mask.unsqueeze(-1), bert)
+        bert_embed = self.projection(bert_embed)
+        return bert_embed

biaffine-parser-master/parser/modules/biaffine.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+class Biaffine(nn.Module):
+    def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
+        super(Biaffine, self).__init__()
+        self.n_in = n_in
+        self.n_out = n_out
+        self.bias_x = bias_x
+        self.bias_y = bias_y
+        self.weight = nn.Parameter(torch.Tensor(n_out,
+                                                n_in + bias_x,
+                                                n_in + bias_y))
+        self.reset_parameters()
+    def extra_repr(self):
+        s = f"n_in={self.n_in}, n_out={self.n_out}"
+        if self.bias_x:
+            s += f", bias_x={self.bias_x}"
+        if self.bias_y:
+            s += f", bias_y={self.bias_y}"
+        return s
+    def reset_parameters(self):
+        nn.init.zeros_(self.weight)
+    def forward(self, x, y):
+        if self.bias_x:
+            x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
+        if self.bias_y:
+            y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
+        # [batch_size, n_out, seq_len, seq_len]
+        s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
+        # remove dim 1 if n_out == 1
+        s = s.squeeze(1)
+        return s

biaffine-parser-master/parser/modules/bilstm.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# -*- coding: utf-8 -*-
+from parser.modules.dropout import SharedDropout
+import torch
+import torch.nn as nn
+from torch.nn.modules.rnn import apply_permutation
+from torch.nn.utils.rnn import PackedSequence
+class BiLSTM(nn.Module):
+    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0):
+        super(BiLSTM, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.f_cells = nn.ModuleList()
+        self.b_cells = nn.ModuleList()
+        for _ in range(self.num_layers):
+            self.f_cells.append(nn.LSTMCell(input_size=input_size,
+                                            hidden_size=hidden_size))
+            self.b_cells.append(nn.LSTMCell(input_size=input_size,
+                                            hidden_size=hidden_size))
+            input_size = hidden_size * 2
+        self.reset_parameters()
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f"{self.input_size}, {self.hidden_size}"
+        if self.num_layers > 1:
+            s += f", num_layers={self.num_layers}"
+        if self.dropout > 0:
+            s += f", dropout={self.dropout}"
+        s += ')'
+        return s
+    def reset_parameters(self):
+        for param in self.parameters():
+            # apply orthogonal_ to weight
+            if len(param.shape) > 1:
+                nn.init.orthogonal_(param)
+            # apply zeros_ to bias
+            else:
+                nn.init.zeros_(param)
+    def permute_hidden(self, hx, permutation):
+        if permutation is None:
+            return hx
+        h = apply_permutation(hx[0], permutation)
+        c = apply_permutation(hx[1], permutation)
+        return h, c
+    def layer_forward(self, x, hx, cell, batch_sizes, reverse=False):
+        hx_0 = hx_i = hx
+        hx_n, output = [], []
+        steps = reversed(range(len(x))) if reverse else range(len(x))
+        if self.training:
+            hid_mask = SharedDropout.get_mask(hx_0[0], self.dropout)
+        for t in steps:
+            last_batch_size, batch_size = len(hx_i[0]), batch_sizes[t]
+            if last_batch_size < batch_size:
+                hx_i = [torch.cat((h, ih[last_batch_size:batch_size]))
+                        for h, ih in zip(hx_i, hx_0)]
+            else:
+                hx_n.append([h[batch_size:] for h in hx_i])
+                hx_i = [h[:batch_size] for h in hx_i]
+            hx_i = [h for h in cell(x[t], hx_i)]
+            output.append(hx_i[0])
+            if self.training:
+                hx_i[0] = hx_i[0] * hid_mask[:batch_size]
+        if reverse:
+            hx_n = hx_i
+            output.reverse()
+        else:
+            hx_n.append(hx_i)
+            hx_n = [torch.cat(h) for h in zip(*reversed(hx_n))]
+        output = torch.cat(output)
+        return output, hx_n
+    def forward(self, sequence, hx=None):
+        x, batch_sizes = sequence.data, sequence.batch_sizes.tolist()
+        batch_size = batch_sizes[0]
+        h_n, c_n = [], []
+        if hx is None:
+            ih = x.new_zeros(self.num_layers * 2, batch_size, self.hidden_size)
+            h, c = ih, ih
+        else:
+            h, c = self.permute_hidden(hx, sequence.sorted_indices)
+        h = h.view(self.num_layers, 2, batch_size, self.hidden_size)
+        c = c.view(self.num_layers, 2, batch_size, self.hidden_size)
+        for i in range(self.num_layers):
+            x = torch.split(x, batch_sizes)
+            if self.training:
+                mask = SharedDropout.get_mask(x[0], self.dropout)
+                x = [i * mask[:len(i)] for i in x]
+            x_f, (h_f, c_f) = self.layer_forward(x=x,
+                                                 hx=(h[i, 0], c[i, 0]),
+                                                 cell=self.f_cells[i],
+                                                 batch_sizes=batch_sizes)
+            x_b, (h_b, c_b) = self.layer_forward(x=x,
+                                                 hx=(h[i, 1], c[i, 1]),
+                                                 cell=self.b_cells[i],
+                                                 batch_sizes=batch_sizes,
+                                                 reverse=True)
+            x = torch.cat((x_f, x_b), -1)
+            h_n.append(torch.stack((h_f, h_b)))
+            c_n.append(torch.stack((c_f, c_b)))
+        x = PackedSequence(x,
+                           sequence.batch_sizes,
+                           sequence.sorted_indices,
+                           sequence.unsorted_indices)
+        hx = torch.cat(h_n, 0), torch.cat(c_n, 0)
+        hx = self.permute_hidden(hx, sequence.unsorted_indices)
+        return x, hx

biaffine-parser-master/parser/modules/char_lstm.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pack_padded_sequence
+class CHAR_LSTM(nn.Module):
+    def __init__(self, n_chars, n_embed, n_out):
+        super(CHAR_LSTM, self).__init__()
+        # the embedding layer
+        self.embed = nn.Embedding(num_embeddings=n_chars,
+                                  embedding_dim=n_embed)
+        # the lstm layer
+        self.lstm = nn.LSTM(input_size=n_embed,
+                            hidden_size=n_out//2,
+                            batch_first=True,
+                            bidirectional=True)
+    def forward(self, x):
+        mask = x.gt(0)
+        lens = mask.sum(dim=1)
+        x = pack_padded_sequence(self.embed(x), lens, True, False)
+        x, (hidden, _) = self.lstm(x)
+        hidden = torch.cat(torch.unbind(hidden), dim=-1)
+        return hidden

biaffine-parser-master/parser/modules/dropout.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+class SharedDropout(nn.Module):
+    def __init__(self, p=0.5, batch_first=True):
+        super(SharedDropout, self).__init__()
+        self.p = p
+        self.batch_first = batch_first
+    def extra_repr(self):
+        s = f"p={self.p}"
+        if self.batch_first:
+            s += f", batch_first={self.batch_first}"
+        return s
+    def forward(self, x):
+        if self.training:
+            if self.batch_first:
+                mask = self.get_mask(x[:, 0], self.p)
+            else:
+                mask = self.get_mask(x[0], self.p)
+            x *= mask.unsqueeze(1) if self.batch_first else mask
+        return x
+    @staticmethod
+    def get_mask(x, p):
+        mask = x.new_empty(x.shape).bernoulli_(1 - p)
+        mask = mask / (1 - p)
+        return mask
+class IndependentDropout(nn.Module):
+    def __init__(self, p=0.5):
+        super(IndependentDropout, self).__init__()
+        self.p = p
+    def extra_repr(self):
+        return f"p={self.p}"
+    def forward(self, *items):
+        if self.training:
+            masks = [x.new_empty(x.shape[:2]).bernoulli_(1 - self.p)
+                     for x in items]
+            total = sum(masks)
+            scale = len(items) / total.max(torch.ones_like(total))
+            masks = [mask * scale for mask in masks]
+            items = [item * mask.unsqueeze(dim=-1)
+                     for item, mask in zip(items, masks)]
+        return items

biaffine-parser-master/parser/modules/mlp.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# -*- coding: utf-8 -*-
+from parser.modules.dropout import SharedDropout
+import torch.nn as nn
+class MLP(nn.Module):
+    def __init__(self, n_in, n_hidden, dropout=0):
+        super(MLP, self).__init__()
+        self.linear = nn.Linear(n_in, n_hidden)
+        self.activation = nn.LeakyReLU(negative_slope=0.1)
+        self.dropout = SharedDropout(p=dropout)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.orthogonal_(self.linear.weight)
+        nn.init.zeros_(self.linear.bias)
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+        return x

biaffine-parser-master/parser/modules/scalar_mix.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+import numpy as np
+class ScalarMix(nn.Module):
+    def __init__(self, n_layers, dropout=0):
+        super(ScalarMix, self).__init__()
+        self.n_layers = n_layers
+        self.dropout = dropout
+        self.weights = nn.Parameter(torch.zeros(n_layers))
+        self.gamma = nn.Parameter(torch.tensor([1.0]))
+        self.dropout = nn.Dropout(dropout)
+    def extra_repr(self):
+        s = f"n_layers={self.n_layers}"
+        if self.dropout.p > 0:
+            s += f", dropout={self.dropout.p}"
+        return s
+    def forward(self, tensors):
+        normed_weights = self.dropout(self.weights.softmax(-1))
+        weighted_sum = sum(w * h for w, h in zip(normed_weights, tensors))
+        return self.gamma * weighted_sum

biaffine-parser-master/parser/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# -*- coding: utf-8 -*-
+from . import corpus, data, field, fn, metric
+from .embedding import Embedding
+from .vocab import Vocab
+__all__ = ['Corpus', 'Embedding', 'Vocab',
+           'corpus', 'data', 'field', 'fn', 'metric']

biaffine-parser-master/parser/utils/alg.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# -*- coding: utf-8 -*-
+import torch
+from torch.nn.utils.rnn import pad_sequence
+def kmeans(x, k):
+    x = torch.tensor(x, dtype=torch.float)
+    # count the frequency of each datapoint
+    d, indices, f = x.unique(return_inverse=True, return_counts=True)
+    # calculate the sum of the values of the same datapoints
+    total = d * f
+    # initialize k centroids randomly
+    c, old = d[torch.randperm(len(d))[:k]], None
+    # assign labels to each datapoint based on centroids
+    dists, y = torch.abs_(d.unsqueeze(-1) - c).min(dim=-1)
+    # make sure number of datapoints is greater than that of clusters
+    assert len(d) >= k, f"unable to assign {len(d)} datapoints to {k} clusters"
+    while old is None or not c.equal(old):
+        # if an empty cluster is encountered,
+        # choose the farthest datapoint from the biggest cluster
+        # and move that the empty one
+        for i in range(k):
+            if not y.eq(i).any():
+                mask = y.eq(torch.arange(k).unsqueeze(-1))
+                lens = mask.sum(dim=-1)
+                biggest = mask[lens.argmax()].nonzero().view(-1)
+                farthest = dists[biggest].argmax()
+                y[biggest[farthest]] = i
+        mask = y.eq(torch.arange(k).unsqueeze(-1))
+        # update the centroids
+        c, old = (total * mask).sum(-1) / (f * mask).sum(-1), c
+        # re-assign all datapoints to clusters
+        dists, y = torch.abs_(d.unsqueeze(-1) - c).min(dim=-1)
+    # assign all datapoints to the new-generated clusters
+    # without considering the empty ones
+    y, assigned = y[indices], y.unique().tolist()
+    # get the centroids of the assigned clusters
+    centroids = c[assigned].tolist()
+    # map all values of datapoints to buckets
+    clusters = [torch.where(y.eq(i))[0].tolist() for i in assigned]
+    return centroids, clusters
+def eisner(scores, mask):
+    lens = mask.sum(1)
+    batch_size, seq_len, _ = scores.shape
+    scores = scores.permute(2, 1, 0)
+    s_i = torch.full_like(scores, float('-inf'))
+    s_c = torch.full_like(scores, float('-inf'))
+    p_i = scores.new_zeros(seq_len, seq_len, batch_size).long()
+    p_c = scores.new_zeros(seq_len, seq_len, batch_size).long()
+    s_c.diagonal().fill_(0)
+    for w in range(1, seq_len):
+        n = seq_len - w
+        starts = p_i.new_tensor(range(n)).unsqueeze(0)
+        # ilr = C(i->r) + C(j->r+1)
+        ilr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
+        # [batch_size, n, w]
+        ilr = ilr.permute(2, 0, 1)
+        il = ilr + scores.diagonal(-w).unsqueeze(-1)
+        # I(j->i) = max(C(i->r) + C(j->r+1) + s(j->i)), i <= r < j
+        il_span, il_path = il.max(-1)
+        s_i.diagonal(-w).copy_(il_span)
+        p_i.diagonal(-w).copy_(il_path + starts)
+        ir = ilr + scores.diagonal(w).unsqueeze(-1)
+        # I(i->j) = max(C(i->r) + C(j->r+1) + s(i->j)), i <= r < j
+        ir_span, ir_path = ir.max(-1)
+        s_i.diagonal(w).copy_(ir_span)
+        p_i.diagonal(w).copy_(ir_path + starts)
+        # C(j->i) = max(C(r->i) + I(j->r)), i <= r < j
+        cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
+        cl_span, cl_path = cl.permute(2, 0, 1).max(-1)
+        s_c.diagonal(-w).copy_(cl_span)
+        p_c.diagonal(-w).copy_(cl_path + starts)
+        # C(i->j) = max(I(i->r) + C(r->j)), i < r <= j
+        cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
+        cr_span, cr_path = cr.permute(2, 0, 1).max(-1)
+        s_c.diagonal(w).copy_(cr_span)
+        s_c[0, w][lens.ne(w)] = float('-inf')
+        p_c.diagonal(w).copy_(cr_path + starts + 1)
+    predicts = []
+    p_c = p_c.permute(2, 0, 1).cpu()
+    p_i = p_i.permute(2, 0, 1).cpu()
+    for i, length in enumerate(lens.tolist()):
+        heads = p_c.new_ones(length + 1, dtype=torch.long)
+        backtrack(p_i[i], p_c[i], heads, 0, length, True)
+        predicts.append(heads.to(mask.device))
+    return pad_sequence(predicts, True)
+def backtrack(p_i, p_c, heads, i, j, complete):
+    if i == j:
+        return
+    if complete:
+        r = p_c[i, j]
+        backtrack(p_i, p_c, heads, i, r, False)
+        backtrack(p_i, p_c, heads, r, j, True)
+    else:
+        r, heads[j] = p_i[i, j], i
+        i, j = sorted((i, j))
+        backtrack(p_i, p_c, heads, i, r, True)
+        backtrack(p_i, p_c, heads, j, r + 1, True)
+def stripe(x, n, w, offset=(0, 0), dim=1):
+    r'''Returns a diagonal stripe of the tensor.
+    Parameters:
+        x (Tensor): the input tensor with 2 or more dims.
+        n (int): the length of the stripe.
+        w (int): the width of the stripe.
+        offset (tuple): the offset of the first two dims.
+        dim (int): 0 if returns a horizontal stripe; 1 else.
+    Example::
+    >>> x = torch.arange(25).view(5, 5)
+    >>> x
+    tensor([[ 0,  1,  2,  3,  4],
+            [ 5,  6,  7,  8,  9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19],
+            [20, 21, 22, 23, 24]])
+    >>> stripe(x, 2, 3, (1, 1))
+    tensor([[ 6,  7,  8],
+            [12, 13, 14]])
+    >>> stripe(x, 2, 3, dim=0)
+    tensor([[ 0,  5, 10],
+            [ 6, 11, 16]])
+    '''
+    x, seq_len = x.contiguous(), x.size(1)
+    stride, numel = list(x.stride()), x[0, 0].numel()
+    stride[0] = (seq_len + 1) * numel
+    stride[1] = (1 if dim == 1 else seq_len) * numel
+    return x.as_strided(size=(n, w, *x.shape[2:]),
+                        stride=stride,
+                        storage_offset=(offset[0]*seq_len+offset[1])*numel)

biaffine-parser-master/parser/utils/common.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# -*- coding: utf-8 -*-
+pad = '<pad>'
+unk = '<unk>'
+bos = '<bos>'
+eos = '<eos>'

biaffine-parser-master/parser/utils/corpus.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# -*- coding: utf-8 -*-
+from collections import namedtuple
+from collections.abc import Iterable
+from parser.utils.field import Field
+CoNLL = namedtuple(typename='CoNLL',
+                   field_names=['ID', 'FORM', 'LEMMA', 'CPOS', 'POS',
+                                'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'],
+                   defaults=[None]*10)
+class Sentence(object):
+    def __init__(self, fields, values):
+        for field, value in zip(fields, values):
+            if isinstance(field, Iterable):
+                for j in range(len(field)):
+                    setattr(self, field[j].name, value)
+            else:
+                setattr(self, field.name, value)
+        self.fields = fields
+    @property
+    def values(self):
+        for field in self.fields:
+            if isinstance(field, Iterable):
+                yield getattr(self, field[0].name)
+            else:
+                yield getattr(self, field.name)
+    def __len__(self):
+        return len(next(iter(self.values)))
+    def __repr__(self):
+        return '\n'.join('\t'.join(map(str, line))
+                         for line in zip(*self.values)) + '\n'
+class Corpus(object):
+    def __init__(self, fields, sentences):
+        super(Corpus, self).__init__()
+        self.fields = fields
+        self.sentences = sentences
+    def __len__(self):
+        return len(self.sentences)
+    def __repr__(self):
+        return '\n'.join(str(sentence) for sentence in self)
+    def __getitem__(self, index):
+        return self.sentences[index]
+    def __getattr__(self, name):
+        print(name)
+        if not hasattr(self.sentences[0], name):
+            raise AttributeError
+        for sentence in self.sentences:
+            yield getattr(sentence, name)
+    def __setattr__(self, name, value):
+        if name in ['fields', 'sentences']:
+            self.__dict__[name] = value
+        else:
+            for i, sentence in enumerate(self.sentences):
+                setattr(sentence, name, value[i])
+    @classmethod
+    def load(cls, path, fields):
+        start, sentences = 0, []
+        fields = [field if field is not None else Field(str(i))
+                  for i, field in enumerate(fields)]
+        with open(path, 'r') as f:
+            lines = [line.strip() for line in f]
+        for i, line in enumerate(lines):
+            if not line:
+                values = list(zip(*[l.split('\t') for l in lines[start:i]]))
+                sentences.append(Sentence(fields, values))
+                start = i + 1
+        return cls(fields, sentences)
+    def save(self, path):
+        with open(path, 'w') as f:
+            f.write(f"{self}\n")

biaffine-parser-master/parser/utils/data.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# -*- coding: utf-8 -*-
+from collections.abc import Iterable
+from itertools import chain
+from parser.utils.alg import kmeans
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset, Sampler
+class TextDataLoader(DataLoader):
+    def __init__(self, *args, **kwargs):
+        super(TextDataLoader, self).__init__(*args, **kwargs)
+        self.fields = self.dataset.fields
+    def __iter__(self):
+        for raw_batch in super(TextDataLoader, self).__iter__():
+            batch, device = [], 'cuda' if torch.cuda.is_available() else 'cpu'
+            for data, field in zip(raw_batch, self.fields):
+                if isinstance(data[0], torch.Tensor):
+                    data = pad_sequence(data, True, field.pad_index).to(device)
+                elif isinstance(data[0], Iterable):
+                    data = [pad_sequence(f, True, field.pad_index).to(device)
+                            for f in zip(*data)]
+                batch.append(data)
+            yield batch
+class TextDataset(Dataset):
+    def __init__(self, corpus, fields, n_buckets=1):
+        super(TextDataset, self).__init__()
+        self.corpus = corpus
+        self.fields = list(chain(*[
+            field if isinstance(field, Iterable) else [field]
+            for field in fields if field is not None
+        ]))
+        for field in self.fields:
+            value = field.numericalize(getattr(corpus, field.name))
+            setattr(self, field.name, value)
+        # NOTE: the final bucket count is roughly equal to n_buckets
+        self.lengths = [len(i) + sum([bool(field.bos), bool(field.bos)])
+                        for i in corpus]
+        self.buckets = dict(zip(*kmeans(self.lengths, n_buckets)))
+    def __getitem__(self, index):
+        for field in self.fields:
+            yield getattr(self, field.name)[index]
+    def __len__(self):
+        return len(self.corpus)
+    @property
+    def loader(self):
+        if hasattr(self, 'data_loader'):
+            return self.data_loader
+        else:
+            raise AttributeError
+    @loader.setter
+    def loader(self, data_loader):
+        self.data_loader = data_loader
+    @classmethod
+    def collate_fn(cls, batch):
+        return (field for field in zip(*batch))
+class TextSampler(Sampler):
+    def __init__(self, buckets, batch_size, shuffle=False):
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.sizes, self.buckets = zip(*[
+            (size, bucket) for size, bucket in buckets.items()
+        ])
+        # the number of chunks in each bucket, which is clipped by
+        # range [1, len(bucket)]
+        self.chunks = [
+            min(len(bucket), max(round(size * len(bucket) / batch_size), 1))
+            for size, bucket in zip(self.sizes, self.buckets)
+        ]
+    def __iter__(self):
+        # if shuffle, shuffle both the buckets and samples in each bucket
+        range_fn = torch.randperm if self.shuffle else torch.arange
+        for i in range_fn(len(self.buckets)).tolist():
+            split_sizes = [(len(self.buckets[i]) - j - 1) // self.chunks[i] + 1
+                           for j in range(self.chunks[i])]
+            # DON'T use `torch.chunk` which may return wrong number of chunks
+            for batch in range_fn(len(self.buckets[i])).split(split_sizes):
+                yield [self.buckets[i][j] for j in batch.tolist()]
+    def __len__(self):
+        return sum(self.chunks)
+def batchify(dataset, batch_size, shuffle=False):
+    batch_sampler = TextSampler(buckets=dataset.buckets,
+                                batch_size=batch_size,
+                                shuffle=shuffle)
+    loader = TextDataLoader(dataset=dataset,
+                            batch_sampler=batch_sampler,
+                            collate_fn=dataset.collate_fn)
+    return loader

biaffine-parser-master/parser/utils/embedding.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# -*- coding: utf-8 -*-
+import torch
+class Embedding(object):
+    def __init__(self, tokens, vectors, unk=None):
+        super(Embedding, self).__init__()
+        self.tokens = tokens
+        self.vectors = torch.tensor([v[0] for v in vectors])
+        print(self.vectors.size(0))
+        self.pretrained = {w: v for w, v in zip(tokens, vectors)}
+        self.unk = '[UNK]'
+    def __len__(self):
+        return len(self.tokens)
+    def __contains__(self, token):
+        return token in self.pretrained
+    @property
+    def dim(self):
+        return self.vectors.size(0)
+    @property
+    def unk_index(self):
+        if self.unk is not None:
+            return self.tokens.index(self.unk)
+        else:
+            raise AttributeError
+    @classmethod
+    def load(cls, path, unk=None):
+        with open(path, 'r') as f:
+            lines = [line for line in f]
+        splits = [line.split() for line in lines]
+        tokens, vectors = zip(*[(s[0], list(map(float, s[1:])))
+                                for s in splits])
+        return cls(tokens, vectors, unk=unk)

biaffine-parser-master/parser/utils/field.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# -*- coding: utf-8 -*-
+from collections import Counter
+from parser.utils.vocab import Vocab
+import torch
+class Field(object):
+    def __init__(self, name, pad=None, unk=None, bos=None, eos=None,
+                 lower=False, use_vocab=True, tokenize=None, fn=None):
+        self.name = name
+        self.pad = pad
+        self.unk = unk
+        self.bos = bos
+        self.eos = eos
+        self.lower = lower
+        self.use_vocab = use_vocab
+        self.tokenize = tokenize
+        self.fn = fn
+        self.specials = [token for token in [pad, unk, bos, eos]
+                         if token is not None]
+    def __repr__(self):
+        s, params = f"({self.name}): {self.__class__.__name__}(", []
+        if self.pad is not None:
+            params.append(f"pad={self.pad}")
+        if self.unk is not None:
+            params.append(f"unk={self.unk}")
+        if self.bos is not None:
+            params.append(f"bos={self.bos}")
+        if self.eos is not None:
+            params.append(f"eos={self.eos}")
+        if self.lower:
+            params.append(f"lower={self.lower}")
+        if not self.use_vocab:
+            params.append(f"use_vocab={self.use_vocab}")
+        s += f", ".join(params)
+        s += f")"
+        return s
+    @property
+    def pad_index(self):
+        return self.specials.index(self.pad) if self.pad is not None else 0
+    @property
+    def unk_index(self):
+        return self.specials.index(self.unk) if self.unk is not None else 0
+    @property
+    def bos_index(self):
+        return self.specials.index(self.bos)
+    @property
+    def eos_index(self):
+        return self.specials.index(self.eos)
+    def transform(self, sequence):
+        if self.tokenize is not None:
+            sequence = self.tokenize(sequence)
+        if self.lower:
+            sequence = [str.lower(token) for token in sequence]
+        if self.fn is not None:
+            sequence = [self.fn(token) for token in sequence]
+        return sequence
+    def build(self, corpus, min_freq=1, embed=None):
+        sequences = getattr(corpus, self.name)
+        counter = Counter(token for sequence in sequences
+                          for token in self.transform(sequence))
+        self.vocab = Vocab(counter, min_freq, self.specials)
+        if not embed:
+            self.embed = None
+        else:
+            tokens = self.transform(embed.tokens)
+            # if the `unk` token has existed in the pretrained,
+            # then replace it with a self-defined one
+            if embed.unk:
+                tokens[embed.unk_index] = self.unk
+            self.vocab.extend(tokens)
+            self.embed = torch.zeros(len(self.vocab), embed.dim)
+            self.embed[self.vocab.token2id(tokens)] = embed.vectors
+            self.embed /= torch.std(self.embed)
+    def numericalize(self, sequences):
+        sequences = [self.transform(sequence) for sequence in sequences]
+        if self.use_vocab:
+            sequences = [self.vocab.token2id(sequence)
+                         for sequence in sequences]
+        if self.bos:
+            sequences = [[self.bos_index] + sequence for sequence in sequences]
+        if self.eos:
+            sequences = [sequence + [self.eos_index] for sequence in sequences]
+        sequences = [torch.tensor(sequence) for sequence in sequences]
+        return sequences
+class CharField(Field):
+    def __init__(self, *args, **kwargs):
+        self.fix_len = kwargs.pop('fix_len') if 'fix_len' in kwargs else -1
+        super(CharField, self).__init__(*args, **kwargs)
+    def build(self, corpus, min_freq=1, embed=None):
+        sequences = getattr(corpus, self.name)
+        counter = Counter(char for sequence in sequences for token in sequence
+                          for char in self.transform(token))
+        self.vocab = Vocab(counter, min_freq, self.specials)
+        if not embed:
+            self.embed = None
+        else:
+            tokens = self.transform(embed.tokens)
+            # if the `unk` token has existed in the pretrained,
+            # then replace it with a self-defined one
+            if embed.unk:
+                tokens[embed.unk_index] = self.unk
+            self.vocab.extend(tokens)
+            self.embed = torch.zeros(len(self.vocab), embed.dim)
+            self.embed[self.vocab.token2id(tokens)] = embed.vectors
+    def numericalize(self, sequences):
+        sequences = [[self.transform(token) for token in sequence]
+                     for sequence in sequences]
+        if self.fix_len <= 0:
+            self.fix_len = max(len(token) for sequence in sequences
+                               for token in sequence)
+        if self.use_vocab:
+            sequences = [[self.vocab.token2id(token) for token in sequence]
+                         for sequence in sequences]
+        if self.bos:
+            sequences = [[self.vocab.token2id(self.bos)] + sequence
+                         for sequence in sequences]
+        if self.eos:
+            sequences = [sequence + [self.vocab.token2id(self.eos)]
+                         for sequence in sequences]
+        sequences = [
+            torch.tensor([ids[:self.fix_len] + [0] * (self.fix_len - len(ids))
+                          for ids in sequence])
+            for sequence in sequences
+        ]
+        return sequences
+class BertField(Field):
+    def numericalize(self, sequences):
+        subwords, lens = [], []
+        sequences = [([self.bos] if self.bos else []) + list(sequence) +
+                     ([self.eos] if self.eos else [])
+                     for sequence in sequences]
+        for sequence in sequences:
+            sequence = [self.transform(token) for token in sequence]
+            sequence = [piece if piece else self.transform(self.pad)
+                        for piece in sequence]
+            subwords.append(sum(sequence, []))
+            lens.append(torch.tensor([len(piece) for piece in sequence]))
+        subwords = [torch.tensor(pieces) for pieces in subwords]
+        mask = [torch.ones(len(pieces)).ge(0) for pieces in subwords]
+        return list(zip(subwords, lens, mask))