Upload 64 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- README.md +67 -0
- UD_Tamil-TTB-master/.gitignore +0 -0
- UD_Tamil-TTB-master/CONTRIBUTING.md +8 -0
- UD_Tamil-TTB-master/LICENSE.txt +7 -0
- UD_Tamil-TTB-master/README.md +103 -0
- UD_Tamil-TTB-master/eval.log +99 -0
- UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD-commented.odt +0 -0
- UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD.odt +0 -0
- UD_Tamil-TTB-master/not-to-release/issues-UD-2.3.odt +0 -0
- UD_Tamil-TTB-master/stats.xml +110 -0
- UD_Tamil-TTB-master/ta_ttb-ud-dev.conllu +0 -0
- UD_Tamil-TTB-master/ta_ttb-ud-test.conllu +0 -0
- UD_Tamil-TTB-master/ta_ttb-ud-train.conllu +0 -0
- app.py +86 -0
- arc_eager.py +93 -0
- best_mapping.pth +3 -0
- biaffine-parser-master/.gitignore +22 -0
- biaffine-parser-master/.travis.yml +36 -0
- biaffine-parser-master/README.md +226 -0
- biaffine-parser-master/config.ini +30 -0
- biaffine-parser-master/data/naive3.conllx +7 -0
- biaffine-parser-master/data/ptb/tamdev.conllx +0 -0
- biaffine-parser-master/data/ptb/tamtest.conllx +0 -0
- biaffine-parser-master/data/ptb/tamtrain.conllx +0 -0
- biaffine-parser-master/exp/ptb/fields +3 -0
- biaffine-parser-master/exp/ptb/model +3 -0
- biaffine-parser-master/parser/__init__.py +5 -0
- biaffine-parser-master/parser/cmds/__init__.py +7 -0
- biaffine-parser-master/parser/cmds/cmd.py +151 -0
- biaffine-parser-master/parser/cmds/evaluate.py +49 -0
- biaffine-parser-master/parser/cmds/predict.py +49 -0
- biaffine-parser-master/parser/cmds/train.py +113 -0
- biaffine-parser-master/parser/config.py +41 -0
- biaffine-parser-master/parser/model.py +140 -0
- biaffine-parser-master/parser/modules/__init__.py +11 -0
- biaffine-parser-master/parser/modules/bert.py +62 -0
- biaffine-parser-master/parser/modules/biaffine.py +43 -0
- biaffine-parser-master/parser/modules/bilstm.py +126 -0
- biaffine-parser-master/parser/modules/char_lstm.py +30 -0
- biaffine-parser-master/parser/modules/dropout.py +60 -0
- biaffine-parser-master/parser/modules/mlp.py +28 -0
- biaffine-parser-master/parser/modules/scalar_mix.py +32 -0
- biaffine-parser-master/parser/utils/__init__.py +8 -0
- biaffine-parser-master/parser/utils/alg.py +143 -0
- biaffine-parser-master/parser/utils/common.py +6 -0
- biaffine-parser-master/parser/utils/corpus.py +88 -0
- biaffine-parser-master/parser/utils/data.py +110 -0
- biaffine-parser-master/parser/utils/embedding.py +41 -0
- biaffine-parser-master/parser/utils/field.py +172 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
biaffine-parser-master/exp/ptb/fields filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
biaffine-parser-master/exp/ptb/model filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
models/tnt_pos_tagger_hin.dill filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dependency-parser
|
| 2 |
+
## Dependencies :
|
| 3 |
+
* OS : Ubuntu 22.04.3 LTS
|
| 4 |
+
* Python 3.7
|
| 5 |
+
* flask 1.1.1
|
| 6 |
+
* flask-wtf 0.14.2
|
| 7 |
+
* flask-markdown 0.3
|
| 8 |
+
* nltk 3.4.5
|
| 9 |
+
* pygraphviz 1.7
|
| 10 |
+
* conllu 2.2.2
|
| 11 |
+
* scikit-learn 0.22.1
|
| 12 |
+
* dill 0.3.1.1
|
| 13 |
+
* transformers 2.1.1
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
> python3.7 -m pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
## Instructions for web app :
|
| 20 |
+
|
| 21 |
+
Run the following to host the app at localhost:5000
|
| 22 |
+
```bash
|
| 23 |
+
> python3.7 app.py
|
| 24 |
+
```
|
| 25 |
+
## Results :
|
| 26 |
+
Trained a model using <b>BERT</b> and parser implemented from <b>Deep Biaffine Attention for Neural Dependency Parsing</b> on Telugu UD Treebank dataset
|
| 27 |
+
|
| 28 |
+
<b>
|
| 29 |
+
train: 400 sentences
|
| 30 |
+
</b>
|
| 31 |
+
<br>
|
| 32 |
+
|
| 33 |
+
<b>
|
| 34 |
+
dev: 80 sentences
|
| 35 |
+
</b>
|
| 36 |
+
<br>
|
| 37 |
+
|
| 38 |
+
<b>
|
| 39 |
+
test: 120 sentences
|
| 40 |
+
</b>
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
### Train
|
| 44 |
+
Training the model
|
| 45 |
+
|
| 46 |
+

|
| 47 |
+
|
| 48 |
+
### Evaluate
|
| 49 |
+
|
| 50 |
+
Evaluation score after testing with Test Dataset
|
| 51 |
+
|
| 52 |
+

|
| 53 |
+
### Prediction
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
#### Entering the sentence
|
| 57 |
+
|
| 58 |
+

|
| 59 |
+
|
| 60 |
+
#### Final Result
|
| 61 |
+
|
| 62 |
+

|
| 63 |
+
|
| 64 |
+
#### Original Values
|
| 65 |
+
|
| 66 |
+

|
| 67 |
+
|
UD_Tamil-TTB-master/.gitignore
ADDED
|
File without changes
|
UD_Tamil-TTB-master/CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing
|
| 2 |
+
|
| 3 |
+
Please do not make pull requests against master, any such pull requests will be
|
| 4 |
+
closed. Pull requests against the dev branch are accepted in some treebanks but
|
| 5 |
+
not in others - check the Contributing line in the README file!
|
| 6 |
+
|
| 7 |
+
For full details on the branch policy see
|
| 8 |
+
[here](http://universaldependencies.org/release_checklist.html#repository-branches).
|
UD_Tamil-TTB-master/LICENSE.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This work is licensed under the Creative Commons Attribution-NonCommercial-
|
| 2 |
+
ShareAlike 3.0 Unported License. To view a copy of this license, visit
|
| 3 |
+
|
| 4 |
+
http://creativecommons.org/licenses/by-nc-sa/3.0/
|
| 5 |
+
|
| 6 |
+
or send a letter to
|
| 7 |
+
Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
|
UD_Tamil-TTB-master/README.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Summary
|
| 2 |
+
|
| 3 |
+
The UD Tamil treebank is based on the Tamil Dependency Treebank created at the
|
| 4 |
+
Charles University in Prague by Loganathan Ramasamy.
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# Introduction
|
| 8 |
+
|
| 9 |
+
The treebank was part of
|
| 10 |
+
HamleDT, a collection of treebanks converted to the Prague dependency style
|
| 11 |
+
(since 2011). Later versions of HamleDT added a conversion to the Stanford
|
| 12 |
+
dependencies (2014) and to Universal Dependencies (HamleDT 3.0, 2015). The
|
| 13 |
+
first release of Universal Dependencies that includes this treebank is UD v1.2
|
| 14 |
+
in November 2015. It is essentially the HamleDT conversion but the data is not
|
| 15 |
+
identical to HamleDT 3.0 because the conversion procedure has been further
|
| 16 |
+
improved.
|
| 17 |
+
|
| 18 |
+
## References:
|
| 19 |
+
|
| 20 |
+
* [TamilTB](http://ufal.mff.cuni.cz/~ramasamy/tamiltb/0.1/)
|
| 21 |
+
* [HamleDT](http://ufal.mff.cuni.cz/hamledt)
|
| 22 |
+
* [Treex](http://ufal.mff.cuni.cz/treex) is the software used for conversion
|
| 23 |
+
* [Interset](http://ufal.mff.cuni.cz/interset) was used to convert POS tags and features
|
| 24 |
+
* Loganathan Ramasamy, Zdeněk Žabokrtský. 2012.
|
| 25 |
+
[Prague Dependency Style Treebank for Tamil](http://www.lrec-conf.org/proceedings/lrec2012/summaries/456.html).
|
| 26 |
+
In: *Proceedings of Eighth International Conference on Language Resources and Evaluation (LREC 2012),*
|
| 27 |
+
İstanbul, Turkey, ISBN 978-2-9517408-7-7, pp. 1888–1894.
|
| 28 |
+
|
| 29 |
+
<pre>
|
| 30 |
+
@inproceedings{ta,
|
| 31 |
+
author = {Ramasamy, Loganathan and \v{Z}abokrtsk\'{y}, Zden\v{e}k},
|
| 32 |
+
year = {2012},
|
| 33 |
+
title = {Prague Dependency Style Treebank for {Tamil}},
|
| 34 |
+
booktitle = {Proceedings of Eighth International Conference on Language Resources and Evaluation ({LREC} 2012)},
|
| 35 |
+
address = {\.{I}stanbul, Turkey},
|
| 36 |
+
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Mehmet Uğur Doğan and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
|
| 37 |
+
isbn = {978-2-9517408-7-7},
|
| 38 |
+
pages = {1888--1894},
|
| 39 |
+
url = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/456.html}
|
| 40 |
+
}
|
| 41 |
+
</pre>
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# Source of annotations
|
| 45 |
+
|
| 46 |
+
This table summarizes the origins and checking of the various columns of the CoNLL-U data.
|
| 47 |
+
|
| 48 |
+
| Column | Status |
|
| 49 |
+
| ------ | ------ |
|
| 50 |
+
| ID | Sentence segmentation and tokenization (including cutting off certain suffixes that constitute independent syntactic words) was automatically done and then hand-corrected. |
|
| 51 |
+
| FORM | Identical to TamilTB form. |
|
| 52 |
+
| LEMMA | Gold (preprocessed and then manually corrected). |
|
| 53 |
+
| UPOSTAG | Converted automatically from XPOSTAG (via [Interset](https://ufal.mff.cuni.cz/interset)). |
|
| 54 |
+
| XPOSTAG | Gold (preprocessed and then manually corrected). |
|
| 55 |
+
| FEATS | Converted automatically from XPOSTAG (via Interset). |
|
| 56 |
+
| HEAD | Original TamilTB annotation is manual (preprocessed by a rule-based parser and then manually corrected). Automatic conversion to UD; human checking of patterns revealed by automatic consistency tests. |
|
| 57 |
+
| DEPREL | Original TamilTB annotation is manual (preprocessed by a rule-based parser and then manually corrected). Automatic conversion to UD; human checking of patterns revealed by automatic consistency tests. |
|
| 58 |
+
| DEPS | — (currently unused) |
|
| 59 |
+
| MISC | Information about token spacing restored using heuristics. Mapping between multi-word tokens and syntactic words verified against the source text. |
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# Changelog
|
| 63 |
+
|
| 64 |
+
* 2023-11-15 v2.13
|
| 65 |
+
* Fixed: no Gender for numerals and particles.
|
| 66 |
+
* 2021-05-15 v2.8
|
| 67 |
+
* Distinguished acl:relcl from other acl.
|
| 68 |
+
* Added enhanced dependencies for relative clauses.
|
| 69 |
+
* 2020-05-15 v2.6
|
| 70 |
+
* Added enhanced relations with case information.
|
| 71 |
+
* 2019-05-15 v2.4
|
| 72 |
+
* Fixed some annotation errors in the original treebank, re-run the conversion.
|
| 73 |
+
* Dative and instrumental objects are now treated as oblique arguments.
|
| 74 |
+
* 2018-04-15 v2.2
|
| 75 |
+
* Repository renamed from UD_Tamil to UD_Tamil-TTB.
|
| 76 |
+
* Added enhanced representation of dependencies propagated across coordination.
|
| 77 |
+
The distinction of shared and private dependents is derived deterministically from the original Prague annotation.
|
| 78 |
+
* 2017-03-01 v2.0
|
| 79 |
+
* Converted to UD v2 guidelines.
|
| 80 |
+
* Reconsidered PRON vs. DET distinction.
|
| 81 |
+
* Improved advmod vs. obl distinction.
|
| 82 |
+
* 2016-05-15 v1.3
|
| 83 |
+
* Added Latin transliteration of lemmas and full sentences.
|
| 84 |
+
* Added orthographic words (surface tokens) and their mapping to nodes.
|
| 85 |
+
* Improved conversion of AuxY.
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
<pre>
|
| 89 |
+
=== Machine-readable metadata (DO NOT REMOVE!) ================================
|
| 90 |
+
Data available since: UD v1.2
|
| 91 |
+
License: CC BY-NC-SA 3.0
|
| 92 |
+
Includes text: yes
|
| 93 |
+
Genre: news
|
| 94 |
+
Lemmas: converted from manual
|
| 95 |
+
UPOS: converted from manual
|
| 96 |
+
XPOS: manual native
|
| 97 |
+
Features: converted from manual
|
| 98 |
+
Relations: converted from manual
|
| 99 |
+
Contributors: Ramasamy, Loganathan; Zeman, Daniel
|
| 100 |
+
Contributing: elsewhere
|
| 101 |
+
Contact: zeman@ufal.mff.cuni.cz
|
| 102 |
+
===============================================================================
|
| 103 |
+
</pre>
|
UD_Tamil-TTB-master/eval.log
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Running the following version of UD tools:
|
| 2 |
+
commit e9726a6a7d6913193d90edb45a4cb549235c5b16
|
| 3 |
+
Author: Dan Zeman <zeman@ufal.mff.cuni.cz>
|
| 4 |
+
Date: Sat Nov 4 17:10:55 2023 +0100
|
| 5 |
+
Evaluating the following revision of UD_Tamil-TTB:
|
| 6 |
+
commit c1739c0397fd034200edaf4e403c2e4c9923dd75
|
| 7 |
+
Merge: 1392fa0 fbea79c
|
| 8 |
+
Author: Dan Zeman <zeman@ufal.mff.cuni.cz>
|
| 9 |
+
Size: counted 9581 of 9581 words (nodes).
|
| 10 |
+
Size: min(0, log((N/1000)**2)) = 4.51956394133747.
|
| 11 |
+
Size: maximum value 13.815511 is for 1000000 words or more.
|
| 12 |
+
Split: Did not find more than 10000 training words.
|
| 13 |
+
Split: Did not find at least 10000 development words.
|
| 14 |
+
Split: Did not find at least 10000 test words.
|
| 15 |
+
Lemmas: source of annotation (from README) factor is 0.8.
|
| 16 |
+
Universal POS tags: 14 out of 17 found in the corpus.
|
| 17 |
+
Universal POS tags: source of annotation (from README) factor is 0.8.
|
| 18 |
+
Features: 8280 out of 9581 total words have one or more features.
|
| 19 |
+
Features: source of annotation (from README) factor is 0.8.
|
| 20 |
+
Universal relations: 25 out of 37 found in the corpus.
|
| 21 |
+
Universal relations: source of annotation (from README) factor is 0.8.
|
| 22 |
+
Udapi:
|
| 23 |
+
TOTAL 205
|
| 24 |
+
Udapi: found 205 bugs.
|
| 25 |
+
Udapi: worst expected case (threshold) is one bug per 10 words. There are 9581 words.
|
| 26 |
+
Genres: found 1 out of 17 known.
|
| 27 |
+
/net/work/people/zeman/unidep/tools/validate.py --lang ta --max-err=10 UD_Tamil-TTB/ta_ttb-ud-dev.conllu
|
| 28 |
+
[Line 9 Sent dev-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:dat' in '11:obl:dat'
|
| 29 |
+
|
| 30 |
+
The following 63 enhanced relations are currently permitted in language [ta]:
|
| 31 |
+
acl, acl:relcl, advcl, advcl:cond, advmod, advmod:emph, advmod:lmod, amod, appos, aux, aux:neg, aux:pass, case, cc, ccomp, clf, compound, compound:lvc, compound:prt, compound:redup, compound:svc, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, flat:name, goeswith, iobj, list, mark, nmod, nmod:poss, nsubj, nsubj:nc, nsubj:nc:xsubj, nsubj:pass, nsubj:pass:xsubj, nsubj:xsubj, nummod, obj, obl, obl:agent, obl:arg, obl:cmpr, obl:inst, obl:lmod, obl:pmod, obl:tmod, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp
|
| 32 |
+
See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.
|
| 33 |
+
|
| 34 |
+
[Line 10 Sent dev-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:dat' in '7:obl:dat'
|
| 35 |
+
[Line 11 Sent dev-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:இல்:nom' in '11:obl:இல்:nom'
|
| 36 |
+
[Line 32 Sent dev-s2]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '10:obl:com'
|
| 37 |
+
[Line 45 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '11:obl:com'
|
| 38 |
+
[Line 48 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '11:obl:com'
|
| 39 |
+
[Line 50 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '10:nmod:nom'
|
| 40 |
+
[Line 58 Sent dev-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '18:nmod:nom'
|
| 41 |
+
[Line 68 Sent dev-s4]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '23:obl:loc'
|
| 42 |
+
...suppressing further errors regarding Enhanced
|
| 43 |
+
Enhanced errors: 351
|
| 44 |
+
*** FAILED *** with 351 errors
|
| 45 |
+
Exit code: 1
|
| 46 |
+
/net/work/people/zeman/unidep/tools/validate.py --lang ta --max-err=10 UD_Tamil-TTB/ta_ttb-ud-test.conllu
|
| 47 |
+
[Line 6 Sent test-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:இலிருந்து:nom' in '4:nmod:இலிருந்து:nom'
|
| 48 |
+
|
| 49 |
+
The following 63 enhanced relations are currently permitted in language [ta]:
|
| 50 |
+
acl, acl:relcl, advcl, advcl:cond, advmod, advmod:emph, advmod:lmod, amod, appos, aux, aux:neg, aux:pass, case, cc, ccomp, clf, compound, compound:lvc, compound:prt, compound:redup, compound:svc, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, flat:name, goeswith, iobj, list, mark, nmod, nmod:poss, nsubj, nsubj:nc, nsubj:nc:xsubj, nsubj:pass, nsubj:pass:xsubj, nsubj:xsubj, nummod, obj, obl, obl:agent, obl:arg, obl:cmpr, obl:inst, obl:lmod, obl:pmod, obl:tmod, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp
|
| 51 |
+
See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.
|
| 52 |
+
|
| 53 |
+
[Line 13 Sent test-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:dat' in '9:obl:dat'
|
| 54 |
+
[Line 28 Sent test-s2]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:இலிருந்து:nom' in '9:obl:இலிருந்து:nom'
|
| 55 |
+
[Line 42 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '2:nmod:nom'
|
| 56 |
+
[Line 43 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '5:obl:loc'
|
| 57 |
+
[Line 44 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '4:nmod:nom'
|
| 58 |
+
[Line 49 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:dat' in '9:nmod:dat'
|
| 59 |
+
[Line 54 Sent test-s3]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:arg:இடம்:gen' in '15:obl:arg:இடம்:gen'
|
| 60 |
+
[Line 66 Sent test-s4]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:com' in '5:obl:com'
|
| 61 |
+
...suppressing further errors regarding Enhanced
|
| 62 |
+
[Line 2738 Sent test-s118 Node 7]: [L3 Syntax too-many-subjects] Multiple subjects [4, 6] not subtyped as ':outer'. Outer subjects are allowed if a clause acts as the predicate of another clause.
|
| 63 |
+
Enhanced errors: 483
|
| 64 |
+
Syntax errors: 1
|
| 65 |
+
*** FAILED *** with 484 errors
|
| 66 |
+
Exit code: 1
|
| 67 |
+
/net/work/people/zeman/unidep/tools/validate.py --lang ta --max-err=10 UD_Tamil-TTB/ta_ttb-ud-train.conllu
|
| 68 |
+
[Line 5 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:அருகே:nom' in '18:obl:அருகே:nom'
|
| 69 |
+
|
| 70 |
+
The following 63 enhanced relations are currently permitted in language [ta]:
|
| 71 |
+
acl, acl:relcl, advcl, advcl:cond, advmod, advmod:emph, advmod:lmod, amod, appos, aux, aux:neg, aux:pass, case, cc, ccomp, clf, compound, compound:lvc, compound:prt, compound:redup, compound:svc, conj, cop, csubj, csubj:xsubj, dep, det, discourse, dislocated, expl, fixed, flat, flat:name, goeswith, iobj, list, mark, nmod, nmod:poss, nsubj, nsubj:nc, nsubj:nc:xsubj, nsubj:pass, nsubj:pass:xsubj, nsubj:xsubj, nummod, obj, obl, obl:agent, obl:arg, obl:cmpr, obl:inst, obl:lmod, obl:pmod, obl:tmod, orphan, parataxis, punct, ref, reparandum, root, vocative, xcomp
|
| 72 |
+
See https://quest.ms.mff.cuni.cz/udvalidator/cgi-bin/unidep/langspec/specify_edeprel.pl for details.
|
| 73 |
+
|
| 74 |
+
[Line 7 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '4:nmod:nom'
|
| 75 |
+
[Line 8 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '18:obl:loc'
|
| 76 |
+
[Line 9 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '6:nmod:nom'
|
| 77 |
+
[Line 10 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:nom' in '11:nmod:nom'
|
| 78 |
+
[Line 16 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:dat' in '12:nmod:dat'
|
| 79 |
+
[Line 19 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:dat' in '15:nmod:dat'
|
| 80 |
+
[Line 20 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'nmod:இல்:nom' in '17:nmod:இல்:nom'
|
| 81 |
+
[Line 22 Sent train-s1]: [L4 Enhanced unknown-edeprel] Unknown enhanced relation type 'obl:loc' in '18:obl:loc'
|
| 82 |
+
...suppressing further errors regarding Enhanced
|
| 83 |
+
[Line 4427 Sent train-s192 Node 25]: [L3 Syntax too-many-subjects] Multiple subjects [11, 17] not subtyped as ':outer'. Outer subjects are allowed if a clause acts as the predicate of another clause.
|
| 84 |
+
Enhanced errors: 1922
|
| 85 |
+
Syntax errors: 1
|
| 86 |
+
*** FAILED *** with 1923 errors
|
| 87 |
+
Exit code: 1
|
| 88 |
+
Validity: 0.01
|
| 89 |
+
(weight=0.0769230769230769) * (score{features}=0.8) = 0.0615384615384615
|
| 90 |
+
(weight=0.0769230769230769) * (score{genres}=0.0588235294117647) = 0.00452488687782805
|
| 91 |
+
(weight=0.0769230769230769) * (score{lemmas}=0.8) = 0.0615384615384615
|
| 92 |
+
(weight=0.256410256410256) * (score{size}=0.327136946721963) = 0.0838812683902469
|
| 93 |
+
(weight=0.0512820512820513) * (score{split}=0.01) = 0.000512820512820513
|
| 94 |
+
(weight=0.0769230769230769) * (score{tags}=0.658823529411765) = 0.0506787330316742
|
| 95 |
+
(weight=0.307692307692308) * (score{udapi}=0.786034860661726) = 0.241856880203608
|
| 96 |
+
(weight=0.0769230769230769) * (score{udeprels}=0.540540540540541) = 0.0415800415800416
|
| 97 |
+
(TOTAL score=0.546111553673142) * (availability=1) * (validity=0.01) = 0.00546111553673142
|
| 98 |
+
STARS = 0
|
| 99 |
+
UD_Tamil-TTB 0.00546111553673142 0
|
UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD-commented.odt
ADDED
|
Binary file (71.5 kB). View file
|
|
|
UD_Tamil-TTB-master/not-to-release/issues-Tamil-UD.odt
ADDED
|
Binary file (71.1 kB). View file
|
|
|
UD_Tamil-TTB-master/not-to-release/issues-UD-2.3.odt
ADDED
|
Binary file (39.2 kB). View file
|
|
|
UD_Tamil-TTB-master/stats.xml
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<treebank>
|
| 3 |
+
<!-- tokens means "surface tokens", e.g. Spanish "vámonos" counts as one token
|
| 4 |
+
words means "syntactic words", e.g. Spanish "vámonos" is split to two words, "vamos" and "nos"
|
| 5 |
+
fused is the number of tokens that are split to two or more syntactic words
|
| 6 |
+
The words and fused elements can be omitted if no token is split to smaller syntactic words. -->
|
| 7 |
+
<size>
|
| 8 |
+
<total><sentences>600</sentences><tokens>8635</tokens><words>9581</words><fused>835</fused></total>
|
| 9 |
+
<train><sentences>400</sentences><tokens>5734</tokens><words>6329</words><fused>520</fused></train>
|
| 10 |
+
<dev><sentences>80</sentences><tokens>1129</tokens><words>1263</words><fused>121</fused></dev>
|
| 11 |
+
<test><sentences>120</sentences><tokens>1772</tokens><words>1989</words><fused>194</fused></test>
|
| 12 |
+
</size>
|
| 13 |
+
<lemmas unique="2024" /><!-- ., ,, உள், உம், என், படு, இரு, ஆகு, அவர், இந்தியா, தெரிவி, செய், ஆக, ஆன, நாடு -->
|
| 14 |
+
<forms unique="3584" /><!-- ., ,, உம், ஆக, உள்ளது, ஆன, என்று, உள்ள, அவர், வேண்டும், இந்த, பட்ட், மற்றும், அரசு, நாடுகள் -->
|
| 15 |
+
<fusions unique="620" /><!-- என்றும், இடையிலான, செயலாளராக, தெரிவிக்கப்பட்டுள்ளது, தெரிவித்துள்ளது, தெரிவித்துள்ளார், இடத்தையும், குறிப்பிடத்தக்கது, வெளியிட்டுள்ள, Kஉறித்தும், பேருக்கும், ஆதரவாக, காரணமாக, குறிப்பிடப்பட்டுள்ளது, கூறியுள்ளார் -->
|
| 16 |
+
<!-- Statistics of universal POS tags. The comments show the most frequent lemmas. -->
|
| 17 |
+
<tags unique="14">
|
| 18 |
+
<tag name="ADJ">557</tag><!-- உள், மத்திய, இரு, ஒரு, நடைபெறு, உள்ளிடு, புதிய, கடந்த, முன்னாள், வரு -->
|
| 19 |
+
<tag name="ADP">293</tag><!-- ஆகு, உடன், குறி, இலிருந்து, மீது, உள், இரு, மூலம், சார்பு, போல் -->
|
| 20 |
+
<tag name="ADV">384</tag><!-- ஆகு, இன்று, மேலும், ஆனால், பின்னர், இதனால், இது, இதுகுறித்து, ஏற்கெனவே, இதுவரை -->
|
| 21 |
+
<tag name="AUX">634</tag><!-- உள், படு, வேண்டு, இரு, வரு, கொள், இல், செய், விடு, வா -->
|
| 22 |
+
<tag name="CCONJ">46</tag><!-- மற்றும், அல்லது -->
|
| 23 |
+
<tag name="DET">120</tag><!-- இந்த, அந்த, எந்த, மிக, அதிகம், மிகவும், முழுவதும், அந்தந்த, ஒரு, குறைவு -->
|
| 24 |
+
<tag name="NOUN">2758</tag><!-- அரசு, நாடு, ஆண்டு, தலைவர், மக்கள், இடம், பேர், கட்சி, செயலாளர், முதல்வர் -->
|
| 25 |
+
<tag name="NUM">274</tag><!-- இரு, ஆயிரம், 2, லட்சம், மூன்று, 10, ஒன்று, 20, இரண்டு, ஒரு -->
|
| 26 |
+
<tag name="PART">654</tag><!-- உம், என், ஆன, ஆக, ஆகு, போது, தான், ஏ, ஆவது, ஓ -->
|
| 27 |
+
<tag name="PRON">236</tag><!-- அவர், இது, அது, தன், அனைவர், என், யார், நான், நாம், இவர் -->
|
| 28 |
+
<tag name="PROPN">1370</tag><!-- இந்தியா, அமெரிக்கா, இலங்கை, பாகிஸ்தான், சென்னை, தமிழகம், ஒபாமா, அதிமுக, காங்கிரஸ், ஜெயலலிதா -->
|
| 29 |
+
<tag name="PUNCT">1000</tag><!-- ., ,, -, :, (, ), ", ரூ, ரூ., ; -->
|
| 30 |
+
<tag name="VERB">1254</tag><!-- தெரிவி, செய், கூறு, இரு, செல், பெறு, வழங்கு, நடைபெறு, குறிப்பிடு, ஏற்படு -->
|
| 31 |
+
<tag name="X">1</tag><!-- என் -->
|
| 32 |
+
</tags>
|
| 33 |
+
<!-- Statistics of features and values. The comments show the most frequent word forms. -->
|
| 34 |
+
<feats unique="41">
|
| 35 |
+
<feat name="AdpType" value="Post" upos="ADP">288</feat><!-- ஆக, உடன், இலிருந்து, குறித்து, மீது, சார்பில், மூலம், இடம், இருந்து, இடையில் -->
|
| 36 |
+
<feat name="Animacy" value="Anim" upos="AUX,NOUN,PRON,PROPN,VERB">420</feat><!-- உள்ளனர், பேர், மக்கள், அதிகாரிகள், அனைவரும், அவர்கள், தனது, புலிகள், போலீஸார், நான் -->
|
| 37 |
+
<feat name="Case" value="Acc" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">340</feat><!-- அதை, அவர்களை, இடத்தைய், இதை, நிலையங்களை, வெடிகளை, அணையை, அரசை, ஆட்சியை, உத்தரவை -->
|
| 38 |
+
<feat name="Case" value="Com" upos="NOUN">13</feat><!-- மனிதாபிமானத்தோடு, அமைப்புடன், அவருடன், உயிருடன், சிலருடன், தங்கப்பதக்கத்துடன், துணையோடு, நம்பிக்கையோடு, நேயத்துடன், நேயத்தோடு -->
|
| 39 |
+
<feat name="Case" value="Dat" upos="AUX,NOUN,NUM,PRON,PROPN,VERB">262</feat><!-- இந்தியாவுக்கு, மணிக்கு, அவர்களுக்கு, ஆண்டுக்கு, பேருக்க், மக்களுக்கு, அரசுக்கு, அவருக்கு, ஆண்டுகளுக்குப், கொள்வதற்க் -->
|
| 40 |
+
<feat name="Case" value="Gen" upos="NOUN,PRON,PROPN,VERB">177</feat><!-- அரசின், தனது, அவரது, ஒபாமாவின், நாடுகளின், நாட்டின், அமெரிக்காவின், அவர்களது, இதன், இந்தியாவின் -->
|
| 41 |
+
<feat name="Case" value="Ins" upos="AUX,NOUN,PART,PRON,PROPN,VERB">24</feat><!-- உள்ளதால், என்பதால், காரணத்தால், அளிக்காததால், அவர்களால், ஆகியதால், ஆனதால், இல்லாததால், எங்களால், ஒப்பந்தத்தால் -->
|
| 42 |
+
<feat name="Case" value="Loc" upos="NOUN,NUM,PRON,PROPN,VERB">487</feat><!-- நிலையில், அறிக்கையில், பகுதியில், வகையில், இந்தியாவில், கவுன்சிலில், தலைமையில், அளவில், சிறையில், சென்னையில் -->
|
| 43 |
+
<feat name="Case" value="Nom" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">2929</feat><!-- அவர், அரசு, நாடுகள், இந்தியா, தலைவர், செயலாளர், முதல்வர், பேர், ஆண்டு, மக்கள் -->
|
| 44 |
+
<feat name="Gender" value="Com" upos="AUX,NOUN,PRON,PROPN,VERB">1217</feat><!-- அவர், உள்ளார், தலைவர், செயலாளர், உள்ளனர், தெரிவித்தார், முதல்வர், பேர், மக்கள், என்றார் -->
|
| 45 |
+
<feat name="Gender" value="Masc" upos="NOUN">2</feat><!-- அற்றவன், ஆடவனின் -->
|
| 46 |
+
<feat name="Gender" value="Neut" upos="AUX,NOUN,PRON,PROPN,VERB">4042</feat><!-- உள்ளது, வேண்டும், அரசு, நாடுகள், இந்தியா, இல்லை, பட்டது, இந்திய, ஆண்டு, அமெரிக்க -->
|
| 47 |
+
<feat name="Mood" value="Cnd" upos="AUX,PART,VERB">28</feat><!-- இருந்தால், விட்டால், ஆனால், உடைத்தால், ஏற்பட்டால், கட்டினால், பட்டால், பெற்றால், வந்தால், இருப்பின் -->
|
| 48 |
+
<feat name="Mood" value="Imp" upos="VERB">1</feat><!-- இருங்கள் -->
|
| 49 |
+
<feat name="Mood" value="Ind" upos="AUX,VERB">718</feat><!-- உள்ளது, வேண்டும், உள்ளார், இல்லை, உள்ளனர், தெரிவித்தார், பட்டது, உள்ளன, என்றார், இருந்தது -->
|
| 50 |
+
<feat name="Number" value="Plur" upos="AUX,NOUN,PRON,PROPN,VERB">909</feat><!-- நாடுகள், இல்லை, உள்ளனர், பேர், மக்கள், உள்ளன, அதிகாரிகள், வருகின்றனர், அனைவரும், அவர்கள் -->
|
| 51 |
+
<feat name="Number" value="Sing" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">4395</feat><!-- உள்ளது, அவர், வேண்டும், அரசு, இந்தியா, உள்ளார், தலைவர், பட்டது, செயலாளர், தெரிவித்தார் -->
|
| 52 |
+
<feat name="NumForm" value="Digit" upos="NUM">149</feat><!-- 2, 10, 20, 3, 80, 16, 4, 5, 50, 6 -->
|
| 53 |
+
<feat name="NumType" value="Card" upos="DET,NUM">282</feat><!-- இரு, 2, ஆயிரம், மூன்று, லட்சம், 10, 20, 3, 80, இரண்டு -->
|
| 54 |
+
<feat name="NumType" value="Ord" upos="ADJ,NUM">52</feat><!-- முதல், இரண்டாவது, 1992-ம், 1-ம், 12-ம், 125-ம், 15-ம், 21-ம், 11வத��, 12வது -->
|
| 55 |
+
<feat name="Person" value="1" upos="AUX,PRON,VERB">68</feat><!-- நான், உள்ளோம், எனது, நாம், உள்ளேன், தனக்கு, நாங்கள், போராடுவோம், எனக்கு, கொள்கிறேன் -->
|
| 56 |
+
<feat name="Person" value="2" upos="AUX,PRON,VERB">12</feat><!-- நீங்கள், அஞ்சுகிறீர்கள், இருக்கிறீர்கள், இருங்கள், உங்களுக்கு, உங்களைச், உங்கள், கவலைப்படாதீர்கள், வருகிறீர்கள், விரும்புகிறீர்கள் -->
|
| 57 |
+
<feat name="Person" value="3" upos="AUX,NOUN,NUM,PART,PRON,PROPN,VERB">5224</feat><!-- உள்ளது, அவர், வேண்டும், அரசு, நாடுகள், இந்தியா, உள்ளார், இல்லை, தலைவர், பட்டது -->
|
| 58 |
+
<feat name="Polarity" value="Neg" upos="ADJ,AUX,VERB">35</feat><!-- முடியாது, கூடாது, இல்லாமல், செயல்படாமல், செய்யாத, தரா, நிறைவேற்றா, மாட்டாது, அல்லாத, எடுக்காத -->
|
| 59 |
+
<feat name="Polarity" value="Pos" upos="ADJ,ADP,ADV,AUX,NOUN,NUM,PART,VERB">2294</feat><!-- உள்ளது, என்று, உள்ள, வேண்டும், பட்ட், உள்ளார், இல்லை, என, என்ற், பட்டது -->
|
| 60 |
+
<feat name="Polite" value="Form" upos="AUX,NOUN,NUM,PRON,PROPN,VERB">798</feat><!-- அவர், உள்ளார், தலைவர், செயலாளர், தெரிவித்தார், முதல்வர், என்றார், அமைச்சர், அதிபர், உறுப்பினர் -->
|
| 61 |
+
<feat name="PronType" value="Ind" upos="PRON">8</feat><!-- யாரும், எதுவும், யாருக்கும், யாரையும் -->
|
| 62 |
+
<feat name="PronType" value="Int" upos="PRON">6</feat><!-- யார், எத்தகையது, ஏத், யாருடைய -->
|
| 63 |
+
<feat name="PronType" value="Prs" upos="PRON">222</feat><!-- அவர், இது, அனைவரும், அவர்கள், தனது, அது, அதை, நான், அவரது, அவர்களை -->
|
| 64 |
+
<feat name="PunctType" value="Comm" upos="PUNCT">400</feat><!-- ,, -, :, (, ), ", ரூ, ரூ., ;, ’ -->
|
| 65 |
+
<feat name="PunctType" value="Peri" upos="PUNCT">600</feat><!-- . -->
|
| 66 |
+
<feat name="Reflex" value="Yes" upos="PRON">16</feat><!-- தனது, தனக்கு, தங்களது, தங்களின், தன்னைப், தமக்கு -->
|
| 67 |
+
<feat name="Tense" value="Fut" upos="ADJ,ADV,AUX,NOUN,PART,VERB">356</feat><!-- வேண்டும், நடைபெறும், ப்படும், வரும், இருக்கும், என்பது, இருப்பத், ஏற்படும், சேர்ந்தவர்கள், படும் -->
|
| 68 |
+
<feat name="Tense" value="Past" upos="ADJ,AUX,NOUN,PART,VERB">518</feat><!-- பட்டது, தெரிவித்தார், என்ற, என்றார், உள்ளிட்ட, இருந்தது, இருந்த, நடைபெற்ற, வந்த, இருந்தார் -->
|
| 69 |
+
<feat name="Tense" value="Pres" upos="ADJ,AUX,NOUN,PART,VERB">123</feat><!-- வருகின்றனர், வருகிறது, படுகிறது, தெரிகிறது, வருகின்றன, இருக்கிறது, என்கிற, தெரிவிக்கிறது, படுகின்றனர், இருக்கிற -->
|
| 70 |
+
<feat name="VerbForm" value="Fin" upos="AUX,PART,VERB">747</feat><!-- உள்ளது, வேண்டும், உள்ளார், இல்லை, உள்ளனர், தெரிவித்தார், பட்டது, உள்ளன, என்றார், இருந்தது -->
|
| 71 |
+
<feat name="VerbForm" value="Ger" upos="AUX,PART,VERB">210</feat><!-- உள்ளத், என்பது, இருப்பத், விட்டத், கூறியத், பட்டது, இருந்தத், உயிரிழந்தத், உயிரிழப்பத், உள்ளதால் -->
|
| 72 |
+
<feat name="VerbForm" value="Inf" upos="AUX,PART,VERB">476</feat><!-- என்று, என, என்ற், செய்யப், தெரிவிக்கப், செய்ய, வழங்கப், நியமிக்கப், ப்பட, குறிப்பிடத் -->
|
| 73 |
+
<feat name="VerbForm" value="Part" upos="ADJ,ADP,ADV,AUX,NOUN,PART,VERB">882</feat><!-- உள்ள, பட்ட், பட்டு, கொண்டு, தெரிவித்த், செய்து, என்ற, உள்ளிட்ட, இருந்த, நடைபெற்ற -->
|
| 74 |
+
<feat name="Voice" value="Act" upos="AUX,VERB">1616</feat><!-- உள்ளது, வேண்டும், உள்ளார், இல்லை, உள்ளனர், தெரிவித்தார், கொண்டு, தெரிவித்த், செய்து, உள்ளன -->
|
| 75 |
+
<feat name="Voice" value="Pass" upos="AUX,VERB">155</feat><!-- பட்ட், பட்டது, பட்டு, ப்படும், படுகிறது, ப்பட, படவ், படுகின்றனர், பட்டதற்கு, பட்டனர் -->
|
| 76 |
+
</feats>
|
| 77 |
+
<!-- Statistics of universal dependency relations. -->
|
| 78 |
+
<deps unique="30">
|
| 79 |
+
<dep name="acl">63</dep>
|
| 80 |
+
<dep name="acl:relcl">69</dep>
|
| 81 |
+
<dep name="advcl">358</dep>
|
| 82 |
+
<dep name="advmod">401</dep>
|
| 83 |
+
<dep name="advmod:emph">231</dep>
|
| 84 |
+
<dep name="amod">549</dep>
|
| 85 |
+
<dep name="aux">608</dep>
|
| 86 |
+
<dep name="case">270</dep>
|
| 87 |
+
<dep name="cc">103</dep>
|
| 88 |
+
<dep name="ccomp">166</dep>
|
| 89 |
+
<dep name="compound">13</dep>
|
| 90 |
+
<dep name="compound:prt">1</dep>
|
| 91 |
+
<dep name="conj">236</dep>
|
| 92 |
+
<dep name="cop">1</dep>
|
| 93 |
+
<dep name="csubj">11</dep>
|
| 94 |
+
<dep name="dep">1</dep>
|
| 95 |
+
<dep name="det">114</dep>
|
| 96 |
+
<dep name="iobj">27</dep>
|
| 97 |
+
<dep name="mark">280</dep>
|
| 98 |
+
<dep name="nmod">2024</dep>
|
| 99 |
+
<dep name="nsubj">664</dep>
|
| 100 |
+
<dep name="nsubj:pass">1</dep>
|
| 101 |
+
<dep name="nummod">239</dep>
|
| 102 |
+
<dep name="obj">537</dep>
|
| 103 |
+
<dep name="obl">888</dep>
|
| 104 |
+
<dep name="obl:arg">89</dep>
|
| 105 |
+
<dep name="parataxis">5</dep>
|
| 106 |
+
<dep name="punct">1000</dep>
|
| 107 |
+
<dep name="root">600</dep>
|
| 108 |
+
<dep name="xcomp">32</dep>
|
| 109 |
+
</deps>
|
| 110 |
+
</treebank>
|
UD_Tamil-TTB-master/ta_ttb-ud-dev.conllu
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
UD_Tamil-TTB-master/ta_ttb-ud-test.conllu
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
UD_Tamil-TTB-master/ta_ttb-ud-train.conllu
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
try:
|
| 2 |
+
from flask import Flask
|
| 3 |
+
|
| 4 |
+
from flask import Flask, request, render_template, redirect, url_for, session, send_file
|
| 5 |
+
|
| 6 |
+
from flask_wtf import FlaskForm, RecaptchaField
|
| 7 |
+
from wtforms import StringField, SubmitField, RadioField, DateTimeField, SelectField, TextAreaField
|
| 8 |
+
|
| 9 |
+
from wtforms.validators import DataRequired
|
| 10 |
+
|
| 11 |
+
from flask import session
|
| 12 |
+
from flaskext.markdown import Markdown
|
| 13 |
+
from arc_eager import Process
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(e)
|
| 18 |
+
print("Some Modules are Missing")
|
| 19 |
+
|
| 20 |
+
app = Flask(__name__)
|
| 21 |
+
Markdown(app)
|
| 22 |
+
app.config["SECRET_KEY"] = 'mysecretkey'
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class Widgets(FlaskForm):
|
| 26 |
+
|
| 27 |
+
Statement = StringField(label="STATEMENT")
|
| 28 |
+
|
| 29 |
+
submit = SubmitField(label="Submit")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def foo(value):
|
| 33 |
+
print("Work to be done")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@app.after_request
|
| 37 |
+
def add_header(r):
|
| 38 |
+
"""
|
| 39 |
+
Add headers to both force latest IE rendering engine or Chrome Frame,
|
| 40 |
+
and also to cache the rendered page for 10 minutes.
|
| 41 |
+
"""
|
| 42 |
+
r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
|
| 43 |
+
r.headers["Pragma"] = "no-cache"
|
| 44 |
+
r.headers["Expires"] = "0"
|
| 45 |
+
r.headers['Cache-Control'] = 'public, max-age=0'
|
| 46 |
+
return r
|
| 47 |
+
|
| 48 |
+
@app.route("/", methods=["GET", "POST"])
|
| 49 |
+
def home():
|
| 50 |
+
|
| 51 |
+
form = Widgets()
|
| 52 |
+
if request.method == 'POST':
|
| 53 |
+
if (form.validate_on_submit()):
|
| 54 |
+
val = form.Statement.data
|
| 55 |
+
print(val)
|
| 56 |
+
session['data'] = val
|
| 57 |
+
# return render_template('home.html', form=form)
|
| 58 |
+
return redirect('/thanks')
|
| 59 |
+
return render_template('home.html', form=form)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@app.route("/thanks", methods=["GET", "POST"])
|
| 63 |
+
def thanks():
|
| 64 |
+
val = session['data']
|
| 65 |
+
txt, err = Process(val)
|
| 66 |
+
txt = txt.split('\n')
|
| 67 |
+
# newval = foo(val)
|
| 68 |
+
ex = [{
|
| 69 |
+
"words": [
|
| 70 |
+
{"text": "This", "tag": "DT"},
|
| 71 |
+
{"text": "is", "tag": "VBZ"},
|
| 72 |
+
{"text": "a", "tag": "DT"},
|
| 73 |
+
{"text": "sentence", "tag": "NN"}
|
| 74 |
+
],
|
| 75 |
+
"arcs": [
|
| 76 |
+
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
| 77 |
+
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
| 78 |
+
{"start": 1, "end": 3, "label": "attr", "dir": "right"}
|
| 79 |
+
]
|
| 80 |
+
}]
|
| 81 |
+
|
| 82 |
+
return render_template('thanks.html',user_image='/static/process.png',text=txt,show= not err)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
if __name__ == "__main__":
|
| 86 |
+
app.run(debug=True)
|
arc_eager.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from conllu import parse_incr, parse
|
| 2 |
+
from nltk.parse import DependencyGraph, DependencyEvaluator
|
| 3 |
+
from nltk.parse.transitionparser import TransitionParser
|
| 4 |
+
import pickle
|
| 5 |
+
import pygraphviz as pgv
|
| 6 |
+
from test_hn_pos import test_fn
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
def Process(sentence):
|
| 10 |
+
words = sentence.replace('|','।').split()
|
| 11 |
+
tags = test_fn(words)
|
| 12 |
+
text = []
|
| 13 |
+
i = 0
|
| 14 |
+
for word, tag in zip(words,tags):
|
| 15 |
+
i += 1
|
| 16 |
+
fill = '_'
|
| 17 |
+
text.append('\t'.join([str(i),word,fill,fill,fill,fill,fill,fill,fill,fill]))
|
| 18 |
+
dg = DependencyGraph('\n'.join(text))
|
| 19 |
+
text = '\n'.join(text)
|
| 20 |
+
text = text + '\n\n' + text
|
| 21 |
+
with open('biaffine-parser-master/data/naive3.conllx','w') as f:
|
| 22 |
+
f.write(text)
|
| 23 |
+
os.chdir('biaffine-parser-master')
|
| 24 |
+
os.system('python3.7 run.py predict --feat=bert --fdata=data/naive3.conllx --fpred=data/naive3.conllx')
|
| 25 |
+
txt = ''
|
| 26 |
+
os.chdir('..')
|
| 27 |
+
with open('biaffine-parser-master/data/naive3.conllx','r') as f:
|
| 28 |
+
txt = f.read().split('\n\n')[0]
|
| 29 |
+
|
| 30 |
+
# parser = TransitionParser('arc-eager')
|
| 31 |
+
# with open('models/parser.pkl','rb') as in_file:
|
| 32 |
+
# parser = pickle.load(in_file)
|
| 33 |
+
# predictions = parser.parse([dg],'models/arc_eager.model')
|
| 34 |
+
# txt = predictions[0].to_conll(4)
|
| 35 |
+
err = False
|
| 36 |
+
try:
|
| 37 |
+
out = DependencyGraph(txt)
|
| 38 |
+
out_dot = out.to_dot()
|
| 39 |
+
G = pgv.AGraph(out_dot)
|
| 40 |
+
G.layout(prog='dot') # use dot
|
| 41 |
+
G.draw('static/process.png')
|
| 42 |
+
except:
|
| 43 |
+
err = True
|
| 44 |
+
txt += '''Error generating graph.\n'''
|
| 45 |
+
return txt, err
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
## creates dependency graph list according to nltk library specification
|
| 51 |
+
def DepGraphList(sentenceList):
|
| 52 |
+
dgList = []
|
| 53 |
+
i = 0
|
| 54 |
+
j = 0
|
| 55 |
+
for sentence in sentenceList:
|
| 56 |
+
text = []
|
| 57 |
+
for token in sentence:
|
| 58 |
+
text.append(' '.join([token['form'],token['upostag'],str(token['head']),token['deprel'].upper()]))
|
| 59 |
+
try:
|
| 60 |
+
dg = DependencyGraph('\n'.join(text))
|
| 61 |
+
except:
|
| 62 |
+
j += 1
|
| 63 |
+
continue
|
| 64 |
+
i += 1
|
| 65 |
+
dgList.append(dg)
|
| 66 |
+
print(i,j)
|
| 67 |
+
return dgList
|
| 68 |
+
|
| 69 |
+
def main():
|
| 70 |
+
#data_file = open('data/test.conllu','r',encoding='utf-8')
|
| 71 |
+
#sentence_iter = parse_incr(data_file)
|
| 72 |
+
#sentences = []
|
| 73 |
+
#for s in sentence_iter:
|
| 74 |
+
# sentences.append(s)
|
| 75 |
+
#training_set = DepGraphList(sentences[len(sentences)//4:])
|
| 76 |
+
#test_set = DepGraphList(sentences[0:len(sentences)//4])
|
| 77 |
+
|
| 78 |
+
#parser = TransitionParser('arc-eager')
|
| 79 |
+
## Training
|
| 80 |
+
# parser.train(training_set,'models/arc_eager.model')
|
| 81 |
+
# with open('models/parser2.pkl','wb') as out:
|
| 82 |
+
# pickle.dump(parser,out)
|
| 83 |
+
# # ## Evaluation
|
| 84 |
+
# with open('models/parser2.pkl','rb') as in_file:
|
| 85 |
+
# parser = pickle.load(in_file)
|
| 86 |
+
# predictions = parser.parse(test_set,'models/arc_eager.model')
|
| 87 |
+
# de = DependencyEvaluator(predictions,test_set)
|
| 88 |
+
# print(de.eval())
|
| 89 |
+
Process('राम अच्छा पुरुष है |')
|
| 90 |
+
return
|
| 91 |
+
|
| 92 |
+
if __name__=='__main__':
|
| 93 |
+
main()
|
best_mapping.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52585379a81d5f8275ba347b5ddee6d69c196bc24fb0012713a63ec173b6312b
|
| 3 |
+
size 3523565
|
biaffine-parser-master/.gitignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ignore data files
|
| 2 |
+
data
|
| 3 |
+
|
| 4 |
+
# ignore bash scripts
|
| 5 |
+
*.sh
|
| 6 |
+
|
| 7 |
+
# ignore experimental results
|
| 8 |
+
exp
|
| 9 |
+
results
|
| 10 |
+
|
| 11 |
+
# ignore log files
|
| 12 |
+
log*
|
| 13 |
+
|
| 14 |
+
# ignore pycache
|
| 15 |
+
__pycache__
|
| 16 |
+
|
| 17 |
+
# ignore saved model
|
| 18 |
+
*.pkl
|
| 19 |
+
*.pt
|
| 20 |
+
|
| 21 |
+
# ignore vscode
|
| 22 |
+
.vscode
|
biaffine-parser-master/.travis.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
language: python
|
| 2 |
+
|
| 3 |
+
# Setup anaconda
|
| 4 |
+
before_install:
|
| 5 |
+
- wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
|
| 6 |
+
- chmod +x miniconda.sh
|
| 7 |
+
- ./miniconda.sh -b -p $HOME/miniconda
|
| 8 |
+
- export PATH=$HOME/miniconda/bin:$PATH
|
| 9 |
+
- conda update --yes --quiet conda
|
| 10 |
+
- conda config --set always_yes yes
|
| 11 |
+
- conda create --quiet -n py37 python=3.7
|
| 12 |
+
- source activate py37
|
| 13 |
+
|
| 14 |
+
# Install packages
|
| 15 |
+
install:
|
| 16 |
+
- conda install --quiet pytorch=1.3.0 -c pytorch
|
| 17 |
+
- conda install --quiet flake8
|
| 18 |
+
- pip install -r requirements.txt
|
| 19 |
+
|
| 20 |
+
script:
|
| 21 |
+
- flake8 .
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
|
| 25 |
+
chmod +x miniconda.sh
|
| 26 |
+
./miniconda.sh -b -p $HOME/miniconda
|
| 27 |
+
export PATH=$HOME/miniconda/bin:$PATH
|
| 28 |
+
conda update --yes --quiet conda
|
| 29 |
+
conda config --set always_yes yes
|
| 30 |
+
conda create --quiet -n py37 python=3.7
|
| 31 |
+
source activate py37
|
| 32 |
+
|
| 33 |
+
chmod +x miniconda.sh \
|
| 34 |
+
./miniconda.sh -b -p $HOME/miniconda \
|
| 35 |
+
export PATH=$HOME/miniconda/bin:$PATH \
|
| 36 |
+
source activate py37
|
biaffine-parser-master/README.md
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Biaffine Parser
|
| 2 |
+
|
| 3 |
+
[](https://travis-ci.org/zysite/biaffine-parser)
|
| 4 |
+
[](https://github.com/zysite/biaffine-parser/blob/master/LICENSE)
|
| 5 |
+
[](https://github.com/zysite/biaffine-parser/stargazers)
|
| 6 |
+
[](https://github.com/zysite/biaffine-parser/network/members)
|
| 7 |
+
|
| 8 |
+
An implementation of "Deep Biaffine Attention for Neural Dependency Parsing".
|
| 9 |
+
|
| 10 |
+
Details and [hyperparameter choices](#Hyperparameters) are almost identical to those described in the paper,
|
| 11 |
+
except that we provide the Eisner rather than MST algorithm to ensure well-formedness.
|
| 12 |
+
Practically, projective decoding like Eisner is the best choice since PTB contains mostly (99.9%) projective trees.
|
| 13 |
+
|
| 14 |
+
Besides the basic implementations, we also provide other features to replace the POS tags (TAG),
|
| 15 |
+
i.e., character-level embeddings (CHAR) and BERT.
|
| 16 |
+
|
| 17 |
+
## Requirements
|
| 18 |
+
|
| 19 |
+
* `python`: 3.7.0
|
| 20 |
+
* [`pytorch`](https://github.com/pytorch/pytorch): 1.3.0
|
| 21 |
+
* [`transformers`](https://github.com/huggingface/transformers): 2.1.1
|
| 22 |
+
|
| 23 |
+
## Datasets
|
| 24 |
+
|
| 25 |
+
The model is evaluated on the Stanford Dependency conversion ([v3.3.0](https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip)) of the English Penn Treebank with POS tags predicted by [Stanford POS tagger](https://nlp.stanford.edu/software/stanford-postagger-full-2018-10-16.zip).
|
| 26 |
+
|
| 27 |
+
For all datasets, we follow the conventional data splits:
|
| 28 |
+
|
| 29 |
+
* Train: 02-21 (39,832 sentences)
|
| 30 |
+
* Dev: 22 (1,700 sentences)
|
| 31 |
+
* Test: 23 (2,416 sentences)
|
| 32 |
+
|
| 33 |
+
## Performance
|
| 34 |
+
|
| 35 |
+
| FEAT | UAS | LAS | Speed (Sents/s) |
|
| 36 |
+
| ------------- | :---: | :---: | :-------------: |
|
| 37 |
+
| TAG | 95.90 | 94.25 | 1696.22 |
|
| 38 |
+
| TAG + Eisner | 95.93 | 94.28 | 350.46 |
|
| 39 |
+
| CHAR | 95.99 | 94.38 | 1464.59 |
|
| 40 |
+
| CHAR + Eisner | 96.02 | 94.41 | 323.73 |
|
| 41 |
+
| BERT | 96.64 | 95.11 | 438.72 |
|
| 42 |
+
| BERT + Eisner | 96.65 | 95.12 | 214.68 |
|
| 43 |
+
|
| 44 |
+
Note that punctuation is ignored in all evaluation metrics for PTB.
|
| 45 |
+
|
| 46 |
+
Aside from using consistent hyperparameters, there are some keypoints that significantly affect the performance:
|
| 47 |
+
|
| 48 |
+
- Dividing the pretrained embedding by its standard-deviation
|
| 49 |
+
- Applying the same dropout mask at every recurrent timestep
|
| 50 |
+
- Jointly dropping the word and additional feature representations
|
| 51 |
+
|
| 52 |
+
For the above reasons, we may have to give up some native modules in pytorch (e.g., `LSTM` and `Dropout`),
|
| 53 |
+
and use custom ones instead.
|
| 54 |
+
|
| 55 |
+
As shown above, our results have outperformed the [offical implementation](https://github.com/tdozat/Parser-v1) (95.74 and 94.08).
|
| 56 |
+
Incorporating character-level features or external embeddings like BERT can further improve the performance of the model.
|
| 57 |
+
|
| 58 |
+
## Usage
|
| 59 |
+
|
| 60 |
+
You can start the training, evaluation and prediction process by using subcommands registered in `parser.cmds`.
|
| 61 |
+
|
| 62 |
+
```sh
|
| 63 |
+
$ python run.py -h
|
| 64 |
+
usage: run.py [-h] {evaluate,predict,train} ...
|
| 65 |
+
|
| 66 |
+
Create the Biaffine Parser model.
|
| 67 |
+
|
| 68 |
+
optional arguments:
|
| 69 |
+
-h, --help show this help message and exit
|
| 70 |
+
|
| 71 |
+
Commands:
|
| 72 |
+
{evaluate,predict,train}
|
| 73 |
+
evaluate Evaluate the specified model and dataset.
|
| 74 |
+
predict Use a trained model to make predictions.
|
| 75 |
+
train Train a model.
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
Before triggering the subcommands, please make sure that the data files must be in CoNLL-X format.
|
| 79 |
+
If some fields are missing, you can use underscores as placeholders.
|
| 80 |
+
Below are some examples:
|
| 81 |
+
|
| 82 |
+
```sh
|
| 83 |
+
$ python run.py train -p -d=0 -f=exp/ptb.char --feat=char \
|
| 84 |
+
--ftrain=data/ptb/train.conllx \
|
| 85 |
+
--fdev=data/ptb/dev.conllx \
|
| 86 |
+
--ftest=data/ptb/test.conllx \
|
| 87 |
+
--fembed=data/glove.6B.100d.txt \
|
| 88 |
+
--unk=unk
|
| 89 |
+
|
| 90 |
+
$ python run.py evaluate -d=0 -f=exp/ptb.char --feat=char --tree \
|
| 91 |
+
--fdata=data/ptb/test.conllx
|
| 92 |
+
|
| 93 |
+
$ cat data/naive.conllx
|
| 94 |
+
1 Too _ _ _ _ _ _ _ _
|
| 95 |
+
2 young _ _ _ _ _ _ _ _
|
| 96 |
+
3 too _ _ _ _ _ _ _ _
|
| 97 |
+
4 simple _ _ _ _ _ _ _ _
|
| 98 |
+
5 , _ _ _ _ _ _ _ _
|
| 99 |
+
6 sometimes _ _ _ _ _ _ _ _
|
| 100 |
+
7 naive _ _ _ _ _ _ _ _
|
| 101 |
+
8 . _ _ _ _ _ _ _ _
|
| 102 |
+
|
| 103 |
+
$ python run.py predict -d=0 -f=exp/ptb.char --feat=char --tree \
|
| 104 |
+
--fdata=data/naive.conllx \
|
| 105 |
+
--fpred=naive.conllx
|
| 106 |
+
|
| 107 |
+
$ cat naive.conllx
|
| 108 |
+
1 Too _ _ _ _ 2 advmod _ _
|
| 109 |
+
2 young _ _ _ _ 0 root _ _
|
| 110 |
+
3 too _ _ _ _ 4 advmod _ _
|
| 111 |
+
4 simple _ _ _ _ 2 dep _ _
|
| 112 |
+
5 , _ _ _ _ 2 punct _ _
|
| 113 |
+
6 sometimes _ _ _ _ 7 advmod _ _
|
| 114 |
+
7 naive _ _ _ _ 2 dep _ _
|
| 115 |
+
8 . _ _ _ _ 2 punct _ _
|
| 116 |
+
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
All the optional arguments of the subcommands are as follows:
|
| 120 |
+
|
| 121 |
+
```sh
|
| 122 |
+
$ python run.py train -h
|
| 123 |
+
usage: run.py train [-h] [--buckets BUCKETS] [--punct] [--ftrain FTRAIN]
|
| 124 |
+
[--fdev FDEV] [--ftest FTEST] [--fembed FEMBED]
|
| 125 |
+
[--unk UNK] [--conf CONF] [--file FILE] [--preprocess]
|
| 126 |
+
[--device DEVICE] [--seed SEED] [--threads THREADS]
|
| 127 |
+
[--tree] [--feat {tag,char,bert}]
|
| 128 |
+
|
| 129 |
+
optional arguments:
|
| 130 |
+
-h, --help show this help message and exit
|
| 131 |
+
--buckets BUCKETS max num of buckets to use
|
| 132 |
+
--punct whether to include punctuation
|
| 133 |
+
--ftrain FTRAIN path to train file
|
| 134 |
+
--fdev FDEV path to dev file
|
| 135 |
+
--ftest FTEST path to test file
|
| 136 |
+
--fembed FEMBED path to pretrained embeddings
|
| 137 |
+
--unk UNK unk token in pretrained embeddings
|
| 138 |
+
--conf CONF, -c CONF path to config file
|
| 139 |
+
--file FILE, -f FILE path to saved files
|
| 140 |
+
--preprocess, -p whether to preprocess the data first
|
| 141 |
+
--device DEVICE, -d DEVICE
|
| 142 |
+
ID of GPU to use
|
| 143 |
+
--seed SEED, -s SEED seed for generating random numbers
|
| 144 |
+
--threads THREADS, -t THREADS
|
| 145 |
+
max num of threads
|
| 146 |
+
--tree whether to ensure well-formedness
|
| 147 |
+
--feat {tag,char,bert}
|
| 148 |
+
choices of additional features
|
| 149 |
+
|
| 150 |
+
$ python run.py evaluate -h
|
| 151 |
+
usage: run.py evaluate [-h] [--batch-size BATCH_SIZE] [--buckets BUCKETS]
|
| 152 |
+
[--punct] [--fdata FDATA] [--conf CONF] [--file FILE]
|
| 153 |
+
[--preprocess] [--device DEVICE] [--seed SEED]
|
| 154 |
+
[--threads THREADS] [--tree] [--feat {tag,char,bert}]
|
| 155 |
+
|
| 156 |
+
optional arguments:
|
| 157 |
+
-h, --help show this help message and exit
|
| 158 |
+
--batch-size BATCH_SIZE
|
| 159 |
+
batch size
|
| 160 |
+
--buckets BUCKETS max num of buckets to use
|
| 161 |
+
--punct whether to include punctuation
|
| 162 |
+
--fdata FDATA path to dataset
|
| 163 |
+
--conf CONF, -c CONF path to config file
|
| 164 |
+
--file FILE, -f FILE path to saved files
|
| 165 |
+
--preprocess, -p whether to preprocess the data first
|
| 166 |
+
--device DEVICE, -d DEVICE
|
| 167 |
+
ID of GPU to use
|
| 168 |
+
--seed SEED, -s SEED seed for generating random numbers
|
| 169 |
+
--threads THREADS, -t THREADS
|
| 170 |
+
max num of threads
|
| 171 |
+
--tree whether to ensure well-formedness
|
| 172 |
+
--feat {tag,char,bert}
|
| 173 |
+
choices of additional features
|
| 174 |
+
|
| 175 |
+
$ python run.py predict -h
|
| 176 |
+
usage: run.py predict [-h] [--batch-size BATCH_SIZE] [--fdata FDATA]
|
| 177 |
+
[--fpred FPRED] [--conf CONF] [--file FILE]
|
| 178 |
+
[--preprocess] [--device DEVICE] [--seed SEED]
|
| 179 |
+
[--threads THREADS] [--tree] [--feat {tag,char,bert}]
|
| 180 |
+
|
| 181 |
+
optional arguments:
|
| 182 |
+
-h, --help show this help message and exit
|
| 183 |
+
--batch-size BATCH_SIZE
|
| 184 |
+
batch size
|
| 185 |
+
--fdata FDATA path to dataset
|
| 186 |
+
--fpred FPRED path to predicted result
|
| 187 |
+
--conf CONF, -c CONF path to config file
|
| 188 |
+
--file FILE, -f FILE path to saved files
|
| 189 |
+
--preprocess, -p whether to preprocess the data first
|
| 190 |
+
--device DEVICE, -d DEVICE
|
| 191 |
+
ID of GPU to use
|
| 192 |
+
--seed SEED, -s SEED seed for generating random numbers
|
| 193 |
+
--threads THREADS, -t THREADS
|
| 194 |
+
max num of threads
|
| 195 |
+
--tree whether to ensure well-formedness
|
| 196 |
+
--feat {tag,char,bert}
|
| 197 |
+
choices of additional features
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
## Hyperparameters
|
| 201 |
+
|
| 202 |
+
| Param | Description | Value |
|
| 203 |
+
| :------------ | :----------------------------------------------------------- | :--------------------------------------------------------------------: |
|
| 204 |
+
| n_embed | dimension of embeddings | 100 |
|
| 205 |
+
| n_char_embed | dimension of char embeddings | 50 |
|
| 206 |
+
| n_bert_layers | number of bert layers to use | 4 |
|
| 207 |
+
| embed_dropout | dropout ratio of embeddings | 0.33 |
|
| 208 |
+
| n_lstm_hidden | dimension of lstm hidden states | 400 |
|
| 209 |
+
| n_lstm_layers | number of lstm layers | 3 |
|
| 210 |
+
| lstm_dropout | dropout ratio of lstm | 0.33 |
|
| 211 |
+
| n_mlp_arc | arc mlp size | 500 |
|
| 212 |
+
| n_mlp_rel | label mlp size | 100 |
|
| 213 |
+
| mlp_dropout | dropout ratio of mlp | 0.33 |
|
| 214 |
+
| lr | starting learning rate of training | 2e-3 |
|
| 215 |
+
| betas | hyperparameters of momentum and L2 norm | (0.9, 0.9) |
|
| 216 |
+
| epsilon | stability constant | 1e-12 |
|
| 217 |
+
| annealing | formula of learning rate annealing | <img src="https://latex.codecogs.com/gif.latex?.75^{\frac{t}{5000}}"/> |
|
| 218 |
+
| batch_size | approximate number of tokens per training update | 5000 |
|
| 219 |
+
| epochs | max number of epochs | 50000 |
|
| 220 |
+
| patience | patience for early stop | 100 |
|
| 221 |
+
| min_freq | minimum frequency of words in the training set not discarded | 2 |
|
| 222 |
+
| fix_len | fixed length of a word | 20 |
|
| 223 |
+
|
| 224 |
+
## References
|
| 225 |
+
|
| 226 |
+
* [Deep Biaffine Attention for Neural Dependency Parsing](https://arxiv.org/abs/1611.01734)
|
biaffine-parser-master/config.ini
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[Data]
|
| 2 |
+
bert_model = 'bert-base-multilingual-uncased'
|
| 3 |
+
|
| 4 |
+
[Network]
|
| 5 |
+
n_embed = 768
|
| 6 |
+
n_char_embed = 50
|
| 7 |
+
n_bert_layers = 4
|
| 8 |
+
embed_dropout = .33
|
| 9 |
+
n_lstm_hidden = 400
|
| 10 |
+
n_lstm_layers = 3
|
| 11 |
+
lstm_dropout = .33
|
| 12 |
+
n_mlp_arc = 500
|
| 13 |
+
n_mlp_rel = 100
|
| 14 |
+
mlp_dropout = .33
|
| 15 |
+
|
| 16 |
+
[Optimizer]
|
| 17 |
+
lr = 2e-3
|
| 18 |
+
mu = .9
|
| 19 |
+
nu = .9
|
| 20 |
+
epsilon = 1e-12
|
| 21 |
+
clip = 5.0
|
| 22 |
+
decay = .75
|
| 23 |
+
decay_steps = 5000
|
| 24 |
+
|
| 25 |
+
[Run]
|
| 26 |
+
batch_size = 1000
|
| 27 |
+
epochs = 300
|
| 28 |
+
patience = 30
|
| 29 |
+
min_freq = 2
|
| 30 |
+
fix_len = 20
|
biaffine-parser-master/data/naive3.conllx
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1 அதற்கு _ _ _ _ 2 obl _ _
|
| 2 |
+
2 கடந்த _ _ _ _ 4 amod _ _
|
| 3 |
+
3 சட்டப்பேரவைத் _ _ _ _ 4 nmod _ _
|
| 4 |
+
4 தேர்தலில் _ _ _ _ 5 obl _ _
|
| 5 |
+
5 பலன்கிட்டியது _ _ _ _ 0 root _ _
|
| 6 |
+
6 . _ _ _ _ 5 punct _ _
|
| 7 |
+
|
biaffine-parser-master/data/ptb/tamdev.conllx
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
biaffine-parser-master/data/ptb/tamtest.conllx
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
biaffine-parser-master/data/ptb/tamtrain.conllx
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
biaffine-parser-master/exp/ptb/fields
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:333059b6bc4af3b1eca7d03cdeeef162912b6e2e5d96a0d7373d05c0beab614d
|
| 3 |
+
size 3189679
|
biaffine-parser-master/exp/ptb/model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:626032b06dd8f45c5755707990069dc0451e00a0c2e7992bfbd62d05a078dc0e
|
| 3 |
+
size 736388858
|
biaffine-parser-master/parser/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from .model import Model
|
| 4 |
+
|
| 5 |
+
__all__ = ['Model']
|
biaffine-parser-master/parser/cmds/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from .evaluate import Evaluate
|
| 4 |
+
from .predict import Predict
|
| 5 |
+
from .train import Train
|
| 6 |
+
|
| 7 |
+
__all__ = ['Evaluate', 'Predict', 'Train']
|
biaffine-parser-master/parser/cmds/cmd.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from parser.utils import Embedding
|
| 5 |
+
from parser.utils.alg import eisner
|
| 6 |
+
from parser.utils.common import bos, pad, unk
|
| 7 |
+
from parser.utils.corpus import CoNLL, Corpus
|
| 8 |
+
from parser.utils.field import BertField, CharField, Field
|
| 9 |
+
from parser.utils.fn import ispunct
|
| 10 |
+
from parser.utils.metric import Metric
|
| 11 |
+
|
| 12 |
+
import torch
|
| 13 |
+
import torch.nn as nn
|
| 14 |
+
from transformers import AutoTokenizer, BertTokenizer
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class CMD(object):
|
| 18 |
+
|
| 19 |
+
def __call__(self, args):
|
| 20 |
+
self.args = args
|
| 21 |
+
if not os.path.exists(args.file):
|
| 22 |
+
os.mkdir(args.file)
|
| 23 |
+
if not os.path.exists(args.fields) or args.preprocess:
|
| 24 |
+
print("Preprocess the data")
|
| 25 |
+
self.WORD = Field('words', pad=pad, unk=unk, bos=bos, lower=True)
|
| 26 |
+
if args.feat == 'char':
|
| 27 |
+
self.FEAT = CharField('chars', pad=pad, unk=unk, bos=bos,
|
| 28 |
+
fix_len=args.fix_len, tokenize=list)
|
| 29 |
+
elif args.feat == 'bert':
|
| 30 |
+
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
|
| 31 |
+
#tokenizer = AutoTokenizer.from_pretrained("sailen7/finetuning-sentiment-model-3000-samples")
|
| 32 |
+
self.FEAT = BertField('bert', pad='[PAD]', bos='[CLS]',
|
| 33 |
+
tokenize=tokenizer.encode)
|
| 34 |
+
else:
|
| 35 |
+
self.FEAT = Field('tags', bos=bos)
|
| 36 |
+
self.HEAD = Field('heads', bos=bos, use_vocab=False, fn=int)
|
| 37 |
+
self.REL = Field('rels', bos=bos)
|
| 38 |
+
if args.feat in ('char', 'bert'):
|
| 39 |
+
self.fields = CoNLL(FORM=(self.WORD, self.FEAT),
|
| 40 |
+
HEAD=self.HEAD, DEPREL=self.REL)
|
| 41 |
+
else:
|
| 42 |
+
self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT,
|
| 43 |
+
HEAD=self.HEAD, DEPREL=self.REL)
|
| 44 |
+
|
| 45 |
+
train = Corpus.load(args.ftrain, self.fields)
|
| 46 |
+
# if args.fembed:
|
| 47 |
+
# embed = Embedding.load(args.fembed, args.unk)
|
| 48 |
+
# else:
|
| 49 |
+
embed = None
|
| 50 |
+
self.WORD.build(train, args.min_freq, embed)
|
| 51 |
+
self.FEAT.build(train)
|
| 52 |
+
self.REL.build(train)
|
| 53 |
+
torch.save(self.fields, args.fields)
|
| 54 |
+
else:
|
| 55 |
+
self.fields = torch.load(args.fields)
|
| 56 |
+
if args.feat in ('char', 'bert'):
|
| 57 |
+
self.WORD, self.FEAT = self.fields.FORM
|
| 58 |
+
else:
|
| 59 |
+
self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS
|
| 60 |
+
self.HEAD, self.REL = self.fields.HEAD, self.fields.DEPREL
|
| 61 |
+
self.puncts = torch.tensor([i for s, i in self.WORD.vocab.stoi.items()
|
| 62 |
+
if ispunct(s)]).to(args.device)
|
| 63 |
+
self.criterion = nn.CrossEntropyLoss()
|
| 64 |
+
|
| 65 |
+
print(f"{self.WORD}\n{self.FEAT}\n{self.HEAD}\n{self.REL}")
|
| 66 |
+
args.update({
|
| 67 |
+
'n_words': self.WORD.vocab.n_init,
|
| 68 |
+
'n_feats': len(self.FEAT.vocab),
|
| 69 |
+
'n_rels': len(self.REL.vocab),
|
| 70 |
+
'pad_index': self.WORD.pad_index,
|
| 71 |
+
'unk_index': self.WORD.unk_index,
|
| 72 |
+
'bos_index': self.WORD.bos_index
|
| 73 |
+
})
|
| 74 |
+
|
| 75 |
+
def train(self, loader):
|
| 76 |
+
self.model.train()
|
| 77 |
+
|
| 78 |
+
for words, feats, arcs, rels in loader:
|
| 79 |
+
self.optimizer.zero_grad()
|
| 80 |
+
|
| 81 |
+
mask = words.ne(self.args.pad_index)
|
| 82 |
+
# ignore the first token of each sentence
|
| 83 |
+
mask[:, 0] = 0
|
| 84 |
+
arc_scores, rel_scores = self.model(words, feats)
|
| 85 |
+
loss = self.get_loss(arc_scores, rel_scores, arcs, rels, mask)
|
| 86 |
+
loss.backward()
|
| 87 |
+
nn.utils.clip_grad_norm_(self.model.parameters(),
|
| 88 |
+
self.args.clip)
|
| 89 |
+
self.optimizer.step()
|
| 90 |
+
self.scheduler.step()
|
| 91 |
+
|
| 92 |
+
@torch.no_grad()
|
| 93 |
+
def evaluate(self, loader):
|
| 94 |
+
self.model.eval()
|
| 95 |
+
|
| 96 |
+
loss, metric = 0, Metric()
|
| 97 |
+
|
| 98 |
+
for words, feats, arcs, rels in loader:
|
| 99 |
+
mask = words.ne(self.args.pad_index)
|
| 100 |
+
# ignore the first token of each sentence
|
| 101 |
+
mask[:, 0] = 0
|
| 102 |
+
arc_scores, rel_scores = self.model(words, feats)
|
| 103 |
+
loss += self.get_loss(arc_scores, rel_scores, arcs, rels, mask)
|
| 104 |
+
arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
|
| 105 |
+
# ignore all punctuation if not specified
|
| 106 |
+
if not self.args.punct:
|
| 107 |
+
mask &= words.unsqueeze(-1).ne(self.puncts).all(-1)
|
| 108 |
+
metric(arc_preds, rel_preds, arcs, rels, mask)
|
| 109 |
+
loss /= len(loader)
|
| 110 |
+
|
| 111 |
+
return loss, metric
|
| 112 |
+
|
| 113 |
+
@torch.no_grad()
|
| 114 |
+
def predict(self, loader):
|
| 115 |
+
self.model.eval()
|
| 116 |
+
|
| 117 |
+
all_arcs, all_rels = [], []
|
| 118 |
+
for words, feats in loader:
|
| 119 |
+
print("words ->", words, " ", "features -> ",feats )
|
| 120 |
+
mask = words.ne(self.args.pad_index)
|
| 121 |
+
# ignore the first token of each sentence
|
| 122 |
+
mask[:, 0] = 0
|
| 123 |
+
lens = mask.sum(1).tolist()
|
| 124 |
+
arc_scores, rel_scores = self.model(words, feats)
|
| 125 |
+
arc_preds, rel_preds = self.decode(arc_scores, rel_scores, mask)
|
| 126 |
+
all_arcs.extend(arc_preds[mask].split(lens))
|
| 127 |
+
all_rels.extend(rel_preds[mask].split(lens))
|
| 128 |
+
all_arcs = [seq.tolist() for seq in all_arcs]
|
| 129 |
+
all_rels = [self.REL.vocab.id2token(seq.tolist()) for seq in all_rels]
|
| 130 |
+
|
| 131 |
+
return all_arcs, all_rels
|
| 132 |
+
|
| 133 |
+
def get_loss(self, arc_scores, rel_scores, arcs, rels, mask):
|
| 134 |
+
arc_scores, arcs = arc_scores[mask], arcs[mask]
|
| 135 |
+
rel_scores, rels = rel_scores[mask], rels[mask]
|
| 136 |
+
rel_scores = rel_scores[torch.arange(len(arcs)), arcs]
|
| 137 |
+
arc_loss = self.criterion(arc_scores, arcs)
|
| 138 |
+
rel_loss = self.criterion(rel_scores, rels)
|
| 139 |
+
loss = arc_loss + rel_loss
|
| 140 |
+
|
| 141 |
+
return loss
|
| 142 |
+
|
| 143 |
+
def decode(self, arc_scores, rel_scores, mask):
|
| 144 |
+
if self.args.tree:
|
| 145 |
+
arc_preds = eisner(arc_scores, mask)
|
| 146 |
+
else:
|
| 147 |
+
arc_preds = arc_scores.argmax(-1)
|
| 148 |
+
rel_preds = rel_scores.argmax(-1)
|
| 149 |
+
rel_preds = rel_preds.gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)
|
| 150 |
+
|
| 151 |
+
return arc_preds, rel_preds
|
biaffine-parser-master/parser/cmds/evaluate.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from parser import Model
|
| 5 |
+
from parser.cmds.cmd import CMD
|
| 6 |
+
from parser.utils.corpus import Corpus
|
| 7 |
+
from parser.utils.data import TextDataset, batchify
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Evaluate(CMD):
|
| 11 |
+
|
| 12 |
+
def add_subparser(self, name, parser):
|
| 13 |
+
subparser = parser.add_parser(
|
| 14 |
+
name, help='Evaluate the specified model and dataset.'
|
| 15 |
+
)
|
| 16 |
+
subparser.add_argument('--batch-size', default=1000, type=int,
|
| 17 |
+
help='batch size')
|
| 18 |
+
subparser.add_argument('--buckets', default=10, type=int,
|
| 19 |
+
help='max num of buckets to use')
|
| 20 |
+
subparser.add_argument('--punct', action='store_true',
|
| 21 |
+
help='whether to include punctuation')
|
| 22 |
+
subparser.add_argument('--fdata', default='data/ptb/tamtest.conllx',
|
| 23 |
+
help='path to dataset')
|
| 24 |
+
|
| 25 |
+
return subparser
|
| 26 |
+
|
| 27 |
+
def __call__(self, args):
|
| 28 |
+
super(Evaluate, self).__call__(args)
|
| 29 |
+
|
| 30 |
+
print("Load the dataset")
|
| 31 |
+
corpus = Corpus.load(args.fdata, self.fields)
|
| 32 |
+
dataset = TextDataset(corpus, self.fields, args.buckets)
|
| 33 |
+
# set the data loader
|
| 34 |
+
dataset.loader = batchify(dataset, args.batch_size)
|
| 35 |
+
print(f"{len(dataset)} sentences, "
|
| 36 |
+
f"{len(dataset.loader)} batches, "
|
| 37 |
+
f"{len(dataset.buckets)} buckets")
|
| 38 |
+
|
| 39 |
+
print("Load the model")
|
| 40 |
+
self.model = Model.load(args.model)
|
| 41 |
+
print(f"{self.model}\n")
|
| 42 |
+
|
| 43 |
+
print("Evaluate the dataset")
|
| 44 |
+
start = datetime.now()
|
| 45 |
+
loss, metric = self.evaluate(dataset.loader)
|
| 46 |
+
total_time = datetime.now() - start
|
| 47 |
+
print(f"Loss: {loss:.4f} {metric}")
|
| 48 |
+
print(f"{total_time}s elapsed, "
|
| 49 |
+
f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
|
biaffine-parser-master/parser/cmds/predict.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from parser import Model
|
| 5 |
+
from parser.cmds.cmd import CMD
|
| 6 |
+
from parser.utils.corpus import Corpus
|
| 7 |
+
from parser.utils.data import TextDataset, batchify
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Predict(CMD):
|
| 11 |
+
|
| 12 |
+
def add_subparser(self, name, parser):
|
| 13 |
+
subparser = parser.add_parser(
|
| 14 |
+
name, help='Use a trained model to make predictions.'
|
| 15 |
+
)
|
| 16 |
+
subparser.add_argument('--batch-size', default=1000, type=int,
|
| 17 |
+
help='batch size')
|
| 18 |
+
subparser.add_argument('--fdata', default='data/ptb/tamtest.conllx',
|
| 19 |
+
help='path to dataset')
|
| 20 |
+
subparser.add_argument('--fpred', default='pred.conllx',
|
| 21 |
+
help='path to predicted result')
|
| 22 |
+
|
| 23 |
+
return subparser
|
| 24 |
+
|
| 25 |
+
def __call__(self, args):
|
| 26 |
+
super(Predict, self).__call__(args)
|
| 27 |
+
|
| 28 |
+
print("Load the dataset")
|
| 29 |
+
corpus = Corpus.load(args.fdata, self.fields)
|
| 30 |
+
dataset = TextDataset(corpus, [self.WORD, self.FEAT])
|
| 31 |
+
# set the data loader
|
| 32 |
+
dataset.loader = batchify(dataset, args.batch_size)
|
| 33 |
+
|
| 34 |
+
print(type(dataset))
|
| 35 |
+
print(f"{len(dataset)} sentences, "
|
| 36 |
+
f"{len(dataset.loader)} batches")
|
| 37 |
+
|
| 38 |
+
print("Load the model")
|
| 39 |
+
self.model = Model.load(args.model)
|
| 40 |
+
print(f"{self.model}\n")
|
| 41 |
+
|
| 42 |
+
print("Make predictions on the dataset")
|
| 43 |
+
start = datetime.now()
|
| 44 |
+
corpus.heads, corpus.rels = self.predict(dataset.loader)
|
| 45 |
+
print(f"Save the predicted result to {args.fpred}")
|
| 46 |
+
corpus.save(args.fpred)
|
| 47 |
+
total_time = datetime.now() - start
|
| 48 |
+
print(f"{total_time}s elapsed, "
|
| 49 |
+
f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
|
biaffine-parser-master/parser/cmds/train.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from datetime import datetime, timedelta
|
| 4 |
+
from parser import Model
|
| 5 |
+
from parser.cmds.cmd import CMD
|
| 6 |
+
from parser.utils.corpus import Corpus
|
| 7 |
+
from parser.utils.data import TextDataset, batchify
|
| 8 |
+
from parser.utils.metric import Metric
|
| 9 |
+
|
| 10 |
+
import torch
|
| 11 |
+
import torch.nn as nn
|
| 12 |
+
from torch.optim import Adam
|
| 13 |
+
from torch.optim.lr_scheduler import ExponentialLR
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Train(CMD):
|
| 17 |
+
|
| 18 |
+
def add_subparser(self, name, parser):
|
| 19 |
+
subparser = parser.add_parser(
|
| 20 |
+
name, help='Train a model.'
|
| 21 |
+
)
|
| 22 |
+
subparser.add_argument('--buckets', default=10, type=int,
|
| 23 |
+
help='max num of buckets to use')
|
| 24 |
+
subparser.add_argument('--punct', action='store_true',
|
| 25 |
+
help='whether to include punctuation')
|
| 26 |
+
subparser.add_argument('--ftrain', default='data/ptb/tamtrain.conllx',
|
| 27 |
+
help='path to train file')
|
| 28 |
+
subparser.add_argument('--fdev', default='data/ptb/tamdev.conllx',
|
| 29 |
+
help='path to dev file')
|
| 30 |
+
subparser.add_argument('--ftest', default='data/ptb/tamtest.conllx',
|
| 31 |
+
help='path to test file')
|
| 32 |
+
# subparser.add_argument('--fembed', default='data/tam.txt',
|
| 33 |
+
# help='path to pretrained embeddings')
|
| 34 |
+
subparser.add_argument('--unk', default='unk',
|
| 35 |
+
help='unk token in pretrained embeddings')
|
| 36 |
+
|
| 37 |
+
return subparser
|
| 38 |
+
|
| 39 |
+
def __call__(self, args):
|
| 40 |
+
super(Train, self).__call__(args)
|
| 41 |
+
|
| 42 |
+
train = Corpus.load(args.ftrain, self.fields)
|
| 43 |
+
dev = Corpus.load(args.fdev, self.fields)
|
| 44 |
+
test = Corpus.load(args.ftest, self.fields)
|
| 45 |
+
|
| 46 |
+
train = TextDataset(train, self.fields, args.buckets)
|
| 47 |
+
dev = TextDataset(dev, self.fields, args.buckets)
|
| 48 |
+
test = TextDataset(test, self.fields, args.buckets)
|
| 49 |
+
# set the data loaders
|
| 50 |
+
train.loader = batchify(train, args.batch_size, True)
|
| 51 |
+
dev.loader = batchify(dev, args.batch_size)
|
| 52 |
+
test.loader = batchify(test, args.batch_size)
|
| 53 |
+
print(f"{'train:':6} {len(train):5} sentences, "
|
| 54 |
+
f"{len(train.loader):3} batches, "
|
| 55 |
+
f"{len(train.buckets)} buckets")
|
| 56 |
+
print(f"{'dev:':6} {len(dev):5} sentences, "
|
| 57 |
+
f"{len(dev.loader):3} batches, "
|
| 58 |
+
f"{len(train.buckets)} buckets")
|
| 59 |
+
print(f"{'test:':6} {len(test):5} sentences, "
|
| 60 |
+
f"{len(test.loader):3} batches, "
|
| 61 |
+
f"{len(train.buckets)} buckets")
|
| 62 |
+
|
| 63 |
+
print("Create the model")
|
| 64 |
+
self.model = Model(args).load_pretrained(self.WORD.embed)
|
| 65 |
+
print(f"{self.model}\n")
|
| 66 |
+
self.model = self.model.to(args.device)
|
| 67 |
+
if torch.cuda.device_count() > 1:
|
| 68 |
+
print("GPU")
|
| 69 |
+
self.model = nn.DataParallel(self.model)
|
| 70 |
+
self.optimizer = Adam(self.model.parameters(),
|
| 71 |
+
args.lr,
|
| 72 |
+
(args.mu, args.nu),
|
| 73 |
+
args.epsilon)
|
| 74 |
+
self.scheduler = ExponentialLR(self.optimizer,
|
| 75 |
+
args.decay**(1/args.decay_steps))
|
| 76 |
+
|
| 77 |
+
total_time = timedelta()
|
| 78 |
+
best_e, best_metric = 1, Metric()
|
| 79 |
+
|
| 80 |
+
for epoch in range(1, args.epochs + 1):
|
| 81 |
+
start = datetime.now()
|
| 82 |
+
# train one epoch and update the parameters
|
| 83 |
+
self.train(train.loader)
|
| 84 |
+
|
| 85 |
+
print(f"Epoch {epoch} / {args.epochs}:")
|
| 86 |
+
loss, train_metric = self.evaluate(train.loader)
|
| 87 |
+
print(f"{'train:':6} Loss: {loss:.4f} {train_metric}")
|
| 88 |
+
loss, dev_metric = self.evaluate(dev.loader)
|
| 89 |
+
print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
|
| 90 |
+
loss, test_metric = self.evaluate(test.loader)
|
| 91 |
+
print(f"{'test:':6} Loss: {loss:.4f} {test_metric}")
|
| 92 |
+
|
| 93 |
+
t = datetime.now() - start
|
| 94 |
+
# save the model if it is the best so far
|
| 95 |
+
if dev_metric > best_metric and epoch > args.patience:
|
| 96 |
+
best_e, best_metric = epoch, dev_metric
|
| 97 |
+
if hasattr(self.model, 'module'):
|
| 98 |
+
self.model.module.save(args.model)
|
| 99 |
+
else:
|
| 100 |
+
self.model.save(args.model)
|
| 101 |
+
print(f"{t}s elapsed (saved)\n")
|
| 102 |
+
else:
|
| 103 |
+
print(f"{t}s elapsed\n")
|
| 104 |
+
total_time += t
|
| 105 |
+
if epoch - best_e >= args.patience:
|
| 106 |
+
break
|
| 107 |
+
self.model = Model.load(args.model)
|
| 108 |
+
loss, metric = self.evaluate(test.loader)
|
| 109 |
+
|
| 110 |
+
print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
|
| 111 |
+
print(f"the score of test at epoch {best_e} is {metric.score:.2%}")
|
| 112 |
+
print(f"average time of each epoch is {total_time / epoch}s")
|
| 113 |
+
print(f"{total_time}s elapsed")
|
biaffine-parser-master/parser/config.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from ast import literal_eval
|
| 4 |
+
from configparser import ConfigParser
|
| 5 |
+
from argparse import Namespace
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Config(ConfigParser):
|
| 9 |
+
|
| 10 |
+
def __init__(self, path):
|
| 11 |
+
super(Config, self).__init__()
|
| 12 |
+
|
| 13 |
+
self.read(path)
|
| 14 |
+
self.namespace = Namespace()
|
| 15 |
+
self.update(dict((name, literal_eval(value))
|
| 16 |
+
for section in self.sections()
|
| 17 |
+
for name, value in self.items(section)))
|
| 18 |
+
|
| 19 |
+
def __repr__(self):
|
| 20 |
+
s = line = "-" * 15 + "-+-" + "-" * 25 + "\n"
|
| 21 |
+
s += f"{'Param':15} | {'Value':^25}\n" + line
|
| 22 |
+
for name, value in vars(self.namespace).items():
|
| 23 |
+
s += f"{name:15} | {str(value):^25}\n"
|
| 24 |
+
s += line
|
| 25 |
+
|
| 26 |
+
return s
|
| 27 |
+
|
| 28 |
+
def __getattr__(self, attr):
|
| 29 |
+
return getattr(self.namespace, attr)
|
| 30 |
+
|
| 31 |
+
def __getstate__(self):
|
| 32 |
+
return vars(self)
|
| 33 |
+
|
| 34 |
+
def __setstate__(self, state):
|
| 35 |
+
self.__dict__.update(state)
|
| 36 |
+
|
| 37 |
+
def update(self, kwargs):
|
| 38 |
+
for name, value in kwargs.items():
|
| 39 |
+
setattr(self.namespace, name, value)
|
| 40 |
+
|
| 41 |
+
return self
|
biaffine-parser-master/parser/model.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from parser.modules import CHAR_LSTM, MLP, BertEmbedding, Biaffine, BiLSTM
|
| 4 |
+
from parser.modules.dropout import IndependentDropout, SharedDropout
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import torch.nn as nn
|
| 8 |
+
from torch.nn.utils.rnn import (pack_padded_sequence, pad_packed_sequence,
|
| 9 |
+
pad_sequence)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Model(nn.Module):
|
| 13 |
+
|
| 14 |
+
def __init__(self, args):
|
| 15 |
+
super(Model, self).__init__()
|
| 16 |
+
|
| 17 |
+
self.args = args
|
| 18 |
+
# the embedding layer
|
| 19 |
+
self.word_embed = nn.Embedding(num_embeddings=args.n_words,
|
| 20 |
+
embedding_dim=args.n_embed)
|
| 21 |
+
if args.feat == 'char':
|
| 22 |
+
self.feat_embed = CHAR_LSTM(n_chars=args.n_feats,
|
| 23 |
+
n_embed=args.n_char_embed,
|
| 24 |
+
n_out=args.n_embed)
|
| 25 |
+
elif args.feat == 'bert':
|
| 26 |
+
self.feat_embed = BertEmbedding(model=args.bert_model,
|
| 27 |
+
n_layers=args.n_bert_layers,
|
| 28 |
+
n_out=args.n_embed)
|
| 29 |
+
else:
|
| 30 |
+
self.feat_embed = nn.Embedding(num_embeddings=args.n_feats,
|
| 31 |
+
embedding_dim=args.n_embed)
|
| 32 |
+
self.embed_dropout = IndependentDropout(p=args.embed_dropout)
|
| 33 |
+
|
| 34 |
+
# the word-lstm layer
|
| 35 |
+
self.lstm = BiLSTM(input_size=args.n_embed*2,
|
| 36 |
+
hidden_size=args.n_lstm_hidden,
|
| 37 |
+
num_layers=args.n_lstm_layers,
|
| 38 |
+
dropout=args.lstm_dropout)
|
| 39 |
+
self.lstm_dropout = SharedDropout(p=args.lstm_dropout)
|
| 40 |
+
|
| 41 |
+
# the MLP layers
|
| 42 |
+
self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden*2,
|
| 43 |
+
n_hidden=args.n_mlp_arc,
|
| 44 |
+
dropout=args.mlp_dropout)
|
| 45 |
+
self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden*2,
|
| 46 |
+
n_hidden=args.n_mlp_arc,
|
| 47 |
+
dropout=args.mlp_dropout)
|
| 48 |
+
self.mlp_rel_h = MLP(n_in=args.n_lstm_hidden*2,
|
| 49 |
+
n_hidden=args.n_mlp_rel,
|
| 50 |
+
dropout=args.mlp_dropout)
|
| 51 |
+
self.mlp_rel_d = MLP(n_in=args.n_lstm_hidden*2,
|
| 52 |
+
n_hidden=args.n_mlp_rel,
|
| 53 |
+
dropout=args.mlp_dropout)
|
| 54 |
+
|
| 55 |
+
# the Biaffine layers
|
| 56 |
+
self.arc_attn = Biaffine(n_in=args.n_mlp_arc,
|
| 57 |
+
bias_x=True,
|
| 58 |
+
bias_y=False)
|
| 59 |
+
self.rel_attn = Biaffine(n_in=args.n_mlp_rel,
|
| 60 |
+
n_out=args.n_rels,
|
| 61 |
+
bias_x=True,
|
| 62 |
+
bias_y=True)
|
| 63 |
+
self.pad_index = args.pad_index
|
| 64 |
+
self.unk_index = args.unk_index
|
| 65 |
+
|
| 66 |
+
def load_pretrained(self, embed=None):
|
| 67 |
+
if embed is not None:
|
| 68 |
+
self.pretrained = nn.Embedding.from_pretrained(embed)
|
| 69 |
+
nn.init.zeros_(self.word_embed.weight)
|
| 70 |
+
|
| 71 |
+
return self
|
| 72 |
+
|
| 73 |
+
def forward(self, words, feats):
|
| 74 |
+
batch_size, seq_len = words.shape
|
| 75 |
+
# get the mask and lengths of given batch
|
| 76 |
+
mask = words.ne(self.pad_index)
|
| 77 |
+
lens = mask.sum(dim=1)
|
| 78 |
+
# set the indices larger than num_embeddings to unk_index
|
| 79 |
+
ext_mask = words.ge(self.word_embed.num_embeddings)
|
| 80 |
+
ext_words = words.masked_fill(ext_mask, self.unk_index)
|
| 81 |
+
|
| 82 |
+
# get outputs from embedding layers
|
| 83 |
+
word_embed = self.word_embed(ext_words)
|
| 84 |
+
if hasattr(self, 'pretrained'):
|
| 85 |
+
word_embed = torch.cat((word_embed, self.pretrained(words)), dim=2)
|
| 86 |
+
if self.args.feat == 'char':
|
| 87 |
+
print(mask.shape)
|
| 88 |
+
feat_embed = self.feat_embed(feats[mask])
|
| 89 |
+
feat_embed = pad_sequence(feat_embed.split(lens.tolist()), True)
|
| 90 |
+
elif self.args.feat == 'bert':
|
| 91 |
+
feat_embed = self.feat_embed(*feats)
|
| 92 |
+
else:
|
| 93 |
+
feat_embed = self.feat_embed(feats)
|
| 94 |
+
word_embed, feat_embed = self.embed_dropout(word_embed, feat_embed)
|
| 95 |
+
# concatenate the word and feat representations
|
| 96 |
+
embed = torch.cat((word_embed, feat_embed), dim=-1)
|
| 97 |
+
|
| 98 |
+
lens = lens.to('cpu')
|
| 99 |
+
x = pack_padded_sequence(embed, lens ,True, False)
|
| 100 |
+
x, _ = self.lstm(x)
|
| 101 |
+
x, _ = pad_packed_sequence(x, True, total_length=seq_len)
|
| 102 |
+
x = self.lstm_dropout(x)
|
| 103 |
+
|
| 104 |
+
# apply MLPs to the BiLSTM output states
|
| 105 |
+
arc_h = self.mlp_arc_h(x)
|
| 106 |
+
arc_d = self.mlp_arc_d(x)
|
| 107 |
+
rel_h = self.mlp_rel_h(x)
|
| 108 |
+
rel_d = self.mlp_rel_d(x)
|
| 109 |
+
|
| 110 |
+
# get arc and rel scores from the bilinear attention
|
| 111 |
+
# [batch_size, seq_len, seq_len]
|
| 112 |
+
s_arc = self.arc_attn(arc_d, arc_h)
|
| 113 |
+
# [batch_size, seq_len, seq_len, n_rels]
|
| 114 |
+
s_rel = self.rel_attn(rel_d, rel_h).permute(0, 2, 3, 1)
|
| 115 |
+
# set the scores that exceed the length of each sentence to -inf
|
| 116 |
+
s_arc.masked_fill_(~mask.unsqueeze(1), float('-inf'))
|
| 117 |
+
|
| 118 |
+
return s_arc, s_rel
|
| 119 |
+
|
| 120 |
+
@classmethod
|
| 121 |
+
def load(cls, path):
|
| 122 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 123 |
+
state = torch.load(path, map_location=device)
|
| 124 |
+
model = cls(state['args'])
|
| 125 |
+
model.load_pretrained(state['pretrained'])
|
| 126 |
+
model.load_state_dict(state['state_dict'], False)
|
| 127 |
+
model.to(device)
|
| 128 |
+
|
| 129 |
+
return model
|
| 130 |
+
|
| 131 |
+
def save(self, path):
|
| 132 |
+
state_dict, pretrained = self.state_dict(), None
|
| 133 |
+
if hasattr(self, 'pretrained'):
|
| 134 |
+
pretrained = state_dict.pop('pretrained.weight')
|
| 135 |
+
state = {
|
| 136 |
+
'args': self.args,
|
| 137 |
+
'state_dict': state_dict,
|
| 138 |
+
'pretrained': pretrained
|
| 139 |
+
}
|
| 140 |
+
torch.save(state, path)
|
biaffine-parser-master/parser/modules/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from . import dropout
|
| 4 |
+
from .bert import BertEmbedding
|
| 5 |
+
from .biaffine import Biaffine
|
| 6 |
+
from .bilstm import BiLSTM
|
| 7 |
+
from .char_lstm import CHAR_LSTM
|
| 8 |
+
from .mlp import MLP
|
| 9 |
+
|
| 10 |
+
__all__ = ['CHAR_LSTM', 'MLP', 'BertEmbedding',
|
| 11 |
+
'Biaffine', 'BiLSTM', 'dropout']
|
biaffine-parser-master/parser/modules/bert.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
from transformers import BertModel
|
| 6 |
+
|
| 7 |
+
from .scalar_mix import ScalarMix
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class BertEmbedding(nn.Module):
|
| 11 |
+
|
| 12 |
+
def __init__(self, model, n_layers, n_out, requires_grad=False):
|
| 13 |
+
proxies = {
|
| 14 |
+
"http": "http://10.10.1.10:3128",
|
| 15 |
+
"https": "https://10.10.1.10:1080",
|
| 16 |
+
}
|
| 17 |
+
super(BertEmbedding, self).__init__()
|
| 18 |
+
|
| 19 |
+
#self.bert = AutoModelForMaskedLM.from_pretrained("Sanath369/distilroberta-base-finetuned-telugu_bert1")
|
| 20 |
+
self.bert = BertModel.from_pretrained(model, output_hidden_states=True)
|
| 21 |
+
|
| 22 |
+
self.bert = self.bert.requires_grad_(requires_grad)
|
| 23 |
+
self.n_layers = n_layers
|
| 24 |
+
self.n_out = n_out
|
| 25 |
+
self.requires_grad = requires_grad
|
| 26 |
+
self.hidden_size = self.bert.config.hidden_size
|
| 27 |
+
|
| 28 |
+
self.scalar_mix = ScalarMix(n_layers)
|
| 29 |
+
self.projection = nn.Linear(self.hidden_size, n_out, False)
|
| 30 |
+
|
| 31 |
+
def __repr__(self):
|
| 32 |
+
s = self.__class__.__name__ + '('
|
| 33 |
+
s += f"n_layers={self.n_layers}, n_out={self.n_out}"
|
| 34 |
+
if self.requires_grad:
|
| 35 |
+
s += f", requires_grad={self.requires_grad}"
|
| 36 |
+
s += ')'
|
| 37 |
+
|
| 38 |
+
return s
|
| 39 |
+
|
| 40 |
+
def forward(self, subwords, bert_lens, bert_mask):
|
| 41 |
+
batch_size, seq_len = bert_lens.shape
|
| 42 |
+
mask = bert_lens.gt(0)
|
| 43 |
+
|
| 44 |
+
if not self.requires_grad:
|
| 45 |
+
self.bert.eval()
|
| 46 |
+
# print(subwords)
|
| 47 |
+
out = self.bert(subwords, attention_mask=bert_mask)
|
| 48 |
+
# print(out[0].shape)
|
| 49 |
+
# print(out[1].shape)
|
| 50 |
+
# print("bert_mask:", bert_mask)
|
| 51 |
+
_,_,bert = self.bert(subwords, attention_mask=bert_mask)
|
| 52 |
+
bert = bert[-self.n_layers:]
|
| 53 |
+
# print("first" , bert)
|
| 54 |
+
bert = self.scalar_mix(bert)
|
| 55 |
+
# print("Second" , bert)
|
| 56 |
+
bert = bert[bert_mask].split(bert_lens[mask].tolist())
|
| 57 |
+
bert = torch.stack([i.mean(0) for i in bert])
|
| 58 |
+
bert_embed = bert.new_zeros(batch_size, seq_len, self.hidden_size)
|
| 59 |
+
bert_embed = bert_embed.masked_scatter_(mask.unsqueeze(-1), bert)
|
| 60 |
+
bert_embed = self.projection(bert_embed)
|
| 61 |
+
|
| 62 |
+
return bert_embed
|
biaffine-parser-master/parser/modules/biaffine.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Biaffine(nn.Module):
|
| 8 |
+
|
| 9 |
+
def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
|
| 10 |
+
super(Biaffine, self).__init__()
|
| 11 |
+
|
| 12 |
+
self.n_in = n_in
|
| 13 |
+
self.n_out = n_out
|
| 14 |
+
self.bias_x = bias_x
|
| 15 |
+
self.bias_y = bias_y
|
| 16 |
+
self.weight = nn.Parameter(torch.Tensor(n_out,
|
| 17 |
+
n_in + bias_x,
|
| 18 |
+
n_in + bias_y))
|
| 19 |
+
self.reset_parameters()
|
| 20 |
+
|
| 21 |
+
def extra_repr(self):
|
| 22 |
+
s = f"n_in={self.n_in}, n_out={self.n_out}"
|
| 23 |
+
if self.bias_x:
|
| 24 |
+
s += f", bias_x={self.bias_x}"
|
| 25 |
+
if self.bias_y:
|
| 26 |
+
s += f", bias_y={self.bias_y}"
|
| 27 |
+
|
| 28 |
+
return s
|
| 29 |
+
|
| 30 |
+
def reset_parameters(self):
|
| 31 |
+
nn.init.zeros_(self.weight)
|
| 32 |
+
|
| 33 |
+
def forward(self, x, y):
|
| 34 |
+
if self.bias_x:
|
| 35 |
+
x = torch.cat((x, torch.ones_like(x[..., :1])), -1)
|
| 36 |
+
if self.bias_y:
|
| 37 |
+
y = torch.cat((y, torch.ones_like(y[..., :1])), -1)
|
| 38 |
+
# [batch_size, n_out, seq_len, seq_len]
|
| 39 |
+
s = torch.einsum('bxi,oij,byj->boxy', x, self.weight, y)
|
| 40 |
+
# remove dim 1 if n_out == 1
|
| 41 |
+
s = s.squeeze(1)
|
| 42 |
+
|
| 43 |
+
return s
|
biaffine-parser-master/parser/modules/bilstm.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from parser.modules.dropout import SharedDropout
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
from torch.nn.modules.rnn import apply_permutation
|
| 8 |
+
from torch.nn.utils.rnn import PackedSequence
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class BiLSTM(nn.Module):
|
| 12 |
+
|
| 13 |
+
def __init__(self, input_size, hidden_size, num_layers=1, dropout=0):
|
| 14 |
+
super(BiLSTM, self).__init__()
|
| 15 |
+
|
| 16 |
+
self.input_size = input_size
|
| 17 |
+
self.hidden_size = hidden_size
|
| 18 |
+
self.num_layers = num_layers
|
| 19 |
+
self.dropout = dropout
|
| 20 |
+
|
| 21 |
+
self.f_cells = nn.ModuleList()
|
| 22 |
+
self.b_cells = nn.ModuleList()
|
| 23 |
+
for _ in range(self.num_layers):
|
| 24 |
+
self.f_cells.append(nn.LSTMCell(input_size=input_size,
|
| 25 |
+
hidden_size=hidden_size))
|
| 26 |
+
self.b_cells.append(nn.LSTMCell(input_size=input_size,
|
| 27 |
+
hidden_size=hidden_size))
|
| 28 |
+
input_size = hidden_size * 2
|
| 29 |
+
|
| 30 |
+
self.reset_parameters()
|
| 31 |
+
|
| 32 |
+
def __repr__(self):
|
| 33 |
+
s = self.__class__.__name__ + '('
|
| 34 |
+
s += f"{self.input_size}, {self.hidden_size}"
|
| 35 |
+
if self.num_layers > 1:
|
| 36 |
+
s += f", num_layers={self.num_layers}"
|
| 37 |
+
if self.dropout > 0:
|
| 38 |
+
s += f", dropout={self.dropout}"
|
| 39 |
+
s += ')'
|
| 40 |
+
|
| 41 |
+
return s
|
| 42 |
+
|
| 43 |
+
def reset_parameters(self):
|
| 44 |
+
for param in self.parameters():
|
| 45 |
+
# apply orthogonal_ to weight
|
| 46 |
+
if len(param.shape) > 1:
|
| 47 |
+
nn.init.orthogonal_(param)
|
| 48 |
+
# apply zeros_ to bias
|
| 49 |
+
else:
|
| 50 |
+
nn.init.zeros_(param)
|
| 51 |
+
|
| 52 |
+
def permute_hidden(self, hx, permutation):
|
| 53 |
+
if permutation is None:
|
| 54 |
+
return hx
|
| 55 |
+
h = apply_permutation(hx[0], permutation)
|
| 56 |
+
c = apply_permutation(hx[1], permutation)
|
| 57 |
+
|
| 58 |
+
return h, c
|
| 59 |
+
|
| 60 |
+
def layer_forward(self, x, hx, cell, batch_sizes, reverse=False):
|
| 61 |
+
hx_0 = hx_i = hx
|
| 62 |
+
hx_n, output = [], []
|
| 63 |
+
steps = reversed(range(len(x))) if reverse else range(len(x))
|
| 64 |
+
if self.training:
|
| 65 |
+
hid_mask = SharedDropout.get_mask(hx_0[0], self.dropout)
|
| 66 |
+
|
| 67 |
+
for t in steps:
|
| 68 |
+
last_batch_size, batch_size = len(hx_i[0]), batch_sizes[t]
|
| 69 |
+
if last_batch_size < batch_size:
|
| 70 |
+
hx_i = [torch.cat((h, ih[last_batch_size:batch_size]))
|
| 71 |
+
for h, ih in zip(hx_i, hx_0)]
|
| 72 |
+
else:
|
| 73 |
+
hx_n.append([h[batch_size:] for h in hx_i])
|
| 74 |
+
hx_i = [h[:batch_size] for h in hx_i]
|
| 75 |
+
hx_i = [h for h in cell(x[t], hx_i)]
|
| 76 |
+
output.append(hx_i[0])
|
| 77 |
+
if self.training:
|
| 78 |
+
hx_i[0] = hx_i[0] * hid_mask[:batch_size]
|
| 79 |
+
if reverse:
|
| 80 |
+
hx_n = hx_i
|
| 81 |
+
output.reverse()
|
| 82 |
+
else:
|
| 83 |
+
hx_n.append(hx_i)
|
| 84 |
+
hx_n = [torch.cat(h) for h in zip(*reversed(hx_n))]
|
| 85 |
+
output = torch.cat(output)
|
| 86 |
+
|
| 87 |
+
return output, hx_n
|
| 88 |
+
|
| 89 |
+
def forward(self, sequence, hx=None):
|
| 90 |
+
x, batch_sizes = sequence.data, sequence.batch_sizes.tolist()
|
| 91 |
+
batch_size = batch_sizes[0]
|
| 92 |
+
h_n, c_n = [], []
|
| 93 |
+
|
| 94 |
+
if hx is None:
|
| 95 |
+
ih = x.new_zeros(self.num_layers * 2, batch_size, self.hidden_size)
|
| 96 |
+
h, c = ih, ih
|
| 97 |
+
else:
|
| 98 |
+
h, c = self.permute_hidden(hx, sequence.sorted_indices)
|
| 99 |
+
h = h.view(self.num_layers, 2, batch_size, self.hidden_size)
|
| 100 |
+
c = c.view(self.num_layers, 2, batch_size, self.hidden_size)
|
| 101 |
+
|
| 102 |
+
for i in range(self.num_layers):
|
| 103 |
+
x = torch.split(x, batch_sizes)
|
| 104 |
+
if self.training:
|
| 105 |
+
mask = SharedDropout.get_mask(x[0], self.dropout)
|
| 106 |
+
x = [i * mask[:len(i)] for i in x]
|
| 107 |
+
x_f, (h_f, c_f) = self.layer_forward(x=x,
|
| 108 |
+
hx=(h[i, 0], c[i, 0]),
|
| 109 |
+
cell=self.f_cells[i],
|
| 110 |
+
batch_sizes=batch_sizes)
|
| 111 |
+
x_b, (h_b, c_b) = self.layer_forward(x=x,
|
| 112 |
+
hx=(h[i, 1], c[i, 1]),
|
| 113 |
+
cell=self.b_cells[i],
|
| 114 |
+
batch_sizes=batch_sizes,
|
| 115 |
+
reverse=True)
|
| 116 |
+
x = torch.cat((x_f, x_b), -1)
|
| 117 |
+
h_n.append(torch.stack((h_f, h_b)))
|
| 118 |
+
c_n.append(torch.stack((c_f, c_b)))
|
| 119 |
+
x = PackedSequence(x,
|
| 120 |
+
sequence.batch_sizes,
|
| 121 |
+
sequence.sorted_indices,
|
| 122 |
+
sequence.unsorted_indices)
|
| 123 |
+
hx = torch.cat(h_n, 0), torch.cat(c_n, 0)
|
| 124 |
+
hx = self.permute_hidden(hx, sequence.unsorted_indices)
|
| 125 |
+
|
| 126 |
+
return x, hx
|
biaffine-parser-master/parser/modules/char_lstm.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
from torch.nn.utils.rnn import pack_padded_sequence
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class CHAR_LSTM(nn.Module):
|
| 9 |
+
|
| 10 |
+
def __init__(self, n_chars, n_embed, n_out):
|
| 11 |
+
super(CHAR_LSTM, self).__init__()
|
| 12 |
+
|
| 13 |
+
# the embedding layer
|
| 14 |
+
self.embed = nn.Embedding(num_embeddings=n_chars,
|
| 15 |
+
embedding_dim=n_embed)
|
| 16 |
+
# the lstm layer
|
| 17 |
+
self.lstm = nn.LSTM(input_size=n_embed,
|
| 18 |
+
hidden_size=n_out//2,
|
| 19 |
+
batch_first=True,
|
| 20 |
+
bidirectional=True)
|
| 21 |
+
|
| 22 |
+
def forward(self, x):
|
| 23 |
+
mask = x.gt(0)
|
| 24 |
+
lens = mask.sum(dim=1)
|
| 25 |
+
|
| 26 |
+
x = pack_padded_sequence(self.embed(x), lens, True, False)
|
| 27 |
+
x, (hidden, _) = self.lstm(x)
|
| 28 |
+
hidden = torch.cat(torch.unbind(hidden), dim=-1)
|
| 29 |
+
|
| 30 |
+
return hidden
|
biaffine-parser-master/parser/modules/dropout.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SharedDropout(nn.Module):
|
| 8 |
+
|
| 9 |
+
def __init__(self, p=0.5, batch_first=True):
|
| 10 |
+
super(SharedDropout, self).__init__()
|
| 11 |
+
|
| 12 |
+
self.p = p
|
| 13 |
+
self.batch_first = batch_first
|
| 14 |
+
|
| 15 |
+
def extra_repr(self):
|
| 16 |
+
s = f"p={self.p}"
|
| 17 |
+
if self.batch_first:
|
| 18 |
+
s += f", batch_first={self.batch_first}"
|
| 19 |
+
|
| 20 |
+
return s
|
| 21 |
+
|
| 22 |
+
def forward(self, x):
|
| 23 |
+
if self.training:
|
| 24 |
+
if self.batch_first:
|
| 25 |
+
mask = self.get_mask(x[:, 0], self.p)
|
| 26 |
+
else:
|
| 27 |
+
mask = self.get_mask(x[0], self.p)
|
| 28 |
+
x *= mask.unsqueeze(1) if self.batch_first else mask
|
| 29 |
+
|
| 30 |
+
return x
|
| 31 |
+
|
| 32 |
+
@staticmethod
|
| 33 |
+
def get_mask(x, p):
|
| 34 |
+
mask = x.new_empty(x.shape).bernoulli_(1 - p)
|
| 35 |
+
mask = mask / (1 - p)
|
| 36 |
+
|
| 37 |
+
return mask
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class IndependentDropout(nn.Module):
|
| 41 |
+
|
| 42 |
+
def __init__(self, p=0.5):
|
| 43 |
+
super(IndependentDropout, self).__init__()
|
| 44 |
+
|
| 45 |
+
self.p = p
|
| 46 |
+
|
| 47 |
+
def extra_repr(self):
|
| 48 |
+
return f"p={self.p}"
|
| 49 |
+
|
| 50 |
+
def forward(self, *items):
|
| 51 |
+
if self.training:
|
| 52 |
+
masks = [x.new_empty(x.shape[:2]).bernoulli_(1 - self.p)
|
| 53 |
+
for x in items]
|
| 54 |
+
total = sum(masks)
|
| 55 |
+
scale = len(items) / total.max(torch.ones_like(total))
|
| 56 |
+
masks = [mask * scale for mask in masks]
|
| 57 |
+
items = [item * mask.unsqueeze(dim=-1)
|
| 58 |
+
for item, mask in zip(items, masks)]
|
| 59 |
+
|
| 60 |
+
return items
|
biaffine-parser-master/parser/modules/mlp.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from parser.modules.dropout import SharedDropout
|
| 4 |
+
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class MLP(nn.Module):
|
| 9 |
+
|
| 10 |
+
def __init__(self, n_in, n_hidden, dropout=0):
|
| 11 |
+
super(MLP, self).__init__()
|
| 12 |
+
|
| 13 |
+
self.linear = nn.Linear(n_in, n_hidden)
|
| 14 |
+
self.activation = nn.LeakyReLU(negative_slope=0.1)
|
| 15 |
+
self.dropout = SharedDropout(p=dropout)
|
| 16 |
+
|
| 17 |
+
self.reset_parameters()
|
| 18 |
+
|
| 19 |
+
def reset_parameters(self):
|
| 20 |
+
nn.init.orthogonal_(self.linear.weight)
|
| 21 |
+
nn.init.zeros_(self.linear.bias)
|
| 22 |
+
|
| 23 |
+
def forward(self, x):
|
| 24 |
+
x = self.linear(x)
|
| 25 |
+
x = self.activation(x)
|
| 26 |
+
x = self.dropout(x)
|
| 27 |
+
|
| 28 |
+
return x
|
biaffine-parser-master/parser/modules/scalar_mix.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ScalarMix(nn.Module):
|
| 9 |
+
|
| 10 |
+
def __init__(self, n_layers, dropout=0):
|
| 11 |
+
super(ScalarMix, self).__init__()
|
| 12 |
+
|
| 13 |
+
self.n_layers = n_layers
|
| 14 |
+
self.dropout = dropout
|
| 15 |
+
|
| 16 |
+
self.weights = nn.Parameter(torch.zeros(n_layers))
|
| 17 |
+
self.gamma = nn.Parameter(torch.tensor([1.0]))
|
| 18 |
+
self.dropout = nn.Dropout(dropout)
|
| 19 |
+
|
| 20 |
+
def extra_repr(self):
|
| 21 |
+
s = f"n_layers={self.n_layers}"
|
| 22 |
+
if self.dropout.p > 0:
|
| 23 |
+
s += f", dropout={self.dropout.p}"
|
| 24 |
+
|
| 25 |
+
return s
|
| 26 |
+
|
| 27 |
+
def forward(self, tensors):
|
| 28 |
+
|
| 29 |
+
normed_weights = self.dropout(self.weights.softmax(-1))
|
| 30 |
+
weighted_sum = sum(w * h for w, h in zip(normed_weights, tensors))
|
| 31 |
+
|
| 32 |
+
return self.gamma * weighted_sum
|
biaffine-parser-master/parser/utils/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from . import corpus, data, field, fn, metric
|
| 4 |
+
from .embedding import Embedding
|
| 5 |
+
from .vocab import Vocab
|
| 6 |
+
|
| 7 |
+
__all__ = ['Corpus', 'Embedding', 'Vocab',
|
| 8 |
+
'corpus', 'data', 'field', 'fn', 'metric']
|
biaffine-parser-master/parser/utils/alg.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def kmeans(x, k):
|
| 8 |
+
x = torch.tensor(x, dtype=torch.float)
|
| 9 |
+
# count the frequency of each datapoint
|
| 10 |
+
d, indices, f = x.unique(return_inverse=True, return_counts=True)
|
| 11 |
+
# calculate the sum of the values of the same datapoints
|
| 12 |
+
total = d * f
|
| 13 |
+
# initialize k centroids randomly
|
| 14 |
+
c, old = d[torch.randperm(len(d))[:k]], None
|
| 15 |
+
# assign labels to each datapoint based on centroids
|
| 16 |
+
dists, y = torch.abs_(d.unsqueeze(-1) - c).min(dim=-1)
|
| 17 |
+
# make sure number of datapoints is greater than that of clusters
|
| 18 |
+
assert len(d) >= k, f"unable to assign {len(d)} datapoints to {k} clusters"
|
| 19 |
+
|
| 20 |
+
while old is None or not c.equal(old):
|
| 21 |
+
# if an empty cluster is encountered,
|
| 22 |
+
# choose the farthest datapoint from the biggest cluster
|
| 23 |
+
# and move that the empty one
|
| 24 |
+
for i in range(k):
|
| 25 |
+
if not y.eq(i).any():
|
| 26 |
+
mask = y.eq(torch.arange(k).unsqueeze(-1))
|
| 27 |
+
lens = mask.sum(dim=-1)
|
| 28 |
+
biggest = mask[lens.argmax()].nonzero().view(-1)
|
| 29 |
+
farthest = dists[biggest].argmax()
|
| 30 |
+
y[biggest[farthest]] = i
|
| 31 |
+
mask = y.eq(torch.arange(k).unsqueeze(-1))
|
| 32 |
+
# update the centroids
|
| 33 |
+
c, old = (total * mask).sum(-1) / (f * mask).sum(-1), c
|
| 34 |
+
# re-assign all datapoints to clusters
|
| 35 |
+
dists, y = torch.abs_(d.unsqueeze(-1) - c).min(dim=-1)
|
| 36 |
+
# assign all datapoints to the new-generated clusters
|
| 37 |
+
# without considering the empty ones
|
| 38 |
+
y, assigned = y[indices], y.unique().tolist()
|
| 39 |
+
# get the centroids of the assigned clusters
|
| 40 |
+
centroids = c[assigned].tolist()
|
| 41 |
+
# map all values of datapoints to buckets
|
| 42 |
+
clusters = [torch.where(y.eq(i))[0].tolist() for i in assigned]
|
| 43 |
+
|
| 44 |
+
return centroids, clusters
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def eisner(scores, mask):
|
| 48 |
+
lens = mask.sum(1)
|
| 49 |
+
batch_size, seq_len, _ = scores.shape
|
| 50 |
+
scores = scores.permute(2, 1, 0)
|
| 51 |
+
s_i = torch.full_like(scores, float('-inf'))
|
| 52 |
+
s_c = torch.full_like(scores, float('-inf'))
|
| 53 |
+
p_i = scores.new_zeros(seq_len, seq_len, batch_size).long()
|
| 54 |
+
p_c = scores.new_zeros(seq_len, seq_len, batch_size).long()
|
| 55 |
+
s_c.diagonal().fill_(0)
|
| 56 |
+
|
| 57 |
+
for w in range(1, seq_len):
|
| 58 |
+
n = seq_len - w
|
| 59 |
+
starts = p_i.new_tensor(range(n)).unsqueeze(0)
|
| 60 |
+
# ilr = C(i->r) + C(j->r+1)
|
| 61 |
+
ilr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
|
| 62 |
+
# [batch_size, n, w]
|
| 63 |
+
ilr = ilr.permute(2, 0, 1)
|
| 64 |
+
il = ilr + scores.diagonal(-w).unsqueeze(-1)
|
| 65 |
+
# I(j->i) = max(C(i->r) + C(j->r+1) + s(j->i)), i <= r < j
|
| 66 |
+
il_span, il_path = il.max(-1)
|
| 67 |
+
s_i.diagonal(-w).copy_(il_span)
|
| 68 |
+
p_i.diagonal(-w).copy_(il_path + starts)
|
| 69 |
+
ir = ilr + scores.diagonal(w).unsqueeze(-1)
|
| 70 |
+
# I(i->j) = max(C(i->r) + C(j->r+1) + s(i->j)), i <= r < j
|
| 71 |
+
ir_span, ir_path = ir.max(-1)
|
| 72 |
+
s_i.diagonal(w).copy_(ir_span)
|
| 73 |
+
p_i.diagonal(w).copy_(ir_path + starts)
|
| 74 |
+
|
| 75 |
+
# C(j->i) = max(C(r->i) + I(j->r)), i <= r < j
|
| 76 |
+
cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
|
| 77 |
+
cl_span, cl_path = cl.permute(2, 0, 1).max(-1)
|
| 78 |
+
s_c.diagonal(-w).copy_(cl_span)
|
| 79 |
+
p_c.diagonal(-w).copy_(cl_path + starts)
|
| 80 |
+
# C(i->j) = max(I(i->r) + C(r->j)), i < r <= j
|
| 81 |
+
cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
|
| 82 |
+
cr_span, cr_path = cr.permute(2, 0, 1).max(-1)
|
| 83 |
+
s_c.diagonal(w).copy_(cr_span)
|
| 84 |
+
s_c[0, w][lens.ne(w)] = float('-inf')
|
| 85 |
+
p_c.diagonal(w).copy_(cr_path + starts + 1)
|
| 86 |
+
|
| 87 |
+
predicts = []
|
| 88 |
+
p_c = p_c.permute(2, 0, 1).cpu()
|
| 89 |
+
p_i = p_i.permute(2, 0, 1).cpu()
|
| 90 |
+
for i, length in enumerate(lens.tolist()):
|
| 91 |
+
heads = p_c.new_ones(length + 1, dtype=torch.long)
|
| 92 |
+
backtrack(p_i[i], p_c[i], heads, 0, length, True)
|
| 93 |
+
predicts.append(heads.to(mask.device))
|
| 94 |
+
|
| 95 |
+
return pad_sequence(predicts, True)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def backtrack(p_i, p_c, heads, i, j, complete):
|
| 99 |
+
if i == j:
|
| 100 |
+
return
|
| 101 |
+
if complete:
|
| 102 |
+
r = p_c[i, j]
|
| 103 |
+
backtrack(p_i, p_c, heads, i, r, False)
|
| 104 |
+
backtrack(p_i, p_c, heads, r, j, True)
|
| 105 |
+
else:
|
| 106 |
+
r, heads[j] = p_i[i, j], i
|
| 107 |
+
i, j = sorted((i, j))
|
| 108 |
+
backtrack(p_i, p_c, heads, i, r, True)
|
| 109 |
+
backtrack(p_i, p_c, heads, j, r + 1, True)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def stripe(x, n, w, offset=(0, 0), dim=1):
|
| 113 |
+
r'''Returns a diagonal stripe of the tensor.
|
| 114 |
+
|
| 115 |
+
Parameters:
|
| 116 |
+
x (Tensor): the input tensor with 2 or more dims.
|
| 117 |
+
n (int): the length of the stripe.
|
| 118 |
+
w (int): the width of the stripe.
|
| 119 |
+
offset (tuple): the offset of the first two dims.
|
| 120 |
+
dim (int): 0 if returns a horizontal stripe; 1 else.
|
| 121 |
+
|
| 122 |
+
Example::
|
| 123 |
+
>>> x = torch.arange(25).view(5, 5)
|
| 124 |
+
>>> x
|
| 125 |
+
tensor([[ 0, 1, 2, 3, 4],
|
| 126 |
+
[ 5, 6, 7, 8, 9],
|
| 127 |
+
[10, 11, 12, 13, 14],
|
| 128 |
+
[15, 16, 17, 18, 19],
|
| 129 |
+
[20, 21, 22, 23, 24]])
|
| 130 |
+
>>> stripe(x, 2, 3, (1, 1))
|
| 131 |
+
tensor([[ 6, 7, 8],
|
| 132 |
+
[12, 13, 14]])
|
| 133 |
+
>>> stripe(x, 2, 3, dim=0)
|
| 134 |
+
tensor([[ 0, 5, 10],
|
| 135 |
+
[ 6, 11, 16]])
|
| 136 |
+
'''
|
| 137 |
+
x, seq_len = x.contiguous(), x.size(1)
|
| 138 |
+
stride, numel = list(x.stride()), x[0, 0].numel()
|
| 139 |
+
stride[0] = (seq_len + 1) * numel
|
| 140 |
+
stride[1] = (1 if dim == 1 else seq_len) * numel
|
| 141 |
+
return x.as_strided(size=(n, w, *x.shape[2:]),
|
| 142 |
+
stride=stride,
|
| 143 |
+
storage_offset=(offset[0]*seq_len+offset[1])*numel)
|
biaffine-parser-master/parser/utils/common.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
pad = '<pad>'
|
| 4 |
+
unk = '<unk>'
|
| 5 |
+
bos = '<bos>'
|
| 6 |
+
eos = '<eos>'
|
biaffine-parser-master/parser/utils/corpus.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from collections import namedtuple
|
| 4 |
+
from collections.abc import Iterable
|
| 5 |
+
from parser.utils.field import Field
|
| 6 |
+
|
| 7 |
+
CoNLL = namedtuple(typename='CoNLL',
|
| 8 |
+
field_names=['ID', 'FORM', 'LEMMA', 'CPOS', 'POS',
|
| 9 |
+
'FEATS', 'HEAD', 'DEPREL', 'PHEAD', 'PDEPREL'],
|
| 10 |
+
defaults=[None]*10)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Sentence(object):
|
| 14 |
+
|
| 15 |
+
def __init__(self, fields, values):
|
| 16 |
+
for field, value in zip(fields, values):
|
| 17 |
+
if isinstance(field, Iterable):
|
| 18 |
+
for j in range(len(field)):
|
| 19 |
+
setattr(self, field[j].name, value)
|
| 20 |
+
else:
|
| 21 |
+
setattr(self, field.name, value)
|
| 22 |
+
self.fields = fields
|
| 23 |
+
|
| 24 |
+
@property
|
| 25 |
+
def values(self):
|
| 26 |
+
for field in self.fields:
|
| 27 |
+
if isinstance(field, Iterable):
|
| 28 |
+
yield getattr(self, field[0].name)
|
| 29 |
+
else:
|
| 30 |
+
yield getattr(self, field.name)
|
| 31 |
+
|
| 32 |
+
def __len__(self):
|
| 33 |
+
return len(next(iter(self.values)))
|
| 34 |
+
|
| 35 |
+
def __repr__(self):
|
| 36 |
+
return '\n'.join('\t'.join(map(str, line))
|
| 37 |
+
for line in zip(*self.values)) + '\n'
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class Corpus(object):
|
| 41 |
+
|
| 42 |
+
def __init__(self, fields, sentences):
|
| 43 |
+
super(Corpus, self).__init__()
|
| 44 |
+
|
| 45 |
+
self.fields = fields
|
| 46 |
+
self.sentences = sentences
|
| 47 |
+
|
| 48 |
+
def __len__(self):
|
| 49 |
+
return len(self.sentences)
|
| 50 |
+
|
| 51 |
+
def __repr__(self):
|
| 52 |
+
return '\n'.join(str(sentence) for sentence in self)
|
| 53 |
+
|
| 54 |
+
def __getitem__(self, index):
|
| 55 |
+
return self.sentences[index]
|
| 56 |
+
|
| 57 |
+
def __getattr__(self, name):
|
| 58 |
+
print(name)
|
| 59 |
+
if not hasattr(self.sentences[0], name):
|
| 60 |
+
raise AttributeError
|
| 61 |
+
for sentence in self.sentences:
|
| 62 |
+
yield getattr(sentence, name)
|
| 63 |
+
|
| 64 |
+
def __setattr__(self, name, value):
|
| 65 |
+
if name in ['fields', 'sentences']:
|
| 66 |
+
self.__dict__[name] = value
|
| 67 |
+
else:
|
| 68 |
+
for i, sentence in enumerate(self.sentences):
|
| 69 |
+
setattr(sentence, name, value[i])
|
| 70 |
+
|
| 71 |
+
@classmethod
|
| 72 |
+
def load(cls, path, fields):
|
| 73 |
+
start, sentences = 0, []
|
| 74 |
+
fields = [field if field is not None else Field(str(i))
|
| 75 |
+
for i, field in enumerate(fields)]
|
| 76 |
+
with open(path, 'r') as f:
|
| 77 |
+
lines = [line.strip() for line in f]
|
| 78 |
+
for i, line in enumerate(lines):
|
| 79 |
+
if not line:
|
| 80 |
+
values = list(zip(*[l.split('\t') for l in lines[start:i]]))
|
| 81 |
+
sentences.append(Sentence(fields, values))
|
| 82 |
+
start = i + 1
|
| 83 |
+
|
| 84 |
+
return cls(fields, sentences)
|
| 85 |
+
|
| 86 |
+
def save(self, path):
|
| 87 |
+
with open(path, 'w') as f:
|
| 88 |
+
f.write(f"{self}\n")
|
biaffine-parser-master/parser/utils/data.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from collections.abc import Iterable
|
| 4 |
+
from itertools import chain
|
| 5 |
+
from parser.utils.alg import kmeans
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 9 |
+
from torch.utils.data import DataLoader, Dataset, Sampler
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class TextDataLoader(DataLoader):
|
| 13 |
+
|
| 14 |
+
def __init__(self, *args, **kwargs):
|
| 15 |
+
super(TextDataLoader, self).__init__(*args, **kwargs)
|
| 16 |
+
|
| 17 |
+
self.fields = self.dataset.fields
|
| 18 |
+
|
| 19 |
+
def __iter__(self):
|
| 20 |
+
for raw_batch in super(TextDataLoader, self).__iter__():
|
| 21 |
+
batch, device = [], 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 22 |
+
for data, field in zip(raw_batch, self.fields):
|
| 23 |
+
if isinstance(data[0], torch.Tensor):
|
| 24 |
+
data = pad_sequence(data, True, field.pad_index).to(device)
|
| 25 |
+
elif isinstance(data[0], Iterable):
|
| 26 |
+
data = [pad_sequence(f, True, field.pad_index).to(device)
|
| 27 |
+
for f in zip(*data)]
|
| 28 |
+
batch.append(data)
|
| 29 |
+
yield batch
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class TextDataset(Dataset):
|
| 33 |
+
|
| 34 |
+
def __init__(self, corpus, fields, n_buckets=1):
|
| 35 |
+
super(TextDataset, self).__init__()
|
| 36 |
+
|
| 37 |
+
self.corpus = corpus
|
| 38 |
+
self.fields = list(chain(*[
|
| 39 |
+
field if isinstance(field, Iterable) else [field]
|
| 40 |
+
for field in fields if field is not None
|
| 41 |
+
]))
|
| 42 |
+
for field in self.fields:
|
| 43 |
+
value = field.numericalize(getattr(corpus, field.name))
|
| 44 |
+
setattr(self, field.name, value)
|
| 45 |
+
# NOTE: the final bucket count is roughly equal to n_buckets
|
| 46 |
+
self.lengths = [len(i) + sum([bool(field.bos), bool(field.bos)])
|
| 47 |
+
for i in corpus]
|
| 48 |
+
self.buckets = dict(zip(*kmeans(self.lengths, n_buckets)))
|
| 49 |
+
|
| 50 |
+
def __getitem__(self, index):
|
| 51 |
+
for field in self.fields:
|
| 52 |
+
yield getattr(self, field.name)[index]
|
| 53 |
+
|
| 54 |
+
def __len__(self):
|
| 55 |
+
return len(self.corpus)
|
| 56 |
+
|
| 57 |
+
@property
|
| 58 |
+
def loader(self):
|
| 59 |
+
if hasattr(self, 'data_loader'):
|
| 60 |
+
return self.data_loader
|
| 61 |
+
else:
|
| 62 |
+
raise AttributeError
|
| 63 |
+
|
| 64 |
+
@loader.setter
|
| 65 |
+
def loader(self, data_loader):
|
| 66 |
+
self.data_loader = data_loader
|
| 67 |
+
|
| 68 |
+
@classmethod
|
| 69 |
+
def collate_fn(cls, batch):
|
| 70 |
+
return (field for field in zip(*batch))
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class TextSampler(Sampler):
|
| 74 |
+
|
| 75 |
+
def __init__(self, buckets, batch_size, shuffle=False):
|
| 76 |
+
self.batch_size = batch_size
|
| 77 |
+
self.shuffle = shuffle
|
| 78 |
+
self.sizes, self.buckets = zip(*[
|
| 79 |
+
(size, bucket) for size, bucket in buckets.items()
|
| 80 |
+
])
|
| 81 |
+
# the number of chunks in each bucket, which is clipped by
|
| 82 |
+
# range [1, len(bucket)]
|
| 83 |
+
self.chunks = [
|
| 84 |
+
min(len(bucket), max(round(size * len(bucket) / batch_size), 1))
|
| 85 |
+
for size, bucket in zip(self.sizes, self.buckets)
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
def __iter__(self):
|
| 89 |
+
# if shuffle, shuffle both the buckets and samples in each bucket
|
| 90 |
+
range_fn = torch.randperm if self.shuffle else torch.arange
|
| 91 |
+
for i in range_fn(len(self.buckets)).tolist():
|
| 92 |
+
split_sizes = [(len(self.buckets[i]) - j - 1) // self.chunks[i] + 1
|
| 93 |
+
for j in range(self.chunks[i])]
|
| 94 |
+
# DON'T use `torch.chunk` which may return wrong number of chunks
|
| 95 |
+
for batch in range_fn(len(self.buckets[i])).split(split_sizes):
|
| 96 |
+
yield [self.buckets[i][j] for j in batch.tolist()]
|
| 97 |
+
|
| 98 |
+
def __len__(self):
|
| 99 |
+
return sum(self.chunks)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def batchify(dataset, batch_size, shuffle=False):
|
| 103 |
+
batch_sampler = TextSampler(buckets=dataset.buckets,
|
| 104 |
+
batch_size=batch_size,
|
| 105 |
+
shuffle=shuffle)
|
| 106 |
+
loader = TextDataLoader(dataset=dataset,
|
| 107 |
+
batch_sampler=batch_sampler,
|
| 108 |
+
collate_fn=dataset.collate_fn)
|
| 109 |
+
|
| 110 |
+
return loader
|
biaffine-parser-master/parser/utils/embedding.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Embedding(object):
|
| 7 |
+
|
| 8 |
+
def __init__(self, tokens, vectors, unk=None):
|
| 9 |
+
super(Embedding, self).__init__()
|
| 10 |
+
self.tokens = tokens
|
| 11 |
+
self.vectors = torch.tensor([v[0] for v in vectors])
|
| 12 |
+
print(self.vectors.size(0))
|
| 13 |
+
self.pretrained = {w: v for w, v in zip(tokens, vectors)}
|
| 14 |
+
self.unk = '[UNK]'
|
| 15 |
+
|
| 16 |
+
def __len__(self):
|
| 17 |
+
return len(self.tokens)
|
| 18 |
+
|
| 19 |
+
def __contains__(self, token):
|
| 20 |
+
return token in self.pretrained
|
| 21 |
+
|
| 22 |
+
@property
|
| 23 |
+
def dim(self):
|
| 24 |
+
return self.vectors.size(0)
|
| 25 |
+
|
| 26 |
+
@property
|
| 27 |
+
def unk_index(self):
|
| 28 |
+
if self.unk is not None:
|
| 29 |
+
return self.tokens.index(self.unk)
|
| 30 |
+
else:
|
| 31 |
+
raise AttributeError
|
| 32 |
+
|
| 33 |
+
@classmethod
|
| 34 |
+
def load(cls, path, unk=None):
|
| 35 |
+
with open(path, 'r') as f:
|
| 36 |
+
lines = [line for line in f]
|
| 37 |
+
splits = [line.split() for line in lines]
|
| 38 |
+
tokens, vectors = zip(*[(s[0], list(map(float, s[1:])))
|
| 39 |
+
for s in splits])
|
| 40 |
+
|
| 41 |
+
return cls(tokens, vectors, unk=unk)
|
biaffine-parser-master/parser/utils/field.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
from collections import Counter
|
| 4 |
+
from parser.utils.vocab import Vocab
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Field(object):
|
| 10 |
+
|
| 11 |
+
def __init__(self, name, pad=None, unk=None, bos=None, eos=None,
|
| 12 |
+
lower=False, use_vocab=True, tokenize=None, fn=None):
|
| 13 |
+
self.name = name
|
| 14 |
+
self.pad = pad
|
| 15 |
+
self.unk = unk
|
| 16 |
+
self.bos = bos
|
| 17 |
+
self.eos = eos
|
| 18 |
+
self.lower = lower
|
| 19 |
+
self.use_vocab = use_vocab
|
| 20 |
+
self.tokenize = tokenize
|
| 21 |
+
self.fn = fn
|
| 22 |
+
|
| 23 |
+
self.specials = [token for token in [pad, unk, bos, eos]
|
| 24 |
+
if token is not None]
|
| 25 |
+
|
| 26 |
+
def __repr__(self):
|
| 27 |
+
s, params = f"({self.name}): {self.__class__.__name__}(", []
|
| 28 |
+
if self.pad is not None:
|
| 29 |
+
params.append(f"pad={self.pad}")
|
| 30 |
+
if self.unk is not None:
|
| 31 |
+
params.append(f"unk={self.unk}")
|
| 32 |
+
if self.bos is not None:
|
| 33 |
+
params.append(f"bos={self.bos}")
|
| 34 |
+
if self.eos is not None:
|
| 35 |
+
params.append(f"eos={self.eos}")
|
| 36 |
+
if self.lower:
|
| 37 |
+
params.append(f"lower={self.lower}")
|
| 38 |
+
if not self.use_vocab:
|
| 39 |
+
params.append(f"use_vocab={self.use_vocab}")
|
| 40 |
+
s += f", ".join(params)
|
| 41 |
+
s += f")"
|
| 42 |
+
|
| 43 |
+
return s
|
| 44 |
+
|
| 45 |
+
@property
|
| 46 |
+
def pad_index(self):
|
| 47 |
+
return self.specials.index(self.pad) if self.pad is not None else 0
|
| 48 |
+
|
| 49 |
+
@property
|
| 50 |
+
def unk_index(self):
|
| 51 |
+
return self.specials.index(self.unk) if self.unk is not None else 0
|
| 52 |
+
|
| 53 |
+
@property
|
| 54 |
+
def bos_index(self):
|
| 55 |
+
return self.specials.index(self.bos)
|
| 56 |
+
|
| 57 |
+
@property
|
| 58 |
+
def eos_index(self):
|
| 59 |
+
return self.specials.index(self.eos)
|
| 60 |
+
|
| 61 |
+
def transform(self, sequence):
|
| 62 |
+
if self.tokenize is not None:
|
| 63 |
+
sequence = self.tokenize(sequence)
|
| 64 |
+
if self.lower:
|
| 65 |
+
sequence = [str.lower(token) for token in sequence]
|
| 66 |
+
if self.fn is not None:
|
| 67 |
+
sequence = [self.fn(token) for token in sequence]
|
| 68 |
+
|
| 69 |
+
return sequence
|
| 70 |
+
|
| 71 |
+
def build(self, corpus, min_freq=1, embed=None):
|
| 72 |
+
sequences = getattr(corpus, self.name)
|
| 73 |
+
counter = Counter(token for sequence in sequences
|
| 74 |
+
for token in self.transform(sequence))
|
| 75 |
+
self.vocab = Vocab(counter, min_freq, self.specials)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
if not embed:
|
| 79 |
+
self.embed = None
|
| 80 |
+
else:
|
| 81 |
+
tokens = self.transform(embed.tokens)
|
| 82 |
+
# if the `unk` token has existed in the pretrained,
|
| 83 |
+
# then replace it with a self-defined one
|
| 84 |
+
if embed.unk:
|
| 85 |
+
tokens[embed.unk_index] = self.unk
|
| 86 |
+
|
| 87 |
+
self.vocab.extend(tokens)
|
| 88 |
+
self.embed = torch.zeros(len(self.vocab), embed.dim)
|
| 89 |
+
self.embed[self.vocab.token2id(tokens)] = embed.vectors
|
| 90 |
+
self.embed /= torch.std(self.embed)
|
| 91 |
+
|
| 92 |
+
def numericalize(self, sequences):
|
| 93 |
+
sequences = [self.transform(sequence) for sequence in sequences]
|
| 94 |
+
if self.use_vocab:
|
| 95 |
+
sequences = [self.vocab.token2id(sequence)
|
| 96 |
+
for sequence in sequences]
|
| 97 |
+
if self.bos:
|
| 98 |
+
sequences = [[self.bos_index] + sequence for sequence in sequences]
|
| 99 |
+
if self.eos:
|
| 100 |
+
sequences = [sequence + [self.eos_index] for sequence in sequences]
|
| 101 |
+
sequences = [torch.tensor(sequence) for sequence in sequences]
|
| 102 |
+
|
| 103 |
+
return sequences
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class CharField(Field):
|
| 107 |
+
|
| 108 |
+
def __init__(self, *args, **kwargs):
|
| 109 |
+
self.fix_len = kwargs.pop('fix_len') if 'fix_len' in kwargs else -1
|
| 110 |
+
super(CharField, self).__init__(*args, **kwargs)
|
| 111 |
+
|
| 112 |
+
def build(self, corpus, min_freq=1, embed=None):
|
| 113 |
+
sequences = getattr(corpus, self.name)
|
| 114 |
+
counter = Counter(char for sequence in sequences for token in sequence
|
| 115 |
+
for char in self.transform(token))
|
| 116 |
+
self.vocab = Vocab(counter, min_freq, self.specials)
|
| 117 |
+
|
| 118 |
+
if not embed:
|
| 119 |
+
self.embed = None
|
| 120 |
+
else:
|
| 121 |
+
tokens = self.transform(embed.tokens)
|
| 122 |
+
# if the `unk` token has existed in the pretrained,
|
| 123 |
+
# then replace it with a self-defined one
|
| 124 |
+
if embed.unk:
|
| 125 |
+
tokens[embed.unk_index] = self.unk
|
| 126 |
+
|
| 127 |
+
self.vocab.extend(tokens)
|
| 128 |
+
self.embed = torch.zeros(len(self.vocab), embed.dim)
|
| 129 |
+
self.embed[self.vocab.token2id(tokens)] = embed.vectors
|
| 130 |
+
|
| 131 |
+
def numericalize(self, sequences):
|
| 132 |
+
sequences = [[self.transform(token) for token in sequence]
|
| 133 |
+
for sequence in sequences]
|
| 134 |
+
if self.fix_len <= 0:
|
| 135 |
+
self.fix_len = max(len(token) for sequence in sequences
|
| 136 |
+
for token in sequence)
|
| 137 |
+
if self.use_vocab:
|
| 138 |
+
sequences = [[self.vocab.token2id(token) for token in sequence]
|
| 139 |
+
for sequence in sequences]
|
| 140 |
+
if self.bos:
|
| 141 |
+
sequences = [[self.vocab.token2id(self.bos)] + sequence
|
| 142 |
+
for sequence in sequences]
|
| 143 |
+
if self.eos:
|
| 144 |
+
sequences = [sequence + [self.vocab.token2id(self.eos)]
|
| 145 |
+
for sequence in sequences]
|
| 146 |
+
sequences = [
|
| 147 |
+
torch.tensor([ids[:self.fix_len] + [0] * (self.fix_len - len(ids))
|
| 148 |
+
for ids in sequence])
|
| 149 |
+
for sequence in sequences
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
return sequences
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
class BertField(Field):
|
| 156 |
+
|
| 157 |
+
def numericalize(self, sequences):
|
| 158 |
+
subwords, lens = [], []
|
| 159 |
+
sequences = [([self.bos] if self.bos else []) + list(sequence) +
|
| 160 |
+
([self.eos] if self.eos else [])
|
| 161 |
+
for sequence in sequences]
|
| 162 |
+
|
| 163 |
+
for sequence in sequences:
|
| 164 |
+
sequence = [self.transform(token) for token in sequence]
|
| 165 |
+
sequence = [piece if piece else self.transform(self.pad)
|
| 166 |
+
for piece in sequence]
|
| 167 |
+
subwords.append(sum(sequence, []))
|
| 168 |
+
lens.append(torch.tensor([len(piece) for piece in sequence]))
|
| 169 |
+
subwords = [torch.tensor(pieces) for pieces in subwords]
|
| 170 |
+
mask = [torch.ones(len(pieces)).ge(0) for pieces in subwords]
|
| 171 |
+
|
| 172 |
+
return list(zip(subwords, lens, mask))
|