Upload ConlluTokenClassificationPipeline
Browse files- README.md +199 -0
- config.json +773 -0
- configuration.py +40 -0
- dependency_classifier.py +301 -0
- encoder.py +109 -0
- mlp_classifier.py +46 -0
- model.safetensors +3 -0
- modeling_parser.py +171 -0
- pipeline.py +236 -0
- utils.py +69 -0
README.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
tags: []
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# Model Card for Model ID
|
| 7 |
+
|
| 8 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## Model Details
|
| 13 |
+
|
| 14 |
+
### Model Description
|
| 15 |
+
|
| 16 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 17 |
+
|
| 18 |
+
This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
|
| 19 |
+
|
| 20 |
+
- **Developed by:** [More Information Needed]
|
| 21 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 22 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 23 |
+
- **Model type:** [More Information Needed]
|
| 24 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 25 |
+
- **License:** [More Information Needed]
|
| 26 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 27 |
+
|
| 28 |
+
### Model Sources [optional]
|
| 29 |
+
|
| 30 |
+
<!-- Provide the basic links for the model. -->
|
| 31 |
+
|
| 32 |
+
- **Repository:** [More Information Needed]
|
| 33 |
+
- **Paper [optional]:** [More Information Needed]
|
| 34 |
+
- **Demo [optional]:** [More Information Needed]
|
| 35 |
+
|
| 36 |
+
## Uses
|
| 37 |
+
|
| 38 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 39 |
+
|
| 40 |
+
### Direct Use
|
| 41 |
+
|
| 42 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 43 |
+
|
| 44 |
+
[More Information Needed]
|
| 45 |
+
|
| 46 |
+
### Downstream Use [optional]
|
| 47 |
+
|
| 48 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 49 |
+
|
| 50 |
+
[More Information Needed]
|
| 51 |
+
|
| 52 |
+
### Out-of-Scope Use
|
| 53 |
+
|
| 54 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 55 |
+
|
| 56 |
+
[More Information Needed]
|
| 57 |
+
|
| 58 |
+
## Bias, Risks, and Limitations
|
| 59 |
+
|
| 60 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 61 |
+
|
| 62 |
+
[More Information Needed]
|
| 63 |
+
|
| 64 |
+
### Recommendations
|
| 65 |
+
|
| 66 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 67 |
+
|
| 68 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 69 |
+
|
| 70 |
+
## How to Get Started with the Model
|
| 71 |
+
|
| 72 |
+
Use the code below to get started with the model.
|
| 73 |
+
|
| 74 |
+
[More Information Needed]
|
| 75 |
+
|
| 76 |
+
## Training Details
|
| 77 |
+
|
| 78 |
+
### Training Data
|
| 79 |
+
|
| 80 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 81 |
+
|
| 82 |
+
[More Information Needed]
|
| 83 |
+
|
| 84 |
+
### Training Procedure
|
| 85 |
+
|
| 86 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 87 |
+
|
| 88 |
+
#### Preprocessing [optional]
|
| 89 |
+
|
| 90 |
+
[More Information Needed]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
#### Training Hyperparameters
|
| 94 |
+
|
| 95 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 96 |
+
|
| 97 |
+
#### Speeds, Sizes, Times [optional]
|
| 98 |
+
|
| 99 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 100 |
+
|
| 101 |
+
[More Information Needed]
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 106 |
+
|
| 107 |
+
### Testing Data, Factors & Metrics
|
| 108 |
+
|
| 109 |
+
#### Testing Data
|
| 110 |
+
|
| 111 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 112 |
+
|
| 113 |
+
[More Information Needed]
|
| 114 |
+
|
| 115 |
+
#### Factors
|
| 116 |
+
|
| 117 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 118 |
+
|
| 119 |
+
[More Information Needed]
|
| 120 |
+
|
| 121 |
+
#### Metrics
|
| 122 |
+
|
| 123 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 124 |
+
|
| 125 |
+
[More Information Needed]
|
| 126 |
+
|
| 127 |
+
### Results
|
| 128 |
+
|
| 129 |
+
[More Information Needed]
|
| 130 |
+
|
| 131 |
+
#### Summary
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Model Examination [optional]
|
| 136 |
+
|
| 137 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 138 |
+
|
| 139 |
+
[More Information Needed]
|
| 140 |
+
|
| 141 |
+
## Environmental Impact
|
| 142 |
+
|
| 143 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 144 |
+
|
| 145 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 146 |
+
|
| 147 |
+
- **Hardware Type:** [More Information Needed]
|
| 148 |
+
- **Hours used:** [More Information Needed]
|
| 149 |
+
- **Cloud Provider:** [More Information Needed]
|
| 150 |
+
- **Compute Region:** [More Information Needed]
|
| 151 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 152 |
+
|
| 153 |
+
## Technical Specifications [optional]
|
| 154 |
+
|
| 155 |
+
### Model Architecture and Objective
|
| 156 |
+
|
| 157 |
+
[More Information Needed]
|
| 158 |
+
|
| 159 |
+
### Compute Infrastructure
|
| 160 |
+
|
| 161 |
+
[More Information Needed]
|
| 162 |
+
|
| 163 |
+
#### Hardware
|
| 164 |
+
|
| 165 |
+
[More Information Needed]
|
| 166 |
+
|
| 167 |
+
#### Software
|
| 168 |
+
|
| 169 |
+
[More Information Needed]
|
| 170 |
+
|
| 171 |
+
## Citation [optional]
|
| 172 |
+
|
| 173 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 174 |
+
|
| 175 |
+
**BibTeX:**
|
| 176 |
+
|
| 177 |
+
[More Information Needed]
|
| 178 |
+
|
| 179 |
+
**APA:**
|
| 180 |
+
|
| 181 |
+
[More Information Needed]
|
| 182 |
+
|
| 183 |
+
## Glossary [optional]
|
| 184 |
+
|
| 185 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 186 |
+
|
| 187 |
+
[More Information Needed]
|
| 188 |
+
|
| 189 |
+
## More Information [optional]
|
| 190 |
+
|
| 191 |
+
[More Information Needed]
|
| 192 |
+
|
| 193 |
+
## Model Card Authors [optional]
|
| 194 |
+
|
| 195 |
+
[More Information Needed]
|
| 196 |
+
|
| 197 |
+
## Model Card Contact
|
| 198 |
+
|
| 199 |
+
[More Information Needed]
|
config.json
ADDED
|
@@ -0,0 +1,773 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation": "relu",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CobaldParser"
|
| 5 |
+
],
|
| 6 |
+
"auto_map": {
|
| 7 |
+
"AutoConfig": "configuration.CobaldParserConfig",
|
| 8 |
+
"AutoModel": "modeling_parser.CobaldParser"
|
| 9 |
+
},
|
| 10 |
+
"consecutive_null_limit": 3,
|
| 11 |
+
"custom_pipelines": {
|
| 12 |
+
"conllu-parsing": {
|
| 13 |
+
"impl": "pipeline.ConlluTokenClassificationPipeline",
|
| 14 |
+
"pt": [
|
| 15 |
+
"AutoModel"
|
| 16 |
+
],
|
| 17 |
+
"tf": [],
|
| 18 |
+
"type": "text"
|
| 19 |
+
}
|
| 20 |
+
},
|
| 21 |
+
"deepslot_classifier_hidden_size": 256,
|
| 22 |
+
"dependency_classifier_hidden_size": 128,
|
| 23 |
+
"dropout": 0.1,
|
| 24 |
+
"encoder_model_name": "xlm-roberta-base",
|
| 25 |
+
"lemma_classifier_hidden_size": 512,
|
| 26 |
+
"misc_classifier_hidden_size": 512,
|
| 27 |
+
"model_type": "cobald_parser",
|
| 28 |
+
"morphology_classifier_hidden_size": 512,
|
| 29 |
+
"null_classifier_hidden_size": 512,
|
| 30 |
+
"semclass_classifier_hidden_size": 512,
|
| 31 |
+
"torch_dtype": "float32",
|
| 32 |
+
"transformers_version": "4.52.2",
|
| 33 |
+
"vocabulary": {
|
| 34 |
+
"deepslot": {
|
| 35 |
+
"0": "Addition",
|
| 36 |
+
"1": "AdditionalParticipant",
|
| 37 |
+
"2": "Addressee",
|
| 38 |
+
"3": "Agent",
|
| 39 |
+
"4": "Agent_Metaphoric",
|
| 40 |
+
"5": "BeneMalefactive",
|
| 41 |
+
"6": "Cause",
|
| 42 |
+
"7": "Ch_Parameter",
|
| 43 |
+
"8": "Ch_Reference",
|
| 44 |
+
"9": "Characteristic",
|
| 45 |
+
"10": "ClassifiedEntity",
|
| 46 |
+
"11": "Comparison",
|
| 47 |
+
"12": "ComparisonBase",
|
| 48 |
+
"13": "Concession",
|
| 49 |
+
"14": "Concurrent",
|
| 50 |
+
"15": "Condition",
|
| 51 |
+
"16": "ContrAgent",
|
| 52 |
+
"17": "ContrObject",
|
| 53 |
+
"18": "Correlative",
|
| 54 |
+
"19": "Criterion",
|
| 55 |
+
"20": "Degree",
|
| 56 |
+
"21": "DegreeNumerative",
|
| 57 |
+
"22": "Elective",
|
| 58 |
+
"23": "Empty_Subject_It",
|
| 59 |
+
"24": "Experiencer",
|
| 60 |
+
"25": "Experiencer_Metaphoric",
|
| 61 |
+
"26": "Function",
|
| 62 |
+
"27": "Instrument_Situation",
|
| 63 |
+
"28": "Landmark",
|
| 64 |
+
"29": "Limitation",
|
| 65 |
+
"30": "Locative",
|
| 66 |
+
"31": "Locative_FinalPoint",
|
| 67 |
+
"32": "Member",
|
| 68 |
+
"33": "MetaphoricLocative",
|
| 69 |
+
"34": "Motive",
|
| 70 |
+
"35": "Name_Title",
|
| 71 |
+
"36": "Object",
|
| 72 |
+
"37": "Object_Relation",
|
| 73 |
+
"38": "Object_Situation",
|
| 74 |
+
"39": "Opposition",
|
| 75 |
+
"40": "OrderInTimeAndSpace",
|
| 76 |
+
"41": "Parenthetical",
|
| 77 |
+
"42": "Part",
|
| 78 |
+
"43": "Part_Situation",
|
| 79 |
+
"44": "ParticipleRelativeClause",
|
| 80 |
+
"45": "Possessor",
|
| 81 |
+
"46": "Possessor_Metaphoric",
|
| 82 |
+
"47": "Predicate",
|
| 83 |
+
"48": "Predicate_Noun",
|
| 84 |
+
"49": "PrincipleOfOrganization",
|
| 85 |
+
"50": "Purpose",
|
| 86 |
+
"51": "QuantifiedEntity",
|
| 87 |
+
"52": "Quantity",
|
| 88 |
+
"53": "Raising_Target",
|
| 89 |
+
"54": "Relative",
|
| 90 |
+
"55": "Resultative",
|
| 91 |
+
"56": "SetEnvironment",
|
| 92 |
+
"57": "Set_General",
|
| 93 |
+
"58": "Source",
|
| 94 |
+
"59": "Specification",
|
| 95 |
+
"60": "Specifier_Number",
|
| 96 |
+
"61": "Sphere",
|
| 97 |
+
"62": "StaffOfPossessors",
|
| 98 |
+
"63": "Standpoint",
|
| 99 |
+
"64": "State",
|
| 100 |
+
"65": "SupportedEntity",
|
| 101 |
+
"66": "Theme",
|
| 102 |
+
"67": "Time",
|
| 103 |
+
"68": "Vocative"
|
| 104 |
+
},
|
| 105 |
+
"eud_deprel": {
|
| 106 |
+
"0": "acl",
|
| 107 |
+
"1": "acl:att",
|
| 108 |
+
"2": "acl:cleft",
|
| 109 |
+
"3": "acl:med",
|
| 110 |
+
"4": "acl:mot",
|
| 111 |
+
"5": "acl:om",
|
| 112 |
+
"6": "acl:p\u00e5",
|
| 113 |
+
"7": "acl:relcl",
|
| 114 |
+
"8": "acl:som",
|
| 115 |
+
"9": "acl:\u00e4n",
|
| 116 |
+
"10": "advcl",
|
| 117 |
+
"11": "advcl:att",
|
| 118 |
+
"12": "advcl:d\u00e4rf\u00f6r_att",
|
| 119 |
+
"13": "advcl:d\u00e5",
|
| 120 |
+
"14": "advcl:eftersom",
|
| 121 |
+
"15": "advcl:f\u00f6r_att",
|
| 122 |
+
"16": "advcl:f\u00f6rutsatt_att",
|
| 123 |
+
"17": "advcl:innan",
|
| 124 |
+
"18": "advcl:liksom",
|
| 125 |
+
"19": "advcl:med_att",
|
| 126 |
+
"20": "advcl:n\u00e4r",
|
| 127 |
+
"21": "advcl:om",
|
| 128 |
+
"22": "advcl:p\u00e5",
|
| 129 |
+
"23": "advcl:samtidigt_som",
|
| 130 |
+
"24": "advcl:sedan",
|
| 131 |
+
"25": "advcl:som",
|
| 132 |
+
"26": "advcl:\u00e4n",
|
| 133 |
+
"27": "advmod",
|
| 134 |
+
"28": "amod",
|
| 135 |
+
"29": "appos",
|
| 136 |
+
"30": "aux",
|
| 137 |
+
"31": "aux:pass",
|
| 138 |
+
"32": "case",
|
| 139 |
+
"33": "cc",
|
| 140 |
+
"34": "ccomp",
|
| 141 |
+
"35": "compound:prt",
|
| 142 |
+
"36": "conj",
|
| 143 |
+
"37": "conj:and",
|
| 144 |
+
"38": "conj:eller",
|
| 145 |
+
"39": "conj:fast",
|
| 146 |
+
"40": "conj:men",
|
| 147 |
+
"41": "conj:och",
|
| 148 |
+
"42": "conj:respektive",
|
| 149 |
+
"43": "conj:samt",
|
| 150 |
+
"44": "conj:som",
|
| 151 |
+
"45": "conj:ty",
|
| 152 |
+
"46": "conj:utan",
|
| 153 |
+
"47": "cop",
|
| 154 |
+
"48": "csubj",
|
| 155 |
+
"49": "csubj:pass",
|
| 156 |
+
"50": "det",
|
| 157 |
+
"51": "dislocated",
|
| 158 |
+
"52": "expl",
|
| 159 |
+
"53": "fixed",
|
| 160 |
+
"54": "flat",
|
| 161 |
+
"55": "iobj",
|
| 162 |
+
"56": "mark",
|
| 163 |
+
"57": "nmod",
|
| 164 |
+
"58": "nmod:av",
|
| 165 |
+
"59": "nmod:efter",
|
| 166 |
+
"60": "nmod:fr\u00e5n",
|
| 167 |
+
"61": "nmod:f\u00f6r",
|
| 168 |
+
"62": "nmod:hos",
|
| 169 |
+
"63": "nmod:i",
|
| 170 |
+
"64": "nmod:inom",
|
| 171 |
+
"65": "nmod:med",
|
| 172 |
+
"66": "nmod:mellan",
|
| 173 |
+
"67": "nmod:mot",
|
| 174 |
+
"68": "nmod:oavsett",
|
| 175 |
+
"69": "nmod:om",
|
| 176 |
+
"70": "nmod:poss",
|
| 177 |
+
"71": "nmod:p\u00e5",
|
| 178 |
+
"72": "nmod:till",
|
| 179 |
+
"73": "nmod:under",
|
| 180 |
+
"74": "nmod:utanf\u00f6r",
|
| 181 |
+
"75": "nmod:vid",
|
| 182 |
+
"76": "nmod:\u00e5t",
|
| 183 |
+
"77": "nsubj",
|
| 184 |
+
"78": "nsubj:pass",
|
| 185 |
+
"79": "nsubj:xsubj",
|
| 186 |
+
"80": "nummod",
|
| 187 |
+
"81": "obj",
|
| 188 |
+
"82": "obl",
|
| 189 |
+
"83": "obl:agent",
|
| 190 |
+
"84": "obl:as",
|
| 191 |
+
"85": "obl:av",
|
| 192 |
+
"86": "obl:bland",
|
| 193 |
+
"87": "obl:efter",
|
| 194 |
+
"88": "obl:enligt",
|
| 195 |
+
"89": "obl:for",
|
| 196 |
+
"90": "obl:fr\u00e5n",
|
| 197 |
+
"91": "obl:f\u00f6r",
|
| 198 |
+
"92": "obl:genom",
|
| 199 |
+
"93": "obl:hos",
|
| 200 |
+
"94": "obl:i",
|
| 201 |
+
"95": "obl:inom",
|
| 202 |
+
"96": "obl:med",
|
| 203 |
+
"97": "obl:med_avseende_p\u00e5",
|
| 204 |
+
"98": "obl:mellan",
|
| 205 |
+
"99": "obl:mot",
|
| 206 |
+
"100": "obl:om",
|
| 207 |
+
"101": "obl:omkring",
|
| 208 |
+
"102": "obl:p\u00e5",
|
| 209 |
+
"103": "obl:runtomkring",
|
| 210 |
+
"104": "obl:som",
|
| 211 |
+
"105": "obl:till",
|
| 212 |
+
"106": "obl:trots",
|
| 213 |
+
"107": "obl:under",
|
| 214 |
+
"108": "obl:ur",
|
| 215 |
+
"109": "obl:utan",
|
| 216 |
+
"110": "obl:utanf\u00f6r",
|
| 217 |
+
"111": "obl:vid",
|
| 218 |
+
"112": "obl:\u00e4n",
|
| 219 |
+
"113": "obl:\u00e5",
|
| 220 |
+
"114": "obl:\u00e5t",
|
| 221 |
+
"115": "parataxis",
|
| 222 |
+
"116": "punct",
|
| 223 |
+
"117": "ref",
|
| 224 |
+
"118": "root",
|
| 225 |
+
"119": "vocative",
|
| 226 |
+
"120": "xcomp"
|
| 227 |
+
},
|
| 228 |
+
"joint_feats": {
|
| 229 |
+
"0": "ADJ#Adjective#Abbr=Yes",
|
| 230 |
+
"1": "ADJ#Adjective#Case=Nom|Definite=Def|Degree=Pos",
|
| 231 |
+
"2": "ADJ#Adjective#Case=Nom|Definite=Def|Degree=Pos|Gender=Com|Number=Sing",
|
| 232 |
+
"3": "ADJ#Adjective#Case=Nom|Definite=Def|Degree=Pos|Tense=Past|VerbForm=Part",
|
| 233 |
+
"4": "ADJ#Adjective#Case=Nom|Definite=Def|Degree=Sup",
|
| 234 |
+
"5": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos",
|
| 235 |
+
"6": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing",
|
| 236 |
+
"7": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing|Tense=Past|VerbForm=Part",
|
| 237 |
+
"8": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing",
|
| 238 |
+
"9": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur",
|
| 239 |
+
"10": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Number=Sing",
|
| 240 |
+
"11": "ADJ#Adjective#Case=Nom|Degree=Cmp",
|
| 241 |
+
"12": "ADJ#Adjective#Case=Nom|Degree=Pos",
|
| 242 |
+
"13": "ADJ#Adjective#Case=Nom|Degree=Pos|Number=Plur",
|
| 243 |
+
"14": "ADJ#Adjective#Case=Nom|Degree=Pos|Tense=Pres|VerbForm=Part",
|
| 244 |
+
"15": "ADJ#Adjective#Case=Nom|Number=Plur|Tense=Past|VerbForm=Part",
|
| 245 |
+
"16": "ADJ#Adjective#Degree=Pos|Foreign=Yes",
|
| 246 |
+
"17": "ADJ#Adverb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing",
|
| 247 |
+
"18": "ADJ#Adverb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing",
|
| 248 |
+
"19": "ADJ#Adverb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part",
|
| 249 |
+
"20": "ADJ#Adverb#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur",
|
| 250 |
+
"21": "ADJ#Noun#Case=Nom|Definite=Def|Degree=Pos",
|
| 251 |
+
"22": "ADJ#Noun#Case=Nom|Degree=Pos",
|
| 252 |
+
"23": "ADJ#Numeral#Case=Nom|Definite=Def|Degree=Pos",
|
| 253 |
+
"24": "ADJ#Numeral#Case=Nom|NumType=Ord",
|
| 254 |
+
"25": "ADJ#Verb#Case=Nom|Definite=Def|Degree=Pos|Tense=Past|VerbForm=Part",
|
| 255 |
+
"26": "ADJ#Verb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing|Tense=Past|VerbForm=Part",
|
| 256 |
+
"27": "ADJ#Verb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part",
|
| 257 |
+
"28": "ADJ#Verb#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur",
|
| 258 |
+
"29": "ADJ#Verb#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur|Tense=Past|VerbForm=Part",
|
| 259 |
+
"30": "ADJ#Verb#Case=Nom|Definite=Ind|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part",
|
| 260 |
+
"31": "ADJ#Verb#Case=Nom|Degree=Pos|Tense=Pres|VerbForm=Part",
|
| 261 |
+
"32": "ADJ#_#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing",
|
| 262 |
+
"33": "ADJ#_#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur",
|
| 263 |
+
"34": "ADJ#_#Case=Nom|Definite=Ind|Degree=Pos|Number=Sing|Tense=Past|VerbForm=Part",
|
| 264 |
+
"35": "ADJ#_#Case=Nom|Degree=Pos",
|
| 265 |
+
"36": "ADP#Adjective#_",
|
| 266 |
+
"37": "ADP#Adverb#_",
|
| 267 |
+
"38": "ADP#Conjunction#_",
|
| 268 |
+
"39": "ADP#Preposition#_",
|
| 269 |
+
"40": "ADP#_#_",
|
| 270 |
+
"41": "ADV#Adjective#_",
|
| 271 |
+
"42": "ADV#Adverb#Abbr=Yes",
|
| 272 |
+
"43": "ADV#Adverb#Degree=Cmp",
|
| 273 |
+
"44": "ADV#Adverb#Degree=Pos",
|
| 274 |
+
"45": "ADV#Adverb#Degree=Sup",
|
| 275 |
+
"46": "ADV#Adverb#Degree=Sup|Polarity=Neg",
|
| 276 |
+
"47": "ADV#Adverb#Polarity=Neg",
|
| 277 |
+
"48": "ADV#Adverb#_",
|
| 278 |
+
"49": "ADV#Conjunction#_",
|
| 279 |
+
"50": "ADV#Invariable#Degree=Cmp",
|
| 280 |
+
"51": "ADV#Invariable#Degree=Sup",
|
| 281 |
+
"52": "ADV#Noun#_",
|
| 282 |
+
"53": "ADV#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
|
| 283 |
+
"54": "ADV#Pronoun#_",
|
| 284 |
+
"55": "ADV#_#Degree=Cmp",
|
| 285 |
+
"56": "ADV#_#Degree=Sup",
|
| 286 |
+
"57": "ADV#_#_",
|
| 287 |
+
"58": "AUX#Verb#Mood=Ind|Tense=Past|VerbForm=Fin|Voice=Act",
|
| 288 |
+
"59": "AUX#Verb#Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
|
| 289 |
+
"60": "AUX#Verb#VerbForm=Inf|Voice=Act",
|
| 290 |
+
"61": "AUX#Verb#VerbForm=Sup|Voice=Act",
|
| 291 |
+
"62": "CCONJ#Conjunction#_",
|
| 292 |
+
"63": "CCONJ#_#_",
|
| 293 |
+
"64": "DET#Adjective#Gender=Com|Number=Sing|PronType=Tot",
|
| 294 |
+
"65": "DET#Adjective#Gender=Neut|Number=Sing|PronType=Tot",
|
| 295 |
+
"66": "DET#Adjective#Number=Plur|PronType=Tot",
|
| 296 |
+
"67": "DET#Article#Definite=Def|Gender=Com|Number=Sing|PronType=Art",
|
| 297 |
+
"68": "DET#Article#Definite=Def|Gender=Neut|Number=Sing|PronType=Art",
|
| 298 |
+
"69": "DET#Article#Definite=Def|Number=Plur|PronType=Art",
|
| 299 |
+
"70": "DET#Article#Definite=Ind|Gender=Com|Number=Sing|PronType=Art",
|
| 300 |
+
"71": "DET#Article#Definite=Ind|Gender=Neut|Number=Sing|PronType=Art",
|
| 301 |
+
"72": "DET#Article#Definite=Ind|Gender=Neut|Number=Sing|PronType=Artt",
|
| 302 |
+
"73": "DET#Article#Definite=Ind|PronType=Art",
|
| 303 |
+
"74": "DET#Numeral#Definite=Ind|Gender=Neut|Number=Sing|PronType=Art",
|
| 304 |
+
"75": "DET#Pronoun#Definite=Def|Gender=Com|Number=Sing|PronType=Art",
|
| 305 |
+
"76": "DET#Pronoun#Definite=Def|Gender=Com|Number=Sing|PronType=Dem",
|
| 306 |
+
"77": "DET#Pronoun#Definite=Def|Gender=Neut|Number=Sing|PronType=Art",
|
| 307 |
+
"78": "DET#Pronoun#Definite=Def|Gender=Neut|Number=Sing|PronType=Dem",
|
| 308 |
+
"79": "DET#Pronoun#Definite=Def|Number=Plur|PronType=Art",
|
| 309 |
+
"80": "DET#Pronoun#Definite=Def|Number=Plur|PronType=Dem",
|
| 310 |
+
"81": "DET#Pronoun#Definite=Def|Number=Plur|PronType=Tot",
|
| 311 |
+
"82": "DET#Pronoun#Definite=Ind|Gender=Com|Number=Sing|PronType=Ind",
|
| 312 |
+
"83": "DET#Pronoun#Definite=Ind|Gender=Com|Number=Sing|PronType=Int",
|
| 313 |
+
"84": "DET#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Ind",
|
| 314 |
+
"85": "DET#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Int",
|
| 315 |
+
"86": "DET#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Tot",
|
| 316 |
+
"87": "DET#Pronoun#Definite=Ind|Number=Plur|PronType=Ind",
|
| 317 |
+
"88": "DET#Pronoun#Definite=Ind|Number=Sing|PronType=Tot",
|
| 318 |
+
"89": "DET#Pronoun#PronType=Ind",
|
| 319 |
+
"90": "DET#_#Gender=Neut|Number=Sing|PronType=Tot",
|
| 320 |
+
"91": "NOUN#Noun#Abbr=Yes",
|
| 321 |
+
"92": "NOUN#Noun#Case=Gen|Definite=Def|Gender=Com|Number=Plur",
|
| 322 |
+
"93": "NOUN#Noun#Case=Gen|Definite=Def|Gender=Com|Number=Sing",
|
| 323 |
+
"94": "NOUN#Noun#Case=Gen|Definite=Def|Gender=Neut|Number=Plur",
|
| 324 |
+
"95": "NOUN#Noun#Case=Gen|Definite=Def|Gender=Neut|Number=Sing",
|
| 325 |
+
"96": "NOUN#Noun#Case=Gen|Definite=Ind|Gender=Com|Number=Plur",
|
| 326 |
+
"97": "NOUN#Noun#Case=Gen|Definite=Ind|Gender=Neut|Number=Plur",
|
| 327 |
+
"98": "NOUN#Noun#Case=Gen|Definite=Ind|Gender=Neut|Number=Sing",
|
| 328 |
+
"99": "NOUN#Noun#Case=Nom|Definite=Def|Gender=Com|Number=Plur",
|
| 329 |
+
"100": "NOUN#Noun#Case=Nom|Definite=Def|Gender=Com|Number=Sing",
|
| 330 |
+
"101": "NOUN#Noun#Case=Nom|Definite=Def|Gender=Neut|Number=Plur",
|
| 331 |
+
"102": "NOUN#Noun#Case=Nom|Definite=Def|Gender=Neut|Number=Sing",
|
| 332 |
+
"103": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Com|Number=Plur",
|
| 333 |
+
"104": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing",
|
| 334 |
+
"105": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Neut|Number=Plur",
|
| 335 |
+
"106": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
|
| 336 |
+
"107": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Neut|Number=Singg",
|
| 337 |
+
"108": "NOUN#Noun#Gender=Com",
|
| 338 |
+
"109": "NOUN#Noun#Number=Plur",
|
| 339 |
+
"110": "NOUN#Noun#Number=Sing",
|
| 340 |
+
"111": "NOUN#Noun#_",
|
| 341 |
+
"112": "NOUN#_#Case=Nom|Definite=Def|Gender=Com|Number=Sing",
|
| 342 |
+
"113": "NOUN#_#Case=Nom|Definite=Def|Gender=Neut|Number=Sing",
|
| 343 |
+
"114": "NOUN#_#Case=Nom|Definite=Ind|Gender=Com|Number=Sing",
|
| 344 |
+
"115": "NOUN#_#Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
|
| 345 |
+
"116": "NUM#Article#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|NumType=Card",
|
| 346 |
+
"117": "NUM#Noun#Case=Nom|NumType=Card",
|
| 347 |
+
"118": "NUM#Numeral#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|NumType=Card",
|
| 348 |
+
"119": "NUM#Numeral#Case=Nom|NumType=Card",
|
| 349 |
+
"120": "PART#Particle#Polarity=Neg",
|
| 350 |
+
"121": "PART#Preposition#_",
|
| 351 |
+
"122": "PRON#Adjective#Definite=Ind|Number=Plur|PronType=Ind",
|
| 352 |
+
"123": "PRON#Adjective#Definite=Ind|Number=Plur|PronType=Tot",
|
| 353 |
+
"124": "PRON#Adverb#Definite=Def|Gender=Neut|Number=Sing|PronType=Prs",
|
| 354 |
+
"125": "PRON#Adverb#Definite=Ind|Gender=Neut|Number=Sing|PronType=Ind",
|
| 355 |
+
"126": "PRON#Adverb#_",
|
| 356 |
+
"127": "PRON#Article#Case=Nom|Definite=Def|Number=Plur|PronType=Prs",
|
| 357 |
+
"128": "PRON#Conjunction#Definite=Ind|Gender=Neut|Number=Sing|PronType=Int",
|
| 358 |
+
"129": "PRON#Conjunction#PronType=Rel",
|
| 359 |
+
"130": "PRON#Noun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|PronType=Ind",
|
| 360 |
+
"131": "PRON#Noun#Definite=Def|Gender=Com|Number=Sing|PronType=Prs",
|
| 361 |
+
"132": "PRON#Noun#Definite=Def|Number=Plur|PronType=Prs",
|
| 362 |
+
"133": "PRON#Noun#Definite=Ind|Number=Plur|PronType=Ind",
|
| 363 |
+
"134": "PRON#Numeral#Definite=Ind|Gender=Com|Number=Sing|PronType=Prs",
|
| 364 |
+
"135": "PRON#Numeral#Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
|
| 365 |
+
"136": "PRON#Pronoun#Case=Acc|Definite=Def|Gender=Com|Number=Plur|PronType=Prs",
|
| 366 |
+
"137": "PRON#Pronoun#Case=Acc|Definite=Def|Gender=Com|Number=Sing|PronType=Prs",
|
| 367 |
+
"138": "PRON#Pronoun#Case=Acc|Definite=Def|Number=Plur|PronType=Prs",
|
| 368 |
+
"139": "PRON#Pronoun#Case=Acc|Definite=Def|PronType=Prs",
|
| 369 |
+
"140": "PRON#Pronoun#Case=Gen|Definite=Def|Gender=Com|Number=Sing|Poss=Yes|PronType=Prs",
|
| 370 |
+
"141": "PRON#Pronoun#Case=Nom|Definite=Def|Gender=Com|Number=Plur|PronType=Prs",
|
| 371 |
+
"142": "PRON#Pronoun#Case=Nom|Definite=Def|Gender=Com|Number=Sing|PronType=Prs",
|
| 372 |
+
"143": "PRON#Pronoun#Case=Nom|Definite=Def|Number=Plur|PronType=Prs",
|
| 373 |
+
"144": "PRON#Pronoun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|PronType=Ind",
|
| 374 |
+
"145": "PRON#Pronoun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|PronType=Rel",
|
| 375 |
+
"146": "PRON#Pronoun#Definite=Def|Gender=Com|Number=Sing|Poss=Yes|PronType=Prs",
|
| 376 |
+
"147": "PRON#Pronoun#Definite=Def|Gender=Com|Number=Sing|PronType=Prs",
|
| 377 |
+
"148": "PRON#Pronoun#Definite=Def|Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs",
|
| 378 |
+
"149": "PRON#Pronoun#Definite=Def|Gender=Neut|Number=Sing|PronType=Dem",
|
| 379 |
+
"150": "PRON#Pronoun#Definite=Def|Gender=Neut|Number=Sing|PronType=Prs",
|
| 380 |
+
"151": "PRON#Pronoun#Definite=Def|Number=Plur|Poss=Yes|PronType=Prs",
|
| 381 |
+
"152": "PRON#Pronoun#Definite=Def|Number=Plur|PronType=Dem",
|
| 382 |
+
"153": "PRON#Pronoun#Definite=Def|Number=Plur|PronType=Prs",
|
| 383 |
+
"154": "PRON#Pronoun#Definite=Def|Poss=Yes|PronType=Prs",
|
| 384 |
+
"155": "PRON#Pronoun#Definite=Ind|Gender=Com|Number=Sing|PronType=Ind",
|
| 385 |
+
"156": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Ind",
|
| 386 |
+
"157": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Int",
|
| 387 |
+
"158": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Neg",
|
| 388 |
+
"159": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
|
| 389 |
+
"160": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Rel",
|
| 390 |
+
"161": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Tot",
|
| 391 |
+
"162": "PRON#Pronoun#Definite=Ind|Number=Plur|PronType=Rel",
|
| 392 |
+
"163": "PRON#Pronoun#Number=Plur",
|
| 393 |
+
"164": "PRON#Pronoun#PronType=Rel",
|
| 394 |
+
"165": "PRON#Verb#Definite=Def|Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs",
|
| 395 |
+
"166": "PRON#_#Case=Acc|Definite=Def|PronType=Prs",
|
| 396 |
+
"167": "PRON#_#Definite=Ind|Gender=Neut|Number=Sing|PronType=Ind",
|
| 397 |
+
"168": "PRON#_#Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
|
| 398 |
+
"169": "PROPN#Noun#Case=Gen",
|
| 399 |
+
"170": "PROPN#Noun#Case=Nom",
|
| 400 |
+
"171": "PROPN#Noun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing",
|
| 401 |
+
"172": "PUNCT#PUNCT#_",
|
| 402 |
+
"173": "SCONJ#Conjunction#_",
|
| 403 |
+
"174": "SCONJ#Preposition#_",
|
| 404 |
+
"175": "SCONJ#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Int",
|
| 405 |
+
"176": "SCONJ#_#_",
|
| 406 |
+
"177": "VERB#Adjective#Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass",
|
| 407 |
+
"178": "VERB#Verb#Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass",
|
| 408 |
+
"179": "VERB#Verb#Mood=Imp|VerbForm=Fin|Voice=Act",
|
| 409 |
+
"180": "VERB#Verb#Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
|
| 410 |
+
"181": "VERB#Verb#Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
|
| 411 |
+
"182": "VERB#Verb#Mood=Ind|Tense=Past|VerbForm=Fin",
|
| 412 |
+
"183": "VERB#Verb#Mood=Ind|Tense=Past|VerbForm=Fin|Voice=Act",
|
| 413 |
+
"184": "VERB#Verb#Mood=Ind|Tense=Past|VerbForm=Fin|Voice=Pass",
|
| 414 |
+
"185": "VERB#Verb#Mood=Ind|Tense=Pres|VerbForm=Fin",
|
| 415 |
+
"186": "VERB#Verb#Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
|
| 416 |
+
"187": "VERB#Verb#Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Pass",
|
| 417 |
+
"188": "VERB#Verb#Tense=Past|VerbForm=Part",
|
| 418 |
+
"189": "VERB#Verb#VerbForm=Inf",
|
| 419 |
+
"190": "VERB#Verb#VerbForm=Inf|Voice=Act",
|
| 420 |
+
"191": "VERB#Verb#VerbForm=Inf|Voice=Pass",
|
| 421 |
+
"192": "VERB#Verb#VerbForm=Sup",
|
| 422 |
+
"193": "VERB#Verb#VerbForm=Sup|Voice=Act",
|
| 423 |
+
"194": "VERB#Verb#VerbForm=Sup|Voice=Pass"
|
| 424 |
+
},
|
| 425 |
+
"lemma_rule": {
|
| 426 |
+
"0": "cut_prefix=0|cut_suffix=0|append_suffix=",
|
| 427 |
+
"1": "cut_prefix=0|cut_suffix=0|append_suffix=a",
|
| 428 |
+
"2": "cut_prefix=0|cut_suffix=0|append_suffix=ma",
|
| 429 |
+
"3": "cut_prefix=0|cut_suffix=1|append_suffix=",
|
| 430 |
+
"4": "cut_prefix=0|cut_suffix=1|append_suffix=a",
|
| 431 |
+
"5": "cut_prefix=0|cut_suffix=1|append_suffix=as",
|
| 432 |
+
"6": "cut_prefix=0|cut_suffix=1|append_suffix=d",
|
| 433 |
+
"7": "cut_prefix=0|cut_suffix=1|append_suffix=en",
|
| 434 |
+
"8": "cut_prefix=0|cut_suffix=1|append_suffix=g",
|
| 435 |
+
"9": "cut_prefix=0|cut_suffix=1|append_suffix=ja",
|
| 436 |
+
"10": "cut_prefix=0|cut_suffix=1|append_suffix=n",
|
| 437 |
+
"11": "cut_prefix=0|cut_suffix=1|append_suffix=na",
|
| 438 |
+
"12": "cut_prefix=0|cut_suffix=1|append_suffix=ola",
|
| 439 |
+
"13": "cut_prefix=0|cut_suffix=1|append_suffix=ym",
|
| 440 |
+
"14": "cut_prefix=0|cut_suffix=2|append_suffix=",
|
| 441 |
+
"15": "cut_prefix=0|cut_suffix=2|append_suffix=a",
|
| 442 |
+
"16": "cut_prefix=0|cut_suffix=2|append_suffix=an",
|
| 443 |
+
"17": "cut_prefix=0|cut_suffix=2|append_suffix=ara",
|
| 444 |
+
"18": "cut_prefix=0|cut_suffix=2|append_suffix=dd",
|
| 445 |
+
"19": "cut_prefix=0|cut_suffix=2|append_suffix=e",
|
| 446 |
+
"20": "cut_prefix=0|cut_suffix=2|append_suffix=en",
|
| 447 |
+
"21": "cut_prefix=0|cut_suffix=2|append_suffix=g",
|
| 448 |
+
"22": "cut_prefix=0|cut_suffix=2|append_suffix=i",
|
| 449 |
+
"23": "cut_prefix=0|cut_suffix=2|append_suffix=igga",
|
| 450 |
+
"24": "cut_prefix=0|cut_suffix=2|append_suffix=ja",
|
| 451 |
+
"25": "cut_prefix=0|cut_suffix=2|append_suffix=mal",
|
| 452 |
+
"26": "cut_prefix=0|cut_suffix=2|append_suffix=n",
|
| 453 |
+
"27": "cut_prefix=0|cut_suffix=2|append_suffix=na",
|
| 454 |
+
"28": "cut_prefix=0|cut_suffix=2|append_suffix=on",
|
| 455 |
+
"29": "cut_prefix=0|cut_suffix=2|append_suffix=u",
|
| 456 |
+
"30": "cut_prefix=0|cut_suffix=2|append_suffix=um",
|
| 457 |
+
"31": "cut_prefix=0|cut_suffix=2|append_suffix=unna",
|
| 458 |
+
"32": "cut_prefix=0|cut_suffix=2|append_suffix=ycket",
|
| 459 |
+
"33": "cut_prefix=0|cut_suffix=2|append_suffix=yda",
|
| 460 |
+
"34": "cut_prefix=0|cut_suffix=2|append_suffix=yta",
|
| 461 |
+
"35": "cut_prefix=0|cut_suffix=2|append_suffix=\u00e5",
|
| 462 |
+
"36": "cut_prefix=0|cut_suffix=2|append_suffix=\u00e5ta",
|
| 463 |
+
"37": "cut_prefix=0|cut_suffix=3|append_suffix=",
|
| 464 |
+
"38": "cut_prefix=0|cut_suffix=3|append_suffix=a",
|
| 465 |
+
"39": "cut_prefix=0|cut_suffix=3|append_suffix=an",
|
| 466 |
+
"40": "cut_prefix=0|cut_suffix=3|append_suffix=and_annat",
|
| 467 |
+
"41": "cut_prefix=0|cut_suffix=3|append_suffix=as",
|
| 468 |
+
"42": "cut_prefix=0|cut_suffix=3|append_suffix=e",
|
| 469 |
+
"43": "cut_prefix=0|cut_suffix=3|append_suffix=er",
|
| 470 |
+
"44": "cut_prefix=0|cut_suffix=3|append_suffix=i",
|
| 471 |
+
"45": "cut_prefix=0|cut_suffix=3|append_suffix=jag",
|
| 472 |
+
"46": "cut_prefix=0|cut_suffix=3|append_suffix=liten",
|
| 473 |
+
"47": "cut_prefix=0|cut_suffix=3|append_suffix=nan",
|
| 474 |
+
"48": "cut_prefix=0|cut_suffix=3|append_suffix=nna",
|
| 475 |
+
"49": "cut_prefix=0|cut_suffix=3|append_suffix=ola",
|
| 476 |
+
"50": "cut_prefix=0|cut_suffix=3|append_suffix=r",
|
| 477 |
+
"51": "cut_prefix=0|cut_suffix=3|append_suffix=ra",
|
| 478 |
+
"52": "cut_prefix=0|cut_suffix=3|append_suffix=vi",
|
| 479 |
+
"53": "cut_prefix=0|cut_suffix=3|append_suffix=ycket",
|
| 480 |
+
"54": "cut_prefix=0|cut_suffix=3|append_suffix=\u00e4ga",
|
| 481 |
+
"55": "cut_prefix=0|cut_suffix=3|append_suffix=\u00e4gga",
|
| 482 |
+
"56": "cut_prefix=0|cut_suffix=3|append_suffix=\u00e5",
|
| 483 |
+
"57": "cut_prefix=0|cut_suffix=3|append_suffix=\u00e5_kallad",
|
| 484 |
+
"58": "cut_prefix=0|cut_suffix=4|append_suffix=",
|
| 485 |
+
"59": "cut_prefix=0|cut_suffix=4|append_suffix=a",
|
| 486 |
+
"60": "cut_prefix=0|cut_suffix=4|append_suffix=ader",
|
| 487 |
+
"61": "cut_prefix=0|cut_suffix=4|append_suffix=an",
|
| 488 |
+
"62": "cut_prefix=0|cut_suffix=4|append_suffix=e",
|
| 489 |
+
"63": "cut_prefix=0|cut_suffix=4|append_suffix=ola",
|
| 490 |
+
"64": "cut_prefix=0|cut_suffix=4|append_suffix=on",
|
| 491 |
+
"65": "cut_prefix=0|cut_suffix=4|append_suffix=or",
|
| 492 |
+
"66": "cut_prefix=0|cut_suffix=4|append_suffix=ot",
|
| 493 |
+
"67": "cut_prefix=0|cut_suffix=4|append_suffix=r",
|
| 494 |
+
"68": "cut_prefix=0|cut_suffix=4|append_suffix=ra",
|
| 495 |
+
"69": "cut_prefix=0|cut_suffix=4|append_suffix=\u00e5g",
|
| 496 |
+
"70": "cut_prefix=0|cut_suffix=4|append_suffix=\u00f6ra",
|
| 497 |
+
"71": "cut_prefix=0|cut_suffix=5|append_suffix=",
|
| 498 |
+
"72": "cut_prefix=0|cut_suffix=5|append_suffix=a",
|
| 499 |
+
"73": "cut_prefix=0|cut_suffix=5|append_suffix=an",
|
| 500 |
+
"74": "cut_prefix=0|cut_suffix=5|append_suffix=d\u00e5lig",
|
| 501 |
+
"75": "cut_prefix=0|cut_suffix=5|append_suffix=er",
|
| 502 |
+
"76": "cut_prefix=0|cut_suffix=5|append_suffix=g\u00e4rna",
|
| 503 |
+
"77": "cut_prefix=0|cut_suffix=5|append_suffix=oder",
|
| 504 |
+
"78": "cut_prefix=0|cut_suffix=5|append_suffix=on",
|
| 505 |
+
"79": "cut_prefix=0|cut_suffix=5|append_suffix=r",
|
| 506 |
+
"80": "cut_prefix=0|cut_suffix=5|append_suffix=ra",
|
| 507 |
+
"81": "cut_prefix=0|cut_suffix=6|append_suffix=er",
|
| 508 |
+
"82": "cut_prefix=0|cut_suffix=8|append_suffix=or",
|
| 509 |
+
"83": "cut_prefix=1|cut_suffix=0|append_suffix=",
|
| 510 |
+
"84": "cut_prefix=1|cut_suffix=0|append_suffix=a",
|
| 511 |
+
"85": "cut_prefix=1|cut_suffix=3|append_suffix=",
|
| 512 |
+
"86": "cut_prefix=1|cut_suffix=3|append_suffix=te",
|
| 513 |
+
"87": "cut_prefix=2|cut_suffix=0|append_suffix=",
|
| 514 |
+
"88": "cut_prefix=2|cut_suffix=0|append_suffix=a",
|
| 515 |
+
"89": "cut_prefix=2|cut_suffix=1|append_suffix=empel",
|
| 516 |
+
"90": "cut_prefix=2|cut_suffix=1|append_suffix=n",
|
| 517 |
+
"91": "cut_prefix=2|cut_suffix=2|append_suffix=",
|
| 518 |
+
"92": "cut_prefix=2|cut_suffix=2|append_suffix=a",
|
| 519 |
+
"93": "cut_prefix=2|cut_suffix=3|append_suffix=",
|
| 520 |
+
"94": "cut_prefix=2|cut_suffix=3|append_suffix=as",
|
| 521 |
+
"95": "cut_prefix=2|cut_suffix=3|append_suffix=n"
|
| 522 |
+
},
|
| 523 |
+
"misc": {
|
| 524 |
+
"0": "Cxn=rc-that-nsubj",
|
| 525 |
+
"1": "Cxn=rc-that-obj",
|
| 526 |
+
"2": "Cxn=rc-wh-nsubj",
|
| 527 |
+
"3": "Cxn=rc-wh-obl",
|
| 528 |
+
"4": "Cxn=rc-wh-obl-pfront",
|
| 529 |
+
"5": "Promoted=Yes|SpaceAfter=No",
|
| 530 |
+
"6": "SpaceAfter=No",
|
| 531 |
+
"7": "ellipsis"
|
| 532 |
+
},
|
| 533 |
+
"semclass": {
|
| 534 |
+
"0": "ABILITY_OF_BEING",
|
| 535 |
+
"1": "ACTIVITY",
|
| 536 |
+
"2": "APPARATUS",
|
| 537 |
+
"3": "AREA_OF_HUMAN_ACTIVITY",
|
| 538 |
+
"4": "ARRANGEMENTS",
|
| 539 |
+
"5": "ARTICLES",
|
| 540 |
+
"6": "ATTRIBUTIVE",
|
| 541 |
+
"7": "AUXILIARY_VERBS",
|
| 542 |
+
"8": "BAD_DANGEROUS_EVENT",
|
| 543 |
+
"9": "BE",
|
| 544 |
+
"10": "BEGIN_TO_TAKE_PLACE",
|
| 545 |
+
"11": "BEHAVIOUR",
|
| 546 |
+
"12": "BEING",
|
| 547 |
+
"13": "BUSINESS",
|
| 548 |
+
"14": "BUSY_FREE_OCCUPIED",
|
| 549 |
+
"15": "CHANGE_OF_POST_AND_JOB",
|
| 550 |
+
"16": "CHARACTERISTIC_GENERAL",
|
| 551 |
+
"17": "CHOOSING_SORTING",
|
| 552 |
+
"18": "CH_APPEARANCE",
|
| 553 |
+
"19": "CH_ASPECT",
|
| 554 |
+
"20": "CH_BENEFIT",
|
| 555 |
+
"21": "CH_BY_SENSORY_PERCEPTION",
|
| 556 |
+
"22": "CH_COMPOSITION",
|
| 557 |
+
"23": "CH_DEGREE",
|
| 558 |
+
"24": "CH_DEGREE_AND_INTENSITY",
|
| 559 |
+
"25": "CH_DISPOSITION_AND_MOTION",
|
| 560 |
+
"26": "CH_DISTRIBUTION",
|
| 561 |
+
"27": "CH_EVALUATION",
|
| 562 |
+
"28": "CH_EVALUATION_OF_HUMAN_TEMPER_AND_ACTIVITY",
|
| 563 |
+
"29": "CH_FUNCTIONING_OF_ENTITY",
|
| 564 |
+
"30": "CH_INFORMATION",
|
| 565 |
+
"31": "CH_INTENTION_CONCENTRATION",
|
| 566 |
+
"32": "CH_MAGNITUDE",
|
| 567 |
+
"33": "CH_OF_CONNECTIONS",
|
| 568 |
+
"34": "CH_PARAMETER_SPEED",
|
| 569 |
+
"35": "CH_POWER_AND_EFFECT",
|
| 570 |
+
"36": "CH_PRICE_AND_SUMS",
|
| 571 |
+
"37": "CH_REFERENCE_AND_QUANTIFICATION",
|
| 572 |
+
"38": "CH_RENOWN",
|
| 573 |
+
"39": "CH_RESISTANCE_TO_IMPACT",
|
| 574 |
+
"40": "CH_SALIENCE",
|
| 575 |
+
"41": "CH_SCALE",
|
| 576 |
+
"42": "CH_SOCIAL_CHARACTERISTIC",
|
| 577 |
+
"43": "CH_SPHERE_OF_COVERAGE",
|
| 578 |
+
"44": "CH_SYSTEM_STRUCTURE",
|
| 579 |
+
"45": "CH_TYPE_OF_POSSESSION_AND_PARTICIPATION",
|
| 580 |
+
"46": "CIRCUMSTANCE",
|
| 581 |
+
"47": "CLOTHES",
|
| 582 |
+
"48": "CONDITION_SITUATION",
|
| 583 |
+
"49": "CONFLICT_INTERACTION",
|
| 584 |
+
"50": "CONJUNCTIONS",
|
| 585 |
+
"51": "CONTAIN_INCLUDE_FORM",
|
| 586 |
+
"52": "CONTINUE_TO_HAVE",
|
| 587 |
+
"53": "CONTINUE_TO_TAKE_PLACE",
|
| 588 |
+
"54": "COORDINATING_CONJUNCTIONS",
|
| 589 |
+
"55": "COSMOS_AND_COSMIC_OBJECTS",
|
| 590 |
+
"56": "COST",
|
| 591 |
+
"57": "COUNTRY_AS_ADMINISTRATIVE_UNIT",
|
| 592 |
+
"58": "CREATION_VERBS",
|
| 593 |
+
"59": "DEFEND_SAVE",
|
| 594 |
+
"60": "DESTRUCTION_VERBS",
|
| 595 |
+
"61": "DIFFICULTIES",
|
| 596 |
+
"62": "DIFFICULT_AND_EASY",
|
| 597 |
+
"63": "DIMENSIONS_CHAR",
|
| 598 |
+
"64": "DISCOURSIVE_UNITS",
|
| 599 |
+
"65": "DOCUMENT",
|
| 600 |
+
"66": "ECONOMY",
|
| 601 |
+
"67": "EMOTIONS_AND_THEIR_EXPRESSION",
|
| 602 |
+
"68": "EMPTY_SUBJECT",
|
| 603 |
+
"69": "END_TO_TAKE_PLACE",
|
| 604 |
+
"70": "ENTITY_AS_RESULT_OF_ACTIVITY",
|
| 605 |
+
"71": "ENTITY_OR_SITUATION_PRONOUN",
|
| 606 |
+
"72": "EVERYDAY_PROCESSING",
|
| 607 |
+
"73": "EXISTENCE_AND_POSSESSION",
|
| 608 |
+
"74": "FACT_INCIDENT",
|
| 609 |
+
"75": "FEELING_AS_CONDITION",
|
| 610 |
+
"76": "FURNISHINGS_AND_DECORATION",
|
| 611 |
+
"77": "GENERAL_ACTION",
|
| 612 |
+
"78": "GRAMMATICAL_ELEMENTS",
|
| 613 |
+
"79": "HIERARCHICAL_VERBS",
|
| 614 |
+
"80": "IDENTIFYING_ATTRIBUTE",
|
| 615 |
+
"81": "IDIOMATICAL_ELEMENTS",
|
| 616 |
+
"82": "INFORMATION",
|
| 617 |
+
"83": "INTELLECTUAL_ACTIVITY",
|
| 618 |
+
"84": "INTERPERSONAL_RELATIONS",
|
| 619 |
+
"85": "KIND",
|
| 620 |
+
"86": "KITCHENWARE_AND_TABLEWARE",
|
| 621 |
+
"87": "KNOWLEDGE_FROM_EXPERIENCE_AND_DEDUCTION",
|
| 622 |
+
"88": "LACK_AND_PLENTY",
|
| 623 |
+
"89": "LAWS_AND_STANDARDS",
|
| 624 |
+
"90": "MANAGE_FAIL_CONDITION",
|
| 625 |
+
"91": "MARKET_AS_AREA_OF_ACTIVITY",
|
| 626 |
+
"92": "MENTAL_OBJECT",
|
| 627 |
+
"93": "METHOD_APPROACH_TECHNIQUE",
|
| 628 |
+
"94": "MODALITY",
|
| 629 |
+
"95": "MONEY",
|
| 630 |
+
"96": "MOTION",
|
| 631 |
+
"97": "NONPRODUCTIVE_AREA",
|
| 632 |
+
"98": "OBJECT_BY_FUNCTION_AND_PROPERTY",
|
| 633 |
+
"99": "ORGANIZATION",
|
| 634 |
+
"100": "PARTICLES",
|
| 635 |
+
"101": "PART_OF_CONSTRUCTION",
|
| 636 |
+
"102": "PART_OF_ORGANISM",
|
| 637 |
+
"103": "PART_OF_WORLD",
|
| 638 |
+
"104": "PART_OR_PORTION_OF_ENTITY",
|
| 639 |
+
"105": "PERCEPTION_ACTIVITY",
|
| 640 |
+
"106": "PHRASAL_PARTICLES",
|
| 641 |
+
"107": "PHYSICAL_AND_BIOLOGICAL_PROPERTIES",
|
| 642 |
+
"108": "PHYSICAL_OBJECT_AND_SUBSTANCE_CHAR",
|
| 643 |
+
"109": "PHYSICAL_PSYCHIC_CONDITION",
|
| 644 |
+
"110": "PHYSIOLOGICAL_PROCESSES",
|
| 645 |
+
"111": "PLACE",
|
| 646 |
+
"112": "POSITION_AS_STATUS",
|
| 647 |
+
"113": "POSITION_IN_SPACE",
|
| 648 |
+
"114": "POWER_RIGHT",
|
| 649 |
+
"115": "PREMISES",
|
| 650 |
+
"116": "PREPOSITION",
|
| 651 |
+
"117": "PROBLEMS_TO_SOLVE",
|
| 652 |
+
"118": "PROCESS_AND_ITS_STAGES",
|
| 653 |
+
"119": "PUBLIC_AND_POLITICAL_ACTIVITY",
|
| 654 |
+
"120": "RELATIVE_SPACE",
|
| 655 |
+
"121": "RESULTS_OF_GIVING_INFORMATION_AND_SPEECH_ACTIVITY",
|
| 656 |
+
"122": "RESULTS_OF_MAKING_DECISIONS",
|
| 657 |
+
"123": "RESULT_CONSEQUENCE",
|
| 658 |
+
"124": "RISK_DANGER",
|
| 659 |
+
"125": "SCHEDULE_FOR_ACTIVITY",
|
| 660 |
+
"126": "SCIENCE",
|
| 661 |
+
"127": "SCIENTIFIC_AND_LITERARY_WORK",
|
| 662 |
+
"128": "SITUATION",
|
| 663 |
+
"129": "SOCIAL_CONDITIONS_OF_BEING",
|
| 664 |
+
"130": "SPHERE_OF_ACTIVITY_GENERAL",
|
| 665 |
+
"131": "STATE_AREA",
|
| 666 |
+
"132": "STATE_OF_MIND",
|
| 667 |
+
"133": "SUBSTANCE",
|
| 668 |
+
"134": "SYMBOLS_FOR_INFORMATION_TRANSFER",
|
| 669 |
+
"135": "TENDENCY_AND_DISPOSITION",
|
| 670 |
+
"136": "TERRITORY_AREA",
|
| 671 |
+
"137": "TEXT_OBJECTS_AND_DOCUMENTS",
|
| 672 |
+
"138": "THE_EARTH_AND_ITS_SPATIAL_PARTS",
|
| 673 |
+
"139": "THE_GOOD_BAD",
|
| 674 |
+
"140": "TIME",
|
| 675 |
+
"141": "TOPIC_SUBJECT",
|
| 676 |
+
"142": "TOTALITY_OF_DEGREE",
|
| 677 |
+
"143": "TO_ADAPT",
|
| 678 |
+
"144": "TO_ADD",
|
| 679 |
+
"145": "TO_ANALYSE_AND_RESEARCH",
|
| 680 |
+
"146": "TO_APPROACH_COME_TO_SOME_POINT_OR_STATE",
|
| 681 |
+
"147": "TO_BE_BASED",
|
| 682 |
+
"148": "TO_CALL_AND_DESIGNATE",
|
| 683 |
+
"149": "TO_CANCEL",
|
| 684 |
+
"150": "TO_CARE_AND_BRING_UP",
|
| 685 |
+
"151": "TO_CHANGE",
|
| 686 |
+
"152": "TO_CHARACTERIZE",
|
| 687 |
+
"153": "TO_COME_OR_TO_LEAVE_SPHERE_OF_ACTIVITY",
|
| 688 |
+
"154": "TO_COMMIT",
|
| 689 |
+
"155": "TO_COMMUNICATE",
|
| 690 |
+
"156": "TO_COMPEL_AND_EVOKE",
|
| 691 |
+
"157": "TO_CONTRIBUTE_AND_HINDER",
|
| 692 |
+
"158": "TO_DECIDE",
|
| 693 |
+
"159": "TO_DEVELOP",
|
| 694 |
+
"160": "TO_DISAPPEAR_LOSE_GET_RID_OF",
|
| 695 |
+
"161": "TO_ECONOMIZE",
|
| 696 |
+
"162": "TO_EXIST",
|
| 697 |
+
"163": "TO_FEEL_AND_EXPRESS_MENTAL_ATTITUDE_TO",
|
| 698 |
+
"164": "TO_FLOW_IN_TIME",
|
| 699 |
+
"165": "TO_GET",
|
| 700 |
+
"166": "TO_GIVE",
|
| 701 |
+
"167": "TO_INTERPRET",
|
| 702 |
+
"168": "TO_INVOLVE",
|
| 703 |
+
"169": "TO_JOIN",
|
| 704 |
+
"170": "TO_KEEP_VIOLATE_NORMS",
|
| 705 |
+
"171": "TO_LEARN_AND_RESEARCH",
|
| 706 |
+
"172": "TO_MAKE",
|
| 707 |
+
"173": "TO_MARRY_DIVORCE_ENGAGE",
|
| 708 |
+
"174": "TO_MEAN",
|
| 709 |
+
"175": "TO_MIX",
|
| 710 |
+
"176": "TO_PARTICIPATE",
|
| 711 |
+
"177": "TO_PERCEIVE",
|
| 712 |
+
"178": "TO_POSSESS",
|
| 713 |
+
"179": "TO_PUNISH",
|
| 714 |
+
"180": "TO_REACT",
|
| 715 |
+
"181": "TO_REBEL",
|
| 716 |
+
"182": "TO_RESTORE",
|
| 717 |
+
"183": "TO_SEEK_FIND",
|
| 718 |
+
"184": "TO_SET",
|
| 719 |
+
"185": "TO_SHARE",
|
| 720 |
+
"186": "TO_SHOW",
|
| 721 |
+
"187": "TO_TAKE",
|
| 722 |
+
"188": "TO_THINK_ABOUT",
|
| 723 |
+
"189": "TO_USE",
|
| 724 |
+
"190": "TO_WAIT",
|
| 725 |
+
"191": "TO_WORK",
|
| 726 |
+
"192": "URBAN_SPACE_AND_ROADS",
|
| 727 |
+
"193": "VALUABLE",
|
| 728 |
+
"194": "VERBAL_COMMUNICATION",
|
| 729 |
+
"195": "VISUAL_CHARACTERISTICS",
|
| 730 |
+
"196": "VISUAL_REPRESENTATION",
|
| 731 |
+
"197": "WORLD_OUTLOOK"
|
| 732 |
+
},
|
| 733 |
+
"ud_deprel": {
|
| 734 |
+
"0": "acl",
|
| 735 |
+
"1": "acl:cleft",
|
| 736 |
+
"2": "acl:relcl",
|
| 737 |
+
"3": "advcl",
|
| 738 |
+
"4": "advmod",
|
| 739 |
+
"5": "amod",
|
| 740 |
+
"6": "appos",
|
| 741 |
+
"7": "aux",
|
| 742 |
+
"8": "aux:pass",
|
| 743 |
+
"9": "case",
|
| 744 |
+
"10": "cc",
|
| 745 |
+
"11": "ccomp",
|
| 746 |
+
"12": "compound:prt",
|
| 747 |
+
"13": "conj",
|
| 748 |
+
"14": "cop",
|
| 749 |
+
"15": "csubj",
|
| 750 |
+
"16": "csubj:pass",
|
| 751 |
+
"17": "det",
|
| 752 |
+
"18": "dislocated",
|
| 753 |
+
"19": "expl",
|
| 754 |
+
"20": "fixed",
|
| 755 |
+
"21": "flat",
|
| 756 |
+
"22": "iobj",
|
| 757 |
+
"23": "mark",
|
| 758 |
+
"24": "nmod",
|
| 759 |
+
"25": "nmod:poss",
|
| 760 |
+
"26": "nsubj",
|
| 761 |
+
"27": "nsubj:pass",
|
| 762 |
+
"28": "nummod",
|
| 763 |
+
"29": "obj",
|
| 764 |
+
"30": "obl",
|
| 765 |
+
"31": "obl:agent",
|
| 766 |
+
"32": "parataxis",
|
| 767 |
+
"33": "punct",
|
| 768 |
+
"34": "root",
|
| 769 |
+
"35": "vocative",
|
| 770 |
+
"36": "xcomp"
|
| 771 |
+
}
|
| 772 |
+
}
|
| 773 |
+
}
|
configuration.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PretrainedConfig
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class CobaldParserConfig(PretrainedConfig):
|
| 5 |
+
model_type = "cobald_parser"
|
| 6 |
+
|
| 7 |
+
def __init__(
|
| 8 |
+
self,
|
| 9 |
+
encoder_model_name: str = None,
|
| 10 |
+
null_classifier_hidden_size: int = 0,
|
| 11 |
+
lemma_classifier_hidden_size: int = 0,
|
| 12 |
+
morphology_classifier_hidden_size: int = 0,
|
| 13 |
+
dependency_classifier_hidden_size: int = 0,
|
| 14 |
+
misc_classifier_hidden_size: int = 0,
|
| 15 |
+
deepslot_classifier_hidden_size: int = 0,
|
| 16 |
+
semclass_classifier_hidden_size: int = 0,
|
| 17 |
+
activation: str = 'relu',
|
| 18 |
+
dropout: float = 0.1,
|
| 19 |
+
consecutive_null_limit: int = 0,
|
| 20 |
+
vocabulary: dict[dict[int, str]] = {},
|
| 21 |
+
**kwargs
|
| 22 |
+
):
|
| 23 |
+
self.encoder_model_name = encoder_model_name
|
| 24 |
+
self.null_classifier_hidden_size = null_classifier_hidden_size
|
| 25 |
+
self.consecutive_null_limit = consecutive_null_limit
|
| 26 |
+
self.lemma_classifier_hidden_size = lemma_classifier_hidden_size
|
| 27 |
+
self.morphology_classifier_hidden_size = morphology_classifier_hidden_size
|
| 28 |
+
self.dependency_classifier_hidden_size = dependency_classifier_hidden_size
|
| 29 |
+
self.misc_classifier_hidden_size = misc_classifier_hidden_size
|
| 30 |
+
self.deepslot_classifier_hidden_size = deepslot_classifier_hidden_size
|
| 31 |
+
self.semclass_classifier_hidden_size = semclass_classifier_hidden_size
|
| 32 |
+
self.activation = activation
|
| 33 |
+
self.dropout = dropout
|
| 34 |
+
# The serialized config stores mappings as strings,
|
| 35 |
+
# e.g. {"0": "acl", "1": "conj"}, so we have to convert them to int.
|
| 36 |
+
self.vocabulary = {
|
| 37 |
+
column: {int(k): v for k, v in labels.items()}
|
| 38 |
+
for column, labels in vocabulary.items()
|
| 39 |
+
}
|
| 40 |
+
super().__init__(**kwargs)
|
dependency_classifier.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from copy import deepcopy
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
from torch import nn
|
| 7 |
+
from torch import Tensor, FloatTensor, BoolTensor, LongTensor
|
| 8 |
+
import torch.nn.functional as F
|
| 9 |
+
|
| 10 |
+
from transformers.activations import ACT2FN
|
| 11 |
+
|
| 12 |
+
from cobald_parser.bilinear_matrix_attention import BilinearMatrixAttention
|
| 13 |
+
from cobald_parser.chu_liu_edmonds import decode_mst
|
| 14 |
+
from cobald_parser.utils import pairwise_mask, replace_masked_values
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class DependencyHeadBase(nn.Module):
|
| 18 |
+
"""
|
| 19 |
+
Base class for scoring arcs and relations between tokens in a dependency tree/graph.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, hidden_size: int, n_rels: int):
|
| 23 |
+
super().__init__()
|
| 24 |
+
|
| 25 |
+
self.arc_attention = BilinearMatrixAttention(
|
| 26 |
+
hidden_size,
|
| 27 |
+
hidden_size,
|
| 28 |
+
use_input_biases=True,
|
| 29 |
+
n_labels=1
|
| 30 |
+
)
|
| 31 |
+
self.rel_attention = BilinearMatrixAttention(
|
| 32 |
+
hidden_size,
|
| 33 |
+
hidden_size,
|
| 34 |
+
use_input_biases=True,
|
| 35 |
+
n_labels=n_rels
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
def forward(
|
| 39 |
+
self,
|
| 40 |
+
h_arc_head: Tensor, # [batch_size, seq_len, hidden_size]
|
| 41 |
+
h_arc_dep: Tensor, # ...
|
| 42 |
+
h_rel_head: Tensor, # ...
|
| 43 |
+
h_rel_dep: Tensor, # ...
|
| 44 |
+
gold_arcs: LongTensor, # [batch_size, seq_len, seq_len]
|
| 45 |
+
null_mask: BoolTensor, # [batch_size, seq_len]
|
| 46 |
+
padding_mask: BoolTensor # [batch_size, seq_len]
|
| 47 |
+
) -> dict[str, Tensor]:
|
| 48 |
+
|
| 49 |
+
# Score arcs.
|
| 50 |
+
# s_arc[:, i, j] = score of edge i -> j.
|
| 51 |
+
s_arc = self.arc_attention(h_arc_head, h_arc_dep)
|
| 52 |
+
# Mask undesirable values (padding, nulls, etc.) with -inf.
|
| 53 |
+
mask2d = pairwise_mask(null_mask & padding_mask)
|
| 54 |
+
replace_masked_values(s_arc, mask2d, replace_with=-1e8)
|
| 55 |
+
# Score arcs' relations.
|
| 56 |
+
# [batch_size, seq_len, seq_len, num_labels]
|
| 57 |
+
s_rel = self.rel_attention(h_rel_head, h_rel_dep).permute(0, 2, 3, 1)
|
| 58 |
+
|
| 59 |
+
# Calculate loss.
|
| 60 |
+
loss = 0.0
|
| 61 |
+
if gold_arcs is not None:
|
| 62 |
+
loss += self.calc_arc_loss(s_arc, gold_arcs)
|
| 63 |
+
loss += self.calc_rel_loss(s_rel, gold_arcs)
|
| 64 |
+
|
| 65 |
+
# Predict arcs based on the scores.
|
| 66 |
+
# [batch_size, seq_len, seq_len]
|
| 67 |
+
pred_arcs_matrix = self.predict_arcs(s_arc, null_mask, padding_mask)
|
| 68 |
+
# [batch_size, seq_len, seq_len]
|
| 69 |
+
pred_rels_matrix = self.predict_rels(s_rel)
|
| 70 |
+
# [n_pred_arcs, 4]
|
| 71 |
+
preds_combined = self.combine_arcs_rels(pred_arcs_matrix, pred_rels_matrix)
|
| 72 |
+
return {
|
| 73 |
+
'preds': preds_combined,
|
| 74 |
+
'loss': loss
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
@staticmethod
|
| 78 |
+
def calc_arc_loss(
|
| 79 |
+
s_arc: Tensor, # [batch_size, seq_len, seq_len]
|
| 80 |
+
gold_arcs: LongTensor # [n_arcs, 4]
|
| 81 |
+
) -> Tensor:
|
| 82 |
+
"""Calculate arc loss."""
|
| 83 |
+
raise NotImplementedError
|
| 84 |
+
|
| 85 |
+
@staticmethod
|
| 86 |
+
def calc_rel_loss(
|
| 87 |
+
s_rel: Tensor, # [batch_size, seq_len, seq_len, num_labels]
|
| 88 |
+
gold_arcs: LongTensor # [n_arcs, 4]
|
| 89 |
+
) -> Tensor:
|
| 90 |
+
batch_idxs, arcs_from, arcs_to, rels = gold_arcs.T
|
| 91 |
+
return F.cross_entropy(s_rel[batch_idxs, arcs_from, arcs_to], rels)
|
| 92 |
+
|
| 93 |
+
def predict_arcs(
|
| 94 |
+
self,
|
| 95 |
+
s_arc: Tensor, # [batch_size, seq_len, seq_len]
|
| 96 |
+
null_mask: BoolTensor, # [batch_size, seq_len]
|
| 97 |
+
padding_mask: BoolTensor # [batch_size, seq_len]
|
| 98 |
+
) -> LongTensor:
|
| 99 |
+
"""Predict arcs from scores."""
|
| 100 |
+
raise NotImplementedError
|
| 101 |
+
|
| 102 |
+
def predict_rels(
|
| 103 |
+
self,
|
| 104 |
+
s_rel: FloatTensor
|
| 105 |
+
) -> LongTensor:
|
| 106 |
+
return s_rel.argmax(dim=-1).long()
|
| 107 |
+
|
| 108 |
+
@staticmethod
|
| 109 |
+
def combine_arcs_rels(
|
| 110 |
+
pred_arcs: LongTensor,
|
| 111 |
+
pred_rels: LongTensor
|
| 112 |
+
) -> LongTensor:
|
| 113 |
+
"""Select relations towards predicted arcs."""
|
| 114 |
+
assert pred_arcs.shape == pred_rels.shape
|
| 115 |
+
# Get indices where arcs exist
|
| 116 |
+
indices = pred_arcs.nonzero(as_tuple=True)
|
| 117 |
+
batch_idxs, from_idxs, to_idxs = indices
|
| 118 |
+
# Get corresponding relation types
|
| 119 |
+
rel_types = pred_rels[batch_idxs, from_idxs, to_idxs]
|
| 120 |
+
# Stack as [batch_idx, from_idx, to_idx, rel_type]
|
| 121 |
+
return torch.stack([batch_idxs, from_idxs, to_idxs, rel_types], dim=1)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class DependencyHead(DependencyHeadBase):
|
| 125 |
+
"""
|
| 126 |
+
Basic UD syntax specialization that predicts single edge for each token.
|
| 127 |
+
"""
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def predict_arcs(
|
| 131 |
+
self,
|
| 132 |
+
s_arc: Tensor, # [batch_size, seq_len, seq_len]
|
| 133 |
+
null_mask: BoolTensor, # [batch_size, seq_len]
|
| 134 |
+
padding_mask: BoolTensor # [batch_size, seq_len, seq_len]
|
| 135 |
+
) -> Tensor:
|
| 136 |
+
|
| 137 |
+
if self.training:
|
| 138 |
+
# During training, use fast greedy decoding.
|
| 139 |
+
# - [batch_size, seq_len]
|
| 140 |
+
pred_arcs_seq = s_arc.argmax(dim=1)
|
| 141 |
+
else:
|
| 142 |
+
# FIXME
|
| 143 |
+
# During inference, decode Maximum Spanning Tree.
|
| 144 |
+
# pred_arcs_seq = self._mst_decode(s_arc, padding_mask)
|
| 145 |
+
pred_arcs_seq = s_arc.argmax(dim=1)
|
| 146 |
+
|
| 147 |
+
# Upscale arcs sequence of shape [batch_size, seq_len]
|
| 148 |
+
# to matrix of shape [batch_size, seq_len, seq_len].
|
| 149 |
+
pred_arcs = F.one_hot(pred_arcs_seq, num_classes=pred_arcs_seq.size(1)).long().transpose(1, 2)
|
| 150 |
+
# Apply mask one more time (even though s_arc is already masked),
|
| 151 |
+
# because argmax erases information about masked values.
|
| 152 |
+
mask2d = pairwise_mask(null_mask & padding_mask)
|
| 153 |
+
replace_masked_values(pred_arcs, mask2d, replace_with=0)
|
| 154 |
+
return pred_arcs
|
| 155 |
+
|
| 156 |
+
def _mst_decode(
|
| 157 |
+
self,
|
| 158 |
+
s_arc: Tensor, # [batch_size, seq_len, seq_len]
|
| 159 |
+
padding_mask: Tensor
|
| 160 |
+
) -> tuple[Tensor, Tensor]:
|
| 161 |
+
|
| 162 |
+
batch_size = s_arc.size(0)
|
| 163 |
+
device = s_arc.device
|
| 164 |
+
s_arc = s_arc.cpu()
|
| 165 |
+
|
| 166 |
+
# Convert scores to probabilities, as `decode_mst` expects non-negative values.
|
| 167 |
+
arc_probs = nn.functional.softmax(s_arc, dim=1)
|
| 168 |
+
|
| 169 |
+
# `decode_mst` knows nothing about UD and ROOT, so we have to manually
|
| 170 |
+
# zero probabilities of arcs leading to ROOT to make sure ROOT is a source node
|
| 171 |
+
# of a graph.
|
| 172 |
+
|
| 173 |
+
# Decode ROOT positions from diagonals.
|
| 174 |
+
# shape: [batch_size]
|
| 175 |
+
root_idxs = arc_probs.diagonal(dim1=1, dim2=2).argmax(dim=-1)
|
| 176 |
+
# Zero out arcs leading to ROOTs.
|
| 177 |
+
arc_probs[torch.arange(batch_size), :, root_idxs] = 0.0
|
| 178 |
+
|
| 179 |
+
pred_arcs = []
|
| 180 |
+
for sample_idx in range(batch_size):
|
| 181 |
+
energy = arc_probs[sample_idx]
|
| 182 |
+
length = padding_mask[sample_idx].sum()
|
| 183 |
+
heads = decode_mst(energy, length)
|
| 184 |
+
# Some nodes may be isolated. Pick heads greedily in this case.
|
| 185 |
+
heads[heads <= 0] = s_arc[sample_idx].argmax(dim=1)[heads <= 0]
|
| 186 |
+
pred_arcs.append(heads)
|
| 187 |
+
|
| 188 |
+
# shape: [batch_size, seq_len]
|
| 189 |
+
pred_arcs = torch.from_numpy(np.stack(pred_arcs)).long().to(device)
|
| 190 |
+
return pred_arcs
|
| 191 |
+
|
| 192 |
+
@staticmethod
|
| 193 |
+
def calc_arc_loss(
|
| 194 |
+
s_arc: Tensor, # [batch_size, seq_len, seq_len]
|
| 195 |
+
gold_arcs: LongTensor # [n_arcs, 4]
|
| 196 |
+
) -> tuple[Tensor, Tensor]:
|
| 197 |
+
batch_idxs, from_idxs, to_idxs, _ = gold_arcs.T
|
| 198 |
+
return F.cross_entropy(s_arc[batch_idxs, :, to_idxs], from_idxs)
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
class MultiDependencyHead(DependencyHeadBase):
|
| 202 |
+
"""
|
| 203 |
+
Enhanced UD syntax specialization that predicts multiple edges for each token.
|
| 204 |
+
"""
|
| 205 |
+
|
| 206 |
+
def predict_arcs(
|
| 207 |
+
self,
|
| 208 |
+
s_arc: Tensor, # [batch_size, seq_len, seq_len]
|
| 209 |
+
null_mask: BoolTensor, # [batch_size, seq_len]
|
| 210 |
+
padding_mask: BoolTensor # [batch_size, seq_len]
|
| 211 |
+
) -> Tensor:
|
| 212 |
+
# Convert scores to probabilities.
|
| 213 |
+
arc_probs = torch.sigmoid(s_arc)
|
| 214 |
+
# Find confident arcs (with prob > 0.5).
|
| 215 |
+
return arc_probs.round().long()
|
| 216 |
+
|
| 217 |
+
@staticmethod
|
| 218 |
+
def calc_arc_loss(
|
| 219 |
+
s_arc: Tensor, # [batch_size, seq_len, seq_len]
|
| 220 |
+
gold_arcs: LongTensor # [n_arcs, 4]
|
| 221 |
+
) -> Tensor:
|
| 222 |
+
batch_idxs, from_idxs, to_idxs, _ = gold_arcs.T
|
| 223 |
+
# Gold arcs but as a matrix, where matrix[i, arcs_from, arc_to] = 1.0 if arcs is present.
|
| 224 |
+
gold_arcs_matrix = torch.zeros_like(s_arc)
|
| 225 |
+
gold_arcs_matrix[batch_idxs, from_idxs, to_idxs] = 1.0
|
| 226 |
+
# Padded arcs's logits are huge negative values that doesn't contribute to the loss.
|
| 227 |
+
return F.binary_cross_entropy_with_logits(s_arc, gold_arcs_matrix)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
class DependencyClassifier(nn.Module):
|
| 231 |
+
"""
|
| 232 |
+
Dozat and Manning's biaffine dependency classifier.
|
| 233 |
+
"""
|
| 234 |
+
|
| 235 |
+
def __init__(
|
| 236 |
+
self,
|
| 237 |
+
input_size: int,
|
| 238 |
+
hidden_size: int,
|
| 239 |
+
n_rels_ud: int,
|
| 240 |
+
n_rels_eud: int,
|
| 241 |
+
activation: str,
|
| 242 |
+
dropout: float,
|
| 243 |
+
):
|
| 244 |
+
super().__init__()
|
| 245 |
+
|
| 246 |
+
self.arc_dep_mlp = nn.Sequential(
|
| 247 |
+
nn.Dropout(dropout),
|
| 248 |
+
nn.Linear(input_size, hidden_size),
|
| 249 |
+
ACT2FN[activation],
|
| 250 |
+
nn.Dropout(dropout)
|
| 251 |
+
)
|
| 252 |
+
# All mlps are equal.
|
| 253 |
+
self.arc_head_mlp = deepcopy(self.arc_dep_mlp)
|
| 254 |
+
self.rel_dep_mlp = deepcopy(self.arc_dep_mlp)
|
| 255 |
+
self.rel_head_mlp = deepcopy(self.arc_dep_mlp)
|
| 256 |
+
|
| 257 |
+
self.dependency_head_ud = DependencyHead(hidden_size, n_rels_ud)
|
| 258 |
+
self.dependency_head_eud = MultiDependencyHead(hidden_size, n_rels_eud)
|
| 259 |
+
|
| 260 |
+
def forward(
|
| 261 |
+
self,
|
| 262 |
+
embeddings: Tensor, # [batch_size, seq_len, embedding_size]
|
| 263 |
+
gold_ud: Tensor, # [n_ud_arcs, 4]
|
| 264 |
+
gold_eud: Tensor, # [n_eud_arcs, 4]
|
| 265 |
+
null_mask: Tensor, # [batch_size, seq_len]
|
| 266 |
+
padding_mask: Tensor # [batch_size, seq_len]
|
| 267 |
+
) -> dict[str, Tensor]:
|
| 268 |
+
|
| 269 |
+
# - [batch_size, seq_len, hidden_size]
|
| 270 |
+
h_arc_head = self.arc_head_mlp(embeddings)
|
| 271 |
+
h_arc_dep = self.arc_dep_mlp(embeddings)
|
| 272 |
+
h_rel_head = self.rel_head_mlp(embeddings)
|
| 273 |
+
h_rel_dep = self.rel_dep_mlp(embeddings)
|
| 274 |
+
|
| 275 |
+
# Share the h vectors between dependency and multi-dependency heads.
|
| 276 |
+
output_ud = self.dependency_head_ud(
|
| 277 |
+
h_arc_head,
|
| 278 |
+
h_arc_dep,
|
| 279 |
+
h_rel_head,
|
| 280 |
+
h_rel_dep,
|
| 281 |
+
gold_arcs=gold_ud,
|
| 282 |
+
null_mask=null_mask,
|
| 283 |
+
padding_mask=padding_mask
|
| 284 |
+
)
|
| 285 |
+
output_eud = self.dependency_head_eud(
|
| 286 |
+
h_arc_head,
|
| 287 |
+
h_arc_dep,
|
| 288 |
+
h_rel_head,
|
| 289 |
+
h_rel_dep,
|
| 290 |
+
gold_arcs=gold_eud,
|
| 291 |
+
# Ignore null mask in E-UD
|
| 292 |
+
null_mask=torch.ones_like(padding_mask),
|
| 293 |
+
padding_mask=padding_mask
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
return {
|
| 297 |
+
'preds_ud': output_ud["preds"],
|
| 298 |
+
'preds_eud': output_eud["preds"],
|
| 299 |
+
'loss_ud': output_ud["loss"],
|
| 300 |
+
'loss_eud': output_eud["loss"]
|
| 301 |
+
}
|
encoder.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch import nn
|
| 3 |
+
from torch import Tensor, LongTensor
|
| 4 |
+
|
| 5 |
+
from transformers import AutoTokenizer, AutoModel
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class WordTransformerEncoder(nn.Module):
|
| 9 |
+
"""
|
| 10 |
+
Encodes sentences into word-level embeddings using a pretrained MLM transformer.
|
| 11 |
+
"""
|
| 12 |
+
def __init__(self, model_name: str):
|
| 13 |
+
super().__init__()
|
| 14 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 15 |
+
# Model like BERT, RoBERTa, etc.
|
| 16 |
+
self.model = AutoModel.from_pretrained(model_name)
|
| 17 |
+
|
| 18 |
+
def forward(self, words: list[list[str]]) -> Tensor:
|
| 19 |
+
"""
|
| 20 |
+
Build words embeddings.
|
| 21 |
+
|
| 22 |
+
- Tokenizes input sentences into subtokens.
|
| 23 |
+
- Passes the subtokens through the pre-trained transformer model.
|
| 24 |
+
- Aggregates subtoken embeddings into word embeddings using mean pooling.
|
| 25 |
+
"""
|
| 26 |
+
batch_size = len(words)
|
| 27 |
+
|
| 28 |
+
# BPE tokenization: split words into subtokens, e.g. ['kidding'] -> ['▁ki', 'dding'].
|
| 29 |
+
subtokens = self.tokenizer(
|
| 30 |
+
words,
|
| 31 |
+
padding=True,
|
| 32 |
+
truncation=True,
|
| 33 |
+
is_split_into_words=True,
|
| 34 |
+
return_tensors='pt'
|
| 35 |
+
)
|
| 36 |
+
subtokens = subtokens.to(self.model.device)
|
| 37 |
+
# Index words from 1 and reserve 0 for special subtokens (e.g. <s>, </s>, padding, etc.).
|
| 38 |
+
# Such numeration makes a following aggregation easier.
|
| 39 |
+
words_ids = torch.stack([
|
| 40 |
+
torch.tensor(
|
| 41 |
+
[word_id + 1 if word_id is not None else 0 for word_id in subtokens.word_ids(batch_idx)],
|
| 42 |
+
dtype=torch.long,
|
| 43 |
+
device=self.model.device
|
| 44 |
+
)
|
| 45 |
+
for batch_idx in range(batch_size)
|
| 46 |
+
])
|
| 47 |
+
|
| 48 |
+
# Run model and extract subtokens embeddings from the last layer.
|
| 49 |
+
subtokens_embeddings = self.model(**subtokens).last_hidden_state
|
| 50 |
+
|
| 51 |
+
# Aggreate subtokens embeddings into words embeddings.
|
| 52 |
+
# [batch_size, n_words, embedding_size]
|
| 53 |
+
words_emeddings = self._aggregate_subtokens_embeddings(subtokens_embeddings, words_ids)
|
| 54 |
+
return words_emeddings
|
| 55 |
+
|
| 56 |
+
def _aggregate_subtokens_embeddings(
|
| 57 |
+
self,
|
| 58 |
+
subtokens_embeddings: Tensor, # [batch_size, n_subtokens, embedding_size]
|
| 59 |
+
words_ids: LongTensor # [batch_size, n_subtokens]
|
| 60 |
+
) -> Tensor:
|
| 61 |
+
"""
|
| 62 |
+
Aggregate subtoken embeddings into word embeddings by averaging.
|
| 63 |
+
|
| 64 |
+
This method ensures that multiple subtokens corresponding to a single word are combined
|
| 65 |
+
into a single embedding.
|
| 66 |
+
"""
|
| 67 |
+
batch_size, n_subtokens, embedding_size = subtokens_embeddings.shape
|
| 68 |
+
# The number of words in a sentence plus an "auxiliary" word in the beginnig.
|
| 69 |
+
n_words = torch.max(words_ids) + 1
|
| 70 |
+
|
| 71 |
+
words_embeddings = torch.zeros(
|
| 72 |
+
size=(batch_size, n_words, embedding_size),
|
| 73 |
+
dtype=subtokens_embeddings.dtype,
|
| 74 |
+
device=self.model.device
|
| 75 |
+
)
|
| 76 |
+
words_ids_expanded = words_ids.unsqueeze(-1).expand(batch_size, n_subtokens, embedding_size)
|
| 77 |
+
|
| 78 |
+
# Use scatter_reduce_ to average embeddings of subtokens corresponding to the same word.
|
| 79 |
+
# All the padding and special subtokens will be aggregated into an "auxiliary" first embedding,
|
| 80 |
+
# namely into words_embeddings[:, 0, :].
|
| 81 |
+
words_embeddings.scatter_reduce_(
|
| 82 |
+
dim=1,
|
| 83 |
+
index=words_ids_expanded,
|
| 84 |
+
src=subtokens_embeddings,
|
| 85 |
+
reduce="mean",
|
| 86 |
+
include_self=False
|
| 87 |
+
)
|
| 88 |
+
# Now remove the auxiliary word in the beginning.
|
| 89 |
+
words_embeddings = words_embeddings[:, 1:, :]
|
| 90 |
+
return words_embeddings
|
| 91 |
+
|
| 92 |
+
def get_embedding_size(self) -> int:
|
| 93 |
+
"""Returns the embedding size of the transformer model, e.g. 768 for BERT."""
|
| 94 |
+
return self.model.config.hidden_size
|
| 95 |
+
|
| 96 |
+
def get_embeddings_layer(self):
|
| 97 |
+
"""Returns the embeddings model."""
|
| 98 |
+
return self.model.embeddings
|
| 99 |
+
|
| 100 |
+
def get_transformer_layers(self) -> list[nn.Module]:
|
| 101 |
+
"""
|
| 102 |
+
Return a flat list of all transformer-*block* layers, excluding embeddings/poolers, etc.
|
| 103 |
+
"""
|
| 104 |
+
layers = []
|
| 105 |
+
for sub in self.model.modules():
|
| 106 |
+
# find all ModuleLists (these always hold the actual block layers)
|
| 107 |
+
if isinstance(sub, nn.ModuleList):
|
| 108 |
+
layers.extend(list(sub))
|
| 109 |
+
return layers
|
mlp_classifier.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch import nn
|
| 3 |
+
from torch import Tensor, LongTensor
|
| 4 |
+
|
| 5 |
+
from transformers.activations import ACT2FN
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class MlpClassifier(nn.Module):
|
| 9 |
+
""" Simple feed-forward multilayer perceptron classifier. """
|
| 10 |
+
|
| 11 |
+
def __init__(
|
| 12 |
+
self,
|
| 13 |
+
input_size: int,
|
| 14 |
+
hidden_size: int,
|
| 15 |
+
n_classes: int,
|
| 16 |
+
activation: str,
|
| 17 |
+
dropout: float,
|
| 18 |
+
class_weights: list[float] = None,
|
| 19 |
+
):
|
| 20 |
+
super().__init__()
|
| 21 |
+
|
| 22 |
+
self.n_classes = n_classes
|
| 23 |
+
self.classifier = nn.Sequential(
|
| 24 |
+
nn.Dropout(dropout),
|
| 25 |
+
nn.Linear(input_size, hidden_size),
|
| 26 |
+
ACT2FN[activation],
|
| 27 |
+
nn.Dropout(dropout),
|
| 28 |
+
nn.Linear(hidden_size, n_classes)
|
| 29 |
+
)
|
| 30 |
+
if class_weights is not None:
|
| 31 |
+
class_weights = torch.tensor(class_weights, dtype=torch.long)
|
| 32 |
+
self.cross_entropy = nn.CrossEntropyLoss(weight=class_weights)
|
| 33 |
+
|
| 34 |
+
def forward(self, embeddings: Tensor, labels: LongTensor = None) -> dict:
|
| 35 |
+
logits = self.classifier(embeddings)
|
| 36 |
+
# Calculate loss.
|
| 37 |
+
loss = 0.0
|
| 38 |
+
if labels is not None:
|
| 39 |
+
# Reshape tensors to match expected dimensions
|
| 40 |
+
loss = self.cross_entropy(
|
| 41 |
+
logits.view(-1, self.n_classes),
|
| 42 |
+
labels.view(-1)
|
| 43 |
+
)
|
| 44 |
+
# Predictions.
|
| 45 |
+
preds = logits.argmax(dim=-1)
|
| 46 |
+
return {'preds': preds, 'loss': loss}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ce1af3c99b565ace5df9845a3b3531c6161957e0952cba24b236bdc24583d6f
|
| 3 |
+
size 1134190536
|
modeling_parser.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch import nn
|
| 2 |
+
from torch import LongTensor
|
| 3 |
+
from transformers import PreTrainedModel
|
| 4 |
+
|
| 5 |
+
from .configuration import CobaldParserConfig
|
| 6 |
+
from .encoder import WordTransformerEncoder
|
| 7 |
+
from .mlp_classifier import MlpClassifier
|
| 8 |
+
from .dependency_classifier import DependencyClassifier
|
| 9 |
+
from .utils import (
|
| 10 |
+
build_padding_mask,
|
| 11 |
+
build_null_mask,
|
| 12 |
+
prepend_cls,
|
| 13 |
+
remove_nulls,
|
| 14 |
+
add_nulls
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class CobaldParser(PreTrainedModel):
|
| 19 |
+
"""Morpho-Syntax-Semantic Parser."""
|
| 20 |
+
|
| 21 |
+
config_class = CobaldParserConfig
|
| 22 |
+
|
| 23 |
+
def __init__(self, config: CobaldParserConfig):
|
| 24 |
+
super().__init__(config)
|
| 25 |
+
|
| 26 |
+
self.encoder = WordTransformerEncoder(
|
| 27 |
+
model_name=config.encoder_model_name
|
| 28 |
+
)
|
| 29 |
+
embedding_size = self.encoder.get_embedding_size()
|
| 30 |
+
|
| 31 |
+
self.classifiers = nn.ModuleDict()
|
| 32 |
+
self.classifiers["null"] = MlpClassifier(
|
| 33 |
+
input_size=self.encoder.get_embedding_size(),
|
| 34 |
+
hidden_size=config.null_classifier_hidden_size,
|
| 35 |
+
n_classes=config.consecutive_null_limit + 1,
|
| 36 |
+
activation=config.activation,
|
| 37 |
+
dropout=config.dropout
|
| 38 |
+
)
|
| 39 |
+
if "lemma_rule" in config.vocabulary:
|
| 40 |
+
self.classifiers["lemma_rule"] = MlpClassifier(
|
| 41 |
+
input_size=embedding_size,
|
| 42 |
+
hidden_size=config.lemma_classifier_hidden_size,
|
| 43 |
+
n_classes=len(config.vocabulary["lemma_rule"]),
|
| 44 |
+
activation=config.activation,
|
| 45 |
+
dropout=config.dropout
|
| 46 |
+
)
|
| 47 |
+
if "joint_feats" in config.vocabulary:
|
| 48 |
+
self.classifiers["joint_feats"] = MlpClassifier(
|
| 49 |
+
input_size=embedding_size,
|
| 50 |
+
hidden_size=config.morphology_classifier_hidden_size,
|
| 51 |
+
n_classes=len(config.vocabulary["joint_feats"]),
|
| 52 |
+
activation=config.activation,
|
| 53 |
+
dropout=config.dropout
|
| 54 |
+
)
|
| 55 |
+
if "ud_deprel" in config.vocabulary or "eud_deprel" in config.vocabulary:
|
| 56 |
+
self.classifiers["syntax"] = DependencyClassifier(
|
| 57 |
+
input_size=embedding_size,
|
| 58 |
+
hidden_size=config.dependency_classifier_hidden_size,
|
| 59 |
+
n_rels_ud=len(config.vocabulary["ud_deprel"]),
|
| 60 |
+
n_rels_eud=len(config.vocabulary["eud_deprel"]),
|
| 61 |
+
activation=config.activation,
|
| 62 |
+
dropout=config.dropout
|
| 63 |
+
)
|
| 64 |
+
if "misc" in config.vocabulary:
|
| 65 |
+
self.classifiers["misc"] = MlpClassifier(
|
| 66 |
+
input_size=embedding_size,
|
| 67 |
+
hidden_size=config.misc_classifier_hidden_size,
|
| 68 |
+
n_classes=len(config.vocabulary["misc"]),
|
| 69 |
+
activation=config.activation,
|
| 70 |
+
dropout=config.dropout
|
| 71 |
+
)
|
| 72 |
+
if "deepslot" in config.vocabulary:
|
| 73 |
+
self.classifiers["deepslot"] = MlpClassifier(
|
| 74 |
+
input_size=embedding_size,
|
| 75 |
+
hidden_size=config.deepslot_classifier_hidden_size,
|
| 76 |
+
n_classes=len(config.vocabulary["deepslot"]),
|
| 77 |
+
activation=config.activation,
|
| 78 |
+
dropout=config.dropout
|
| 79 |
+
)
|
| 80 |
+
if "semclass" in config.vocabulary:
|
| 81 |
+
self.classifiers["semclass"] = MlpClassifier(
|
| 82 |
+
input_size=embedding_size,
|
| 83 |
+
hidden_size=config.semclass_classifier_hidden_size,
|
| 84 |
+
n_classes=len(config.vocabulary["semclass"]),
|
| 85 |
+
activation=config.activation,
|
| 86 |
+
dropout=config.dropout
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
def forward(
|
| 90 |
+
self,
|
| 91 |
+
words: list[list[str]],
|
| 92 |
+
counting_masks: LongTensor = None,
|
| 93 |
+
lemma_rules: LongTensor = None,
|
| 94 |
+
joint_feats: LongTensor = None,
|
| 95 |
+
deps_ud: LongTensor = None,
|
| 96 |
+
deps_eud: LongTensor = None,
|
| 97 |
+
miscs: LongTensor = None,
|
| 98 |
+
deepslots: LongTensor = None,
|
| 99 |
+
semclasses: LongTensor = None,
|
| 100 |
+
sent_ids: list[str] = None,
|
| 101 |
+
texts: list[str] = None,
|
| 102 |
+
inference_mode: bool = False
|
| 103 |
+
) -> dict:
|
| 104 |
+
output = {}
|
| 105 |
+
|
| 106 |
+
# Extra [CLS] token accounts for the case when #NULL is the first token in a sentence.
|
| 107 |
+
words_with_cls = prepend_cls(words)
|
| 108 |
+
words_without_nulls = remove_nulls(words_with_cls)
|
| 109 |
+
# Embeddings of words without nulls.
|
| 110 |
+
embeddings_without_nulls = self.encoder(words_without_nulls)
|
| 111 |
+
# Predict nulls.
|
| 112 |
+
null_output = self.classifiers["null"](embeddings_without_nulls, counting_masks)
|
| 113 |
+
output["counting_mask"] = null_output['preds']
|
| 114 |
+
output["loss"] = null_output["loss"]
|
| 115 |
+
|
| 116 |
+
# "Teacher forcing": during training, pass the original words (with gold nulls)
|
| 117 |
+
# to the classification heads, so that they are trained upon correct sentences.
|
| 118 |
+
if inference_mode:
|
| 119 |
+
# Restore predicted nulls in the original sentences.
|
| 120 |
+
output["words"] = add_nulls(words, null_output["preds"])
|
| 121 |
+
else:
|
| 122 |
+
output["words"] = words
|
| 123 |
+
|
| 124 |
+
# Encode words with nulls.
|
| 125 |
+
# [batch_size, seq_len, embedding_size]
|
| 126 |
+
embeddings = self.encoder(output["words"])
|
| 127 |
+
|
| 128 |
+
# Predict lemmas and morphological features.
|
| 129 |
+
if "lemma_rule" in self.classifiers:
|
| 130 |
+
lemma_output = self.classifiers["lemma_rule"](embeddings, lemma_rules)
|
| 131 |
+
output["lemma_rules"] = lemma_output['preds']
|
| 132 |
+
output["loss"] += lemma_output['loss']
|
| 133 |
+
|
| 134 |
+
if "joint_feats" in self.classifiers:
|
| 135 |
+
joint_feats_output = self.classifiers["joint_feats"](embeddings, joint_feats)
|
| 136 |
+
output["joint_feats"] = joint_feats_output['preds']
|
| 137 |
+
output["loss"] += joint_feats_output['loss']
|
| 138 |
+
|
| 139 |
+
# Predict syntax.
|
| 140 |
+
if "syntax" in self.classifiers:
|
| 141 |
+
padding_mask = build_padding_mask(output["words"], self.device)
|
| 142 |
+
null_mask = build_null_mask(output["words"], self.device)
|
| 143 |
+
deps_output = self.classifiers["syntax"](
|
| 144 |
+
embeddings,
|
| 145 |
+
deps_ud,
|
| 146 |
+
deps_eud,
|
| 147 |
+
null_mask,
|
| 148 |
+
padding_mask
|
| 149 |
+
)
|
| 150 |
+
output["deps_ud"] = deps_output['preds_ud']
|
| 151 |
+
output["deps_eud"] = deps_output['preds_eud']
|
| 152 |
+
output["loss"] += deps_output['loss_ud'] + deps_output['loss_eud']
|
| 153 |
+
|
| 154 |
+
# Predict miscellaneous features.
|
| 155 |
+
if "misc" in self.classifiers:
|
| 156 |
+
misc_output = self.classifiers["misc"](embeddings, miscs)
|
| 157 |
+
output["miscs"] = misc_output['preds']
|
| 158 |
+
output["loss"] += misc_output['loss']
|
| 159 |
+
|
| 160 |
+
# Predict semantics.
|
| 161 |
+
if "deepslot" in self.classifiers:
|
| 162 |
+
deepslot_output = self.classifiers["deepslot"](embeddings, deepslots)
|
| 163 |
+
output["deepslots"] = deepslot_output['preds']
|
| 164 |
+
output["loss"] += deepslot_output['loss']
|
| 165 |
+
|
| 166 |
+
if "semclass" in self.classifiers:
|
| 167 |
+
semclass_output = self.classifiers["semclass"](embeddings, semclasses)
|
| 168 |
+
output["semclasses"] = semclass_output['preds']
|
| 169 |
+
output["loss"] += semclass_output['loss']
|
| 170 |
+
|
| 171 |
+
return output
|
pipeline.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import Pipeline
|
| 3 |
+
|
| 4 |
+
from src.lemmatize_helper import reconstruct_lemma
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ConlluTokenClassificationPipeline(Pipeline):
|
| 8 |
+
def __init__(
|
| 9 |
+
self,
|
| 10 |
+
model,
|
| 11 |
+
tokenizer: callable = None,
|
| 12 |
+
sentenizer: callable = None,
|
| 13 |
+
**kwargs
|
| 14 |
+
):
|
| 15 |
+
super().__init__(model=model, **kwargs)
|
| 16 |
+
self.tokenizer = tokenizer
|
| 17 |
+
self.sentenizer = sentenizer
|
| 18 |
+
|
| 19 |
+
#@override
|
| 20 |
+
def _sanitize_parameters(self, output_format: str = 'list', **kwargs):
|
| 21 |
+
if output_format not in ['list', 'str']:
|
| 22 |
+
raise ValueError(
|
| 23 |
+
f"output_format must be 'str' or 'list', not {output_format}"
|
| 24 |
+
)
|
| 25 |
+
# capture output_format for postprocessing
|
| 26 |
+
return {}, {}, {'output_format': output_format}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def preprocess(self, inputs: str) -> dict:
|
| 30 |
+
if not isinstance(inputs, str):
|
| 31 |
+
raise ValueError("pipeline input must be string (text)")
|
| 32 |
+
|
| 33 |
+
sentences = [sentence for sentence in self.sentenizer(inputs)]
|
| 34 |
+
words = [
|
| 35 |
+
[word for word in self.tokenizer(sentence)]
|
| 36 |
+
for sentence in sentences
|
| 37 |
+
]
|
| 38 |
+
# stash for later post‐processing
|
| 39 |
+
self._texts = sentences
|
| 40 |
+
return {"words": words}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _forward(self, model_inputs: dict) -> dict:
|
| 44 |
+
return self.model(**model_inputs, inference_mode=True)
|
| 45 |
+
|
| 46 |
+
#@override
|
| 47 |
+
def postprocess(self, model_outputs: dict, output_format: str) -> list[dict] | str:
|
| 48 |
+
sentences = self._decode_model_output(model_outputs)
|
| 49 |
+
# Format sentences into CoNLL-U string if requested.
|
| 50 |
+
if output_format == 'str':
|
| 51 |
+
sentences = self._format_as_conllu(sentences)
|
| 52 |
+
return sentences
|
| 53 |
+
|
| 54 |
+
def _decode_model_output(self, model_outputs: dict) -> list[dict]:
|
| 55 |
+
n_sentences = len(model_outputs["words"])
|
| 56 |
+
|
| 57 |
+
sentences_decoded = []
|
| 58 |
+
for i in range(n_sentences):
|
| 59 |
+
|
| 60 |
+
def select_arcs(arcs, batch_idx):
|
| 61 |
+
# Select arcs where batch index == batch_idx
|
| 62 |
+
# Return tensor of shape [n_selected_arcs, 3]
|
| 63 |
+
return arcs[arcs[:, 0] == batch_idx][:, 1:]
|
| 64 |
+
|
| 65 |
+
# Model outputs are padded tensors, so only leave first `n_words` labels.
|
| 66 |
+
n_words = len(model_outputs["words"][i])
|
| 67 |
+
|
| 68 |
+
optional_tags = {}
|
| 69 |
+
if "lemma_rules" in model_outputs:
|
| 70 |
+
optional_tags["lemma_rule_ids"] = model_outputs["lemma_rules"][i, :n_words].tolist()
|
| 71 |
+
if "joint_feats" in model_outputs:
|
| 72 |
+
optional_tags["joint_feats_ids"] = model_outputs["joint_feats"][i, :n_words].tolist()
|
| 73 |
+
if "deps_ud" in model_outputs:
|
| 74 |
+
optional_tags["deps_ud"] = select_arcs(model_outputs["deps_ud"], i).tolist()
|
| 75 |
+
if "deps_eud" in model_outputs:
|
| 76 |
+
optional_tags["deps_eud"] = select_arcs(model_outputs["deps_eud"], i).tolist()
|
| 77 |
+
if "miscs" in model_outputs:
|
| 78 |
+
optional_tags["misc_ids"] = model_outputs["miscs"][i, :n_words].tolist()
|
| 79 |
+
if "deepslots" in model_outputs:
|
| 80 |
+
optional_tags["deepslot_ids"] = model_outputs["deepslots"][i, :n_words].tolist()
|
| 81 |
+
if "semclasses" in model_outputs:
|
| 82 |
+
optional_tags["semclass_ids"] = model_outputs["semclasses"][i, :n_words].tolist()
|
| 83 |
+
|
| 84 |
+
sentence_decoded = self._decode_sentence(
|
| 85 |
+
text=self._texts[i],
|
| 86 |
+
words=model_outputs["words"][i],
|
| 87 |
+
**optional_tags,
|
| 88 |
+
)
|
| 89 |
+
sentences_decoded.append(sentence_decoded)
|
| 90 |
+
return sentences_decoded
|
| 91 |
+
|
| 92 |
+
def _decode_sentence(
|
| 93 |
+
self,
|
| 94 |
+
text: str,
|
| 95 |
+
words: list[str],
|
| 96 |
+
lemma_rule_ids: list[int] = None,
|
| 97 |
+
joint_feats_ids: list[int] = None,
|
| 98 |
+
deps_ud: list[list[int]] = None,
|
| 99 |
+
deps_eud: list[list[int]] = None,
|
| 100 |
+
misc_ids: list[int] = None,
|
| 101 |
+
deepslot_ids: list[int] = None,
|
| 102 |
+
semclass_ids: list[int] = None
|
| 103 |
+
) -> dict:
|
| 104 |
+
|
| 105 |
+
# Enumerate words in the sentence, starting from 1.
|
| 106 |
+
ids = self._enumerate_words(words)
|
| 107 |
+
|
| 108 |
+
result = {
|
| 109 |
+
"text": text,
|
| 110 |
+
"words": words,
|
| 111 |
+
"ids": ids
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
# Decode lemmas.
|
| 115 |
+
if lemma_rule_ids:
|
| 116 |
+
result["lemmas"] = [
|
| 117 |
+
reconstruct_lemma(
|
| 118 |
+
word,
|
| 119 |
+
self.model.config.vocabulary["lemma_rule"][lemma_rule_id]
|
| 120 |
+
)
|
| 121 |
+
for word, lemma_rule_id in zip(words, lemma_rule_ids, strict=True)
|
| 122 |
+
]
|
| 123 |
+
# Decode POS and features.
|
| 124 |
+
if joint_feats_ids:
|
| 125 |
+
upos, xpos, feats = zip(
|
| 126 |
+
*[
|
| 127 |
+
self.model.config.vocabulary["joint_feats"][joint_feats_id].split('#')
|
| 128 |
+
for joint_feats_id in joint_feats_ids
|
| 129 |
+
],
|
| 130 |
+
strict=True
|
| 131 |
+
)
|
| 132 |
+
result["upos"] = list(upos)
|
| 133 |
+
result["xpos"] = list(xpos)
|
| 134 |
+
result["feats"] = list(feats)
|
| 135 |
+
# Decode syntax.
|
| 136 |
+
renumerate_and_decode_arcs = lambda arcs, id2rel: [
|
| 137 |
+
(
|
| 138 |
+
# ids stores inverse mapping from internal numeration to the standard
|
| 139 |
+
# conllu numeration, so simply use ids[internal_idx] to retrieve token id
|
| 140 |
+
# from internal index.
|
| 141 |
+
ids[arc_from] if arc_from != arc_to else '0',
|
| 142 |
+
ids[arc_to],
|
| 143 |
+
id2rel[deprel_id]
|
| 144 |
+
)
|
| 145 |
+
for arc_from, arc_to, deprel_id in arcs
|
| 146 |
+
]
|
| 147 |
+
if deps_ud:
|
| 148 |
+
result["deps_ud"] = renumerate_and_decode_arcs(
|
| 149 |
+
deps_ud,
|
| 150 |
+
self.model.config.vocabulary["ud_deprel"]
|
| 151 |
+
)
|
| 152 |
+
if deps_eud:
|
| 153 |
+
result["deps_eud"] = renumerate_and_decode_arcs(
|
| 154 |
+
deps_eud,
|
| 155 |
+
self.model.config.vocabulary["eud_deprel"]
|
| 156 |
+
)
|
| 157 |
+
# Decode misc.
|
| 158 |
+
if misc_ids:
|
| 159 |
+
result["miscs"] = [
|
| 160 |
+
self.model.config.vocabulary["misc"][misc_id]
|
| 161 |
+
for misc_id in misc_ids
|
| 162 |
+
]
|
| 163 |
+
# Decode semantics.
|
| 164 |
+
if deepslot_ids:
|
| 165 |
+
result["deepslots"] = [
|
| 166 |
+
self.model.config.vocabulary["deepslot"][deepslot_id]
|
| 167 |
+
for deepslot_id in deepslot_ids
|
| 168 |
+
]
|
| 169 |
+
if semclass_ids:
|
| 170 |
+
result["semclasses"] = [
|
| 171 |
+
self.model.config.vocabulary["semclass"][semclass_id]
|
| 172 |
+
for semclass_id in semclass_ids
|
| 173 |
+
]
|
| 174 |
+
return result
|
| 175 |
+
|
| 176 |
+
@staticmethod
|
| 177 |
+
def _enumerate_words(words: list[str]) -> list[str]:
|
| 178 |
+
ids = []
|
| 179 |
+
current_id = 0
|
| 180 |
+
current_null_count = 0
|
| 181 |
+
for word in words:
|
| 182 |
+
if word == "#NULL":
|
| 183 |
+
current_null_count += 1
|
| 184 |
+
ids.append(f"{current_id}.{current_null_count}")
|
| 185 |
+
else:
|
| 186 |
+
current_id += 1
|
| 187 |
+
current_null_count = 0
|
| 188 |
+
ids.append(f"{current_id}")
|
| 189 |
+
return ids
|
| 190 |
+
|
| 191 |
+
@staticmethod
|
| 192 |
+
def _format_as_conllu(sentences: list[dict]) -> str:
|
| 193 |
+
"""
|
| 194 |
+
Format a list of sentence dicts into a CoNLL-U formatted string.
|
| 195 |
+
"""
|
| 196 |
+
formatted = []
|
| 197 |
+
for sentence in sentences:
|
| 198 |
+
# The first line is a text matadata.
|
| 199 |
+
lines = [f"# text = {sentence['text']}"]
|
| 200 |
+
|
| 201 |
+
id2idx = {token_id: idx for idx, token_id in enumerate(sentence['ids'])}
|
| 202 |
+
|
| 203 |
+
# Basic syntax.
|
| 204 |
+
heads = [''] * len(id2idx)
|
| 205 |
+
deprels = [''] * len(id2idx)
|
| 206 |
+
if "deps_ud" in sentence:
|
| 207 |
+
for arc_from, arc_to, deprel in sentence['deps_ud']:
|
| 208 |
+
token_idx = id2idx[arc_to]
|
| 209 |
+
heads[token_idx] = arc_from
|
| 210 |
+
deprels[token_idx] = deprel
|
| 211 |
+
|
| 212 |
+
# Enhanced syntax.
|
| 213 |
+
deps_dicts = [{} for _ in range(len(id2idx))]
|
| 214 |
+
if "deps_eud" in sentence:
|
| 215 |
+
for arc_from, arc_to, deprel in sentence['deps_eud']:
|
| 216 |
+
token_idx = id2idx[arc_to]
|
| 217 |
+
deps_dicts[token_idx][arc_from] = deprel
|
| 218 |
+
|
| 219 |
+
for idx, token_id in enumerate(sentence['ids']):
|
| 220 |
+
word = sentence['words'][idx]
|
| 221 |
+
lemma = sentence['lemmas'][idx] if "lemmas" in sentence else ''
|
| 222 |
+
upos = sentence['upos'][idx] if "upos" in sentence else ''
|
| 223 |
+
xpos = sentence['xpos'][idx] if "xpos" in sentence else ''
|
| 224 |
+
feats = sentence['feats'][idx] if "feats" in sentence else ''
|
| 225 |
+
deps = '|'.join(f"{head}:{rel}" for head, rel in deps_dicts[idx].items()) or '_'
|
| 226 |
+
misc = sentence['miscs'][idx] if "miscs" in sentence else ''
|
| 227 |
+
deepslot = sentence['deepslots'][idx] if "deepslots" in sentence else ''
|
| 228 |
+
semclass = sentence['semclasses'][idx] if "semclasses" in sentence else ''
|
| 229 |
+
# CoNLL-U columns
|
| 230 |
+
line = '\t'.join([
|
| 231 |
+
token_id, word, lemma, upos, xpos, feats, heads[idx],
|
| 232 |
+
deprels[idx], deps, misc, deepslot, semclass
|
| 233 |
+
])
|
| 234 |
+
lines.append(line)
|
| 235 |
+
formatted.append('\n'.join(lines))
|
| 236 |
+
return '\n\n'.join(formatted)
|
utils.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch import Tensor
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def pad_sequences(sequences: list[Tensor], padding_value: int) -> Tensor:
|
| 6 |
+
"""
|
| 7 |
+
Stack 1d tensors (sequences) into a single 2d tensor so that each sequence is padded on the
|
| 8 |
+
right.
|
| 9 |
+
"""
|
| 10 |
+
return torch.nn.utils.rnn.pad_sequence(sequences, padding_value=padding_value, batch_first=True)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _build_condition_mask(sentences: list[list[str]], condition_fn: callable, device) -> Tensor:
|
| 14 |
+
masks = [
|
| 15 |
+
torch.tensor([condition_fn(word) for word in sentence], dtype=bool, device=device)
|
| 16 |
+
for sentence in sentences
|
| 17 |
+
]
|
| 18 |
+
return pad_sequences(masks, padding_value=False)
|
| 19 |
+
|
| 20 |
+
def build_padding_mask(sentences: list[list[str]], device) -> Tensor:
|
| 21 |
+
return _build_condition_mask(sentences, condition_fn=lambda word: True, device=device)
|
| 22 |
+
|
| 23 |
+
def build_null_mask(sentences: list[list[str]], device) -> Tensor:
|
| 24 |
+
return _build_condition_mask(sentences, condition_fn=lambda word: word != "#NULL", device=device)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def pairwise_mask(masks1d: Tensor) -> Tensor:
|
| 28 |
+
"""
|
| 29 |
+
Calculate an outer product of a mask, i.e. masks2d[:, i, j] = masks1d[:, i] & masks1d[:, j].
|
| 30 |
+
"""
|
| 31 |
+
return masks1d[:, None, :] & masks1d[:, :, None]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# Credits: https://docs.allennlp.org/main/api/nn/util/#replace_masked_values
|
| 35 |
+
def replace_masked_values(tensor: Tensor, mask: Tensor, replace_with: float):
|
| 36 |
+
"""
|
| 37 |
+
Replace all masked values in tensor with `replace_with`.
|
| 38 |
+
"""
|
| 39 |
+
assert tensor.dim() == mask.dim(), "tensor.dim() of {tensor.dim()} != mask.dim() of {mask.dim()}"
|
| 40 |
+
tensor.masked_fill_(~mask, replace_with)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def prepend_cls(sentences: list[list[str]]) -> list[list[str]]:
|
| 44 |
+
"""
|
| 45 |
+
Return a copy of sentences with [CLS] token prepended.
|
| 46 |
+
"""
|
| 47 |
+
return [["[CLS]", *sentence] for sentence in sentences]
|
| 48 |
+
|
| 49 |
+
def remove_nulls(sentences: list[list[str]]) -> list[list[str]]:
|
| 50 |
+
"""
|
| 51 |
+
Return a copy of sentences with nulls removed.
|
| 52 |
+
"""
|
| 53 |
+
return [[word for word in sentence if word != "#NULL"] for sentence in sentences]
|
| 54 |
+
|
| 55 |
+
def add_nulls(sentences: list[list[str]], counting_mask) -> list[list[str]]:
|
| 56 |
+
"""
|
| 57 |
+
Return a copy of sentences with nulls restored according to counting masks.
|
| 58 |
+
"""
|
| 59 |
+
sentences_with_nulls = []
|
| 60 |
+
for sentence, counting_mask in zip(sentences, counting_mask, strict=True):
|
| 61 |
+
sentence_with_nulls = []
|
| 62 |
+
assert 0 < len(counting_mask)
|
| 63 |
+
# Account for leading (CLS) auxiliary token.
|
| 64 |
+
sentence_with_nulls.extend(["#NULL"] * counting_mask[0])
|
| 65 |
+
for word, n_nulls_to_insert in zip(sentence, counting_mask[1:], strict=True):
|
| 66 |
+
sentence_with_nulls.append(word)
|
| 67 |
+
sentence_with_nulls.extend(["#NULL"] * n_nulls_to_insert)
|
| 68 |
+
sentences_with_nulls.append(sentence_with_nulls)
|
| 69 |
+
return sentences_with_nulls
|