Upload ConlluTokenClassificationPipeline

Browse files

Files changed (10) hide show

README.md +199 -0
config.json +773 -0
configuration.py +40 -0
dependency_classifier.py +301 -0
encoder.py +109 -0
mlp_classifier.py +46 -0
model.safetensors +3 -0
modeling_parser.py +171 -0
pipeline.py +236 -0
utils.py +69 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,773 @@

+{
+  "activation": "relu",
+  "architectures": [
+    "CobaldParser"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration.CobaldParserConfig",
+    "AutoModel": "modeling_parser.CobaldParser"
+  },
+  "consecutive_null_limit": 3,
+  "custom_pipelines": {
+    "conllu-parsing": {
+      "impl": "pipeline.ConlluTokenClassificationPipeline",
+      "pt": [
+        "AutoModel"
+      ],
+      "tf": [],
+      "type": "text"
+    }
+  },
+  "deepslot_classifier_hidden_size": 256,
+  "dependency_classifier_hidden_size": 128,
+  "dropout": 0.1,
+  "encoder_model_name": "xlm-roberta-base",
+  "lemma_classifier_hidden_size": 512,
+  "misc_classifier_hidden_size": 512,
+  "model_type": "cobald_parser",
+  "morphology_classifier_hidden_size": 512,
+  "null_classifier_hidden_size": 512,
+  "semclass_classifier_hidden_size": 512,
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.2",
+  "vocabulary": {
+    "deepslot": {
+      "0": "Addition",
+      "1": "AdditionalParticipant",
+      "2": "Addressee",
+      "3": "Agent",
+      "4": "Agent_Metaphoric",
+      "5": "BeneMalefactive",
+      "6": "Cause",
+      "7": "Ch_Parameter",
+      "8": "Ch_Reference",
+      "9": "Characteristic",
+      "10": "ClassifiedEntity",
+      "11": "Comparison",
+      "12": "ComparisonBase",
+      "13": "Concession",
+      "14": "Concurrent",
+      "15": "Condition",
+      "16": "ContrAgent",
+      "17": "ContrObject",
+      "18": "Correlative",
+      "19": "Criterion",
+      "20": "Degree",
+      "21": "DegreeNumerative",
+      "22": "Elective",
+      "23": "Empty_Subject_It",
+      "24": "Experiencer",
+      "25": "Experiencer_Metaphoric",
+      "26": "Function",
+      "27": "Instrument_Situation",
+      "28": "Landmark",
+      "29": "Limitation",
+      "30": "Locative",
+      "31": "Locative_FinalPoint",
+      "32": "Member",
+      "33": "MetaphoricLocative",
+      "34": "Motive",
+      "35": "Name_Title",
+      "36": "Object",
+      "37": "Object_Relation",
+      "38": "Object_Situation",
+      "39": "Opposition",
+      "40": "OrderInTimeAndSpace",
+      "41": "Parenthetical",
+      "42": "Part",
+      "43": "Part_Situation",
+      "44": "ParticipleRelativeClause",
+      "45": "Possessor",
+      "46": "Possessor_Metaphoric",
+      "47": "Predicate",
+      "48": "Predicate_Noun",
+      "49": "PrincipleOfOrganization",
+      "50": "Purpose",
+      "51": "QuantifiedEntity",
+      "52": "Quantity",
+      "53": "Raising_Target",
+      "54": "Relative",
+      "55": "Resultative",
+      "56": "SetEnvironment",
+      "57": "Set_General",
+      "58": "Source",
+      "59": "Specification",
+      "60": "Specifier_Number",
+      "61": "Sphere",
+      "62": "StaffOfPossessors",
+      "63": "Standpoint",
+      "64": "State",
+      "65": "SupportedEntity",
+      "66": "Theme",
+      "67": "Time",
+      "68": "Vocative"
+    },
+    "eud_deprel": {
+      "0": "acl",
+      "1": "acl:att",
+      "2": "acl:cleft",
+      "3": "acl:med",
+      "4": "acl:mot",
+      "5": "acl:om",
+      "6": "acl:p\u00e5",
+      "7": "acl:relcl",
+      "8": "acl:som",
+      "9": "acl:\u00e4n",
+      "10": "advcl",
+      "11": "advcl:att",
+      "12": "advcl:d\u00e4rf\u00f6r_att",
+      "13": "advcl:d\u00e5",
+      "14": "advcl:eftersom",
+      "15": "advcl:f\u00f6r_att",
+      "16": "advcl:f\u00f6rutsatt_att",
+      "17": "advcl:innan",
+      "18": "advcl:liksom",
+      "19": "advcl:med_att",
+      "20": "advcl:n\u00e4r",
+      "21": "advcl:om",
+      "22": "advcl:p\u00e5",
+      "23": "advcl:samtidigt_som",
+      "24": "advcl:sedan",
+      "25": "advcl:som",
+      "26": "advcl:\u00e4n",
+      "27": "advmod",
+      "28": "amod",
+      "29": "appos",
+      "30": "aux",
+      "31": "aux:pass",
+      "32": "case",
+      "33": "cc",
+      "34": "ccomp",
+      "35": "compound:prt",
+      "36": "conj",
+      "37": "conj:and",
+      "38": "conj:eller",
+      "39": "conj:fast",
+      "40": "conj:men",
+      "41": "conj:och",
+      "42": "conj:respektive",
+      "43": "conj:samt",
+      "44": "conj:som",
+      "45": "conj:ty",
+      "46": "conj:utan",
+      "47": "cop",
+      "48": "csubj",
+      "49": "csubj:pass",
+      "50": "det",
+      "51": "dislocated",
+      "52": "expl",
+      "53": "fixed",
+      "54": "flat",
+      "55": "iobj",
+      "56": "mark",
+      "57": "nmod",
+      "58": "nmod:av",
+      "59": "nmod:efter",
+      "60": "nmod:fr\u00e5n",
+      "61": "nmod:f\u00f6r",
+      "62": "nmod:hos",
+      "63": "nmod:i",
+      "64": "nmod:inom",
+      "65": "nmod:med",
+      "66": "nmod:mellan",
+      "67": "nmod:mot",
+      "68": "nmod:oavsett",
+      "69": "nmod:om",
+      "70": "nmod:poss",
+      "71": "nmod:p\u00e5",
+      "72": "nmod:till",
+      "73": "nmod:under",
+      "74": "nmod:utanf\u00f6r",
+      "75": "nmod:vid",
+      "76": "nmod:\u00e5t",
+      "77": "nsubj",
+      "78": "nsubj:pass",
+      "79": "nsubj:xsubj",
+      "80": "nummod",
+      "81": "obj",
+      "82": "obl",
+      "83": "obl:agent",
+      "84": "obl:as",
+      "85": "obl:av",
+      "86": "obl:bland",
+      "87": "obl:efter",
+      "88": "obl:enligt",
+      "89": "obl:for",
+      "90": "obl:fr\u00e5n",
+      "91": "obl:f\u00f6r",
+      "92": "obl:genom",
+      "93": "obl:hos",
+      "94": "obl:i",
+      "95": "obl:inom",
+      "96": "obl:med",
+      "97": "obl:med_avseende_p\u00e5",
+      "98": "obl:mellan",
+      "99": "obl:mot",
+      "100": "obl:om",
+      "101": "obl:omkring",
+      "102": "obl:p\u00e5",
+      "103": "obl:runtomkring",
+      "104": "obl:som",
+      "105": "obl:till",
+      "106": "obl:trots",
+      "107": "obl:under",
+      "108": "obl:ur",
+      "109": "obl:utan",
+      "110": "obl:utanf\u00f6r",
+      "111": "obl:vid",
+      "112": "obl:\u00e4n",
+      "113": "obl:\u00e5",
+      "114": "obl:\u00e5t",
+      "115": "parataxis",
+      "116": "punct",
+      "117": "ref",
+      "118": "root",
+      "119": "vocative",
+      "120": "xcomp"
+    },
+    "joint_feats": {
+      "0": "ADJ#Adjective#Abbr=Yes",
+      "1": "ADJ#Adjective#Case=Nom|Definite=Def|Degree=Pos",
+      "2": "ADJ#Adjective#Case=Nom|Definite=Def|Degree=Pos|Gender=Com|Number=Sing",
+      "3": "ADJ#Adjective#Case=Nom|Definite=Def|Degree=Pos|Tense=Past|VerbForm=Part",
+      "4": "ADJ#Adjective#Case=Nom|Definite=Def|Degree=Sup",
+      "5": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos",
+      "6": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing",
+      "7": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing|Tense=Past|VerbForm=Part",
+      "8": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing",
+      "9": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur",
+      "10": "ADJ#Adjective#Case=Nom|Definite=Ind|Degree=Pos|Number=Sing",
+      "11": "ADJ#Adjective#Case=Nom|Degree=Cmp",
+      "12": "ADJ#Adjective#Case=Nom|Degree=Pos",
+      "13": "ADJ#Adjective#Case=Nom|Degree=Pos|Number=Plur",
+      "14": "ADJ#Adjective#Case=Nom|Degree=Pos|Tense=Pres|VerbForm=Part",
+      "15": "ADJ#Adjective#Case=Nom|Number=Plur|Tense=Past|VerbForm=Part",
+      "16": "ADJ#Adjective#Degree=Pos|Foreign=Yes",
+      "17": "ADJ#Adverb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing",
+      "18": "ADJ#Adverb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing",
+      "19": "ADJ#Adverb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part",
+      "20": "ADJ#Adverb#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur",
+      "21": "ADJ#Noun#Case=Nom|Definite=Def|Degree=Pos",
+      "22": "ADJ#Noun#Case=Nom|Degree=Pos",
+      "23": "ADJ#Numeral#Case=Nom|Definite=Def|Degree=Pos",
+      "24": "ADJ#Numeral#Case=Nom|NumType=Ord",
+      "25": "ADJ#Verb#Case=Nom|Definite=Def|Degree=Pos|Tense=Past|VerbForm=Part",
+      "26": "ADJ#Verb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Com|Number=Sing|Tense=Past|VerbForm=Part",
+      "27": "ADJ#Verb#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part",
+      "28": "ADJ#Verb#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur",
+      "29": "ADJ#Verb#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur|Tense=Past|VerbForm=Part",
+      "30": "ADJ#Verb#Case=Nom|Definite=Ind|Gender=Neut|Number=Sing|Tense=Past|VerbForm=Part",
+      "31": "ADJ#Verb#Case=Nom|Degree=Pos|Tense=Pres|VerbForm=Part",
+      "32": "ADJ#_#Case=Nom|Definite=Ind|Degree=Pos|Gender=Neut|Number=Sing",
+      "33": "ADJ#_#Case=Nom|Definite=Ind|Degree=Pos|Number=Plur",
+      "34": "ADJ#_#Case=Nom|Definite=Ind|Degree=Pos|Number=Sing|Tense=Past|VerbForm=Part",
+      "35": "ADJ#_#Case=Nom|Degree=Pos",
+      "36": "ADP#Adjective#_",
+      "37": "ADP#Adverb#_",
+      "38": "ADP#Conjunction#_",
+      "39": "ADP#Preposition#_",
+      "40": "ADP#_#_",
+      "41": "ADV#Adjective#_",
+      "42": "ADV#Adverb#Abbr=Yes",
+      "43": "ADV#Adverb#Degree=Cmp",
+      "44": "ADV#Adverb#Degree=Pos",
+      "45": "ADV#Adverb#Degree=Sup",
+      "46": "ADV#Adverb#Degree=Sup|Polarity=Neg",
+      "47": "ADV#Adverb#Polarity=Neg",
+      "48": "ADV#Adverb#_",
+      "49": "ADV#Conjunction#_",
+      "50": "ADV#Invariable#Degree=Cmp",
+      "51": "ADV#Invariable#Degree=Sup",
+      "52": "ADV#Noun#_",
+      "53": "ADV#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
+      "54": "ADV#Pronoun#_",
+      "55": "ADV#_#Degree=Cmp",
+      "56": "ADV#_#Degree=Sup",
+      "57": "ADV#_#_",
+      "58": "AUX#Verb#Mood=Ind|Tense=Past|VerbForm=Fin|Voice=Act",
+      "59": "AUX#Verb#Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
+      "60": "AUX#Verb#VerbForm=Inf|Voice=Act",
+      "61": "AUX#Verb#VerbForm=Sup|Voice=Act",
+      "62": "CCONJ#Conjunction#_",
+      "63": "CCONJ#_#_",
+      "64": "DET#Adjective#Gender=Com|Number=Sing|PronType=Tot",
+      "65": "DET#Adjective#Gender=Neut|Number=Sing|PronType=Tot",
+      "66": "DET#Adjective#Number=Plur|PronType=Tot",
+      "67": "DET#Article#Definite=Def|Gender=Com|Number=Sing|PronType=Art",
+      "68": "DET#Article#Definite=Def|Gender=Neut|Number=Sing|PronType=Art",
+      "69": "DET#Article#Definite=Def|Number=Plur|PronType=Art",
+      "70": "DET#Article#Definite=Ind|Gender=Com|Number=Sing|PronType=Art",
+      "71": "DET#Article#Definite=Ind|Gender=Neut|Number=Sing|PronType=Art",
+      "72": "DET#Article#Definite=Ind|Gender=Neut|Number=Sing|PronType=Artt",
+      "73": "DET#Article#Definite=Ind|PronType=Art",
+      "74": "DET#Numeral#Definite=Ind|Gender=Neut|Number=Sing|PronType=Art",
+      "75": "DET#Pronoun#Definite=Def|Gender=Com|Number=Sing|PronType=Art",
+      "76": "DET#Pronoun#Definite=Def|Gender=Com|Number=Sing|PronType=Dem",
+      "77": "DET#Pronoun#Definite=Def|Gender=Neut|Number=Sing|PronType=Art",
+      "78": "DET#Pronoun#Definite=Def|Gender=Neut|Number=Sing|PronType=Dem",
+      "79": "DET#Pronoun#Definite=Def|Number=Plur|PronType=Art",
+      "80": "DET#Pronoun#Definite=Def|Number=Plur|PronType=Dem",
+      "81": "DET#Pronoun#Definite=Def|Number=Plur|PronType=Tot",
+      "82": "DET#Pronoun#Definite=Ind|Gender=Com|Number=Sing|PronType=Ind",
+      "83": "DET#Pronoun#Definite=Ind|Gender=Com|Number=Sing|PronType=Int",
+      "84": "DET#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Ind",
+      "85": "DET#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Int",
+      "86": "DET#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Tot",
+      "87": "DET#Pronoun#Definite=Ind|Number=Plur|PronType=Ind",
+      "88": "DET#Pronoun#Definite=Ind|Number=Sing|PronType=Tot",
+      "89": "DET#Pronoun#PronType=Ind",
+      "90": "DET#_#Gender=Neut|Number=Sing|PronType=Tot",
+      "91": "NOUN#Noun#Abbr=Yes",
+      "92": "NOUN#Noun#Case=Gen|Definite=Def|Gender=Com|Number=Plur",
+      "93": "NOUN#Noun#Case=Gen|Definite=Def|Gender=Com|Number=Sing",
+      "94": "NOUN#Noun#Case=Gen|Definite=Def|Gender=Neut|Number=Plur",
+      "95": "NOUN#Noun#Case=Gen|Definite=Def|Gender=Neut|Number=Sing",
+      "96": "NOUN#Noun#Case=Gen|Definite=Ind|Gender=Com|Number=Plur",
+      "97": "NOUN#Noun#Case=Gen|Definite=Ind|Gender=Neut|Number=Plur",
+      "98": "NOUN#Noun#Case=Gen|Definite=Ind|Gender=Neut|Number=Sing",
+      "99": "NOUN#Noun#Case=Nom|Definite=Def|Gender=Com|Number=Plur",
+      "100": "NOUN#Noun#Case=Nom|Definite=Def|Gender=Com|Number=Sing",
+      "101": "NOUN#Noun#Case=Nom|Definite=Def|Gender=Neut|Number=Plur",
+      "102": "NOUN#Noun#Case=Nom|Definite=Def|Gender=Neut|Number=Sing",
+      "103": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Com|Number=Plur",
+      "104": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing",
+      "105": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Neut|Number=Plur",
+      "106": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
+      "107": "NOUN#Noun#Case=Nom|Definite=Ind|Gender=Neut|Number=Singg",
+      "108": "NOUN#Noun#Gender=Com",
+      "109": "NOUN#Noun#Number=Plur",
+      "110": "NOUN#Noun#Number=Sing",
+      "111": "NOUN#Noun#_",
+      "112": "NOUN#_#Case=Nom|Definite=Def|Gender=Com|Number=Sing",
+      "113": "NOUN#_#Case=Nom|Definite=Def|Gender=Neut|Number=Sing",
+      "114": "NOUN#_#Case=Nom|Definite=Ind|Gender=Com|Number=Sing",
+      "115": "NOUN#_#Case=Nom|Definite=Ind|Gender=Neut|Number=Sing",
+      "116": "NUM#Article#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|NumType=Card",
+      "117": "NUM#Noun#Case=Nom|NumType=Card",
+      "118": "NUM#Numeral#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|NumType=Card",
+      "119": "NUM#Numeral#Case=Nom|NumType=Card",
+      "120": "PART#Particle#Polarity=Neg",
+      "121": "PART#Preposition#_",
+      "122": "PRON#Adjective#Definite=Ind|Number=Plur|PronType=Ind",
+      "123": "PRON#Adjective#Definite=Ind|Number=Plur|PronType=Tot",
+      "124": "PRON#Adverb#Definite=Def|Gender=Neut|Number=Sing|PronType=Prs",
+      "125": "PRON#Adverb#Definite=Ind|Gender=Neut|Number=Sing|PronType=Ind",
+      "126": "PRON#Adverb#_",
+      "127": "PRON#Article#Case=Nom|Definite=Def|Number=Plur|PronType=Prs",
+      "128": "PRON#Conjunction#Definite=Ind|Gender=Neut|Number=Sing|PronType=Int",
+      "129": "PRON#Conjunction#PronType=Rel",
+      "130": "PRON#Noun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|PronType=Ind",
+      "131": "PRON#Noun#Definite=Def|Gender=Com|Number=Sing|PronType=Prs",
+      "132": "PRON#Noun#Definite=Def|Number=Plur|PronType=Prs",
+      "133": "PRON#Noun#Definite=Ind|Number=Plur|PronType=Ind",
+      "134": "PRON#Numeral#Definite=Ind|Gender=Com|Number=Sing|PronType=Prs",
+      "135": "PRON#Numeral#Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
+      "136": "PRON#Pronoun#Case=Acc|Definite=Def|Gender=Com|Number=Plur|PronType=Prs",
+      "137": "PRON#Pronoun#Case=Acc|Definite=Def|Gender=Com|Number=Sing|PronType=Prs",
+      "138": "PRON#Pronoun#Case=Acc|Definite=Def|Number=Plur|PronType=Prs",
+      "139": "PRON#Pronoun#Case=Acc|Definite=Def|PronType=Prs",
+      "140": "PRON#Pronoun#Case=Gen|Definite=Def|Gender=Com|Number=Sing|Poss=Yes|PronType=Prs",
+      "141": "PRON#Pronoun#Case=Nom|Definite=Def|Gender=Com|Number=Plur|PronType=Prs",
+      "142": "PRON#Pronoun#Case=Nom|Definite=Def|Gender=Com|Number=Sing|PronType=Prs",
+      "143": "PRON#Pronoun#Case=Nom|Definite=Def|Number=Plur|PronType=Prs",
+      "144": "PRON#Pronoun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|PronType=Ind",
+      "145": "PRON#Pronoun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing|PronType=Rel",
+      "146": "PRON#Pronoun#Definite=Def|Gender=Com|Number=Sing|Poss=Yes|PronType=Prs",
+      "147": "PRON#Pronoun#Definite=Def|Gender=Com|Number=Sing|PronType=Prs",
+      "148": "PRON#Pronoun#Definite=Def|Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs",
+      "149": "PRON#Pronoun#Definite=Def|Gender=Neut|Number=Sing|PronType=Dem",
+      "150": "PRON#Pronoun#Definite=Def|Gender=Neut|Number=Sing|PronType=Prs",
+      "151": "PRON#Pronoun#Definite=Def|Number=Plur|Poss=Yes|PronType=Prs",
+      "152": "PRON#Pronoun#Definite=Def|Number=Plur|PronType=Dem",
+      "153": "PRON#Pronoun#Definite=Def|Number=Plur|PronType=Prs",
+      "154": "PRON#Pronoun#Definite=Def|Poss=Yes|PronType=Prs",
+      "155": "PRON#Pronoun#Definite=Ind|Gender=Com|Number=Sing|PronType=Ind",
+      "156": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Ind",
+      "157": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Int",
+      "158": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Neg",
+      "159": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
+      "160": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Rel",
+      "161": "PRON#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Tot",
+      "162": "PRON#Pronoun#Definite=Ind|Number=Plur|PronType=Rel",
+      "163": "PRON#Pronoun#Number=Plur",
+      "164": "PRON#Pronoun#PronType=Rel",
+      "165": "PRON#Verb#Definite=Def|Gender=Neut|Number=Sing|Poss=Yes|PronType=Prs",
+      "166": "PRON#_#Case=Acc|Definite=Def|PronType=Prs",
+      "167": "PRON#_#Definite=Ind|Gender=Neut|Number=Sing|PronType=Ind",
+      "168": "PRON#_#Definite=Ind|Gender=Neut|Number=Sing|PronType=Prs",
+      "169": "PROPN#Noun#Case=Gen",
+      "170": "PROPN#Noun#Case=Nom",
+      "171": "PROPN#Noun#Case=Nom|Definite=Ind|Gender=Com|Number=Sing",
+      "172": "PUNCT#PUNCT#_",
+      "173": "SCONJ#Conjunction#_",
+      "174": "SCONJ#Preposition#_",
+      "175": "SCONJ#Pronoun#Definite=Ind|Gender=Neut|Number=Sing|PronType=Int",
+      "176": "SCONJ#_#_",
+      "177": "VERB#Adjective#Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass",
+      "178": "VERB#Verb#Case=Nom|Number=Plur|Tense=Past|VerbForm=Part|Voice=Pass",
+      "179": "VERB#Verb#Mood=Imp|VerbForm=Fin|Voice=Act",
+      "180": "VERB#Verb#Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
+      "181": "VERB#Verb#Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
+      "182": "VERB#Verb#Mood=Ind|Tense=Past|VerbForm=Fin",
+      "183": "VERB#Verb#Mood=Ind|Tense=Past|VerbForm=Fin|Voice=Act",
+      "184": "VERB#Verb#Mood=Ind|Tense=Past|VerbForm=Fin|Voice=Pass",
+      "185": "VERB#Verb#Mood=Ind|Tense=Pres|VerbForm=Fin",
+      "186": "VERB#Verb#Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Act",
+      "187": "VERB#Verb#Mood=Ind|Tense=Pres|VerbForm=Fin|Voice=Pass",
+      "188": "VERB#Verb#Tense=Past|VerbForm=Part",
+      "189": "VERB#Verb#VerbForm=Inf",
+      "190": "VERB#Verb#VerbForm=Inf|Voice=Act",
+      "191": "VERB#Verb#VerbForm=Inf|Voice=Pass",
+      "192": "VERB#Verb#VerbForm=Sup",
+      "193": "VERB#Verb#VerbForm=Sup|Voice=Act",
+      "194": "VERB#Verb#VerbForm=Sup|Voice=Pass"
+    },
+    "lemma_rule": {
+      "0": "cut_prefix=0|cut_suffix=0|append_suffix=",
+      "1": "cut_prefix=0|cut_suffix=0|append_suffix=a",
+      "2": "cut_prefix=0|cut_suffix=0|append_suffix=ma",
+      "3": "cut_prefix=0|cut_suffix=1|append_suffix=",
+      "4": "cut_prefix=0|cut_suffix=1|append_suffix=a",
+      "5": "cut_prefix=0|cut_suffix=1|append_suffix=as",
+      "6": "cut_prefix=0|cut_suffix=1|append_suffix=d",
+      "7": "cut_prefix=0|cut_suffix=1|append_suffix=en",
+      "8": "cut_prefix=0|cut_suffix=1|append_suffix=g",
+      "9": "cut_prefix=0|cut_suffix=1|append_suffix=ja",
+      "10": "cut_prefix=0|cut_suffix=1|append_suffix=n",
+      "11": "cut_prefix=0|cut_suffix=1|append_suffix=na",
+      "12": "cut_prefix=0|cut_suffix=1|append_suffix=ola",
+      "13": "cut_prefix=0|cut_suffix=1|append_suffix=ym",
+      "14": "cut_prefix=0|cut_suffix=2|append_suffix=",
+      "15": "cut_prefix=0|cut_suffix=2|append_suffix=a",
+      "16": "cut_prefix=0|cut_suffix=2|append_suffix=an",
+      "17": "cut_prefix=0|cut_suffix=2|append_suffix=ara",
+      "18": "cut_prefix=0|cut_suffix=2|append_suffix=dd",
+      "19": "cut_prefix=0|cut_suffix=2|append_suffix=e",
+      "20": "cut_prefix=0|cut_suffix=2|append_suffix=en",
+      "21": "cut_prefix=0|cut_suffix=2|append_suffix=g",
+      "22": "cut_prefix=0|cut_suffix=2|append_suffix=i",
+      "23": "cut_prefix=0|cut_suffix=2|append_suffix=igga",
+      "24": "cut_prefix=0|cut_suffix=2|append_suffix=ja",
+      "25": "cut_prefix=0|cut_suffix=2|append_suffix=mal",
+      "26": "cut_prefix=0|cut_suffix=2|append_suffix=n",
+      "27": "cut_prefix=0|cut_suffix=2|append_suffix=na",
+      "28": "cut_prefix=0|cut_suffix=2|append_suffix=on",
+      "29": "cut_prefix=0|cut_suffix=2|append_suffix=u",
+      "30": "cut_prefix=0|cut_suffix=2|append_suffix=um",
+      "31": "cut_prefix=0|cut_suffix=2|append_suffix=unna",
+      "32": "cut_prefix=0|cut_suffix=2|append_suffix=ycket",
+      "33": "cut_prefix=0|cut_suffix=2|append_suffix=yda",
+      "34": "cut_prefix=0|cut_suffix=2|append_suffix=yta",
+      "35": "cut_prefix=0|cut_suffix=2|append_suffix=\u00e5",
+      "36": "cut_prefix=0|cut_suffix=2|append_suffix=\u00e5ta",
+      "37": "cut_prefix=0|cut_suffix=3|append_suffix=",
+      "38": "cut_prefix=0|cut_suffix=3|append_suffix=a",
+      "39": "cut_prefix=0|cut_suffix=3|append_suffix=an",
+      "40": "cut_prefix=0|cut_suffix=3|append_suffix=and_annat",
+      "41": "cut_prefix=0|cut_suffix=3|append_suffix=as",
+      "42": "cut_prefix=0|cut_suffix=3|append_suffix=e",
+      "43": "cut_prefix=0|cut_suffix=3|append_suffix=er",
+      "44": "cut_prefix=0|cut_suffix=3|append_suffix=i",
+      "45": "cut_prefix=0|cut_suffix=3|append_suffix=jag",
+      "46": "cut_prefix=0|cut_suffix=3|append_suffix=liten",
+      "47": "cut_prefix=0|cut_suffix=3|append_suffix=nan",
+      "48": "cut_prefix=0|cut_suffix=3|append_suffix=nna",
+      "49": "cut_prefix=0|cut_suffix=3|append_suffix=ola",
+      "50": "cut_prefix=0|cut_suffix=3|append_suffix=r",
+      "51": "cut_prefix=0|cut_suffix=3|append_suffix=ra",
+      "52": "cut_prefix=0|cut_suffix=3|append_suffix=vi",
+      "53": "cut_prefix=0|cut_suffix=3|append_suffix=ycket",
+      "54": "cut_prefix=0|cut_suffix=3|append_suffix=\u00e4ga",
+      "55": "cut_prefix=0|cut_suffix=3|append_suffix=\u00e4gga",
+      "56": "cut_prefix=0|cut_suffix=3|append_suffix=\u00e5",
+      "57": "cut_prefix=0|cut_suffix=3|append_suffix=\u00e5_kallad",
+      "58": "cut_prefix=0|cut_suffix=4|append_suffix=",
+      "59": "cut_prefix=0|cut_suffix=4|append_suffix=a",
+      "60": "cut_prefix=0|cut_suffix=4|append_suffix=ader",
+      "61": "cut_prefix=0|cut_suffix=4|append_suffix=an",
+      "62": "cut_prefix=0|cut_suffix=4|append_suffix=e",
+      "63": "cut_prefix=0|cut_suffix=4|append_suffix=ola",
+      "64": "cut_prefix=0|cut_suffix=4|append_suffix=on",
+      "65": "cut_prefix=0|cut_suffix=4|append_suffix=or",
+      "66": "cut_prefix=0|cut_suffix=4|append_suffix=ot",
+      "67": "cut_prefix=0|cut_suffix=4|append_suffix=r",
+      "68": "cut_prefix=0|cut_suffix=4|append_suffix=ra",
+      "69": "cut_prefix=0|cut_suffix=4|append_suffix=\u00e5g",
+      "70": "cut_prefix=0|cut_suffix=4|append_suffix=\u00f6ra",
+      "71": "cut_prefix=0|cut_suffix=5|append_suffix=",
+      "72": "cut_prefix=0|cut_suffix=5|append_suffix=a",
+      "73": "cut_prefix=0|cut_suffix=5|append_suffix=an",
+      "74": "cut_prefix=0|cut_suffix=5|append_suffix=d\u00e5lig",
+      "75": "cut_prefix=0|cut_suffix=5|append_suffix=er",
+      "76": "cut_prefix=0|cut_suffix=5|append_suffix=g\u00e4rna",
+      "77": "cut_prefix=0|cut_suffix=5|append_suffix=oder",
+      "78": "cut_prefix=0|cut_suffix=5|append_suffix=on",
+      "79": "cut_prefix=0|cut_suffix=5|append_suffix=r",
+      "80": "cut_prefix=0|cut_suffix=5|append_suffix=ra",
+      "81": "cut_prefix=0|cut_suffix=6|append_suffix=er",
+      "82": "cut_prefix=0|cut_suffix=8|append_suffix=or",
+      "83": "cut_prefix=1|cut_suffix=0|append_suffix=",
+      "84": "cut_prefix=1|cut_suffix=0|append_suffix=a",
+      "85": "cut_prefix=1|cut_suffix=3|append_suffix=",
+      "86": "cut_prefix=1|cut_suffix=3|append_suffix=te",
+      "87": "cut_prefix=2|cut_suffix=0|append_suffix=",
+      "88": "cut_prefix=2|cut_suffix=0|append_suffix=a",
+      "89": "cut_prefix=2|cut_suffix=1|append_suffix=empel",
+      "90": "cut_prefix=2|cut_suffix=1|append_suffix=n",
+      "91": "cut_prefix=2|cut_suffix=2|append_suffix=",
+      "92": "cut_prefix=2|cut_suffix=2|append_suffix=a",
+      "93": "cut_prefix=2|cut_suffix=3|append_suffix=",
+      "94": "cut_prefix=2|cut_suffix=3|append_suffix=as",
+      "95": "cut_prefix=2|cut_suffix=3|append_suffix=n"
+    },
+    "misc": {
+      "0": "Cxn=rc-that-nsubj",
+      "1": "Cxn=rc-that-obj",
+      "2": "Cxn=rc-wh-nsubj",
+      "3": "Cxn=rc-wh-obl",
+      "4": "Cxn=rc-wh-obl-pfront",
+      "5": "Promoted=Yes|SpaceAfter=No",
+      "6": "SpaceAfter=No",
+      "7": "ellipsis"
+    },
+    "semclass": {
+      "0": "ABILITY_OF_BEING",
+      "1": "ACTIVITY",
+      "2": "APPARATUS",
+      "3": "AREA_OF_HUMAN_ACTIVITY",
+      "4": "ARRANGEMENTS",
+      "5": "ARTICLES",
+      "6": "ATTRIBUTIVE",
+      "7": "AUXILIARY_VERBS",
+      "8": "BAD_DANGEROUS_EVENT",
+      "9": "BE",
+      "10": "BEGIN_TO_TAKE_PLACE",
+      "11": "BEHAVIOUR",
+      "12": "BEING",
+      "13": "BUSINESS",
+      "14": "BUSY_FREE_OCCUPIED",
+      "15": "CHANGE_OF_POST_AND_JOB",
+      "16": "CHARACTERISTIC_GENERAL",
+      "17": "CHOOSING_SORTING",
+      "18": "CH_APPEARANCE",
+      "19": "CH_ASPECT",
+      "20": "CH_BENEFIT",
+      "21": "CH_BY_SENSORY_PERCEPTION",
+      "22": "CH_COMPOSITION",
+      "23": "CH_DEGREE",
+      "24": "CH_DEGREE_AND_INTENSITY",
+      "25": "CH_DISPOSITION_AND_MOTION",
+      "26": "CH_DISTRIBUTION",
+      "27": "CH_EVALUATION",
+      "28": "CH_EVALUATION_OF_HUMAN_TEMPER_AND_ACTIVITY",
+      "29": "CH_FUNCTIONING_OF_ENTITY",
+      "30": "CH_INFORMATION",
+      "31": "CH_INTENTION_CONCENTRATION",
+      "32": "CH_MAGNITUDE",
+      "33": "CH_OF_CONNECTIONS",
+      "34": "CH_PARAMETER_SPEED",
+      "35": "CH_POWER_AND_EFFECT",
+      "36": "CH_PRICE_AND_SUMS",
+      "37": "CH_REFERENCE_AND_QUANTIFICATION",
+      "38": "CH_RENOWN",
+      "39": "CH_RESISTANCE_TO_IMPACT",
+      "40": "CH_SALIENCE",
+      "41": "CH_SCALE",
+      "42": "CH_SOCIAL_CHARACTERISTIC",
+      "43": "CH_SPHERE_OF_COVERAGE",
+      "44": "CH_SYSTEM_STRUCTURE",
+      "45": "CH_TYPE_OF_POSSESSION_AND_PARTICIPATION",
+      "46": "CIRCUMSTANCE",
+      "47": "CLOTHES",
+      "48": "CONDITION_SITUATION",
+      "49": "CONFLICT_INTERACTION",
+      "50": "CONJUNCTIONS",
+      "51": "CONTAIN_INCLUDE_FORM",
+      "52": "CONTINUE_TO_HAVE",
+      "53": "CONTINUE_TO_TAKE_PLACE",
+      "54": "COORDINATING_CONJUNCTIONS",
+      "55": "COSMOS_AND_COSMIC_OBJECTS",
+      "56": "COST",
+      "57": "COUNTRY_AS_ADMINISTRATIVE_UNIT",
+      "58": "CREATION_VERBS",
+      "59": "DEFEND_SAVE",
+      "60": "DESTRUCTION_VERBS",
+      "61": "DIFFICULTIES",
+      "62": "DIFFICULT_AND_EASY",
+      "63": "DIMENSIONS_CHAR",
+      "64": "DISCOURSIVE_UNITS",
+      "65": "DOCUMENT",
+      "66": "ECONOMY",
+      "67": "EMOTIONS_AND_THEIR_EXPRESSION",
+      "68": "EMPTY_SUBJECT",
+      "69": "END_TO_TAKE_PLACE",
+      "70": "ENTITY_AS_RESULT_OF_ACTIVITY",
+      "71": "ENTITY_OR_SITUATION_PRONOUN",
+      "72": "EVERYDAY_PROCESSING",
+      "73": "EXISTENCE_AND_POSSESSION",
+      "74": "FACT_INCIDENT",
+      "75": "FEELING_AS_CONDITION",
+      "76": "FURNISHINGS_AND_DECORATION",
+      "77": "GENERAL_ACTION",
+      "78": "GRAMMATICAL_ELEMENTS",
+      "79": "HIERARCHICAL_VERBS",
+      "80": "IDENTIFYING_ATTRIBUTE",
+      "81": "IDIOMATICAL_ELEMENTS",
+      "82": "INFORMATION",
+      "83": "INTELLECTUAL_ACTIVITY",
+      "84": "INTERPERSONAL_RELATIONS",
+      "85": "KIND",
+      "86": "KITCHENWARE_AND_TABLEWARE",
+      "87": "KNOWLEDGE_FROM_EXPERIENCE_AND_DEDUCTION",
+      "88": "LACK_AND_PLENTY",
+      "89": "LAWS_AND_STANDARDS",
+      "90": "MANAGE_FAIL_CONDITION",
+      "91": "MARKET_AS_AREA_OF_ACTIVITY",
+      "92": "MENTAL_OBJECT",
+      "93": "METHOD_APPROACH_TECHNIQUE",
+      "94": "MODALITY",
+      "95": "MONEY",
+      "96": "MOTION",
+      "97": "NONPRODUCTIVE_AREA",
+      "98": "OBJECT_BY_FUNCTION_AND_PROPERTY",
+      "99": "ORGANIZATION",
+      "100": "PARTICLES",
+      "101": "PART_OF_CONSTRUCTION",
+      "102": "PART_OF_ORGANISM",
+      "103": "PART_OF_WORLD",
+      "104": "PART_OR_PORTION_OF_ENTITY",
+      "105": "PERCEPTION_ACTIVITY",
+      "106": "PHRASAL_PARTICLES",
+      "107": "PHYSICAL_AND_BIOLOGICAL_PROPERTIES",
+      "108": "PHYSICAL_OBJECT_AND_SUBSTANCE_CHAR",
+      "109": "PHYSICAL_PSYCHIC_CONDITION",
+      "110": "PHYSIOLOGICAL_PROCESSES",
+      "111": "PLACE",
+      "112": "POSITION_AS_STATUS",
+      "113": "POSITION_IN_SPACE",
+      "114": "POWER_RIGHT",
+      "115": "PREMISES",
+      "116": "PREPOSITION",
+      "117": "PROBLEMS_TO_SOLVE",
+      "118": "PROCESS_AND_ITS_STAGES",
+      "119": "PUBLIC_AND_POLITICAL_ACTIVITY",
+      "120": "RELATIVE_SPACE",
+      "121": "RESULTS_OF_GIVING_INFORMATION_AND_SPEECH_ACTIVITY",
+      "122": "RESULTS_OF_MAKING_DECISIONS",
+      "123": "RESULT_CONSEQUENCE",
+      "124": "RISK_DANGER",
+      "125": "SCHEDULE_FOR_ACTIVITY",
+      "126": "SCIENCE",
+      "127": "SCIENTIFIC_AND_LITERARY_WORK",
+      "128": "SITUATION",
+      "129": "SOCIAL_CONDITIONS_OF_BEING",
+      "130": "SPHERE_OF_ACTIVITY_GENERAL",
+      "131": "STATE_AREA",
+      "132": "STATE_OF_MIND",
+      "133": "SUBSTANCE",
+      "134": "SYMBOLS_FOR_INFORMATION_TRANSFER",
+      "135": "TENDENCY_AND_DISPOSITION",
+      "136": "TERRITORY_AREA",
+      "137": "TEXT_OBJECTS_AND_DOCUMENTS",
+      "138": "THE_EARTH_AND_ITS_SPATIAL_PARTS",
+      "139": "THE_GOOD_BAD",
+      "140": "TIME",
+      "141": "TOPIC_SUBJECT",
+      "142": "TOTALITY_OF_DEGREE",
+      "143": "TO_ADAPT",
+      "144": "TO_ADD",
+      "145": "TO_ANALYSE_AND_RESEARCH",
+      "146": "TO_APPROACH_COME_TO_SOME_POINT_OR_STATE",
+      "147": "TO_BE_BASED",
+      "148": "TO_CALL_AND_DESIGNATE",
+      "149": "TO_CANCEL",
+      "150": "TO_CARE_AND_BRING_UP",
+      "151": "TO_CHANGE",
+      "152": "TO_CHARACTERIZE",
+      "153": "TO_COME_OR_TO_LEAVE_SPHERE_OF_ACTIVITY",
+      "154": "TO_COMMIT",
+      "155": "TO_COMMUNICATE",
+      "156": "TO_COMPEL_AND_EVOKE",
+      "157": "TO_CONTRIBUTE_AND_HINDER",
+      "158": "TO_DECIDE",
+      "159": "TO_DEVELOP",
+      "160": "TO_DISAPPEAR_LOSE_GET_RID_OF",
+      "161": "TO_ECONOMIZE",
+      "162": "TO_EXIST",
+      "163": "TO_FEEL_AND_EXPRESS_MENTAL_ATTITUDE_TO",
+      "164": "TO_FLOW_IN_TIME",
+      "165": "TO_GET",
+      "166": "TO_GIVE",
+      "167": "TO_INTERPRET",
+      "168": "TO_INVOLVE",
+      "169": "TO_JOIN",
+      "170": "TO_KEEP_VIOLATE_NORMS",
+      "171": "TO_LEARN_AND_RESEARCH",
+      "172": "TO_MAKE",
+      "173": "TO_MARRY_DIVORCE_ENGAGE",
+      "174": "TO_MEAN",
+      "175": "TO_MIX",
+      "176": "TO_PARTICIPATE",
+      "177": "TO_PERCEIVE",
+      "178": "TO_POSSESS",
+      "179": "TO_PUNISH",
+      "180": "TO_REACT",
+      "181": "TO_REBEL",
+      "182": "TO_RESTORE",
+      "183": "TO_SEEK_FIND",
+      "184": "TO_SET",
+      "185": "TO_SHARE",
+      "186": "TO_SHOW",
+      "187": "TO_TAKE",
+      "188": "TO_THINK_ABOUT",
+      "189": "TO_USE",
+      "190": "TO_WAIT",
+      "191": "TO_WORK",
+      "192": "URBAN_SPACE_AND_ROADS",
+      "193": "VALUABLE",
+      "194": "VERBAL_COMMUNICATION",
+      "195": "VISUAL_CHARACTERISTICS",
+      "196": "VISUAL_REPRESENTATION",
+      "197": "WORLD_OUTLOOK"
+    },
+    "ud_deprel": {
+      "0": "acl",
+      "1": "acl:cleft",
+      "2": "acl:relcl",
+      "3": "advcl",
+      "4": "advmod",
+      "5": "amod",
+      "6": "appos",
+      "7": "aux",
+      "8": "aux:pass",
+      "9": "case",
+      "10": "cc",
+      "11": "ccomp",
+      "12": "compound:prt",
+      "13": "conj",
+      "14": "cop",
+      "15": "csubj",
+      "16": "csubj:pass",
+      "17": "det",
+      "18": "dislocated",
+      "19": "expl",
+      "20": "fixed",
+      "21": "flat",
+      "22": "iobj",
+      "23": "mark",
+      "24": "nmod",
+      "25": "nmod:poss",
+      "26": "nsubj",
+      "27": "nsubj:pass",
+      "28": "nummod",
+      "29": "obj",
+      "30": "obl",
+      "31": "obl:agent",
+      "32": "parataxis",
+      "33": "punct",
+      "34": "root",
+      "35": "vocative",
+      "36": "xcomp"
+    }
+  }
+}

configuration.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from transformers import PretrainedConfig
+class CobaldParserConfig(PretrainedConfig):
+    model_type = "cobald_parser"
+    def __init__(
+        self,
+        encoder_model_name: str = None,
+        null_classifier_hidden_size: int = 0,
+        lemma_classifier_hidden_size: int = 0,
+        morphology_classifier_hidden_size: int = 0,
+        dependency_classifier_hidden_size: int = 0,
+        misc_classifier_hidden_size: int = 0,
+        deepslot_classifier_hidden_size: int = 0,
+        semclass_classifier_hidden_size: int = 0,
+        activation: str = 'relu',
+        dropout: float = 0.1,
+        consecutive_null_limit: int = 0,
+        vocabulary: dict[dict[int, str]] = {},
+        **kwargs
+    ):
+        self.encoder_model_name = encoder_model_name
+        self.null_classifier_hidden_size = null_classifier_hidden_size
+        self.consecutive_null_limit = consecutive_null_limit
+        self.lemma_classifier_hidden_size = lemma_classifier_hidden_size
+        self.morphology_classifier_hidden_size = morphology_classifier_hidden_size
+        self.dependency_classifier_hidden_size = dependency_classifier_hidden_size
+        self.misc_classifier_hidden_size = misc_classifier_hidden_size
+        self.deepslot_classifier_hidden_size = deepslot_classifier_hidden_size
+        self.semclass_classifier_hidden_size = semclass_classifier_hidden_size
+        self.activation = activation
+        self.dropout = dropout
+        # The serialized config stores mappings as strings,
+        # e.g. {"0": "acl", "1": "conj"}, so we have to convert them to int.
+        self.vocabulary = {
+            column: {int(k): v for k, v in labels.items()}
+            for column, labels in vocabulary.items()
+        }
+        super().__init__(**kwargs)

dependency_classifier.py ADDED Viewed

	@@ -0,0 +1,301 @@

+from copy import deepcopy
+import numpy as np
+import torch
+from torch import nn
+from torch import Tensor, FloatTensor, BoolTensor, LongTensor
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from cobald_parser.bilinear_matrix_attention import BilinearMatrixAttention
+from cobald_parser.chu_liu_edmonds import decode_mst
+from cobald_parser.utils import pairwise_mask, replace_masked_values
+class DependencyHeadBase(nn.Module):
+    """
+    Base class for scoring arcs and relations between tokens in a dependency tree/graph.
+    """
+    def __init__(self, hidden_size: int, n_rels: int):
+        super().__init__()
+        self.arc_attention = BilinearMatrixAttention(
+            hidden_size,
+            hidden_size,
+            use_input_biases=True,
+            n_labels=1
+        )
+        self.rel_attention = BilinearMatrixAttention(
+            hidden_size,
+            hidden_size,
+            use_input_biases=True,
+            n_labels=n_rels
+        )
+    def forward(
+        self,
+        h_arc_head: Tensor,        # [batch_size, seq_len, hidden_size]
+        h_arc_dep: Tensor,         # ...
+        h_rel_head: Tensor,        # ...
+        h_rel_dep: Tensor,         # ...
+        gold_arcs: LongTensor,     # [batch_size, seq_len, seq_len]
+        null_mask: BoolTensor,     # [batch_size, seq_len]
+        padding_mask: BoolTensor   # [batch_size, seq_len]
+    ) -> dict[str, Tensor]:
+        # Score arcs.
+        # s_arc[:, i, j] = score of edge i -> j.
+        s_arc = self.arc_attention(h_arc_head, h_arc_dep)
+        # Mask undesirable values (padding, nulls, etc.) with -inf.
+        mask2d = pairwise_mask(null_mask & padding_mask)
+        replace_masked_values(s_arc, mask2d, replace_with=-1e8)
+        # Score arcs' relations.
+        # [batch_size, seq_len, seq_len, num_labels]
+        s_rel = self.rel_attention(h_rel_head, h_rel_dep).permute(0, 2, 3, 1)
+        # Calculate loss.
+        loss = 0.0
+        if gold_arcs is not None:
+            loss += self.calc_arc_loss(s_arc, gold_arcs)
+            loss += self.calc_rel_loss(s_rel, gold_arcs)
+        # Predict arcs based on the scores.
+        # [batch_size, seq_len, seq_len]
+        pred_arcs_matrix = self.predict_arcs(s_arc, null_mask, padding_mask)
+        # [batch_size, seq_len, seq_len]
+        pred_rels_matrix = self.predict_rels(s_rel)
+        # [n_pred_arcs, 4]
+        preds_combined = self.combine_arcs_rels(pred_arcs_matrix, pred_rels_matrix)
+        return {
+            'preds': preds_combined,
+            'loss': loss
+        }
+    @staticmethod
+    def calc_arc_loss(
+        s_arc: Tensor,         # [batch_size, seq_len, seq_len]
+        gold_arcs: LongTensor  # [n_arcs, 4]
+    ) -> Tensor:
+        """Calculate arc loss."""
+        raise NotImplementedError
+    @staticmethod
+    def calc_rel_loss(
+        s_rel: Tensor,         # [batch_size, seq_len, seq_len, num_labels]
+        gold_arcs: LongTensor  # [n_arcs, 4]
+    ) -> Tensor:
+        batch_idxs, arcs_from, arcs_to, rels = gold_arcs.T
+        return F.cross_entropy(s_rel[batch_idxs, arcs_from, arcs_to], rels)
+    def predict_arcs(
+        self,
+        s_arc: Tensor,           # [batch_size, seq_len, seq_len]
+        null_mask: BoolTensor,   # [batch_size, seq_len]
+        padding_mask: BoolTensor # [batch_size, seq_len]
+    ) -> LongTensor:
+        """Predict arcs from scores."""
+        raise NotImplementedError
+    def predict_rels(
+        self,
+        s_rel: FloatTensor
+    ) -> LongTensor:
+        return s_rel.argmax(dim=-1).long()
+    @staticmethod
+    def combine_arcs_rels(
+        pred_arcs: LongTensor,
+        pred_rels: LongTensor
+    ) -> LongTensor:
+        """Select relations towards predicted arcs."""
+        assert pred_arcs.shape == pred_rels.shape
+        # Get indices where arcs exist
+        indices = pred_arcs.nonzero(as_tuple=True)
+        batch_idxs, from_idxs, to_idxs = indices
+        # Get corresponding relation types
+        rel_types = pred_rels[batch_idxs, from_idxs, to_idxs]
+        # Stack as [batch_idx, from_idx, to_idx, rel_type]
+        return torch.stack([batch_idxs, from_idxs, to_idxs, rel_types], dim=1)
+class DependencyHead(DependencyHeadBase):
+    """
+    Basic UD syntax specialization that predicts single edge for each token.
+    """
+    def predict_arcs(
+        self,
+        s_arc: Tensor,           # [batch_size, seq_len, seq_len]
+        null_mask: BoolTensor,   # [batch_size, seq_len]
+        padding_mask: BoolTensor # [batch_size, seq_len, seq_len]
+    ) -> Tensor:
+        if self.training:
+            # During training, use fast greedy decoding.
+            # - [batch_size, seq_len]
+            pred_arcs_seq = s_arc.argmax(dim=1)
+        else:
+            # FIXME
+            # During inference, decode Maximum Spanning Tree.
+            # pred_arcs_seq = self._mst_decode(s_arc, padding_mask)
+            pred_arcs_seq = s_arc.argmax(dim=1)
+        # Upscale arcs sequence of shape [batch_size, seq_len]
+        # to matrix of shape [batch_size, seq_len, seq_len].
+        pred_arcs = F.one_hot(pred_arcs_seq, num_classes=pred_arcs_seq.size(1)).long().transpose(1, 2)
+        # Apply mask one more time (even though s_arc is already masked),
+        # because argmax erases information about masked values.
+        mask2d = pairwise_mask(null_mask & padding_mask)
+        replace_masked_values(pred_arcs, mask2d, replace_with=0)
+        return pred_arcs
+    def _mst_decode(
+        self,
+        s_arc: Tensor,    # [batch_size, seq_len, seq_len]
+        padding_mask: Tensor
+    ) -> tuple[Tensor, Tensor]:
+        batch_size = s_arc.size(0)
+        device = s_arc.device
+        s_arc = s_arc.cpu()
+        # Convert scores to probabilities, as `decode_mst` expects non-negative values.
+        arc_probs = nn.functional.softmax(s_arc, dim=1)
+        # `decode_mst` knows nothing about UD and ROOT, so we have to manually
+        # zero probabilities of arcs leading to ROOT to make sure ROOT is a source node
+        # of a graph.
+        # Decode ROOT positions from diagonals.
+        # shape: [batch_size]
+        root_idxs = arc_probs.diagonal(dim1=1, dim2=2).argmax(dim=-1)
+        # Zero out arcs leading to ROOTs.
+        arc_probs[torch.arange(batch_size), :, root_idxs] = 0.0
+        pred_arcs = []
+        for sample_idx in range(batch_size):
+            energy = arc_probs[sample_idx]
+            length = padding_mask[sample_idx].sum()
+            heads = decode_mst(energy, length)
+            # Some nodes may be isolated. Pick heads greedily in this case.
+            heads[heads <= 0] = s_arc[sample_idx].argmax(dim=1)[heads <= 0]
+            pred_arcs.append(heads)
+        # shape: [batch_size, seq_len]
+        pred_arcs = torch.from_numpy(np.stack(pred_arcs)).long().to(device)
+        return pred_arcs
+    @staticmethod
+    def calc_arc_loss(
+        s_arc: Tensor,         # [batch_size, seq_len, seq_len]
+        gold_arcs: LongTensor  # [n_arcs, 4]
+    ) -> tuple[Tensor, Tensor]:
+        batch_idxs, from_idxs, to_idxs, _ = gold_arcs.T
+        return F.cross_entropy(s_arc[batch_idxs, :, to_idxs], from_idxs)
+class MultiDependencyHead(DependencyHeadBase):
+    """
+    Enhanced UD syntax specialization that predicts multiple edges for each token.
+    """
+    def predict_arcs(
+        self,
+        s_arc: Tensor,           # [batch_size, seq_len, seq_len]
+        null_mask: BoolTensor,   # [batch_size, seq_len]
+        padding_mask: BoolTensor # [batch_size, seq_len]
+    ) -> Tensor:
+        # Convert scores to probabilities.
+        arc_probs = torch.sigmoid(s_arc)
+        # Find confident arcs (with prob > 0.5).
+        return arc_probs.round().long()
+    @staticmethod
+    def calc_arc_loss(
+        s_arc: Tensor,         # [batch_size, seq_len, seq_len]
+        gold_arcs: LongTensor  # [n_arcs, 4]
+    ) -> Tensor:
+        batch_idxs, from_idxs, to_idxs, _ = gold_arcs.T
+        # Gold arcs but as a matrix, where matrix[i, arcs_from, arc_to] = 1.0 if arcs is present.
+        gold_arcs_matrix = torch.zeros_like(s_arc)
+        gold_arcs_matrix[batch_idxs, from_idxs, to_idxs] = 1.0
+        # Padded arcs's logits are huge negative values that doesn't contribute to the loss.
+        return F.binary_cross_entropy_with_logits(s_arc, gold_arcs_matrix)
+class DependencyClassifier(nn.Module):
+    """
+    Dozat and Manning's biaffine dependency classifier.
+    """
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        n_rels_ud: int,
+        n_rels_eud: int,
+        activation: str,
+        dropout: float,
+    ):
+        super().__init__()
+        self.arc_dep_mlp = nn.Sequential(
+            nn.Dropout(dropout),
+            nn.Linear(input_size, hidden_size),
+            ACT2FN[activation],
+            nn.Dropout(dropout)
+        )
+        # All mlps are equal.
+        self.arc_head_mlp = deepcopy(self.arc_dep_mlp)
+        self.rel_dep_mlp = deepcopy(self.arc_dep_mlp)
+        self.rel_head_mlp = deepcopy(self.arc_dep_mlp)
+        self.dependency_head_ud = DependencyHead(hidden_size, n_rels_ud)
+        self.dependency_head_eud = MultiDependencyHead(hidden_size, n_rels_eud)
+    def forward(
+        self,
+        embeddings: Tensor,    # [batch_size, seq_len, embedding_size]
+        gold_ud: Tensor,       # [n_ud_arcs, 4]
+        gold_eud: Tensor,      # [n_eud_arcs, 4]
+        null_mask: Tensor,     # [batch_size, seq_len]
+        padding_mask: Tensor   # [batch_size, seq_len]
+    ) -> dict[str, Tensor]:
+        # - [batch_size, seq_len, hidden_size]
+        h_arc_head = self.arc_head_mlp(embeddings)
+        h_arc_dep = self.arc_dep_mlp(embeddings)
+        h_rel_head = self.rel_head_mlp(embeddings)
+        h_rel_dep = self.rel_dep_mlp(embeddings)
+        # Share the h vectors between dependency and multi-dependency heads.
+        output_ud = self.dependency_head_ud(
+            h_arc_head,
+            h_arc_dep,
+            h_rel_head,
+            h_rel_dep,
+            gold_arcs=gold_ud,
+            null_mask=null_mask,
+            padding_mask=padding_mask
+        )
+        output_eud = self.dependency_head_eud(
+            h_arc_head,
+            h_arc_dep,
+            h_rel_head,
+            h_rel_dep,
+            gold_arcs=gold_eud,
+            # Ignore null mask in E-UD
+            null_mask=torch.ones_like(padding_mask),
+            padding_mask=padding_mask
+        )
+        return {
+            'preds_ud': output_ud["preds"],
+            'preds_eud': output_eud["preds"],
+            'loss_ud': output_ud["loss"],
+            'loss_eud': output_eud["loss"]
+        }

encoder.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import torch
+from torch import nn
+from torch import Tensor, LongTensor
+from transformers import AutoTokenizer, AutoModel
+class WordTransformerEncoder(nn.Module):
+    """
+    Encodes sentences into word-level embeddings using a pretrained MLM transformer.
+    """
+    def __init__(self, model_name: str):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Model like BERT, RoBERTa, etc.
+        self.model = AutoModel.from_pretrained(model_name)
+    def forward(self, words: list[list[str]]) -> Tensor:
+        """
+        Build words embeddings.
+        - Tokenizes input sentences into subtokens.
+        - Passes the subtokens through the pre-trained transformer model.
+        - Aggregates subtoken embeddings into word embeddings using mean pooling.
+        """
+        batch_size = len(words)
+        # BPE tokenization: split words into subtokens, e.g. ['kidding'] -> ['▁ki', 'dding'].
+        subtokens = self.tokenizer(
+            words,
+            padding=True,
+            truncation=True,
+            is_split_into_words=True,
+            return_tensors='pt'
+        )
+        subtokens = subtokens.to(self.model.device)
+        # Index words from 1 and reserve 0 for special subtokens (e.g. <s>, </s>, padding, etc.).
+        # Such numeration makes a following aggregation easier.
+        words_ids = torch.stack([
+            torch.tensor(
+                [word_id + 1 if word_id is not None else 0 for word_id in subtokens.word_ids(batch_idx)],
+                dtype=torch.long,
+                device=self.model.device
+            )
+            for batch_idx in range(batch_size)
+        ])
+        # Run model and extract subtokens embeddings from the last layer.
+        subtokens_embeddings = self.model(**subtokens).last_hidden_state
+        # Aggreate subtokens embeddings into words embeddings.
+        # [batch_size, n_words, embedding_size]
+        words_emeddings = self._aggregate_subtokens_embeddings(subtokens_embeddings, words_ids)
+        return words_emeddings
+    def _aggregate_subtokens_embeddings(
+        self,
+        subtokens_embeddings: Tensor, # [batch_size, n_subtokens, embedding_size]
+        words_ids: LongTensor          # [batch_size, n_subtokens]
+    ) -> Tensor:
+        """
+        Aggregate subtoken embeddings into word embeddings by averaging.
+        This method ensures that multiple subtokens corresponding to a single word are combined
+        into a single embedding.
+        """
+        batch_size, n_subtokens, embedding_size = subtokens_embeddings.shape
+        # The number of words in a sentence plus an "auxiliary" word in the beginnig.
+        n_words = torch.max(words_ids) + 1
+        words_embeddings = torch.zeros(
+            size=(batch_size, n_words, embedding_size),
+            dtype=subtokens_embeddings.dtype,
+            device=self.model.device
+        )
+        words_ids_expanded = words_ids.unsqueeze(-1).expand(batch_size, n_subtokens, embedding_size)
+        # Use scatter_reduce_ to average embeddings of subtokens corresponding to the same word.
+        # All the padding and special subtokens will be aggregated into an "auxiliary" first embedding,
+        # namely into words_embeddings[:, 0, :].
+        words_embeddings.scatter_reduce_(
+            dim=1,
+            index=words_ids_expanded,
+            src=subtokens_embeddings,
+            reduce="mean",
+            include_self=False
+        )
+        # Now remove the auxiliary word in the beginning.
+        words_embeddings = words_embeddings[:, 1:, :]
+        return words_embeddings
+    def get_embedding_size(self) -> int:
+        """Returns the embedding size of the transformer model, e.g. 768 for BERT."""
+        return self.model.config.hidden_size
+    def get_embeddings_layer(self):
+        """Returns the embeddings model."""
+        return self.model.embeddings
+    def get_transformer_layers(self) -> list[nn.Module]:
+        """
+        Return a flat list of all transformer-*block* layers, excluding embeddings/poolers, etc.
+        """
+        layers = []
+        for sub in self.model.modules():
+            # find all ModuleLists (these always hold the actual block layers)
+            if isinstance(sub, nn.ModuleList):
+                layers.extend(list(sub))
+        return layers

mlp_classifier.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import torch
+from torch import nn
+from torch import Tensor, LongTensor
+from transformers.activations import ACT2FN
+class MlpClassifier(nn.Module):
+    """ Simple feed-forward multilayer perceptron classifier. """
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        n_classes: int,
+        activation: str,
+        dropout: float,
+        class_weights: list[float] = None,
+    ):
+        super().__init__()
+        self.n_classes = n_classes
+        self.classifier = nn.Sequential(
+            nn.Dropout(dropout),
+            nn.Linear(input_size, hidden_size),
+            ACT2FN[activation],
+            nn.Dropout(dropout),
+            nn.Linear(hidden_size, n_classes)
+        )
+        if class_weights is not None:
+            class_weights = torch.tensor(class_weights, dtype=torch.long)
+        self.cross_entropy = nn.CrossEntropyLoss(weight=class_weights)
+    def forward(self, embeddings: Tensor, labels: LongTensor = None) -> dict:
+        logits = self.classifier(embeddings)
+        # Calculate loss.
+        loss = 0.0
+        if labels is not None:
+            # Reshape tensors to match expected dimensions
+            loss = self.cross_entropy(
+                logits.view(-1, self.n_classes),
+                labels.view(-1)
+            )
+        # Predictions.
+        preds = logits.argmax(dim=-1)
+        return {'preds': preds, 'loss': loss}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ce1af3c99b565ace5df9845a3b3531c6161957e0952cba24b236bdc24583d6f
+size 1134190536

modeling_parser.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from torch import nn
+from torch import LongTensor
+from transformers import PreTrainedModel
+from .configuration import CobaldParserConfig
+from .encoder import WordTransformerEncoder
+from .mlp_classifier import MlpClassifier
+from .dependency_classifier import DependencyClassifier
+from .utils import (
+    build_padding_mask,
+    build_null_mask,
+    prepend_cls,
+    remove_nulls,
+    add_nulls
+)
+class CobaldParser(PreTrainedModel):
+    """Morpho-Syntax-Semantic Parser."""
+    config_class = CobaldParserConfig
+    def __init__(self, config: CobaldParserConfig):
+        super().__init__(config)
+        self.encoder = WordTransformerEncoder(
+            model_name=config.encoder_model_name
+        )
+        embedding_size = self.encoder.get_embedding_size()
+        self.classifiers = nn.ModuleDict()
+        self.classifiers["null"] = MlpClassifier(
+            input_size=self.encoder.get_embedding_size(),
+            hidden_size=config.null_classifier_hidden_size,
+            n_classes=config.consecutive_null_limit + 1,
+            activation=config.activation,
+            dropout=config.dropout
+        )
+        if "lemma_rule" in config.vocabulary:
+            self.classifiers["lemma_rule"] = MlpClassifier(
+                input_size=embedding_size,
+                hidden_size=config.lemma_classifier_hidden_size,
+                n_classes=len(config.vocabulary["lemma_rule"]),
+                activation=config.activation,
+                dropout=config.dropout
+            )
+        if "joint_feats" in config.vocabulary:
+            self.classifiers["joint_feats"] = MlpClassifier(
+                input_size=embedding_size,
+                hidden_size=config.morphology_classifier_hidden_size,
+                n_classes=len(config.vocabulary["joint_feats"]),
+                activation=config.activation,
+                dropout=config.dropout
+            )
+        if "ud_deprel" in config.vocabulary or "eud_deprel" in config.vocabulary:
+            self.classifiers["syntax"] = DependencyClassifier(
+                input_size=embedding_size,
+                hidden_size=config.dependency_classifier_hidden_size,
+                n_rels_ud=len(config.vocabulary["ud_deprel"]),
+                n_rels_eud=len(config.vocabulary["eud_deprel"]),
+                activation=config.activation,
+                dropout=config.dropout
+            )
+        if "misc" in config.vocabulary:
+            self.classifiers["misc"] = MlpClassifier(
+                input_size=embedding_size,
+                hidden_size=config.misc_classifier_hidden_size,
+                n_classes=len(config.vocabulary["misc"]),
+                activation=config.activation,
+                dropout=config.dropout
+            )
+        if "deepslot" in config.vocabulary:
+            self.classifiers["deepslot"] = MlpClassifier(
+                input_size=embedding_size,
+                hidden_size=config.deepslot_classifier_hidden_size,
+                n_classes=len(config.vocabulary["deepslot"]),
+                activation=config.activation,
+                dropout=config.dropout
+            )
+        if "semclass" in config.vocabulary:
+            self.classifiers["semclass"] = MlpClassifier(
+                input_size=embedding_size,
+                hidden_size=config.semclass_classifier_hidden_size,
+                n_classes=len(config.vocabulary["semclass"]),
+                activation=config.activation,
+                dropout=config.dropout
+            )
+    def forward(
+        self,
+        words: list[list[str]],
+        counting_masks: LongTensor = None,
+        lemma_rules: LongTensor = None,
+        joint_feats: LongTensor = None,
+        deps_ud: LongTensor = None,
+        deps_eud: LongTensor = None,
+        miscs: LongTensor = None,
+        deepslots: LongTensor = None,
+        semclasses: LongTensor = None,
+        sent_ids: list[str] = None,
+        texts: list[str] = None,
+        inference_mode: bool = False
+    ) -> dict:
+        output = {}
+        # Extra [CLS] token accounts for the case when #NULL is the first token in a sentence.
+        words_with_cls = prepend_cls(words)
+        words_without_nulls = remove_nulls(words_with_cls)
+        # Embeddings of words without nulls.
+        embeddings_without_nulls = self.encoder(words_without_nulls)
+        # Predict nulls.
+        null_output = self.classifiers["null"](embeddings_without_nulls, counting_masks)
+        output["counting_mask"] = null_output['preds']
+        output["loss"] = null_output["loss"]
+        # "Teacher forcing": during training, pass the original words (with gold nulls)
+        # to the classification heads, so that they are trained upon correct sentences.
+        if inference_mode:
+            # Restore predicted nulls in the original sentences.
+            output["words"] = add_nulls(words, null_output["preds"])
+        else:
+            output["words"] = words
+        # Encode words with nulls.
+        # [batch_size, seq_len, embedding_size]
+        embeddings = self.encoder(output["words"])
+        # Predict lemmas and morphological features.
+        if "lemma_rule" in self.classifiers:
+            lemma_output = self.classifiers["lemma_rule"](embeddings, lemma_rules)
+            output["lemma_rules"] = lemma_output['preds']
+            output["loss"] += lemma_output['loss']
+        if "joint_feats" in self.classifiers:
+            joint_feats_output = self.classifiers["joint_feats"](embeddings, joint_feats)
+            output["joint_feats"] = joint_feats_output['preds']
+            output["loss"] += joint_feats_output['loss']
+        # Predict syntax.
+        if "syntax" in self.classifiers:
+            padding_mask = build_padding_mask(output["words"], self.device)
+            null_mask = build_null_mask(output["words"], self.device)
+            deps_output = self.classifiers["syntax"](
+                embeddings,
+                deps_ud,
+                deps_eud,
+                null_mask,
+                padding_mask
+            )
+            output["deps_ud"] = deps_output['preds_ud']
+            output["deps_eud"] = deps_output['preds_eud']
+            output["loss"] += deps_output['loss_ud'] + deps_output['loss_eud']
+        # Predict miscellaneous features.
+        if "misc" in self.classifiers:
+            misc_output = self.classifiers["misc"](embeddings, miscs)
+            output["miscs"] = misc_output['preds']
+            output["loss"] += misc_output['loss']
+        # Predict semantics.
+        if "deepslot" in self.classifiers:
+            deepslot_output = self.classifiers["deepslot"](embeddings, deepslots)
+            output["deepslots"] = deepslot_output['preds']
+            output["loss"] += deepslot_output['loss']
+        if "semclass" in self.classifiers:
+            semclass_output = self.classifiers["semclass"](embeddings, semclasses)
+            output["semclasses"] = semclass_output['preds']
+            output["loss"] += semclass_output['loss']
+        return output

pipeline.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from transformers import Pipeline
+from src.lemmatize_helper import reconstruct_lemma
+class ConlluTokenClassificationPipeline(Pipeline):
+    def __init__(
+        self,
+        model,
+        tokenizer: callable = None,
+        sentenizer: callable = None,
+        **kwargs
+    ):
+        super().__init__(model=model, **kwargs)
+        self.tokenizer = tokenizer
+        self.sentenizer = sentenizer
+    #@override
+    def _sanitize_parameters(self, output_format: str = 'list', **kwargs):
+        if output_format not in ['list', 'str']:
+            raise ValueError(
+                f"output_format must be 'str' or 'list', not {output_format}"
+            )
+        # capture output_format for postprocessing
+        return {}, {}, {'output_format': output_format}
+    def preprocess(self, inputs: str) -> dict:
+        if not isinstance(inputs, str):
+            raise ValueError("pipeline input must be string (text)")
+        sentences = [sentence for sentence in self.sentenizer(inputs)]
+        words = [
+            [word for word in self.tokenizer(sentence)]
+            for sentence in sentences
+        ]
+        # stash for later post‐processing
+        self._texts = sentences
+        return {"words": words}
+    def _forward(self, model_inputs: dict) -> dict:
+        return self.model(**model_inputs, inference_mode=True)
+    #@override
+    def postprocess(self, model_outputs: dict, output_format: str) -> list[dict] | str:
+        sentences = self._decode_model_output(model_outputs)
+        # Format sentences into CoNLL-U string if requested.
+        if output_format == 'str':
+            sentences = self._format_as_conllu(sentences)
+        return sentences
+    def _decode_model_output(self, model_outputs: dict) -> list[dict]:
+        n_sentences = len(model_outputs["words"])
+        sentences_decoded = []
+        for i in range(n_sentences):
+            def select_arcs(arcs, batch_idx):
+                # Select arcs where batch index == batch_idx
+                # Return tensor of shape [n_selected_arcs, 3]
+                return arcs[arcs[:, 0] == batch_idx][:, 1:]
+            # Model outputs are padded tensors, so only leave first `n_words` labels.
+            n_words = len(model_outputs["words"][i])
+            optional_tags = {}
+            if "lemma_rules" in model_outputs:
+                optional_tags["lemma_rule_ids"] = model_outputs["lemma_rules"][i, :n_words].tolist()
+            if "joint_feats" in model_outputs:
+                optional_tags["joint_feats_ids"] = model_outputs["joint_feats"][i, :n_words].tolist()
+            if "deps_ud" in model_outputs:
+                optional_tags["deps_ud"] = select_arcs(model_outputs["deps_ud"], i).tolist()
+            if "deps_eud" in model_outputs:
+                optional_tags["deps_eud"] = select_arcs(model_outputs["deps_eud"], i).tolist()
+            if "miscs" in model_outputs:
+                optional_tags["misc_ids"] = model_outputs["miscs"][i, :n_words].tolist()
+            if "deepslots" in model_outputs:
+                optional_tags["deepslot_ids"] = model_outputs["deepslots"][i, :n_words].tolist()
+            if "semclasses" in model_outputs:
+                optional_tags["semclass_ids"] = model_outputs["semclasses"][i, :n_words].tolist()
+            sentence_decoded = self._decode_sentence(
+                text=self._texts[i],
+                words=model_outputs["words"][i],
+                **optional_tags,
+            )
+            sentences_decoded.append(sentence_decoded)
+        return sentences_decoded
+    def _decode_sentence(
+        self,
+        text: str,
+        words: list[str],
+        lemma_rule_ids: list[int] = None,
+        joint_feats_ids: list[int] = None,
+        deps_ud: list[list[int]] = None,
+        deps_eud: list[list[int]] = None,
+        misc_ids: list[int] = None,
+        deepslot_ids: list[int] = None,
+        semclass_ids: list[int] = None
+    ) -> dict:
+        # Enumerate words in the sentence, starting from 1.
+        ids = self._enumerate_words(words)
+        result = {
+            "text": text,
+            "words": words,
+            "ids": ids
+        }
+        # Decode lemmas.
+        if lemma_rule_ids:
+            result["lemmas"] = [
+                reconstruct_lemma(
+                    word,
+                    self.model.config.vocabulary["lemma_rule"][lemma_rule_id]
+                )
+                for word, lemma_rule_id in zip(words, lemma_rule_ids, strict=True)
+            ]
+        # Decode POS and features.
+        if joint_feats_ids:
+            upos, xpos, feats = zip(
+                *[
+                    self.model.config.vocabulary["joint_feats"][joint_feats_id].split('#')
+                    for joint_feats_id in joint_feats_ids
+                ],
+                strict=True
+            )
+            result["upos"] = list(upos)
+            result["xpos"] = list(xpos)
+            result["feats"] = list(feats)
+        # Decode syntax.
+        renumerate_and_decode_arcs = lambda arcs, id2rel: [
+            (
+                # ids stores inverse mapping from internal numeration to the standard
+                # conllu numeration, so simply use ids[internal_idx] to retrieve token id
+                # from internal index.
+                ids[arc_from] if arc_from != arc_to else '0',
+                ids[arc_to],
+                id2rel[deprel_id]
+            )
+            for arc_from, arc_to, deprel_id in arcs
+        ]
+        if deps_ud:
+            result["deps_ud"] = renumerate_and_decode_arcs(
+                deps_ud,
+                self.model.config.vocabulary["ud_deprel"]
+            )
+        if deps_eud:
+            result["deps_eud"] = renumerate_and_decode_arcs(
+                deps_eud,
+                self.model.config.vocabulary["eud_deprel"]
+            )
+        # Decode misc.
+        if misc_ids:
+            result["miscs"] = [
+                self.model.config.vocabulary["misc"][misc_id]
+                for misc_id in misc_ids
+            ]
+        # Decode semantics.
+        if deepslot_ids:
+            result["deepslots"] = [
+                self.model.config.vocabulary["deepslot"][deepslot_id]
+                for deepslot_id in deepslot_ids
+            ]
+        if semclass_ids:
+            result["semclasses"] = [
+                self.model.config.vocabulary["semclass"][semclass_id]
+                for semclass_id in semclass_ids
+            ]
+        return result
+    @staticmethod
+    def _enumerate_words(words: list[str]) -> list[str]:
+        ids = []
+        current_id = 0
+        current_null_count = 0
+        for word in words:
+            if word == "#NULL":
+                current_null_count += 1
+                ids.append(f"{current_id}.{current_null_count}")
+            else:
+                current_id += 1
+                current_null_count = 0
+                ids.append(f"{current_id}")
+        return ids
+    @staticmethod
+    def _format_as_conllu(sentences: list[dict]) -> str:
+        """
+        Format a list of sentence dicts into a CoNLL-U formatted string.
+        """
+        formatted = []
+        for sentence in sentences:
+            # The first line is a text matadata.
+            lines = [f"# text = {sentence['text']}"]
+            id2idx = {token_id: idx for idx, token_id in enumerate(sentence['ids'])}
+            # Basic syntax.
+            heads = [''] * len(id2idx)
+            deprels = [''] * len(id2idx)
+            if "deps_ud" in sentence:
+                for arc_from, arc_to, deprel in sentence['deps_ud']:
+                    token_idx = id2idx[arc_to]
+                    heads[token_idx] = arc_from
+                    deprels[token_idx] = deprel
+            # Enhanced syntax.
+            deps_dicts = [{} for _ in range(len(id2idx))]
+            if "deps_eud" in sentence:
+                for arc_from, arc_to, deprel in sentence['deps_eud']:
+                    token_idx = id2idx[arc_to]
+                    deps_dicts[token_idx][arc_from] = deprel
+            for idx, token_id in enumerate(sentence['ids']):
+                word = sentence['words'][idx]
+                lemma = sentence['lemmas'][idx] if "lemmas" in sentence else ''
+                upos = sentence['upos'][idx] if "upos" in sentence else ''
+                xpos = sentence['xpos'][idx] if "xpos" in sentence else ''
+                feats = sentence['feats'][idx] if "feats" in sentence else ''
+                deps = '|'.join(f"{head}:{rel}" for head, rel in deps_dicts[idx].items()) or '_'
+                misc = sentence['miscs'][idx] if "miscs" in sentence else ''
+                deepslot = sentence['deepslots'][idx] if "deepslots" in sentence else ''
+                semclass = sentence['semclasses'][idx] if "semclasses" in sentence else ''
+                # CoNLL-U columns
+                line = '\t'.join([
+                    token_id, word, lemma, upos, xpos, feats, heads[idx],
+                    deprels[idx], deps, misc, deepslot, semclass
+                ])
+                lines.append(line)
+            formatted.append('\n'.join(lines))
+        return '\n\n'.join(formatted)

utils.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+from torch import Tensor
+def pad_sequences(sequences: list[Tensor], padding_value: int) -> Tensor:
+    """
+    Stack 1d tensors (sequences) into a single 2d tensor so that each sequence is padded on the
+    right.
+    """
+    return torch.nn.utils.rnn.pad_sequence(sequences, padding_value=padding_value, batch_first=True)
+def _build_condition_mask(sentences: list[list[str]], condition_fn: callable, device) -> Tensor:
+    masks = [
+        torch.tensor([condition_fn(word) for word in sentence], dtype=bool, device=device)
+        for sentence in sentences
+    ]
+    return pad_sequences(masks, padding_value=False)
+def build_padding_mask(sentences: list[list[str]], device) -> Tensor:
+    return _build_condition_mask(sentences, condition_fn=lambda word: True, device=device)
+def build_null_mask(sentences: list[list[str]], device) -> Tensor:
+    return _build_condition_mask(sentences, condition_fn=lambda word: word != "#NULL", device=device)
+def pairwise_mask(masks1d: Tensor) -> Tensor:
+    """
+    Calculate an outer product of a mask, i.e. masks2d[:, i, j] = masks1d[:, i] & masks1d[:, j].
+    """
+    return masks1d[:, None, :] & masks1d[:, :, None]
+# Credits: https://docs.allennlp.org/main/api/nn/util/#replace_masked_values
+def replace_masked_values(tensor: Tensor, mask: Tensor, replace_with: float):
+    """
+    Replace all masked values in tensor with `replace_with`.
+    """
+    assert tensor.dim() == mask.dim(), "tensor.dim() of {tensor.dim()} != mask.dim() of {mask.dim()}"
+    tensor.masked_fill_(~mask, replace_with)
+def prepend_cls(sentences: list[list[str]]) -> list[list[str]]:
+    """
+    Return a copy of sentences with [CLS] token prepended.
+    """
+    return [["[CLS]", *sentence] for sentence in sentences]
+def remove_nulls(sentences: list[list[str]]) -> list[list[str]]:
+    """
+    Return a copy of sentences with nulls removed.
+    """
+    return [[word for word in sentence if word != "#NULL"] for sentence in sentences]
+def add_nulls(sentences: list[list[str]], counting_mask) -> list[list[str]]:
+    """
+    Return a copy of sentences with nulls restored according to counting masks.
+    """
+    sentences_with_nulls = []
+    for sentence, counting_mask in zip(sentences, counting_mask, strict=True):
+        sentence_with_nulls = []
+        assert 0 < len(counting_mask)
+        # Account for leading (CLS) auxiliary token.
+        sentence_with_nulls.extend(["#NULL"] * counting_mask[0])
+        for word, n_nulls_to_insert in zip(sentence, counting_mask[1:], strict=True):
+            sentence_with_nulls.append(word)
+            sentence_with_nulls.extend(["#NULL"] * n_nulls_to_insert)
+        sentences_with_nulls.append(sentence_with_nulls)
+    return sentences_with_nulls