veryfansome commited on
Commit
0dfbd20
·
1 Parent(s): b249cec

Big cleanup

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. conll2012_dataset_maker.py +0 -120
  2. dataset/o3-mini_20250218/data-00000-of-00001.arrow +0 -3
  3. dataset/o3-mini_20250218/dataset_info.json +0 -110
  4. dataset/o3-mini_20250218/state.json +0 -13
  5. dataset/ud_augmented_1st_pass/dataset_dict.json +0 -1
  6. dataset/ud_augmented_1st_pass/test/data-00000-of-00001.arrow +0 -3
  7. dataset/ud_augmented_1st_pass/test/dataset_info.json +0 -166
  8. dataset/ud_augmented_1st_pass/test/state.json +0 -37
  9. dataset/ud_augmented_1st_pass/train/data-00000-of-00001.arrow +0 -3
  10. dataset/ud_augmented_1st_pass/train/dataset_info.json +0 -166
  11. dataset/ud_augmented_1st_pass/train/state.json +0 -37
  12. dataset/ud_augmented_1st_pass/validation/data-00000-of-00001.arrow +0 -3
  13. dataset/ud_augmented_1st_pass/validation/dataset_info.json +0 -166
  14. dataset/ud_augmented_1st_pass/validation/state.json +0 -37
  15. dataset/ud_augmented_jj_rb_types_20250320/dataset_dict.json +0 -1
  16. dataset/ud_augmented_jj_rb_types_20250320/test/data-00000-of-00001.arrow +0 -3
  17. dataset/ud_augmented_jj_rb_types_20250320/test/dataset_info.json +0 -222
  18. dataset/ud_augmented_jj_rb_types_20250320/test/state.json +0 -45
  19. dataset/ud_augmented_jj_rb_types_20250320/train/data-00000-of-00001.arrow +0 -3
  20. dataset/ud_augmented_jj_rb_types_20250320/train/dataset_info.json +0 -222
  21. dataset/ud_augmented_jj_rb_types_20250320/train/state.json +0 -45
  22. dataset/ud_augmented_jj_rb_types_20250320/validation/data-00000-of-00001.arrow +0 -3
  23. dataset/ud_augmented_jj_rb_types_20250320/validation/dataset_info.json +0 -222
  24. dataset/ud_augmented_jj_rb_types_20250320/validation/state.json +0 -45
  25. dataset/ud_transform_only_20250317/dataset_dict.json +0 -1
  26. dataset/ud_transform_only_20250317/test/data-00000-of-00001.arrow +0 -3
  27. dataset/ud_transform_only_20250317/test/dataset_info.json +0 -222
  28. dataset/ud_transform_only_20250317/test/state.json +0 -45
  29. dataset/ud_transform_only_20250317/train/data-00000-of-00001.arrow +0 -3
  30. dataset/ud_transform_only_20250317/train/dataset_info.json +0 -222
  31. dataset/ud_transform_only_20250317/train/state.json +0 -45
  32. dataset/ud_transform_only_20250317/validation/data-00000-of-00001.arrow +0 -3
  33. dataset/ud_transform_only_20250317/validation/dataset_info.json +0 -222
  34. dataset/ud_transform_only_20250317/validation/state.json +0 -45
  35. dataset_splitter.py +0 -43
  36. goemotions_predict.py +0 -63
  37. llama_dataset_maker.py +0 -194
  38. models/conll2012_en12_20250305/added_tokens.json +0 -3
  39. models/conll2012_en12_20250305/config.json +0 -135
  40. models/conll2012_en12_20250305/model.safetensors +0 -3
  41. models/conll2012_en12_20250305/special_tokens_map.json +0 -15
  42. models/conll2012_en12_20250305/spm.model +0 -3
  43. models/conll2012_en12_20250305/tokenizer.json +0 -0
  44. models/conll2012_en12_20250305/tokenizer_config.json +0 -60
  45. models/conll2012_en12_20250305/training_args.bin +0 -3
  46. models/ud_augmented_jj_rb_types_20250320_v2/README.md +0 -0
  47. models/ud_augmented_jj_rb_types_20250320_v2/added_tokens.json +0 -3
  48. models/ud_augmented_jj_rb_types_20250320_v2/config.json +0 -388
  49. models/ud_augmented_jj_rb_types_20250320_v2/model.safetensors +0 -3
  50. models/ud_augmented_jj_rb_types_20250320_v2/special_tokens_map.json +0 -51
conll2012_dataset_maker.py DELETED
@@ -1,120 +0,0 @@
1
- from datasets import load_dataset, DatasetDict
2
- import argparse
3
- import logging
4
-
5
- from utils import default_logging_config, get_uniq_training_labels, show_examples
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- allowed_pos = {'``', '$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'CC', 'CD', 'DT', 'EX',
11
- 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS',
12
- 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',
13
- 'WDT', 'WP', 'WP$', 'WRB'}
14
-
15
- allowed_ner = {'O', 'B-PERSON', 'I-PERSON', 'B-NORP', 'I-NORP', 'B-FAC', 'I-FAC', 'B-ORG', 'I-ORG', 'B-GPE', 'I-GPE',
16
- 'B-LOC', 'I-LOC', 'B-PRODUCT', 'I-PRODUCT', 'B-DATE', 'I-DATE', 'B-TIME', 'I-TIME', 'B-PERCENT',
17
- 'I-PERCENT', 'B-MONEY', 'I-MONEY', 'B-QUANTITY', 'I-QUANTITY', 'B-ORDINAL', 'I-ORDINAL',
18
- 'B-CARDINAL', 'I-CARDINAL', 'B-EVENT', 'I-EVENT', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'B-LAW', 'I-LAW',
19
- 'B-LANGUAGE', 'I-LANGUAGE'}
20
-
21
-
22
- def is_valid_example(exp):
23
- """
24
- Simple filter that checks if all pos_tags are in allowed_pos
25
- and all ner_tags are in allowed_ner. If you do not want any
26
- filtering, simply return True.
27
- """
28
- # You can skip filtering by just returning True:
29
- # return True
30
-
31
- # If your dataset has multiple tokens with possibly different tags,
32
- # check them all:
33
- for pos_tag in exp["pos_tags"]:
34
- if pos_tag not in allowed_pos:
35
- return False
36
-
37
- for ner_tag in exp["ner_tags"]:
38
- if ner_tag not in allowed_ner:
39
- return False
40
-
41
- return True
42
-
43
-
44
- def transform_and_filter_dataset(onto_ds):
45
- """
46
- onto_ds is a DatasetDict with splits: 'train', 'validation', 'test', etc.
47
- Return a new DatasetDict with the same splits but:
48
- - Filter out unwanted examples
49
- - Possibly rename or remove columns
50
- - Possibly introduce new columns
51
- """
52
- pos_tag_int2str = onto_ds["train"].features["sentences"][0]["pos_tags"].feature.names
53
- ner_tag_int2str = onto_ds["train"].features["sentences"][0]["named_entities"].feature.names
54
-
55
- def flatten_ontonotes(batch):
56
- out = {
57
- "tokens": [],
58
- "ner_tags": [],
59
- "pos_tags": [],
60
- "verb_predicate": [],
61
- }
62
- for doc_id, sents in zip(batch["document_id"], batch["sentences"]):
63
- for sent_info in sents:
64
- out["tokens"].append(sent_info["words"])
65
- out["ner_tags"].append([ner_tag_int2str[i] for i in sent_info["named_entities"]])
66
- out["pos_tags"].append([pos_tag_int2str[i] for i in sent_info["pos_tags"]])
67
- out["verb_predicate"].append([("Yes" if s else "O") for s in sent_info["predicate_lemmas"]])
68
- return out
69
-
70
- new_splits = {}
71
- for split_name, split_ds in onto_ds.items():
72
- # Flatten
73
- flattened_ds = split_ds.map(
74
- flatten_ontonotes,
75
- batched=True,
76
- remove_columns=["sentences", "document_id"], # remove old columns
77
- )
78
-
79
- # Filter out invalid examples
80
- filtered_split = flattened_ds.filter(is_valid_example)
81
- new_splits[split_name] = filtered_split
82
-
83
- return DatasetDict(new_splits)
84
-
85
-
86
- # ------------------------------------------------------------------------------
87
- # 6) Main Script
88
- # ------------------------------------------------------------------------------
89
- if __name__ == "__main__":
90
- import logging.config
91
-
92
- arg_parser = argparse.ArgumentParser(description="Process OntoNotes CoNLL-2012 (English).")
93
- arg_parser.add_argument("--log-level", help="Log level.", action="store",
94
- default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"])
95
- arg_parser.add_argument("--save", help="Save final dataset to disk.", action="store_true", default=False)
96
- arg_parser.add_argument("--save-path", help="Where to save final dataset.", default="./conll2012_en12_training_data")
97
- arg_parser.add_argument("--show", help="Show examples: <split>/<col>/<label>/<count>", default=None)
98
- args = arg_parser.parse_args()
99
-
100
- logging.config.dictConfig(default_logging_config)
101
- logger.setLevel(args.log_level)
102
-
103
- # 6a) Load OntoNotes (English) from the 'conll2012_ontonotesv5' script
104
- # This usually yields "train", "validation", "test" splits.
105
- ontonotes_ds = load_dataset("conll2012_ontonotesv5", "english_v12")
106
- logger.info(f"Splits loaded: {ontonotes_ds}")
107
-
108
- # 6b) Transform & Filter
109
- final_dataset = transform_and_filter_dataset(ontonotes_ds)
110
-
111
- # 6d) Show examples if user requested
112
- show_examples(final_dataset, args.show)
113
-
114
- # 6e) Log unique training labels (POS/NER) if you like
115
- get_uniq_training_labels(final_dataset)
116
-
117
- # 6f) Save to disk if requested
118
- if args.save:
119
- final_dataset.save_to_disk(args.save_path)
120
- logger.info("Saved dataset to %s", args.save_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/o3-mini_20250218/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d78eaab378f462e45adbf026f13c3d5b9a289ddea75d399dbbf0c12b0c25c11e
3
- size 40179808
 
 
 
 
dataset/o3-mini_20250218/dataset_info.json DELETED
@@ -1,110 +0,0 @@
1
- {
2
- "citation": "",
3
- "description": "",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "adj": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "adv": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "det": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "enc": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "func": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "misc": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "ner1": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "ner2": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "noun": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "pronoun": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "punct": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "verb": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "wh": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- }
107
- },
108
- "homepage": "",
109
- "license": ""
110
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/o3-mini_20250218/state.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "4a79c58b9023cf85",
8
- "_format_columns": null,
9
- "_format_kwargs": {},
10
- "_format_type": null,
11
- "_output_all_columns": false,
12
- "_split": null
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_1st_pass/dataset_dict.json DELETED
@@ -1 +0,0 @@
1
- {"splits": ["test", "train", "validation"]}
 
 
dataset/ud_augmented_1st_pass/test/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:72ff4b6e757e35c614da44f3b7c13fbde97c3411ddb9417bf9dc3586b82d8a13
3
- size 7444864
 
 
 
 
dataset/ud_augmented_1st_pass/test/dataset_info.json DELETED
@@ -1,166 +0,0 @@
1
- {
2
- "citation": "\\\n@misc{11234/1-3424,\ntitle = {Universal Dependencies 2.7},\nauthor = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\\\"e}mi and Aghaei, Hamid and Agi{\\'c}, {\\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\\v c}i{\\=u}t{\\.e}, Gabriel{\\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\\'o}ttir, {\\t H}{\\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\\\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\\.e}, Agn{\\.e} and Bjarnad{\\'o}ttir, Krist{\\'{\\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\\\"{\\i}}c and Borges V{\\\"o}lker, Emanuel and B{\\\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\\'a}rd and Fernanda, Mar{\\'{\\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\\'a}udia and Fujita, Kazunori and Gajdosov{\\'a}, Katar{\\'{\\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\\\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\\'{\\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\\\"o}k{\\i}rmak, Memduh and Goldberg, Yoav and G{\\'o}mez Guinovart, Xavier and Gonz{\\'a}lez Saavedra,\nBerta and Grici{\\=u}t{\\.e}, Bernadeta and Grioni, Matias and Grobol, Lo{\\\"{\\i}}c and Gr{\\=u}z{\\={\\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\\'e}line and G{\\\"u}ng{\\\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\\v c}, Jan and Haji{\\v c} jr., Jan and H{\\\"a}m{\\\"a}l{\\\"a}inen, Mika and H{\\`a} M{\\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\\'a}, Barbora and Hlav{\\'a}{\\v c}ov{\\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huber, Eva and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\\d O}l{\\'a}j{\\'{\\i}}d{\\'e} and Jel{\\'{\\i}}nek, Tom{\\'a}{\\v s} and Johannsen, Anders and J{\\'o}nsd{\\'o}ttir, Hildur and J{\\o}rgensen, Fredrik and Juutinen, Markus and K, Sarveswaran and Ka{\\c s}{\\i}kara, H{\\\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\\'a}, V{\\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\\\"o}hn, Arne and K{\\\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and Kotsyba, Natalia and Kovalevskait{\\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and Kwak, Sookyoung and Laippala, Veronika and Lam, Lucia and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\\^e} H{\\`{\\^o}}ng, Ph\u01b0\u01a1ng and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yuan and Lim, {KyungTae} and Linden, Krister and Ljubesic, Nikola and Loginova, Olga and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Maranduc, Catalina and Marcek, David and Marheinecke, Katrin and Mart{\\'{\\i}}nez Alonso, H{\\'e}ctor and Martins, Andr{\\'e} and Masek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendonca, Gustavo and Miekka, Niko and Mischenkova, Karina and Misirpashayeva, Margarita and Missil{\\\"a}, Anna and Mititelu, Catalin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\\\"u}{\\\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\\'e}, Mariam and Navarro Hor{\\~n}iacek, Juan Ignacio and Nedoluzhko,\nAnna and Ne{\\v s}pore-B{\\=e}rzkalne, Gunta and Nguy{\\~{\\^e}}n Th{\\d i}, L\u01b0\u01a1ng and Nguy{\\~{\\^e}}n Th{\\d i} Minh, Huy{\\`{\\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\\'u}{\\`o}kun, Ad{\\'e}day{\\d o}\u0300 and Omura, Mai and Onwuegbuzia, Emeka and Osenova, Petya and {\\\"O}stling, Robert and {\\O}vrelid, Lilja and {\\\"O}zate{\\c s}, {\\c S}aziye Bet{\\\"u}l and {\\\"O}zg{\\\"u}r, Arzucan and {\\\"O}zt{\\\"u}rk Ba{\\c s}aran, Balk{\\i}z and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\\L}api{\\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalnina, Lauma and Pr{\\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\\\"a}{\\\"a}bis, Andriela and Rademaker, Alexandre and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\\ss}ler, Michael and Rimkut{\\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and R{\\\"o}gnvaldsson, Eir{\\'{\\i}}kur and Romanenko, Mykhailo and Rosa, Rudolf and Ro\u0219ca, Valentin and Rovati, Davide and Rudina, Olga and Rueter, Jack and R{\\'u}narsson, Kristjan and Sadde, Shoval and Safari, Pegah and Sagot, Benoit and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samardzi{\\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\\\"a}rg,\nDage and Saul{\\={\\i}}te, Baiba and Sawanakunanon, Yanin and Scannell, Kevin and Scarlata, Salvatore and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Sigur\u00f0sson, Einar Freyr and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\\'o}, Katalin and {\\v S}imkov{\\'a}, M{\\'a}ria and Simov, Kiril and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Steingr{\\'{\\i}}msson, Stein{\\t h}{\\'o}r and Stella, Antonio and Straka, Milan and Strickland, Emmett and Strnadov{\\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Sz{\\'a}nt{\\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tella, Samson and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\\\"u}rk, Utku and Tyers, Francis and Uematsu, Sumire and Untilov, Roman and Uresov{\\'a}, Zdenka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Williams, Seyi and Wir{\\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\\'o}blewska, Alina and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Zabokrtsk{\\'y}, Zdenek and Zahra, Shorouq and Zeldes, Amir and Zhu, Hanzhi and Zhuravleva, Anna},\nurl = {http://hdl.handle.net/11234/1-3424},\nnote = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\\'U}FAL}), Faculty of Mathematics and Physics, Charles University},\ncopyright = {Licence Universal Dependencies v2.7},\nyear = {2020} }",
3
- "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "xpos": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "deprel": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "Case": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "Definite": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "Degree": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "Gender": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "Mood": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "NumType": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "Number": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "Person": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "Poss": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "PronType": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "Reflex": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- },
107
- "Tense": {
108
- "feature": {
109
- "dtype": "string",
110
- "_type": "Value"
111
- },
112
- "_type": "Sequence"
113
- },
114
- "Typo": {
115
- "feature": {
116
- "dtype": "string",
117
- "_type": "Value"
118
- },
119
- "_type": "Sequence"
120
- },
121
- "VerbForm": {
122
- "feature": {
123
- "dtype": "string",
124
- "_type": "Value"
125
- },
126
- "_type": "Sequence"
127
- },
128
- "Emotion": {
129
- "feature": {
130
- "dtype": "string",
131
- "_type": "Value"
132
- },
133
- "_type": "Sequence"
134
- },
135
- "AdjType": {
136
- "feature": {
137
- "dtype": "string",
138
- "_type": "Value"
139
- },
140
- "_type": "Sequence"
141
- },
142
- "NerLocation": {
143
- "feature": {
144
- "dtype": "string",
145
- "_type": "Value"
146
- },
147
- "_type": "Sequence"
148
- },
149
- "NerOrganization": {
150
- "feature": {
151
- "dtype": "string",
152
- "_type": "Value"
153
- },
154
- "_type": "Sequence"
155
- },
156
- "NerPerson": {
157
- "feature": {
158
- "dtype": "string",
159
- "_type": "Value"
160
- },
161
- "_type": "Sequence"
162
- }
163
- },
164
- "homepage": "https://universaldependencies.org/",
165
- "license": ""
166
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_1st_pass/test/state.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "9e6189ebb4d00723",
8
- "_format_columns": [
9
- "text",
10
- "tokens",
11
- "xpos",
12
- "deprel",
13
- "Case",
14
- "Definite",
15
- "Degree",
16
- "Gender",
17
- "Mood",
18
- "NumType",
19
- "Number",
20
- "Person",
21
- "Poss",
22
- "PronType",
23
- "Reflex",
24
- "Tense",
25
- "Typo",
26
- "VerbForm",
27
- "Emotion",
28
- "AdjType",
29
- "NerLocation",
30
- "NerOrganization",
31
- "NerPerson"
32
- ],
33
- "_format_kwargs": {},
34
- "_format_type": null,
35
- "_output_all_columns": false,
36
- "_split": null
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_1st_pass/train/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:db9cdfd62c504e0d4ed6ecc5ca28a067785440611e7915f0a42997e71e8cb27b
3
- size 30680400
 
 
 
 
dataset/ud_augmented_1st_pass/train/dataset_info.json DELETED
@@ -1,166 +0,0 @@
1
- {
2
- "citation": "\\\n@misc{11234/1-3424,\ntitle = {Universal Dependencies 2.7},\nauthor = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\\\"e}mi and Aghaei, Hamid and Agi{\\'c}, {\\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\\v c}i{\\=u}t{\\.e}, Gabriel{\\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\\'o}ttir, {\\t H}{\\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\\\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\\.e}, Agn{\\.e} and Bjarnad{\\'o}ttir, Krist{\\'{\\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\\\"{\\i}}c and Borges V{\\\"o}lker, Emanuel and B{\\\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\\'a}rd and Fernanda, Mar{\\'{\\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\\'a}udia and Fujita, Kazunori and Gajdosov{\\'a}, Katar{\\'{\\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\\\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\\'{\\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\\\"o}k{\\i}rmak, Memduh and Goldberg, Yoav and G{\\'o}mez Guinovart, Xavier and Gonz{\\'a}lez Saavedra,\nBerta and Grici{\\=u}t{\\.e}, Bernadeta and Grioni, Matias and Grobol, Lo{\\\"{\\i}}c and Gr{\\=u}z{\\={\\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\\'e}line and G{\\\"u}ng{\\\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\\v c}, Jan and Haji{\\v c} jr., Jan and H{\\\"a}m{\\\"a}l{\\\"a}inen, Mika and H{\\`a} M{\\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\\'a}, Barbora and Hlav{\\'a}{\\v c}ov{\\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huber, Eva and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\\d O}l{\\'a}j{\\'{\\i}}d{\\'e} and Jel{\\'{\\i}}nek, Tom{\\'a}{\\v s} and Johannsen, Anders and J{\\'o}nsd{\\'o}ttir, Hildur and J{\\o}rgensen, Fredrik and Juutinen, Markus and K, Sarveswaran and Ka{\\c s}{\\i}kara, H{\\\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\\'a}, V{\\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\\\"o}hn, Arne and K{\\\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and Kotsyba, Natalia and Kovalevskait{\\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and Kwak, Sookyoung and Laippala, Veronika and Lam, Lucia and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\\^e} H{\\`{\\^o}}ng, Ph\u01b0\u01a1ng and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yuan and Lim, {KyungTae} and Linden, Krister and Ljubesic, Nikola and Loginova, Olga and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Maranduc, Catalina and Marcek, David and Marheinecke, Katrin and Mart{\\'{\\i}}nez Alonso, H{\\'e}ctor and Martins, Andr{\\'e} and Masek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendonca, Gustavo and Miekka, Niko and Mischenkova, Karina and Misirpashayeva, Margarita and Missil{\\\"a}, Anna and Mititelu, Catalin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\\\"u}{\\\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\\'e}, Mariam and Navarro Hor{\\~n}iacek, Juan Ignacio and Nedoluzhko,\nAnna and Ne{\\v s}pore-B{\\=e}rzkalne, Gunta and Nguy{\\~{\\^e}}n Th{\\d i}, L\u01b0\u01a1ng and Nguy{\\~{\\^e}}n Th{\\d i} Minh, Huy{\\`{\\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\\'u}{\\`o}kun, Ad{\\'e}day{\\d o}\u0300 and Omura, Mai and Onwuegbuzia, Emeka and Osenova, Petya and {\\\"O}stling, Robert and {\\O}vrelid, Lilja and {\\\"O}zate{\\c s}, {\\c S}aziye Bet{\\\"u}l and {\\\"O}zg{\\\"u}r, Arzucan and {\\\"O}zt{\\\"u}rk Ba{\\c s}aran, Balk{\\i}z and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\\L}api{\\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalnina, Lauma and Pr{\\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\\\"a}{\\\"a}bis, Andriela and Rademaker, Alexandre and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\\ss}ler, Michael and Rimkut{\\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and R{\\\"o}gnvaldsson, Eir{\\'{\\i}}kur and Romanenko, Mykhailo and Rosa, Rudolf and Ro\u0219ca, Valentin and Rovati, Davide and Rudina, Olga and Rueter, Jack and R{\\'u}narsson, Kristjan and Sadde, Shoval and Safari, Pegah and Sagot, Benoit and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samardzi{\\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\\\"a}rg,\nDage and Saul{\\={\\i}}te, Baiba and Sawanakunanon, Yanin and Scannell, Kevin and Scarlata, Salvatore and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Sigur\u00f0sson, Einar Freyr and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\\'o}, Katalin and {\\v S}imkov{\\'a}, M{\\'a}ria and Simov, Kiril and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Steingr{\\'{\\i}}msson, Stein{\\t h}{\\'o}r and Stella, Antonio and Straka, Milan and Strickland, Emmett and Strnadov{\\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Sz{\\'a}nt{\\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tella, Samson and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\\\"u}rk, Utku and Tyers, Francis and Uematsu, Sumire and Untilov, Roman and Uresov{\\'a}, Zdenka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Williams, Seyi and Wir{\\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\\'o}blewska, Alina and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Zabokrtsk{\\'y}, Zdenek and Zahra, Shorouq and Zeldes, Amir and Zhu, Hanzhi and Zhuravleva, Anna},\nurl = {http://hdl.handle.net/11234/1-3424},\nnote = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\\'U}FAL}), Faculty of Mathematics and Physics, Charles University},\ncopyright = {Licence Universal Dependencies v2.7},\nyear = {2020} }",
3
- "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "xpos": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "deprel": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "Case": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "Definite": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "Degree": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "Gender": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "Mood": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "NumType": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "Number": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "Person": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "Poss": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "PronType": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "Reflex": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- },
107
- "Tense": {
108
- "feature": {
109
- "dtype": "string",
110
- "_type": "Value"
111
- },
112
- "_type": "Sequence"
113
- },
114
- "Typo": {
115
- "feature": {
116
- "dtype": "string",
117
- "_type": "Value"
118
- },
119
- "_type": "Sequence"
120
- },
121
- "VerbForm": {
122
- "feature": {
123
- "dtype": "string",
124
- "_type": "Value"
125
- },
126
- "_type": "Sequence"
127
- },
128
- "Emotion": {
129
- "feature": {
130
- "dtype": "string",
131
- "_type": "Value"
132
- },
133
- "_type": "Sequence"
134
- },
135
- "AdjType": {
136
- "feature": {
137
- "dtype": "string",
138
- "_type": "Value"
139
- },
140
- "_type": "Sequence"
141
- },
142
- "NerLocation": {
143
- "feature": {
144
- "dtype": "string",
145
- "_type": "Value"
146
- },
147
- "_type": "Sequence"
148
- },
149
- "NerOrganization": {
150
- "feature": {
151
- "dtype": "string",
152
- "_type": "Value"
153
- },
154
- "_type": "Sequence"
155
- },
156
- "NerPerson": {
157
- "feature": {
158
- "dtype": "string",
159
- "_type": "Value"
160
- },
161
- "_type": "Sequence"
162
- }
163
- },
164
- "homepage": "https://universaldependencies.org/",
165
- "license": ""
166
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_1st_pass/train/state.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "ad7a0ecbd5effa06",
8
- "_format_columns": [
9
- "text",
10
- "tokens",
11
- "xpos",
12
- "deprel",
13
- "Case",
14
- "Definite",
15
- "Degree",
16
- "Gender",
17
- "Mood",
18
- "NumType",
19
- "Number",
20
- "Person",
21
- "Poss",
22
- "PronType",
23
- "Reflex",
24
- "Tense",
25
- "Typo",
26
- "VerbForm",
27
- "Emotion",
28
- "AdjType",
29
- "NerLocation",
30
- "NerOrganization",
31
- "NerPerson"
32
- ],
33
- "_format_kwargs": {},
34
- "_format_type": null,
35
- "_output_all_columns": false,
36
- "_split": null
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_1st_pass/validation/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ae4a4bf4daf5bcee6e968dcdef9b2d303d781bfac1b8323a20072c53aff7b03
3
- size 4514168
 
 
 
 
dataset/ud_augmented_1st_pass/validation/dataset_info.json DELETED
@@ -1,166 +0,0 @@
1
- {
2
- "citation": "\\\n@misc{11234/1-3424,\ntitle = {Universal Dependencies 2.7},\nauthor = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\\\"e}mi and Aghaei, Hamid and Agi{\\'c}, {\\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\\v c}i{\\=u}t{\\.e}, Gabriel{\\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\\'o}ttir, {\\t H}{\\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\\\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\\.e}, Agn{\\.e} and Bjarnad{\\'o}ttir, Krist{\\'{\\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\\\"{\\i}}c and Borges V{\\\"o}lker, Emanuel and B{\\\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\\'a}rd and Fernanda, Mar{\\'{\\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\\'a}udia and Fujita, Kazunori and Gajdosov{\\'a}, Katar{\\'{\\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\\\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\\'{\\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\\\"o}k{\\i}rmak, Memduh and Goldberg, Yoav and G{\\'o}mez Guinovart, Xavier and Gonz{\\'a}lez Saavedra,\nBerta and Grici{\\=u}t{\\.e}, Bernadeta and Grioni, Matias and Grobol, Lo{\\\"{\\i}}c and Gr{\\=u}z{\\={\\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\\'e}line and G{\\\"u}ng{\\\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\\v c}, Jan and Haji{\\v c} jr., Jan and H{\\\"a}m{\\\"a}l{\\\"a}inen, Mika and H{\\`a} M{\\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\\'a}, Barbora and Hlav{\\'a}{\\v c}ov{\\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huber, Eva and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\\d O}l{\\'a}j{\\'{\\i}}d{\\'e} and Jel{\\'{\\i}}nek, Tom{\\'a}{\\v s} and Johannsen, Anders and J{\\'o}nsd{\\'o}ttir, Hildur and J{\\o}rgensen, Fredrik and Juutinen, Markus and K, Sarveswaran and Ka{\\c s}{\\i}kara, H{\\\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\\'a}, V{\\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\\\"o}hn, Arne and K{\\\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and Kotsyba, Natalia and Kovalevskait{\\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and Kwak, Sookyoung and Laippala, Veronika and Lam, Lucia and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\\^e} H{\\`{\\^o}}ng, Ph\u01b0\u01a1ng and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yuan and Lim, {KyungTae} and Linden, Krister and Ljubesic, Nikola and Loginova, Olga and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Maranduc, Catalina and Marcek, David and Marheinecke, Katrin and Mart{\\'{\\i}}nez Alonso, H{\\'e}ctor and Martins, Andr{\\'e} and Masek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendonca, Gustavo and Miekka, Niko and Mischenkova, Karina and Misirpashayeva, Margarita and Missil{\\\"a}, Anna and Mititelu, Catalin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\\\"u}{\\\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\\'e}, Mariam and Navarro Hor{\\~n}iacek, Juan Ignacio and Nedoluzhko,\nAnna and Ne{\\v s}pore-B{\\=e}rzkalne, Gunta and Nguy{\\~{\\^e}}n Th{\\d i}, L\u01b0\u01a1ng and Nguy{\\~{\\^e}}n Th{\\d i} Minh, Huy{\\`{\\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\\'u}{\\`o}kun, Ad{\\'e}day{\\d o}\u0300 and Omura, Mai and Onwuegbuzia, Emeka and Osenova, Petya and {\\\"O}stling, Robert and {\\O}vrelid, Lilja and {\\\"O}zate{\\c s}, {\\c S}aziye Bet{\\\"u}l and {\\\"O}zg{\\\"u}r, Arzucan and {\\\"O}zt{\\\"u}rk Ba{\\c s}aran, Balk{\\i}z and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\\L}api{\\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalnina, Lauma and Pr{\\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\\\"a}{\\\"a}bis, Andriela and Rademaker, Alexandre and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\\ss}ler, Michael and Rimkut{\\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and R{\\\"o}gnvaldsson, Eir{\\'{\\i}}kur and Romanenko, Mykhailo and Rosa, Rudolf and Ro\u0219ca, Valentin and Rovati, Davide and Rudina, Olga and Rueter, Jack and R{\\'u}narsson, Kristjan and Sadde, Shoval and Safari, Pegah and Sagot, Benoit and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samardzi{\\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\\\"a}rg,\nDage and Saul{\\={\\i}}te, Baiba and Sawanakunanon, Yanin and Scannell, Kevin and Scarlata, Salvatore and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Sigur\u00f0sson, Einar Freyr and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\\'o}, Katalin and {\\v S}imkov{\\'a}, M{\\'a}ria and Simov, Kiril and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Steingr{\\'{\\i}}msson, Stein{\\t h}{\\'o}r and Stella, Antonio and Straka, Milan and Strickland, Emmett and Strnadov{\\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Sz{\\'a}nt{\\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tella, Samson and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\\\"u}rk, Utku and Tyers, Francis and Uematsu, Sumire and Untilov, Roman and Uresov{\\'a}, Zdenka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Williams, Seyi and Wir{\\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\\'o}blewska, Alina and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Zabokrtsk{\\'y}, Zdenek and Zahra, Shorouq and Zeldes, Amir and Zhu, Hanzhi and Zhuravleva, Anna},\nurl = {http://hdl.handle.net/11234/1-3424},\nnote = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\\'U}FAL}), Faculty of Mathematics and Physics, Charles University},\ncopyright = {Licence Universal Dependencies v2.7},\nyear = {2020} }",
3
- "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "xpos": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "deprel": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "Case": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "Definite": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "Degree": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "Gender": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "Mood": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "NumType": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "Number": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "Person": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "Poss": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "PronType": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "Reflex": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- },
107
- "Tense": {
108
- "feature": {
109
- "dtype": "string",
110
- "_type": "Value"
111
- },
112
- "_type": "Sequence"
113
- },
114
- "Typo": {
115
- "feature": {
116
- "dtype": "string",
117
- "_type": "Value"
118
- },
119
- "_type": "Sequence"
120
- },
121
- "VerbForm": {
122
- "feature": {
123
- "dtype": "string",
124
- "_type": "Value"
125
- },
126
- "_type": "Sequence"
127
- },
128
- "Emotion": {
129
- "feature": {
130
- "dtype": "string",
131
- "_type": "Value"
132
- },
133
- "_type": "Sequence"
134
- },
135
- "AdjType": {
136
- "feature": {
137
- "dtype": "string",
138
- "_type": "Value"
139
- },
140
- "_type": "Sequence"
141
- },
142
- "NerLocation": {
143
- "feature": {
144
- "dtype": "string",
145
- "_type": "Value"
146
- },
147
- "_type": "Sequence"
148
- },
149
- "NerOrganization": {
150
- "feature": {
151
- "dtype": "string",
152
- "_type": "Value"
153
- },
154
- "_type": "Sequence"
155
- },
156
- "NerPerson": {
157
- "feature": {
158
- "dtype": "string",
159
- "_type": "Value"
160
- },
161
- "_type": "Sequence"
162
- }
163
- },
164
- "homepage": "https://universaldependencies.org/",
165
- "license": ""
166
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_1st_pass/validation/state.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "9ba8b77501d6e582",
8
- "_format_columns": [
9
- "text",
10
- "tokens",
11
- "xpos",
12
- "deprel",
13
- "Case",
14
- "Definite",
15
- "Degree",
16
- "Gender",
17
- "Mood",
18
- "NumType",
19
- "Number",
20
- "Person",
21
- "Poss",
22
- "PronType",
23
- "Reflex",
24
- "Tense",
25
- "Typo",
26
- "VerbForm",
27
- "Emotion",
28
- "AdjType",
29
- "NerLocation",
30
- "NerOrganization",
31
- "NerPerson"
32
- ],
33
- "_format_kwargs": {},
34
- "_format_type": null,
35
- "_output_all_columns": false,
36
- "_split": null
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_jj_rb_types_20250320/dataset_dict.json DELETED
@@ -1 +0,0 @@
1
- {"splits": ["test", "train", "validation"]}
 
 
dataset/ud_augmented_jj_rb_types_20250320/test/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:febd70182c55d126b9faaa6572684cf25ef41fa4191aa59e26d0789985bb9ec7
3
- size 9567448
 
 
 
 
dataset/ud_augmented_jj_rb_types_20250320/test/dataset_info.json DELETED
@@ -1,222 +0,0 @@
1
- {
2
- "citation": "\\\n@misc{11234/1-3424,\ntitle = {Universal Dependencies 2.7},\nauthor = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\\\"e}mi and Aghaei, Hamid and Agi{\\'c}, {\\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\\v c}i{\\=u}t{\\.e}, Gabriel{\\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\\'o}ttir, {\\t H}{\\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\\\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\\.e}, Agn{\\.e} and Bjarnad{\\'o}ttir, Krist{\\'{\\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\\\"{\\i}}c and Borges V{\\\"o}lker, Emanuel and B{\\\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\\'a}rd and Fernanda, Mar{\\'{\\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\\'a}udia and Fujita, Kazunori and Gajdosov{\\'a}, Katar{\\'{\\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\\\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\\'{\\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\\\"o}k{\\i}rmak, Memduh and Goldberg, Yoav and G{\\'o}mez Guinovart, Xavier and Gonz{\\'a}lez Saavedra,\nBerta and Grici{\\=u}t{\\.e}, Bernadeta and Grioni, Matias and Grobol, Lo{\\\"{\\i}}c and Gr{\\=u}z{\\={\\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\\'e}line and G{\\\"u}ng{\\\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\\v c}, Jan and Haji{\\v c} jr., Jan and H{\\\"a}m{\\\"a}l{\\\"a}inen, Mika and H{\\`a} M{\\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\\'a}, Barbora and Hlav{\\'a}{\\v c}ov{\\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huber, Eva and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\\d O}l{\\'a}j{\\'{\\i}}d{\\'e} and Jel{\\'{\\i}}nek, Tom{\\'a}{\\v s} and Johannsen, Anders and J{\\'o}nsd{\\'o}ttir, Hildur and J{\\o}rgensen, Fredrik and Juutinen, Markus and K, Sarveswaran and Ka{\\c s}{\\i}kara, H{\\\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\\'a}, V{\\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\\\"o}hn, Arne and K{\\\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and Kotsyba, Natalia and Kovalevskait{\\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and Kwak, Sookyoung and Laippala, Veronika and Lam, Lucia and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\\^e} H{\\`{\\^o}}ng, Ph\u01b0\u01a1ng and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yuan and Lim, {KyungTae} and Linden, Krister and Ljubesic, Nikola and Loginova, Olga and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Maranduc, Catalina and Marcek, David and Marheinecke, Katrin and Mart{\\'{\\i}}nez Alonso, H{\\'e}ctor and Martins, Andr{\\'e} and Masek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendonca, Gustavo and Miekka, Niko and Mischenkova, Karina and Misirpashayeva, Margarita and Missil{\\\"a}, Anna and Mititelu, Catalin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\\\"u}{\\\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\\'e}, Mariam and Navarro Hor{\\~n}iacek, Juan Ignacio and Nedoluzhko,\nAnna and Ne{\\v s}pore-B{\\=e}rzkalne, Gunta and Nguy{\\~{\\^e}}n Th{\\d i}, L\u01b0\u01a1ng and Nguy{\\~{\\^e}}n Th{\\d i} Minh, Huy{\\`{\\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\\'u}{\\`o}kun, Ad{\\'e}day{\\d o}\u0300 and Omura, Mai and Onwuegbuzia, Emeka and Osenova, Petya and {\\\"O}stling, Robert and {\\O}vrelid, Lilja and {\\\"O}zate{\\c s}, {\\c S}aziye Bet{\\\"u}l and {\\\"O}zg{\\\"u}r, Arzucan and {\\\"O}zt{\\\"u}rk Ba{\\c s}aran, Balk{\\i}z and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\\L}api{\\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalnina, Lauma and Pr{\\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\\\"a}{\\\"a}bis, Andriela and Rademaker, Alexandre and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\\ss}ler, Michael and Rimkut{\\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and R{\\\"o}gnvaldsson, Eir{\\'{\\i}}kur and Romanenko, Mykhailo and Rosa, Rudolf and Ro\u0219ca, Valentin and Rovati, Davide and Rudina, Olga and Rueter, Jack and R{\\'u}narsson, Kristjan and Sadde, Shoval and Safari, Pegah and Sagot, Benoit and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samardzi{\\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\\\"a}rg,\nDage and Saul{\\={\\i}}te, Baiba and Sawanakunanon, Yanin and Scannell, Kevin and Scarlata, Salvatore and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Sigur\u00f0sson, Einar Freyr and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\\'o}, Katalin and {\\v S}imkov{\\'a}, M{\\'a}ria and Simov, Kiril and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Steingr{\\'{\\i}}msson, Stein{\\t h}{\\'o}r and Stella, Antonio and Straka, Milan and Strickland, Emmett and Strnadov{\\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Sz{\\'a}nt{\\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tella, Samson and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\\\"u}rk, Utku and Tyers, Francis and Uematsu, Sumire and Untilov, Roman and Uresov{\\'a}, Zdenka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Williams, Seyi and Wir{\\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\\'o}blewska, Alina and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Zabokrtsk{\\'y}, Zdenek and Zahra, Shorouq and Zeldes, Amir and Zhu, Hanzhi and Zhuravleva, Anna},\nurl = {http://hdl.handle.net/11234/1-3424},\nnote = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\\'U}FAL}), Faculty of Mathematics and Physics, Charles University},\ncopyright = {Licence Universal Dependencies v2.7},\nyear = {2020} }",
3
- "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "xpos": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "deprel": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "pos": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "Case": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "Definite": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "Degree": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "Gender": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "Mood": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "NumType": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "Number": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "Person": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "PronType": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "Tense": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- },
107
- "VerbForm": {
108
- "feature": {
109
- "dtype": "string",
110
- "_type": "Value"
111
- },
112
- "_type": "Sequence"
113
- },
114
- "AdjHead": {
115
- "feature": {
116
- "dtype": "string",
117
- "_type": "Value"
118
- },
119
- "_type": "Sequence"
120
- },
121
- "AdvHead": {
122
- "feature": {
123
- "dtype": "string",
124
- "_type": "Value"
125
- },
126
- "_type": "Sequence"
127
- },
128
- "CdHead": {
129
- "feature": {
130
- "dtype": "string",
131
- "_type": "Value"
132
- },
133
- "_type": "Sequence"
134
- },
135
- "ConjHead": {
136
- "feature": {
137
- "dtype": "string",
138
- "_type": "Value"
139
- },
140
- "_type": "Sequence"
141
- },
142
- "DetHead": {
143
- "feature": {
144
- "dtype": "string",
145
- "_type": "Value"
146
- },
147
- "_type": "Sequence"
148
- },
149
- "InHead": {
150
- "feature": {
151
- "dtype": "string",
152
- "_type": "Value"
153
- },
154
- "_type": "Sequence"
155
- },
156
- "MdHead": {
157
- "feature": {
158
- "dtype": "string",
159
- "_type": "Value"
160
- },
161
- "_type": "Sequence"
162
- },
163
- "NounHead": {
164
- "feature": {
165
- "dtype": "string",
166
- "_type": "Value"
167
- },
168
- "_type": "Sequence"
169
- },
170
- "PronHead": {
171
- "feature": {
172
- "dtype": "string",
173
- "_type": "Value"
174
- },
175
- "_type": "Sequence"
176
- },
177
- "VerbHead": {
178
- "feature": {
179
- "dtype": "string",
180
- "_type": "Value"
181
- },
182
- "_type": "Sequence"
183
- },
184
- "WhHead": {
185
- "feature": {
186
- "dtype": "string",
187
- "_type": "Value"
188
- },
189
- "_type": "Sequence"
190
- },
191
- "AdjType": {
192
- "feature": {
193
- "dtype": "string",
194
- "_type": "Value"
195
- },
196
- "_type": "Sequence"
197
- },
198
- "AdjGrad": {
199
- "feature": {
200
- "dtype": "string",
201
- "_type": "Value"
202
- },
203
- "_type": "Sequence"
204
- },
205
- "AdjPos": {
206
- "feature": {
207
- "dtype": "string",
208
- "_type": "Value"
209
- },
210
- "_type": "Sequence"
211
- },
212
- "AdvType": {
213
- "feature": {
214
- "dtype": "string",
215
- "_type": "Value"
216
- },
217
- "_type": "Sequence"
218
- }
219
- },
220
- "homepage": "https://universaldependencies.org/",
221
- "license": ""
222
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_jj_rb_types_20250320/test/state.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "0172d8a8a781517f",
8
- "_format_columns": [
9
- "AdjGrad",
10
- "AdjHead",
11
- "AdjPos",
12
- "AdjType",
13
- "AdvHead",
14
- "AdvType",
15
- "Case",
16
- "CdHead",
17
- "ConjHead",
18
- "Definite",
19
- "Degree",
20
- "DetHead",
21
- "Gender",
22
- "InHead",
23
- "MdHead",
24
- "Mood",
25
- "NounHead",
26
- "NumType",
27
- "Number",
28
- "Person",
29
- "PronHead",
30
- "PronType",
31
- "Tense",
32
- "VerbForm",
33
- "VerbHead",
34
- "WhHead",
35
- "deprel",
36
- "pos",
37
- "text",
38
- "tokens",
39
- "xpos"
40
- ],
41
- "_format_kwargs": {},
42
- "_format_type": null,
43
- "_output_all_columns": false,
44
- "_split": null
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_jj_rb_types_20250320/train/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c0bcd7cd11b46e27a48db9579fb9c71767971617c690f53cc5535c6e4cc39c4
3
- size 38694752
 
 
 
 
dataset/ud_augmented_jj_rb_types_20250320/train/dataset_info.json DELETED
@@ -1,222 +0,0 @@
1
- {
2
- "citation": "\\\n@misc{11234/1-3424,\ntitle = {Universal Dependencies 2.7},\nauthor = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\\\"e}mi and Aghaei, Hamid and Agi{\\'c}, {\\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\\v c}i{\\=u}t{\\.e}, Gabriel{\\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\\'o}ttir, {\\t H}{\\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\\\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\\.e}, Agn{\\.e} and Bjarnad{\\'o}ttir, Krist{\\'{\\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\\\"{\\i}}c and Borges V{\\\"o}lker, Emanuel and B{\\\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\\'a}rd and Fernanda, Mar{\\'{\\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\\'a}udia and Fujita, Kazunori and Gajdosov{\\'a}, Katar{\\'{\\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\\\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\\'{\\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\\\"o}k{\\i}rmak, Memduh and Goldberg, Yoav and G{\\'o}mez Guinovart, Xavier and Gonz{\\'a}lez Saavedra,\nBerta and Grici{\\=u}t{\\.e}, Bernadeta and Grioni, Matias and Grobol, Lo{\\\"{\\i}}c and Gr{\\=u}z{\\={\\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\\'e}line and G{\\\"u}ng{\\\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\\v c}, Jan and Haji{\\v c} jr., Jan and H{\\\"a}m{\\\"a}l{\\\"a}inen, Mika and H{\\`a} M{\\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\\'a}, Barbora and Hlav{\\'a}{\\v c}ov{\\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huber, Eva and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\\d O}l{\\'a}j{\\'{\\i}}d{\\'e} and Jel{\\'{\\i}}nek, Tom{\\'a}{\\v s} and Johannsen, Anders and J{\\'o}nsd{\\'o}ttir, Hildur and J{\\o}rgensen, Fredrik and Juutinen, Markus and K, Sarveswaran and Ka{\\c s}{\\i}kara, H{\\\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\\'a}, V{\\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\\\"o}hn, Arne and K{\\\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and Kotsyba, Natalia and Kovalevskait{\\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and Kwak, Sookyoung and Laippala, Veronika and Lam, Lucia and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\\^e} H{\\`{\\^o}}ng, Ph\u01b0\u01a1ng and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yuan and Lim, {KyungTae} and Linden, Krister and Ljubesic, Nikola and Loginova, Olga and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Maranduc, Catalina and Marcek, David and Marheinecke, Katrin and Mart{\\'{\\i}}nez Alonso, H{\\'e}ctor and Martins, Andr{\\'e} and Masek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendonca, Gustavo and Miekka, Niko and Mischenkova, Karina and Misirpashayeva, Margarita and Missil{\\\"a}, Anna and Mititelu, Catalin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\\\"u}{\\\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\\'e}, Mariam and Navarro Hor{\\~n}iacek, Juan Ignacio and Nedoluzhko,\nAnna and Ne{\\v s}pore-B{\\=e}rzkalne, Gunta and Nguy{\\~{\\^e}}n Th{\\d i}, L\u01b0\u01a1ng and Nguy{\\~{\\^e}}n Th{\\d i} Minh, Huy{\\`{\\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\\'u}{\\`o}kun, Ad{\\'e}day{\\d o}\u0300 and Omura, Mai and Onwuegbuzia, Emeka and Osenova, Petya and {\\\"O}stling, Robert and {\\O}vrelid, Lilja and {\\\"O}zate{\\c s}, {\\c S}aziye Bet{\\\"u}l and {\\\"O}zg{\\\"u}r, Arzucan and {\\\"O}zt{\\\"u}rk Ba{\\c s}aran, Balk{\\i}z and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\\L}api{\\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalnina, Lauma and Pr{\\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\\\"a}{\\\"a}bis, Andriela and Rademaker, Alexandre and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\\ss}ler, Michael and Rimkut{\\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and R{\\\"o}gnvaldsson, Eir{\\'{\\i}}kur and Romanenko, Mykhailo and Rosa, Rudolf and Ro\u0219ca, Valentin and Rovati, Davide and Rudina, Olga and Rueter, Jack and R{\\'u}narsson, Kristjan and Sadde, Shoval and Safari, Pegah and Sagot, Benoit and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samardzi{\\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\\\"a}rg,\nDage and Saul{\\={\\i}}te, Baiba and Sawanakunanon, Yanin and Scannell, Kevin and Scarlata, Salvatore and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Sigur\u00f0sson, Einar Freyr and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\\'o}, Katalin and {\\v S}imkov{\\'a}, M{\\'a}ria and Simov, Kiril and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Steingr{\\'{\\i}}msson, Stein{\\t h}{\\'o}r and Stella, Antonio and Straka, Milan and Strickland, Emmett and Strnadov{\\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Sz{\\'a}nt{\\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tella, Samson and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\\\"u}rk, Utku and Tyers, Francis and Uematsu, Sumire and Untilov, Roman and Uresov{\\'a}, Zdenka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Williams, Seyi and Wir{\\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\\'o}blewska, Alina and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Zabokrtsk{\\'y}, Zdenek and Zahra, Shorouq and Zeldes, Amir and Zhu, Hanzhi and Zhuravleva, Anna},\nurl = {http://hdl.handle.net/11234/1-3424},\nnote = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\\'U}FAL}), Faculty of Mathematics and Physics, Charles University},\ncopyright = {Licence Universal Dependencies v2.7},\nyear = {2020} }",
3
- "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "xpos": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "deprel": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "pos": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "Case": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "Definite": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "Degree": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "Gender": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "Mood": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "NumType": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "Number": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "Person": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "PronType": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "Tense": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- },
107
- "VerbForm": {
108
- "feature": {
109
- "dtype": "string",
110
- "_type": "Value"
111
- },
112
- "_type": "Sequence"
113
- },
114
- "AdjHead": {
115
- "feature": {
116
- "dtype": "string",
117
- "_type": "Value"
118
- },
119
- "_type": "Sequence"
120
- },
121
- "AdvHead": {
122
- "feature": {
123
- "dtype": "string",
124
- "_type": "Value"
125
- },
126
- "_type": "Sequence"
127
- },
128
- "CdHead": {
129
- "feature": {
130
- "dtype": "string",
131
- "_type": "Value"
132
- },
133
- "_type": "Sequence"
134
- },
135
- "ConjHead": {
136
- "feature": {
137
- "dtype": "string",
138
- "_type": "Value"
139
- },
140
- "_type": "Sequence"
141
- },
142
- "DetHead": {
143
- "feature": {
144
- "dtype": "string",
145
- "_type": "Value"
146
- },
147
- "_type": "Sequence"
148
- },
149
- "InHead": {
150
- "feature": {
151
- "dtype": "string",
152
- "_type": "Value"
153
- },
154
- "_type": "Sequence"
155
- },
156
- "MdHead": {
157
- "feature": {
158
- "dtype": "string",
159
- "_type": "Value"
160
- },
161
- "_type": "Sequence"
162
- },
163
- "NounHead": {
164
- "feature": {
165
- "dtype": "string",
166
- "_type": "Value"
167
- },
168
- "_type": "Sequence"
169
- },
170
- "PronHead": {
171
- "feature": {
172
- "dtype": "string",
173
- "_type": "Value"
174
- },
175
- "_type": "Sequence"
176
- },
177
- "VerbHead": {
178
- "feature": {
179
- "dtype": "string",
180
- "_type": "Value"
181
- },
182
- "_type": "Sequence"
183
- },
184
- "WhHead": {
185
- "feature": {
186
- "dtype": "string",
187
- "_type": "Value"
188
- },
189
- "_type": "Sequence"
190
- },
191
- "AdjType": {
192
- "feature": {
193
- "dtype": "string",
194
- "_type": "Value"
195
- },
196
- "_type": "Sequence"
197
- },
198
- "AdjGrad": {
199
- "feature": {
200
- "dtype": "string",
201
- "_type": "Value"
202
- },
203
- "_type": "Sequence"
204
- },
205
- "AdjPos": {
206
- "feature": {
207
- "dtype": "string",
208
- "_type": "Value"
209
- },
210
- "_type": "Sequence"
211
- },
212
- "AdvType": {
213
- "feature": {
214
- "dtype": "string",
215
- "_type": "Value"
216
- },
217
- "_type": "Sequence"
218
- }
219
- },
220
- "homepage": "https://universaldependencies.org/",
221
- "license": ""
222
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_jj_rb_types_20250320/train/state.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "e4c987937dbd5ac4",
8
- "_format_columns": [
9
- "AdjGrad",
10
- "AdjHead",
11
- "AdjPos",
12
- "AdjType",
13
- "AdvHead",
14
- "AdvType",
15
- "Case",
16
- "CdHead",
17
- "ConjHead",
18
- "Definite",
19
- "Degree",
20
- "DetHead",
21
- "Gender",
22
- "InHead",
23
- "MdHead",
24
- "Mood",
25
- "NounHead",
26
- "NumType",
27
- "Number",
28
- "Person",
29
- "PronHead",
30
- "PronType",
31
- "Tense",
32
- "VerbForm",
33
- "VerbHead",
34
- "WhHead",
35
- "deprel",
36
- "pos",
37
- "text",
38
- "tokens",
39
- "xpos"
40
- ],
41
- "_format_kwargs": {},
42
- "_format_type": null,
43
- "_output_all_columns": false,
44
- "_split": null
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_jj_rb_types_20250320/validation/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba7c69aecada9238046f52b3bbffcec94c5ab2a7a23106803453578507a36d1c
3
- size 5724752
 
 
 
 
dataset/ud_augmented_jj_rb_types_20250320/validation/dataset_info.json DELETED
@@ -1,222 +0,0 @@
1
- {
2
- "citation": "\\\n@misc{11234/1-3424,\ntitle = {Universal Dependencies 2.7},\nauthor = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\\\"e}mi and Aghaei, Hamid and Agi{\\'c}, {\\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\\v c}i{\\=u}t{\\.e}, Gabriel{\\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\\'o}ttir, {\\t H}{\\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\\\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\\.e}, Agn{\\.e} and Bjarnad{\\'o}ttir, Krist{\\'{\\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\\\"{\\i}}c and Borges V{\\\"o}lker, Emanuel and B{\\\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\\'a}rd and Fernanda, Mar{\\'{\\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\\'a}udia and Fujita, Kazunori and Gajdosov{\\'a}, Katar{\\'{\\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\\\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\\'{\\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\\\"o}k{\\i}rmak, Memduh and Goldberg, Yoav and G{\\'o}mez Guinovart, Xavier and Gonz{\\'a}lez Saavedra,\nBerta and Grici{\\=u}t{\\.e}, Bernadeta and Grioni, Matias and Grobol, Lo{\\\"{\\i}}c and Gr{\\=u}z{\\={\\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\\'e}line and G{\\\"u}ng{\\\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\\v c}, Jan and Haji{\\v c} jr., Jan and H{\\\"a}m{\\\"a}l{\\\"a}inen, Mika and H{\\`a} M{\\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\\'a}, Barbora and Hlav{\\'a}{\\v c}ov{\\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huber, Eva and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\\d O}l{\\'a}j{\\'{\\i}}d{\\'e} and Jel{\\'{\\i}}nek, Tom{\\'a}{\\v s} and Johannsen, Anders and J{\\'o}nsd{\\'o}ttir, Hildur and J{\\o}rgensen, Fredrik and Juutinen, Markus and K, Sarveswaran and Ka{\\c s}{\\i}kara, H{\\\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\\'a}, V{\\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\\\"o}hn, Arne and K{\\\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and Kotsyba, Natalia and Kovalevskait{\\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and Kwak, Sookyoung and Laippala, Veronika and Lam, Lucia and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\\^e} H{\\`{\\^o}}ng, Ph\u01b0\u01a1ng and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yuan and Lim, {KyungTae} and Linden, Krister and Ljubesic, Nikola and Loginova, Olga and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Maranduc, Catalina and Marcek, David and Marheinecke, Katrin and Mart{\\'{\\i}}nez Alonso, H{\\'e}ctor and Martins, Andr{\\'e} and Masek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendonca, Gustavo and Miekka, Niko and Mischenkova, Karina and Misirpashayeva, Margarita and Missil{\\\"a}, Anna and Mititelu, Catalin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\\\"u}{\\\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\\'e}, Mariam and Navarro Hor{\\~n}iacek, Juan Ignacio and Nedoluzhko,\nAnna and Ne{\\v s}pore-B{\\=e}rzkalne, Gunta and Nguy{\\~{\\^e}}n Th{\\d i}, L\u01b0\u01a1ng and Nguy{\\~{\\^e}}n Th{\\d i} Minh, Huy{\\`{\\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\\'u}{\\`o}kun, Ad{\\'e}day{\\d o}\u0300 and Omura, Mai and Onwuegbuzia, Emeka and Osenova, Petya and {\\\"O}stling, Robert and {\\O}vrelid, Lilja and {\\\"O}zate{\\c s}, {\\c S}aziye Bet{\\\"u}l and {\\\"O}zg{\\\"u}r, Arzucan and {\\\"O}zt{\\\"u}rk Ba{\\c s}aran, Balk{\\i}z and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\\L}api{\\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalnina, Lauma and Pr{\\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\\\"a}{\\\"a}bis, Andriela and Rademaker, Alexandre and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\\ss}ler, Michael and Rimkut{\\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and R{\\\"o}gnvaldsson, Eir{\\'{\\i}}kur and Romanenko, Mykhailo and Rosa, Rudolf and Ro\u0219ca, Valentin and Rovati, Davide and Rudina, Olga and Rueter, Jack and R{\\'u}narsson, Kristjan and Sadde, Shoval and Safari, Pegah and Sagot, Benoit and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samardzi{\\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\\\"a}rg,\nDage and Saul{\\={\\i}}te, Baiba and Sawanakunanon, Yanin and Scannell, Kevin and Scarlata, Salvatore and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Sigur\u00f0sson, Einar Freyr and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\\'o}, Katalin and {\\v S}imkov{\\'a}, M{\\'a}ria and Simov, Kiril and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Steingr{\\'{\\i}}msson, Stein{\\t h}{\\'o}r and Stella, Antonio and Straka, Milan and Strickland, Emmett and Strnadov{\\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Sz{\\'a}nt{\\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tella, Samson and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\\\"u}rk, Utku and Tyers, Francis and Uematsu, Sumire and Untilov, Roman and Uresov{\\'a}, Zdenka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Williams, Seyi and Wir{\\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\\'o}blewska, Alina and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Zabokrtsk{\\'y}, Zdenek and Zahra, Shorouq and Zeldes, Amir and Zhu, Hanzhi and Zhuravleva, Anna},\nurl = {http://hdl.handle.net/11234/1-3424},\nnote = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\\'U}FAL}), Faculty of Mathematics and Physics, Charles University},\ncopyright = {Licence Universal Dependencies v2.7},\nyear = {2020} }",
3
- "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "xpos": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "deprel": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "pos": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "Case": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "Definite": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "Degree": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "Gender": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "Mood": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "NumType": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "Number": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "Person": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "PronType": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "Tense": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- },
107
- "VerbForm": {
108
- "feature": {
109
- "dtype": "string",
110
- "_type": "Value"
111
- },
112
- "_type": "Sequence"
113
- },
114
- "AdjHead": {
115
- "feature": {
116
- "dtype": "string",
117
- "_type": "Value"
118
- },
119
- "_type": "Sequence"
120
- },
121
- "AdvHead": {
122
- "feature": {
123
- "dtype": "string",
124
- "_type": "Value"
125
- },
126
- "_type": "Sequence"
127
- },
128
- "CdHead": {
129
- "feature": {
130
- "dtype": "string",
131
- "_type": "Value"
132
- },
133
- "_type": "Sequence"
134
- },
135
- "ConjHead": {
136
- "feature": {
137
- "dtype": "string",
138
- "_type": "Value"
139
- },
140
- "_type": "Sequence"
141
- },
142
- "DetHead": {
143
- "feature": {
144
- "dtype": "string",
145
- "_type": "Value"
146
- },
147
- "_type": "Sequence"
148
- },
149
- "InHead": {
150
- "feature": {
151
- "dtype": "string",
152
- "_type": "Value"
153
- },
154
- "_type": "Sequence"
155
- },
156
- "MdHead": {
157
- "feature": {
158
- "dtype": "string",
159
- "_type": "Value"
160
- },
161
- "_type": "Sequence"
162
- },
163
- "NounHead": {
164
- "feature": {
165
- "dtype": "string",
166
- "_type": "Value"
167
- },
168
- "_type": "Sequence"
169
- },
170
- "PronHead": {
171
- "feature": {
172
- "dtype": "string",
173
- "_type": "Value"
174
- },
175
- "_type": "Sequence"
176
- },
177
- "VerbHead": {
178
- "feature": {
179
- "dtype": "string",
180
- "_type": "Value"
181
- },
182
- "_type": "Sequence"
183
- },
184
- "WhHead": {
185
- "feature": {
186
- "dtype": "string",
187
- "_type": "Value"
188
- },
189
- "_type": "Sequence"
190
- },
191
- "AdjType": {
192
- "feature": {
193
- "dtype": "string",
194
- "_type": "Value"
195
- },
196
- "_type": "Sequence"
197
- },
198
- "AdjGrad": {
199
- "feature": {
200
- "dtype": "string",
201
- "_type": "Value"
202
- },
203
- "_type": "Sequence"
204
- },
205
- "AdjPos": {
206
- "feature": {
207
- "dtype": "string",
208
- "_type": "Value"
209
- },
210
- "_type": "Sequence"
211
- },
212
- "AdvType": {
213
- "feature": {
214
- "dtype": "string",
215
- "_type": "Value"
216
- },
217
- "_type": "Sequence"
218
- }
219
- },
220
- "homepage": "https://universaldependencies.org/",
221
- "license": ""
222
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_augmented_jj_rb_types_20250320/validation/state.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "25f7c9b3bbb3e13e",
8
- "_format_columns": [
9
- "AdjGrad",
10
- "AdjHead",
11
- "AdjPos",
12
- "AdjType",
13
- "AdvHead",
14
- "AdvType",
15
- "Case",
16
- "CdHead",
17
- "ConjHead",
18
- "Definite",
19
- "Degree",
20
- "DetHead",
21
- "Gender",
22
- "InHead",
23
- "MdHead",
24
- "Mood",
25
- "NounHead",
26
- "NumType",
27
- "Number",
28
- "Person",
29
- "PronHead",
30
- "PronType",
31
- "Tense",
32
- "VerbForm",
33
- "VerbHead",
34
- "WhHead",
35
- "deprel",
36
- "pos",
37
- "text",
38
- "tokens",
39
- "xpos"
40
- ],
41
- "_format_kwargs": {},
42
- "_format_type": null,
43
- "_output_all_columns": false,
44
- "_split": null
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_transform_only_20250317/dataset_dict.json DELETED
@@ -1 +0,0 @@
1
- {"splits": ["test", "train", "validation"]}
 
 
dataset/ud_transform_only_20250317/test/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7de427f099e36be3e7ab659b1d6522afa30137f6044a1254ec9c36eb3828929a
3
- size 9862544
 
 
 
 
dataset/ud_transform_only_20250317/test/dataset_info.json DELETED
@@ -1,222 +0,0 @@
1
- {
2
- "citation": "\\\n@misc{11234/1-3424,\ntitle = {Universal Dependencies 2.7},\nauthor = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\\\"e}mi and Aghaei, Hamid and Agi{\\'c}, {\\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\\v c}i{\\=u}t{\\.e}, Gabriel{\\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\\'o}ttir, {\\t H}{\\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\\\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\\.e}, Agn{\\.e} and Bjarnad{\\'o}ttir, Krist{\\'{\\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\\\"{\\i}}c and Borges V{\\\"o}lker, Emanuel and B{\\\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\\'a}rd and Fernanda, Mar{\\'{\\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\\'a}udia and Fujita, Kazunori and Gajdosov{\\'a}, Katar{\\'{\\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\\\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\\'{\\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\\\"o}k{\\i}rmak, Memduh and Goldberg, Yoav and G{\\'o}mez Guinovart, Xavier and Gonz{\\'a}lez Saavedra,\nBerta and Grici{\\=u}t{\\.e}, Bernadeta and Grioni, Matias and Grobol, Lo{\\\"{\\i}}c and Gr{\\=u}z{\\={\\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\\'e}line and G{\\\"u}ng{\\\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\\v c}, Jan and Haji{\\v c} jr., Jan and H{\\\"a}m{\\\"a}l{\\\"a}inen, Mika and H{\\`a} M{\\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\\'a}, Barbora and Hlav{\\'a}{\\v c}ov{\\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huber, Eva and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\\d O}l{\\'a}j{\\'{\\i}}d{\\'e} and Jel{\\'{\\i}}nek, Tom{\\'a}{\\v s} and Johannsen, Anders and J{\\'o}nsd{\\'o}ttir, Hildur and J{\\o}rgensen, Fredrik and Juutinen, Markus and K, Sarveswaran and Ka{\\c s}{\\i}kara, H{\\\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\\'a}, V{\\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\\\"o}hn, Arne and K{\\\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and Kotsyba, Natalia and Kovalevskait{\\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and Kwak, Sookyoung and Laippala, Veronika and Lam, Lucia and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\\^e} H{\\`{\\^o}}ng, Ph\u01b0\u01a1ng and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yuan and Lim, {KyungTae} and Linden, Krister and Ljubesic, Nikola and Loginova, Olga and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Maranduc, Catalina and Marcek, David and Marheinecke, Katrin and Mart{\\'{\\i}}nez Alonso, H{\\'e}ctor and Martins, Andr{\\'e} and Masek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendonca, Gustavo and Miekka, Niko and Mischenkova, Karina and Misirpashayeva, Margarita and Missil{\\\"a}, Anna and Mititelu, Catalin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\\\"u}{\\\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\\'e}, Mariam and Navarro Hor{\\~n}iacek, Juan Ignacio and Nedoluzhko,\nAnna and Ne{\\v s}pore-B{\\=e}rzkalne, Gunta and Nguy{\\~{\\^e}}n Th{\\d i}, L\u01b0\u01a1ng and Nguy{\\~{\\^e}}n Th{\\d i} Minh, Huy{\\`{\\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\\'u}{\\`o}kun, Ad{\\'e}day{\\d o}\u0300 and Omura, Mai and Onwuegbuzia, Emeka and Osenova, Petya and {\\\"O}stling, Robert and {\\O}vrelid, Lilja and {\\\"O}zate{\\c s}, {\\c S}aziye Bet{\\\"u}l and {\\\"O}zg{\\\"u}r, Arzucan and {\\\"O}zt{\\\"u}rk Ba{\\c s}aran, Balk{\\i}z and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\\L}api{\\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalnina, Lauma and Pr{\\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\\\"a}{\\\"a}bis, Andriela and Rademaker, Alexandre and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\\ss}ler, Michael and Rimkut{\\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and R{\\\"o}gnvaldsson, Eir{\\'{\\i}}kur and Romanenko, Mykhailo and Rosa, Rudolf and Ro\u0219ca, Valentin and Rovati, Davide and Rudina, Olga and Rueter, Jack and R{\\'u}narsson, Kristjan and Sadde, Shoval and Safari, Pegah and Sagot, Benoit and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samardzi{\\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\\\"a}rg,\nDage and Saul{\\={\\i}}te, Baiba and Sawanakunanon, Yanin and Scannell, Kevin and Scarlata, Salvatore and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Sigur\u00f0sson, Einar Freyr and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\\'o}, Katalin and {\\v S}imkov{\\'a}, M{\\'a}ria and Simov, Kiril and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Steingr{\\'{\\i}}msson, Stein{\\t h}{\\'o}r and Stella, Antonio and Straka, Milan and Strickland, Emmett and Strnadov{\\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Sz{\\'a}nt{\\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tella, Samson and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\\\"u}rk, Utku and Tyers, Francis and Uematsu, Sumire and Untilov, Roman and Uresov{\\'a}, Zdenka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Williams, Seyi and Wir{\\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\\'o}blewska, Alina and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Zabokrtsk{\\'y}, Zdenek and Zahra, Shorouq and Zeldes, Amir and Zhu, Hanzhi and Zhuravleva, Anna},\nurl = {http://hdl.handle.net/11234/1-3424},\nnote = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\\'U}FAL}), Faculty of Mathematics and Physics, Charles University},\ncopyright = {Licence Universal Dependencies v2.7},\nyear = {2020} }",
3
- "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "xpos": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "deprel": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "pos": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "Case": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "Definite": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "Degree": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "Gender": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "Mood": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "NumType": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "Number": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "Person": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "Poss": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "PronType": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- },
107
- "Reflex": {
108
- "feature": {
109
- "dtype": "string",
110
- "_type": "Value"
111
- },
112
- "_type": "Sequence"
113
- },
114
- "Tense": {
115
- "feature": {
116
- "dtype": "string",
117
- "_type": "Value"
118
- },
119
- "_type": "Sequence"
120
- },
121
- "Typo": {
122
- "feature": {
123
- "dtype": "string",
124
- "_type": "Value"
125
- },
126
- "_type": "Sequence"
127
- },
128
- "VerbForm": {
129
- "feature": {
130
- "dtype": "string",
131
- "_type": "Value"
132
- },
133
- "_type": "Sequence"
134
- },
135
- "AdjHead": {
136
- "feature": {
137
- "dtype": "string",
138
- "_type": "Value"
139
- },
140
- "_type": "Sequence"
141
- },
142
- "AdvHead": {
143
- "feature": {
144
- "dtype": "string",
145
- "_type": "Value"
146
- },
147
- "_type": "Sequence"
148
- },
149
- "CdHead": {
150
- "feature": {
151
- "dtype": "string",
152
- "_type": "Value"
153
- },
154
- "_type": "Sequence"
155
- },
156
- "ConjHead": {
157
- "feature": {
158
- "dtype": "string",
159
- "_type": "Value"
160
- },
161
- "_type": "Sequence"
162
- },
163
- "DetHead": {
164
- "feature": {
165
- "dtype": "string",
166
- "_type": "Value"
167
- },
168
- "_type": "Sequence"
169
- },
170
- "InHead": {
171
- "feature": {
172
- "dtype": "string",
173
- "_type": "Value"
174
- },
175
- "_type": "Sequence"
176
- },
177
- "ModalHead": {
178
- "feature": {
179
- "dtype": "string",
180
- "_type": "Value"
181
- },
182
- "_type": "Sequence"
183
- },
184
- "NounHead": {
185
- "feature": {
186
- "dtype": "string",
187
- "_type": "Value"
188
- },
189
- "_type": "Sequence"
190
- },
191
- "PronounHead": {
192
- "feature": {
193
- "dtype": "string",
194
- "_type": "Value"
195
- },
196
- "_type": "Sequence"
197
- },
198
- "ToHead": {
199
- "feature": {
200
- "dtype": "string",
201
- "_type": "Value"
202
- },
203
- "_type": "Sequence"
204
- },
205
- "VerbHead": {
206
- "feature": {
207
- "dtype": "string",
208
- "_type": "Value"
209
- },
210
- "_type": "Sequence"
211
- },
212
- "WhHead": {
213
- "feature": {
214
- "dtype": "string",
215
- "_type": "Value"
216
- },
217
- "_type": "Sequence"
218
- }
219
- },
220
- "homepage": "https://universaldependencies.org/",
221
- "license": ""
222
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_transform_only_20250317/test/state.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "12f8503a2d9f38d0",
8
- "_format_columns": [
9
- "text",
10
- "tokens",
11
- "xpos",
12
- "deprel",
13
- "pos",
14
- "Case",
15
- "Definite",
16
- "Degree",
17
- "Gender",
18
- "Mood",
19
- "NumType",
20
- "Number",
21
- "Person",
22
- "Poss",
23
- "PronType",
24
- "Reflex",
25
- "Tense",
26
- "Typo",
27
- "VerbForm",
28
- "AdjHead",
29
- "AdvHead",
30
- "CdHead",
31
- "ConjHead",
32
- "DetHead",
33
- "InHead",
34
- "ModalHead",
35
- "NounHead",
36
- "PronounHead",
37
- "ToHead",
38
- "VerbHead",
39
- "WhHead"
40
- ],
41
- "_format_kwargs": {},
42
- "_format_type": null,
43
- "_output_all_columns": false,
44
- "_split": null
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_transform_only_20250317/train/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc5931b824b13f2ac2bb76da737c46256137ef66365e90672f89f45a7b2b15bb
3
- size 40720208
 
 
 
 
dataset/ud_transform_only_20250317/train/dataset_info.json DELETED
@@ -1,222 +0,0 @@
1
- {
2
- "citation": "\\\n@misc{11234/1-3424,\ntitle = {Universal Dependencies 2.7},\nauthor = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\\\"e}mi and Aghaei, Hamid and Agi{\\'c}, {\\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\\v c}i{\\=u}t{\\.e}, Gabriel{\\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\\'o}ttir, {\\t H}{\\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\\\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\\.e}, Agn{\\.e} and Bjarnad{\\'o}ttir, Krist{\\'{\\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\\\"{\\i}}c and Borges V{\\\"o}lker, Emanuel and B{\\\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\\'a}rd and Fernanda, Mar{\\'{\\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\\'a}udia and Fujita, Kazunori and Gajdosov{\\'a}, Katar{\\'{\\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\\\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\\'{\\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\\\"o}k{\\i}rmak, Memduh and Goldberg, Yoav and G{\\'o}mez Guinovart, Xavier and Gonz{\\'a}lez Saavedra,\nBerta and Grici{\\=u}t{\\.e}, Bernadeta and Grioni, Matias and Grobol, Lo{\\\"{\\i}}c and Gr{\\=u}z{\\={\\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\\'e}line and G{\\\"u}ng{\\\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\\v c}, Jan and Haji{\\v c} jr., Jan and H{\\\"a}m{\\\"a}l{\\\"a}inen, Mika and H{\\`a} M{\\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\\'a}, Barbora and Hlav{\\'a}{\\v c}ov{\\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huber, Eva and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\\d O}l{\\'a}j{\\'{\\i}}d{\\'e} and Jel{\\'{\\i}}nek, Tom{\\'a}{\\v s} and Johannsen, Anders and J{\\'o}nsd{\\'o}ttir, Hildur and J{\\o}rgensen, Fredrik and Juutinen, Markus and K, Sarveswaran and Ka{\\c s}{\\i}kara, H{\\\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\\'a}, V{\\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\\\"o}hn, Arne and K{\\\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and Kotsyba, Natalia and Kovalevskait{\\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and Kwak, Sookyoung and Laippala, Veronika and Lam, Lucia and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\\^e} H{\\`{\\^o}}ng, Ph\u01b0\u01a1ng and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yuan and Lim, {KyungTae} and Linden, Krister and Ljubesic, Nikola and Loginova, Olga and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Maranduc, Catalina and Marcek, David and Marheinecke, Katrin and Mart{\\'{\\i}}nez Alonso, H{\\'e}ctor and Martins, Andr{\\'e} and Masek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendonca, Gustavo and Miekka, Niko and Mischenkova, Karina and Misirpashayeva, Margarita and Missil{\\\"a}, Anna and Mititelu, Catalin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\\\"u}{\\\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\\'e}, Mariam and Navarro Hor{\\~n}iacek, Juan Ignacio and Nedoluzhko,\nAnna and Ne{\\v s}pore-B{\\=e}rzkalne, Gunta and Nguy{\\~{\\^e}}n Th{\\d i}, L\u01b0\u01a1ng and Nguy{\\~{\\^e}}n Th{\\d i} Minh, Huy{\\`{\\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\\'u}{\\`o}kun, Ad{\\'e}day{\\d o}\u0300 and Omura, Mai and Onwuegbuzia, Emeka and Osenova, Petya and {\\\"O}stling, Robert and {\\O}vrelid, Lilja and {\\\"O}zate{\\c s}, {\\c S}aziye Bet{\\\"u}l and {\\\"O}zg{\\\"u}r, Arzucan and {\\\"O}zt{\\\"u}rk Ba{\\c s}aran, Balk{\\i}z and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\\L}api{\\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalnina, Lauma and Pr{\\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\\\"a}{\\\"a}bis, Andriela and Rademaker, Alexandre and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\\ss}ler, Michael and Rimkut{\\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and R{\\\"o}gnvaldsson, Eir{\\'{\\i}}kur and Romanenko, Mykhailo and Rosa, Rudolf and Ro\u0219ca, Valentin and Rovati, Davide and Rudina, Olga and Rueter, Jack and R{\\'u}narsson, Kristjan and Sadde, Shoval and Safari, Pegah and Sagot, Benoit and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samardzi{\\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\\\"a}rg,\nDage and Saul{\\={\\i}}te, Baiba and Sawanakunanon, Yanin and Scannell, Kevin and Scarlata, Salvatore and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Sigur\u00f0sson, Einar Freyr and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\\'o}, Katalin and {\\v S}imkov{\\'a}, M{\\'a}ria and Simov, Kiril and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Steingr{\\'{\\i}}msson, Stein{\\t h}{\\'o}r and Stella, Antonio and Straka, Milan and Strickland, Emmett and Strnadov{\\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Sz{\\'a}nt{\\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tella, Samson and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\\\"u}rk, Utku and Tyers, Francis and Uematsu, Sumire and Untilov, Roman and Uresov{\\'a}, Zdenka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Williams, Seyi and Wir{\\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\\'o}blewska, Alina and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Zabokrtsk{\\'y}, Zdenek and Zahra, Shorouq and Zeldes, Amir and Zhu, Hanzhi and Zhuravleva, Anna},\nurl = {http://hdl.handle.net/11234/1-3424},\nnote = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\\'U}FAL}), Faculty of Mathematics and Physics, Charles University},\ncopyright = {Licence Universal Dependencies v2.7},\nyear = {2020} }",
3
- "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "xpos": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "deprel": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "pos": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "Case": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "Definite": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "Degree": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "Gender": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "Mood": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "NumType": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "Number": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "Person": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "Poss": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "PronType": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- },
107
- "Reflex": {
108
- "feature": {
109
- "dtype": "string",
110
- "_type": "Value"
111
- },
112
- "_type": "Sequence"
113
- },
114
- "Tense": {
115
- "feature": {
116
- "dtype": "string",
117
- "_type": "Value"
118
- },
119
- "_type": "Sequence"
120
- },
121
- "Typo": {
122
- "feature": {
123
- "dtype": "string",
124
- "_type": "Value"
125
- },
126
- "_type": "Sequence"
127
- },
128
- "VerbForm": {
129
- "feature": {
130
- "dtype": "string",
131
- "_type": "Value"
132
- },
133
- "_type": "Sequence"
134
- },
135
- "AdjHead": {
136
- "feature": {
137
- "dtype": "string",
138
- "_type": "Value"
139
- },
140
- "_type": "Sequence"
141
- },
142
- "AdvHead": {
143
- "feature": {
144
- "dtype": "string",
145
- "_type": "Value"
146
- },
147
- "_type": "Sequence"
148
- },
149
- "CdHead": {
150
- "feature": {
151
- "dtype": "string",
152
- "_type": "Value"
153
- },
154
- "_type": "Sequence"
155
- },
156
- "ConjHead": {
157
- "feature": {
158
- "dtype": "string",
159
- "_type": "Value"
160
- },
161
- "_type": "Sequence"
162
- },
163
- "DetHead": {
164
- "feature": {
165
- "dtype": "string",
166
- "_type": "Value"
167
- },
168
- "_type": "Sequence"
169
- },
170
- "InHead": {
171
- "feature": {
172
- "dtype": "string",
173
- "_type": "Value"
174
- },
175
- "_type": "Sequence"
176
- },
177
- "ModalHead": {
178
- "feature": {
179
- "dtype": "string",
180
- "_type": "Value"
181
- },
182
- "_type": "Sequence"
183
- },
184
- "NounHead": {
185
- "feature": {
186
- "dtype": "string",
187
- "_type": "Value"
188
- },
189
- "_type": "Sequence"
190
- },
191
- "PronounHead": {
192
- "feature": {
193
- "dtype": "string",
194
- "_type": "Value"
195
- },
196
- "_type": "Sequence"
197
- },
198
- "ToHead": {
199
- "feature": {
200
- "dtype": "string",
201
- "_type": "Value"
202
- },
203
- "_type": "Sequence"
204
- },
205
- "VerbHead": {
206
- "feature": {
207
- "dtype": "string",
208
- "_type": "Value"
209
- },
210
- "_type": "Sequence"
211
- },
212
- "WhHead": {
213
- "feature": {
214
- "dtype": "string",
215
- "_type": "Value"
216
- },
217
- "_type": "Sequence"
218
- }
219
- },
220
- "homepage": "https://universaldependencies.org/",
221
- "license": ""
222
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_transform_only_20250317/train/state.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "fe61d5df50a17d24",
8
- "_format_columns": [
9
- "text",
10
- "tokens",
11
- "xpos",
12
- "deprel",
13
- "pos",
14
- "Case",
15
- "Definite",
16
- "Degree",
17
- "Gender",
18
- "Mood",
19
- "NumType",
20
- "Number",
21
- "Person",
22
- "Poss",
23
- "PronType",
24
- "Reflex",
25
- "Tense",
26
- "Typo",
27
- "VerbForm",
28
- "AdjHead",
29
- "AdvHead",
30
- "CdHead",
31
- "ConjHead",
32
- "DetHead",
33
- "InHead",
34
- "ModalHead",
35
- "NounHead",
36
- "PronounHead",
37
- "ToHead",
38
- "VerbHead",
39
- "WhHead"
40
- ],
41
- "_format_kwargs": {},
42
- "_format_type": null,
43
- "_output_all_columns": false,
44
- "_split": null
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_transform_only_20250317/validation/data-00000-of-00001.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:82163cd7f5263d89a61d25886b3c5b71bfdc5fae4f778d5e545e5cfe914eb6bc
3
- size 5980624
 
 
 
 
dataset/ud_transform_only_20250317/validation/dataset_info.json DELETED
@@ -1,222 +0,0 @@
1
- {
2
- "citation": "\\\n@misc{11234/1-3424,\ntitle = {Universal Dependencies 2.7},\nauthor = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Ackermann, Elia and Aepli, No{\\\"e}mi and Aghaei, Hamid and Agi{\\'c}, {\\v Z}eljko and Ahmadi, Amir and Ahrenberg, Lars and Ajede, Chika Kennedy and Aleksandravi{\\v c}i{\\=u}t{\\.e}, Gabriel{\\.e} and Alfina, Ika and Antonsen, Lene and Aplonova, Katya and Aquino, Angelina and Aragon, Carolina and Aranzabe, Maria Jesus and Arnard{\\'o}ttir, {\\t H}{\\'o}runn and Arutie, Gashaw and Arwidarasti, Jessica Naraiswari and Asahara, Masayuki and Ateyah, Luma and Atmaca, Furkan and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Balasubramani, Keerthana and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bedir, Seyyit Talha and Bengoetxea, Kepa and Berk, G{\\\"o}zde and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\\.e}, Agn{\\.e} and Bjarnad{\\'o}ttir, Krist{\\'{\\i}}n and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\\\"{\\i}}c and Borges V{\\\"o}lker, Emanuel and B{\\\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiroglu Eryigit, Gulsen and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and Ceplo, Slavomir and Cetin, Savas and Cetinoglu, Ozlem and Chalub, Fabricio and Chi, Ethan and Cho, Yongseok and Choi, Jinho and Chun, Jayeol and Cignarella, Alessandra T. and Cinkova, Silvie and Collomb, Aurelie and Coltekin, Cagr{\\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and Derin, Mehmet Oguz and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dinakaramani, Arawinda and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Tomaz and Etienne, Aline and Evelyn, Wograine and Facundes, Sidney and Farkas, Rich{\\'a}rd and Fernanda, Mar{\\'{\\i}}lia and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\\'a}udia and Fujita, Kazunori and Gajdosov{\\'a}, Katar{\\'{\\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\\\"a}rdenfors, Moa and Garza, Sebastian and Gerardi, Fabr{\\'{\\i}}cio Ferraz and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\\\"o}k{\\i}rmak, Memduh and Goldberg, Yoav and G{\\'o}mez Guinovart, Xavier and Gonz{\\'a}lez Saavedra,\nBerta and Grici{\\=u}t{\\.e}, Bernadeta and Grioni, Matias and Grobol, Lo{\\\"{\\i}}c and Gr{\\=u}z{\\={\\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\\'e}line and G{\\\"u}ng{\\\"o}r, Tunga and Habash, Nizar and Hafsteinsson, Hinrik and Haji{\\v c}, Jan and Haji{\\v c} jr., Jan and H{\\\"a}m{\\\"a}l{\\\"a}inen, Mika and H{\\`a} M{\\~y}, Linh and Han, Na-Rae and Hanifmuti, Muhammad Yudistira and Hardwick, Sam and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hellwig, Oliver and Hennig, Felix and Hladk{\\'a}, Barbora and Hlav{\\'a}{\\v c}ov{\\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Huber, Eva and Hwang, Jena and Ikeda, Takumi and Ingason, Anton Karl and Ion, Radu and Irimia, Elena and Ishola, {\\d O}l{\\'a}j{\\'{\\i}}d{\\'e} and Jel{\\'{\\i}}nek, Tom{\\'a}{\\v s} and Johannsen, Anders and J{\\'o}nsd{\\'o}ttir, Hildur and J{\\o}rgensen, Fredrik and Juutinen, Markus and K, Sarveswaran and Ka{\\c s}{\\i}kara, H{\\\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\\'a}, V{\\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\\\"o}hn, Arne and K{\\\"o}ksal, Abdullatif and Kopacewicz, Kamil and Korkiakangas, Timo and Kotsyba, Natalia and Kovalevskait{\\.e}, Jolanta and Krek, Simon and Krishnamurthy, Parameswari and Kwak, Sookyoung and Laippala, Veronika and Lam, Lucia and Lambertino, Lorenzo and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\\^e} H{\\`{\\^o}}ng, Ph\u01b0\u01a1ng and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Levina, Maria and Li, Cheuk Ying and Li, Josie and Li, Keying and Li, Yuan and Lim, {KyungTae} and Linden, Krister and Ljubesic, Nikola and Loginova, Olga and Luthfi, Andry and Luukko, Mikko and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and Maranduc, Catalina and Marcek, David and Marheinecke, Katrin and Mart{\\'{\\i}}nez Alonso, H{\\'e}ctor and Martins, Andr{\\'e} and Masek, Jan and Matsuda, Hiroshi and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendonca, Gustavo and Miekka, Niko and Mischenkova, Karina and Misirpashayeva, Margarita and Missil{\\\"a}, Anna and Mititelu, Catalin and Mitrofan, Maria and Miyao, Yusuke and Mojiri Foroushani, {AmirHossein} and Moloodi, Amirsaeid and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Mori, Shinsuke and Morioka, Tomohiko and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\\\"u}{\\\"u}risep, Kaili and Nainwani, Pinkey and Nakhl{\\'e}, Mariam and Navarro Hor{\\~n}iacek, Juan Ignacio and Nedoluzhko,\nAnna and Ne{\\v s}pore-B{\\=e}rzkalne, Gunta and Nguy{\\~{\\^e}}n Th{\\d i}, L\u01b0\u01a1ng and Nguy{\\~{\\^e}}n Th{\\d i} Minh, Huy{\\`{\\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nourian, Alireza and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\\'u}{\\`o}kun, Ad{\\'e}day{\\d o}\u0300 and Omura, Mai and Onwuegbuzia, Emeka and Osenova, Petya and {\\\"O}stling, Robert and {\\O}vrelid, Lilja and {\\\"O}zate{\\c s}, {\\c S}aziye Bet{\\\"u}l and {\\\"O}zg{\\\"u}r, Arzucan and {\\\"O}zt{\\\"u}rk Ba{\\c s}aran, Balk{\\i}z and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\\L}api{\\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perkova, Natalia and Perrier, Guy and Petrov, Slav and Petrova, Daria and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalnina, Lauma and Pr{\\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\\\"a}{\\\"a}bis, Andriela and Rademaker, Alexandre and Rama, Taraka and Ramasamy, Loganathan and Ramisch, Carlos and Rashel, Fam and Rasooli, Mohammad Sadegh and Ravishankar, Vinit and Real, Livy and Rebeja, Petru and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\\ss}ler, Michael and Rimkut{\\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and R{\\\"o}gnvaldsson, Eir{\\'{\\i}}kur and Romanenko, Mykhailo and Rosa, Rudolf and Ro\u0219ca, Valentin and Rovati, Davide and Rudina, Olga and Rueter, Jack and R{\\'u}narsson, Kristjan and Sadde, Shoval and Safari, Pegah and Sagot, Benoit and Sahala, Aleksi and Saleh, Shadi and Salomoni, Alessio and Samardzi{\\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\\\"a}rg,\nDage and Saul{\\={\\i}}te, Baiba and Sawanakunanon, Yanin and Scannell, Kevin and Scarlata, Salvatore and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Sigur\u00f0sson, Einar Freyr and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\\'o}, Katalin and {\\v S}imkov{\\'a}, M{\\'a}ria and Simov, Kiril and Skachedubova, Maria and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Steingr{\\'{\\i}}msson, Stein{\\t h}{\\'o}r and Stella, Antonio and Straka, Milan and Strickland, Emmett and Strnadov{\\'a}, Jana and Suhr, Alane and Sulestio, Yogi Lesmana and Sulubacak, Umut and Suzuki, Shingo and Sz{\\'a}nt{\\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tan, Mary Ann C. and Tanaka, Takaaki and Tella, Samson and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Toska, Marsida and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and T{\\\"u}rk, Utku and Tyers, Francis and Uematsu, Sumire and Untilov, Roman and Uresov{\\'a}, Zdenka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wakasa, Aya and Wallenberg, Joel C. and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Widmer, Paul and Williams, Seyi and Wir{\\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\\'o}blewska, Alina and Yako, Mary and Yamashita, Kayo and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and Zabokrtsk{\\'y}, Zdenek and Zahra, Shorouq and Zeldes, Amir and Zhu, Hanzhi and Zhuravleva, Anna},\nurl = {http://hdl.handle.net/11234/1-3424},\nnote = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\\'U}FAL}), Faculty of Mathematics and Physics, Charles University},\ncopyright = {Licence Universal Dependencies v2.7},\nyear = {2020} }",
3
- "description": "Universal Dependencies is a project that seeks to develop cross-linguistically consistent treebank annotation for many languages, with the goal of facilitating multilingual parser development, cross-lingual learning, and parsing research from a language typology perspective. The annotation scheme is based on (universal) Stanford dependencies (de Marneffe et al., 2006, 2008, 2014), Google universal part-of-speech tags (Petrov et al., 2012), and the Interset interlingua for morphosyntactic tagsets (Zeman, 2008).",
4
- "features": {
5
- "text": {
6
- "dtype": "string",
7
- "_type": "Value"
8
- },
9
- "tokens": {
10
- "feature": {
11
- "dtype": "string",
12
- "_type": "Value"
13
- },
14
- "_type": "Sequence"
15
- },
16
- "xpos": {
17
- "feature": {
18
- "dtype": "string",
19
- "_type": "Value"
20
- },
21
- "_type": "Sequence"
22
- },
23
- "deprel": {
24
- "feature": {
25
- "dtype": "string",
26
- "_type": "Value"
27
- },
28
- "_type": "Sequence"
29
- },
30
- "pos": {
31
- "feature": {
32
- "dtype": "string",
33
- "_type": "Value"
34
- },
35
- "_type": "Sequence"
36
- },
37
- "Case": {
38
- "feature": {
39
- "dtype": "string",
40
- "_type": "Value"
41
- },
42
- "_type": "Sequence"
43
- },
44
- "Definite": {
45
- "feature": {
46
- "dtype": "string",
47
- "_type": "Value"
48
- },
49
- "_type": "Sequence"
50
- },
51
- "Degree": {
52
- "feature": {
53
- "dtype": "string",
54
- "_type": "Value"
55
- },
56
- "_type": "Sequence"
57
- },
58
- "Gender": {
59
- "feature": {
60
- "dtype": "string",
61
- "_type": "Value"
62
- },
63
- "_type": "Sequence"
64
- },
65
- "Mood": {
66
- "feature": {
67
- "dtype": "string",
68
- "_type": "Value"
69
- },
70
- "_type": "Sequence"
71
- },
72
- "NumType": {
73
- "feature": {
74
- "dtype": "string",
75
- "_type": "Value"
76
- },
77
- "_type": "Sequence"
78
- },
79
- "Number": {
80
- "feature": {
81
- "dtype": "string",
82
- "_type": "Value"
83
- },
84
- "_type": "Sequence"
85
- },
86
- "Person": {
87
- "feature": {
88
- "dtype": "string",
89
- "_type": "Value"
90
- },
91
- "_type": "Sequence"
92
- },
93
- "Poss": {
94
- "feature": {
95
- "dtype": "string",
96
- "_type": "Value"
97
- },
98
- "_type": "Sequence"
99
- },
100
- "PronType": {
101
- "feature": {
102
- "dtype": "string",
103
- "_type": "Value"
104
- },
105
- "_type": "Sequence"
106
- },
107
- "Reflex": {
108
- "feature": {
109
- "dtype": "string",
110
- "_type": "Value"
111
- },
112
- "_type": "Sequence"
113
- },
114
- "Tense": {
115
- "feature": {
116
- "dtype": "string",
117
- "_type": "Value"
118
- },
119
- "_type": "Sequence"
120
- },
121
- "Typo": {
122
- "feature": {
123
- "dtype": "string",
124
- "_type": "Value"
125
- },
126
- "_type": "Sequence"
127
- },
128
- "VerbForm": {
129
- "feature": {
130
- "dtype": "string",
131
- "_type": "Value"
132
- },
133
- "_type": "Sequence"
134
- },
135
- "AdjHead": {
136
- "feature": {
137
- "dtype": "string",
138
- "_type": "Value"
139
- },
140
- "_type": "Sequence"
141
- },
142
- "AdvHead": {
143
- "feature": {
144
- "dtype": "string",
145
- "_type": "Value"
146
- },
147
- "_type": "Sequence"
148
- },
149
- "CdHead": {
150
- "feature": {
151
- "dtype": "string",
152
- "_type": "Value"
153
- },
154
- "_type": "Sequence"
155
- },
156
- "ConjHead": {
157
- "feature": {
158
- "dtype": "string",
159
- "_type": "Value"
160
- },
161
- "_type": "Sequence"
162
- },
163
- "DetHead": {
164
- "feature": {
165
- "dtype": "string",
166
- "_type": "Value"
167
- },
168
- "_type": "Sequence"
169
- },
170
- "InHead": {
171
- "feature": {
172
- "dtype": "string",
173
- "_type": "Value"
174
- },
175
- "_type": "Sequence"
176
- },
177
- "ModalHead": {
178
- "feature": {
179
- "dtype": "string",
180
- "_type": "Value"
181
- },
182
- "_type": "Sequence"
183
- },
184
- "NounHead": {
185
- "feature": {
186
- "dtype": "string",
187
- "_type": "Value"
188
- },
189
- "_type": "Sequence"
190
- },
191
- "PronounHead": {
192
- "feature": {
193
- "dtype": "string",
194
- "_type": "Value"
195
- },
196
- "_type": "Sequence"
197
- },
198
- "ToHead": {
199
- "feature": {
200
- "dtype": "string",
201
- "_type": "Value"
202
- },
203
- "_type": "Sequence"
204
- },
205
- "VerbHead": {
206
- "feature": {
207
- "dtype": "string",
208
- "_type": "Value"
209
- },
210
- "_type": "Sequence"
211
- },
212
- "WhHead": {
213
- "feature": {
214
- "dtype": "string",
215
- "_type": "Value"
216
- },
217
- "_type": "Sequence"
218
- }
219
- },
220
- "homepage": "https://universaldependencies.org/",
221
- "license": ""
222
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset/ud_transform_only_20250317/validation/state.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "_data_files": [
3
- {
4
- "filename": "data-00000-of-00001.arrow"
5
- }
6
- ],
7
- "_fingerprint": "59aa93667916292e",
8
- "_format_columns": [
9
- "text",
10
- "tokens",
11
- "xpos",
12
- "deprel",
13
- "pos",
14
- "Case",
15
- "Definite",
16
- "Degree",
17
- "Gender",
18
- "Mood",
19
- "NumType",
20
- "Number",
21
- "Person",
22
- "Poss",
23
- "PronType",
24
- "Reflex",
25
- "Tense",
26
- "Typo",
27
- "VerbForm",
28
- "AdjHead",
29
- "AdvHead",
30
- "CdHead",
31
- "ConjHead",
32
- "DetHead",
33
- "InHead",
34
- "ModalHead",
35
- "NounHead",
36
- "PronounHead",
37
- "ToHead",
38
- "VerbHead",
39
- "WhHead"
40
- ],
41
- "_format_kwargs": {},
42
- "_format_type": null,
43
- "_output_all_columns": false,
44
- "_split": null
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dataset_splitter.py DELETED
@@ -1,43 +0,0 @@
1
- from datasets import DatasetDict, load_from_disk
2
- import argparse
3
-
4
- from openai_dataset_maker import features
5
-
6
- def has_all_valid_labels(exp):
7
- for col, labels in exp.items():
8
- if col in {"text", "tokens"}:
9
- continue
10
- for label in labels:
11
- if label not in features[col]:
12
- return False
13
- return True
14
-
15
- def is_evenly_shaped(exp):
16
- cnt_set = set()
17
- for col, labels in exp.items():
18
- if col == "text":
19
- continue
20
- cnt_set.add(len(labels))
21
- return len(cnt_set) == 1
22
-
23
-
24
- if __name__ == '__main__':
25
- arg_parser = argparse.ArgumentParser(description="Train multi-task model.")
26
- arg_parser.add_argument("data_path", help="Load dataset from specified path.",
27
- action="store")
28
- arg_parser.add_argument("--save-path", help="Save final dataset to specified path.",
29
- action="store", default="./training_data")
30
- args = arg_parser.parse_args()
31
-
32
- loaded_dataset = load_from_disk(args.data_path)
33
- loaded_dataset = loaded_dataset.filter(is_evenly_shaped)
34
- loaded_dataset = loaded_dataset.filter(has_all_valid_labels)
35
-
36
- first_split = loaded_dataset.train_test_split(shuffle=True, seed=42, test_size=0.09)
37
- second_split = first_split["train"].train_test_split(test_size=0.1)
38
-
39
- new_ds = DatasetDict()
40
- new_ds["test"] = first_split["test"]
41
- new_ds["train"] = second_split["train"]
42
- new_ds["validation"] = second_split["test"]
43
- new_ds.save_to_disk(args.save_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
goemotions_predict.py DELETED
@@ -1,63 +0,0 @@
1
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
2
- import numpy as np
3
- import torch
4
-
5
- from utils import get_torch_device
6
-
7
-
8
- class GoEmotionsPredictor:
9
- def __init__(self, model_name_or_path: str, subfolder=None):
10
- self.tokenizer = AutoTokenizer.from_pretrained(
11
- model_name_or_path, subfolder=subfolder)
12
- self.model = AutoModelForSequenceClassification.from_pretrained(
13
- model_name_or_path, subfolder=subfolder)
14
-
15
- self.label_names = getattr(self.model.config, "label_names", None)
16
- self.per_label_thresh = getattr(self.model.config, "per_label_thresholds", None)
17
- self.global_thresh = getattr(self.model.config, "best_global_threshold", 0.65)
18
-
19
- self.device = get_torch_device()
20
- self.model.to(self.device)
21
- self.model.eval()
22
-
23
- def predict(self, texts, use_per_label=True):
24
- """
25
- Args:
26
- texts (list[str]): A list of raw text strings to classify.
27
- use_per_label (bool): If True, apply per-label thresholds. If False, apply global threshold.
28
- Returns:
29
- A list of dicts, each with {"text": ..., "predicted_labels": [...]}
30
- """
31
- encodings = self.tokenizer(
32
- texts,
33
- truncation=True,
34
- padding=True,
35
- max_length=512,
36
- return_tensors="pt"
37
- )
38
- # Move encodings to same device as the model
39
- encodings = {k: v.to(self.device) for k, v in encodings.items()}
40
-
41
- # 1) Run the model to get logits
42
- with torch.no_grad():
43
- outputs = self.model(**encodings)
44
- logits = outputs.logits # shape: (batch_size, num_labels)
45
- probs = torch.sigmoid(logits).cpu().numpy() # shape: (batch_size, num_labels)
46
-
47
- # 2) Determine predictions by thresholding
48
- if use_per_label:
49
- # Use per-label thresholds
50
- threshold_array = np.array(self.per_label_thresh)
51
- preds = (probs >= threshold_array).astype(int) # shape: (batch_size, num_labels)
52
- else:
53
- # Use global threshold
54
- preds = (probs >= self.global_thresh).astype(int)
55
-
56
- # 3) Convert integer predictions to label names
57
- results = []
58
- for i, text in enumerate(texts):
59
- row_preds = preds[i]
60
- predicted_labels = [self.label_names[j] for j, val in enumerate(row_preds) if val == 1]
61
- results.append({"text": text, "emotions": predicted_labels})
62
-
63
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama_dataset_maker.py DELETED
@@ -1,194 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, Pipeline, pipeline
3
- import logging
4
- import torch
5
-
6
- from utils import get_torch_device
7
-
8
- logger = logging.getLogger(__name__)
9
-
10
-
11
- class ChatModel(ABC):
12
-
13
- @abstractmethod
14
- def generate(self, messages: list[dict[str, str]]) -> dict[str, str]:
15
- pass
16
-
17
-
18
- class AdjLabeler:
19
- def __init__(self, model: ChatModel):
20
- self.model = model
21
-
22
- def label_example(self, exp, feature_name):
23
- messages = [
24
- {"role": "system",
25
- "content": "You are a helpful Grammar tutor."},
26
- {"role": "user",
27
- "content": "An adjective is a word that describes a noun?"},
28
- {"role": "assistant",
29
- "content": "Yes, that's correct! An adjective relates to, modifies, or describes nouns."},
30
- {"role": "user",
31
- "content": "Are they always used with nouns?"},
32
- {"role": "assistant",
33
- "content": ("No, adjectives often appear directly before nouns (e.g. \"a red apple\") "
34
- "but they can also follow linking verbs to describe the subject (e.g. \"The sky is blue\"). "
35
- "Sometimes, adjectives are used as complements in certain constructions or phrases "
36
- "(e.g. \"the rich\" or \"well-known author\").")},
37
- {"role": "user",
38
- "content": "They can have comparative or superlative forms too, right?"},
39
- {"role": "assistant",
40
- "content": ("Yes, that's right! The word \"fast\" can take a comparative form as in \"faster\" "
41
- "or a superlative form as in \"fastest\". Some adjectives don't have comparative or "
42
- "superlative forms but use the word \"more\" or \"most\" to become comparative or "
43
- "superlative.")},
44
- {"role": "user",
45
- "content": f"How about this example: {exp['tokens']}"},
46
- ]
47
-
48
- token_labels = []
49
- for idx, token in enumerate(exp["tokens"]):
50
- token_messages = messages.copy()
51
- token_messages.append({"role": "user",
52
- "content": f"Is '{token}' at position {idx} an adjective? Answer 'yes' or 'no'."})
53
- #logger.info(f"token_messages: {token_messages}")
54
-
55
- assistant_message = self.model.generate(token_messages)
56
- logger.info(f"{assistant_message} - {token}")
57
- token_messages.append(assistant_message)
58
- messages += token_messages
59
- return token_labels
60
-
61
-
62
- class LlamaPipeline(ChatModel):
63
- def __init__(self, model_name: str):
64
- self.device = get_torch_device()
65
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
66
- self.pipeline = pipeline(
67
- "text-generation",
68
- model=model_name,
69
- model_kwargs={"torch_dtype": torch.bfloat16},
70
- device_map="auto",
71
- )
72
-
73
- def generate(self, messages, max_new_tokens=1) :
74
- outputs = self.pipeline(
75
- messages,
76
- max_new_tokens=max_new_tokens,
77
- pad_token_id=self.tokenizer.eos_token_id,
78
- temperature=0.6,
79
- top_p=0.9,
80
- )
81
- return outputs[0]["generated_text"][-1]
82
-
83
-
84
- class LlamaModel(ChatModel):
85
- """
86
- A wrapper around a Llama model checkpoint using Hugging Face Transformers.
87
- """
88
-
89
- def __init__(self, model_name: str):
90
- torch_device = get_torch_device()
91
-
92
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
93
- self.model = AutoModelForCausalLM.from_pretrained(
94
- model_name,
95
- device_map=str(torch_device),
96
- torch_dtype=torch.float16,
97
- )
98
- self.model.to(torch_device)
99
- self.model.eval()
100
-
101
- # Adjust generation parameters as needed
102
- self.generation_config = GenerationConfig(
103
- max_new_tokens=1,
104
- pad_token_id=self.tokenizer.eos_token_id,
105
- temperature=0.7,
106
- top_p=0.9,
107
- do_sample=True,
108
- )
109
-
110
- def generate(self, prompt: str) -> str:
111
- """
112
- Generate text from the model given a prompt.
113
- """
114
- inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
115
- with torch.no_grad():
116
- output_ids = self.model.generate(
117
- **inputs,
118
- generation_config=self.generation_config
119
- )
120
- raw_output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
121
- return raw_output[len(prompt):]
122
-
123
-
124
- # ----------------------------------
125
- # Putting It All Together
126
- # ----------------------------------
127
-
128
- if __name__ == "__main__":
129
- import logging.config
130
- from utils import default_logging_config
131
- logging.config.dictConfig(default_logging_config)
132
-
133
- llama_pipeline = LlamaPipeline(
134
- model_name="meta-llama/Llama-3.2-3B-Instruct",
135
- #model_name="meta-llama/Llama-3.1-8B-Instruct",
136
- )
137
- adj_labeler = AdjLabeler(llama_pipeline)
138
-
139
- basic_cases = [
140
- #{"text": "Joan has a nice dog.",
141
- # "tokens": ["Joan", "has", "a", "nice", "dog."]},
142
- #{"text": "Bob is the most agile person I have ever met.",
143
- # "tokens": ["Bob", "is", "the", "most", "agile", "person", "I", "have", "ever", "met."]},
144
- #{"text": "He's a total shit head",
145
- # "tokens": ["He's", "a", "total", "shit", "head"]},
146
- #{"text": "The old, creaky house stood on the quiet street.",
147
- # "tokens": ["The", "old,", "creaky", "house", "stood", "on", "the", "quiet", "street."]},
148
- #{"text": "The sky turned brilliant blue as the sun emerged.",
149
- # "tokens": ["The", "sky", "turned", "brilliant", "blue", "as", "the", "sun", "emerged."]},
150
- #{"text": "They admired the well-behaved and enthusiastic children at the party.",
151
- # "tokens": ["They", "admired", "the", "well-behaved", "and", "enthusiastic", "children", "at", "the",
152
- # "party."]},
153
- #{"text": "After dinner, she felt tired and content.",
154
- # "tokens": ["After", "dinner,", "she", "felt", "tired", "and", "content."]},
155
- #{"text": "The resourceful team devised a clever plan.",
156
- # "tokens": ["The", "resourceful", "team", "devised", "a", "clever", "plan."]},
157
- #{"text": "He handed over the thick book to the eager student.",
158
- # "tokens": ["He", "handed", "over", "the", "thick", "book", "to", "the", "eager", "student."]},
159
- #{"text": "We appreciated the delicious, handmade pie from our neighbor.",
160
- # "tokens": ["We", "appreciated", "the", "delicious,", "handmade", "pie", "from", "our", "neighbor."]},
161
- #{"text": "In the enchanted forest, sparkling fairies danced under the moonlight.",
162
- # "tokens": ["In", "the", "enchanted", "forest,", "sparkling", "fairies", "danced", "under", "the", "moonlight."]},
163
- #{"text": "The stray cats, hungry and dirty, roamed the narrow alley.",
164
- # "tokens": ["The", "stray", "cats,", "hungry", "and", "dirty,", "roamed", "the", "narrow", "alley."]},
165
- #{"text": "The challenging puzzle left the determined young boy both frustrated and excited.",
166
- # "tokens": ["The", "challenging", "puzzle", "left", "the", "determined", "young", "boy", "both", "frustrated",
167
- # "and", "excited."]},
168
-
169
- {"text": "Big cars use a lot more gas.",
170
- "tokens": ["Big", "cars", "use", "a", "lot", "more", "gas."]},
171
- {"text": "My car is faster than my bicycle.",
172
- "tokens": ["My", "car", "is", "faster", "than", "my", "bicycle."]},
173
- #{"text": "This puzzle is more challenging than the one we solved yesterday.",
174
- # "tokens": ["This", "puzzle", "is", "more", "challenging", "than", "the", "one", "we", "solved", "yesterday."]},
175
- #{"text": "Among all the students, Lara is the most diligent.",
176
- # "tokens": ["Among", "all", "the", "students,", "Lara", "is", "the", "most", "diligent."]},
177
- #{"text": "That building is taller than the one next to it.",
178
- # "tokens": ["That", "building", "is", "taller", "than", "the", "one", "next", "to", "it."]},
179
- #{"text": "This book is more interesting than the movie adaptation.",
180
- # "tokens": ["This", "book", "is", "more", "interesting", "than", "the", "movie", "adaptation."]},
181
- #{"text": "Of all the fruits, mangoes are the sweetest.",
182
- # "tokens": ["Of", "all", "the", "fruits,", "mangoes", "are", "the", "sweetest."]},
183
- #{"text": "His running speed is quicker than anyone else's on the team.",
184
- # "tokens": ["His", "running", "speed", "is", "quicker", "than", "anyone", "else's", "on", "the", "team."]},
185
- #{"text": "The exam was easier than I had anticipated.",
186
- # "tokens": ["The", "exam", "was", "easier", "than", "I", "had", "anticipated."]},
187
- #{"text": "Among all the flavors, vanilla is the mildest.",
188
- # "tokens": ["Among", "all", "the", "flavors,", "vanilla", "is", "the", "mildest."]},
189
- #{"text": "The new smartphone is lighter than the previous version.",
190
- # "tokens": ["The", "new", "smartphone", "is", "lighter", "than", "the", "previous", "version."]},
191
- ]
192
- for case in basic_cases:
193
- adj_labels = adj_labeler.label_example(case, "adj")
194
- logger.info(f"\ntokens:\t{case['tokens']}\nadj:\t{adj_labels}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/conll2012_en12_20250305/added_tokens.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "[MASK]": 128000
3
- }
 
 
 
 
models/conll2012_en12_20250305/config.json DELETED
@@ -1,135 +0,0 @@
1
- {
2
- "_name_or_path": "microsoft/deberta-v3-base",
3
- "architectures": [
4
- "MultiHeadModel"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "hidden_act": "gelu",
8
- "hidden_dropout_prob": 0.1,
9
- "hidden_size": 768,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 3072,
12
- "label_maps": {
13
- "ner_tags": [
14
- "B-QUANTITY",
15
- "I-PERSON",
16
- "B-LANGUAGE",
17
- "O",
18
- "I-LOC",
19
- "I-MONEY",
20
- "I-DATE",
21
- "I-CARDINAL",
22
- "I-WORK_OF_ART",
23
- "I-FAC",
24
- "B-FAC",
25
- "B-LOC",
26
- "I-PERCENT",
27
- "B-CARDINAL",
28
- "B-NORP",
29
- "B-TIME",
30
- "B-GPE",
31
- "I-LANGUAGE",
32
- "B-PERSON",
33
- "B-LAW",
34
- "I-LAW",
35
- "B-MONEY",
36
- "I-ORDINAL",
37
- "B-PRODUCT",
38
- "B-WORK_OF_ART",
39
- "B-ORDINAL",
40
- "B-DATE",
41
- "B-ORG",
42
- "I-GPE",
43
- "I-PRODUCT",
44
- "B-EVENT",
45
- "B-PERCENT",
46
- "I-EVENT",
47
- "I-ORG",
48
- "I-NORP",
49
- "I-QUANTITY",
50
- "I-TIME"
51
- ],
52
- "pos_tags": [
53
- "RBR",
54
- "POS",
55
- "EX",
56
- "VBP",
57
- "VBZ",
58
- "``",
59
- "PRP$",
60
- "WP",
61
- "VBD",
62
- "NN",
63
- "NNS",
64
- "WP$",
65
- "RB",
66
- "UH",
67
- ":",
68
- "NNPS",
69
- "LS",
70
- "HYPH",
71
- "RP",
72
- "WDT",
73
- "-LRB-",
74
- ",",
75
- "CC",
76
- "JJS",
77
- "MD",
78
- "JJR",
79
- "RBS",
80
- "SYM",
81
- "DT",
82
- "-RRB-",
83
- "FW",
84
- "TO",
85
- "PDT",
86
- "NNP",
87
- "ADD",
88
- "VB",
89
- "$",
90
- "VBG",
91
- "CD",
92
- "''",
93
- "WRB",
94
- "PRP",
95
- "NFP",
96
- "JJ",
97
- "VBN",
98
- ".",
99
- "IN"
100
- ],
101
- "verb_predicate": [
102
- "O",
103
- "Yes"
104
- ]
105
- },
106
- "layer_norm_eps": 1e-07,
107
- "legacy": true,
108
- "max_position_embeddings": 512,
109
- "max_relative_positions": -1,
110
- "model_type": "deberta-v2",
111
- "norm_rel_ebd": "layer_norm",
112
- "num_attention_heads": 12,
113
- "num_hidden_layers": 12,
114
- "num_labels_dict": {
115
- "ner_tags": 37,
116
- "pos_tags": 47,
117
- "verb_predicate": 2
118
- },
119
- "pad_token_id": 0,
120
- "pooler_dropout": 0,
121
- "pooler_hidden_act": "gelu",
122
- "pooler_hidden_size": 768,
123
- "pos_att_type": [
124
- "p2c",
125
- "c2p"
126
- ],
127
- "position_biased_input": false,
128
- "position_buckets": 256,
129
- "relative_attention": true,
130
- "share_att_key": true,
131
- "torch_dtype": "float32",
132
- "transformers_version": "4.49.0",
133
- "type_vocab_size": 0,
134
- "vocab_size": 128100
135
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/conll2012_en12_20250305/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4eefbb306d8c9dd94dd544cc95aaaf31d9f8b768ef9bb47ed54ecac6c8982e68
3
- size 735615520
 
 
 
 
models/conll2012_en12_20250305/special_tokens_map.json DELETED
@@ -1,15 +0,0 @@
1
- {
2
- "bos_token": "[CLS]",
3
- "cls_token": "[CLS]",
4
- "eos_token": "[SEP]",
5
- "mask_token": "[MASK]",
6
- "pad_token": "[PAD]",
7
- "sep_token": "[SEP]",
8
- "unk_token": {
9
- "content": "[UNK]",
10
- "lstrip": false,
11
- "normalized": true,
12
- "rstrip": false,
13
- "single_word": false
14
- }
15
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/conll2012_en12_20250305/spm.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
- size 2464616
 
 
 
 
models/conll2012_en12_20250305/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
models/conll2012_en12_20250305/tokenizer_config.json DELETED
@@ -1,60 +0,0 @@
1
- {
2
- "add_prefix_space": true,
3
- "added_tokens_decoder": {
4
- "0": {
5
- "content": "[PAD]",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "1": {
13
- "content": "[CLS]",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "2": {
21
- "content": "[SEP]",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- },
28
- "3": {
29
- "content": "[UNK]",
30
- "lstrip": false,
31
- "normalized": true,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
- },
36
- "128000": {
37
- "content": "[MASK]",
38
- "lstrip": false,
39
- "normalized": false,
40
- "rstrip": false,
41
- "single_word": false,
42
- "special": true
43
- }
44
- },
45
- "bos_token": "[CLS]",
46
- "clean_up_tokenization_spaces": false,
47
- "cls_token": "[CLS]",
48
- "do_lower_case": false,
49
- "eos_token": "[SEP]",
50
- "extra_special_tokens": {},
51
- "mask_token": "[MASK]",
52
- "model_max_length": 1000000000000000019884624838656,
53
- "pad_token": "[PAD]",
54
- "sep_token": "[SEP]",
55
- "sp_model_kwargs": {},
56
- "split_by_punct": false,
57
- "tokenizer_class": "DebertaV2Tokenizer",
58
- "unk_token": "[UNK]",
59
- "vocab_type": "spm"
60
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/conll2012_en12_20250305/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:778a691f83fb5dcfe8e9c9a6371dc7f31bf3842f4d67464cc949409d466b4db0
3
- size 5240
 
 
 
 
models/ud_augmented_jj_rb_types_20250320_v2/README.md DELETED
The diff for this file is too large to render. See raw diff
 
models/ud_augmented_jj_rb_types_20250320_v2/added_tokens.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "[MASK]": 128000
3
- }
 
 
 
 
models/ud_augmented_jj_rb_types_20250320_v2/config.json DELETED
@@ -1,388 +0,0 @@
1
- {
2
- "_name_or_path": "models/ud_augmented_jj_rb_types_20250320_v2",
3
- "architectures": [
4
- "MultiHeadModel"
5
- ],
6
- "attention_probs_dropout_prob": 0.1,
7
- "hidden_act": "gelu",
8
- "hidden_dropout_prob": 0.1,
9
- "hidden_size": 768,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 3072,
12
- "label_maps": {
13
- "AdjGrad": [
14
- "yes",
15
- "no",
16
- "O"
17
- ],
18
- "AdjHead": [
19
- "2",
20
- "4+",
21
- "-1",
22
- "O",
23
- "-4+",
24
- "3",
25
- "1",
26
- "-2",
27
- "-3"
28
- ],
29
- "AdjPos": [
30
- "postpositive",
31
- "attributive",
32
- "predicative",
33
- "O"
34
- ],
35
- "AdjType": [
36
- "material",
37
- "quantifying",
38
- "color",
39
- "size",
40
- "O",
41
- "quality",
42
- "purpose",
43
- "origin",
44
- "relational",
45
- "age",
46
- "shape",
47
- "limiting"
48
- ],
49
- "AdvHead": [
50
- "-3+",
51
- "2",
52
- "4+",
53
- "-1",
54
- "O",
55
- "3",
56
- "1",
57
- "-2"
58
- ],
59
- "AdvType": [
60
- "manner",
61
- "time",
62
- "negation",
63
- "conjunctive",
64
- "frequency",
65
- "focusing",
66
- "O",
67
- "place",
68
- "degree",
69
- "disjunct",
70
- "modal"
71
- ],
72
- "Case": [
73
- "Acc",
74
- "Nom",
75
- "O"
76
- ],
77
- "CdHead": [
78
- "-3+",
79
- "2",
80
- "-1",
81
- "3+",
82
- "O",
83
- "1",
84
- "-2"
85
- ],
86
- "ConjHead": [
87
- "2",
88
- "4+",
89
- "O",
90
- "3",
91
- "1",
92
- "-1+"
93
- ],
94
- "Definite": [
95
- "Def",
96
- "Ind",
97
- "O"
98
- ],
99
- "Degree": [
100
- "Cmp",
101
- "Pos",
102
- "Sup",
103
- "O"
104
- ],
105
- "DetHead": [
106
- "-2+",
107
- "2",
108
- "4+",
109
- "-1",
110
- "O",
111
- "3",
112
- "1"
113
- ],
114
- "Gender": [
115
- "Fem",
116
- "Masc",
117
- "Neut",
118
- "O"
119
- ],
120
- "InHead": [
121
- "4",
122
- "2",
123
- "-2+",
124
- "-1",
125
- "O",
126
- "3",
127
- "5+",
128
- "1"
129
- ],
130
- "MdHead": [
131
- "2",
132
- "3+",
133
- "O",
134
- "1",
135
- "-1+"
136
- ],
137
- "Mood": [
138
- "Imp",
139
- "Ind",
140
- "O"
141
- ],
142
- "NounHead": [
143
- "2",
144
- "4+",
145
- "-5+",
146
- "-1",
147
- "O",
148
- "3",
149
- "-4",
150
- "1",
151
- "-2",
152
- "-3"
153
- ],
154
- "NumType": [
155
- "Mult",
156
- "Card",
157
- "Ord",
158
- "O"
159
- ],
160
- "Number": [
161
- "Plur",
162
- "Sing",
163
- "O"
164
- ],
165
- "Person": [
166
- "3",
167
- "1",
168
- "2",
169
- "O"
170
- ],
171
- "PronHead": [
172
- "-2+",
173
- "2",
174
- "-1",
175
- "3+",
176
- "O",
177
- "1"
178
- ],
179
- "PronType": [
180
- "Dem",
181
- "O",
182
- "Rel",
183
- "Int",
184
- "Prs",
185
- "Art"
186
- ],
187
- "Tense": [
188
- "Past",
189
- "Pres",
190
- "O"
191
- ],
192
- "VerbForm": [
193
- "Ger",
194
- "Inf",
195
- "O",
196
- "Part",
197
- "Fin"
198
- ],
199
- "VerbHead": [
200
- "2",
201
- "4+",
202
- "-5+",
203
- "-1",
204
- "O",
205
- "3",
206
- "-4",
207
- "1",
208
- "-2",
209
- "-3"
210
- ],
211
- "WhHead": [
212
- "-2+",
213
- "2",
214
- "4+",
215
- "-1",
216
- "O",
217
- "3",
218
- "1"
219
- ],
220
- "deprel": [
221
- "punct",
222
- "compound",
223
- "obl",
224
- "case",
225
- "obj",
226
- "nsubj:pass",
227
- "cc:preconj",
228
- "list",
229
- "mark",
230
- "parataxis",
231
- "acl",
232
- "obl:npmod",
233
- "root",
234
- "nmod:poss",
235
- "flat",
236
- "iobj",
237
- "nsubj",
238
- "expl",
239
- "compound:prt",
240
- "cop",
241
- "vocative",
242
- "nmod",
243
- "aux:pass",
244
- "ccomp",
245
- "det",
246
- "csubj",
247
- "obl:tmod",
248
- "xcomp",
249
- "aux",
250
- "discourse",
251
- "acl:relcl",
252
- "cc",
253
- "nmod:npmod",
254
- "appos",
255
- "advcl",
256
- "conj",
257
- "fixed",
258
- "advmod",
259
- "det:predet",
260
- "amod",
261
- "nmod:tmod",
262
- "nummod"
263
- ],
264
- "pos": [
265
- "INTJ",
266
- "VERB",
267
- "SYM",
268
- "PROPN",
269
- "ADV",
270
- "AUX",
271
- "SCONJ",
272
- "ADJ",
273
- "DET",
274
- "NUM",
275
- "PRON",
276
- "NOUN",
277
- "X",
278
- "CCONJ",
279
- "ADP",
280
- "PUNCT",
281
- "PART"
282
- ],
283
- "xpos": [
284
- "FW",
285
- "RBR",
286
- "NNPS",
287
- "DT",
288
- "PDT",
289
- "EX",
290
- "HYPH",
291
- "CD",
292
- "ADD",
293
- "SYM",
294
- "PRP",
295
- "JJR",
296
- "MD",
297
- "WDT",
298
- "JJ",
299
- "RB",
300
- "RP",
301
- "TO",
302
- "NNP",
303
- "NN",
304
- "CC",
305
- "-RRB-",
306
- "VBP",
307
- "WRB",
308
- "''",
309
- "IN",
310
- ":",
311
- "LS",
312
- "-LRB-",
313
- "VBD",
314
- "VBN",
315
- ".",
316
- "VBZ",
317
- "VBG",
318
- "WP$",
319
- "JJS",
320
- "VB",
321
- "NNS",
322
- "``",
323
- "POS",
324
- "UH",
325
- "PRP$",
326
- "NFP",
327
- "$",
328
- "RBS",
329
- ",",
330
- "WP"
331
- ]
332
- },
333
- "layer_norm_eps": 1e-07,
334
- "legacy": true,
335
- "max_position_embeddings": 512,
336
- "max_relative_positions": -1,
337
- "model_type": "deberta-v2",
338
- "norm_rel_ebd": "layer_norm",
339
- "num_attention_heads": 12,
340
- "num_hidden_layers": 12,
341
- "num_labels_dict": {
342
- "AdjGrad": 3,
343
- "AdjHead": 9,
344
- "AdjPos": 4,
345
- "AdjType": 12,
346
- "AdvHead": 8,
347
- "AdvType": 11,
348
- "Case": 3,
349
- "CdHead": 7,
350
- "ConjHead": 6,
351
- "Definite": 3,
352
- "Degree": 4,
353
- "DetHead": 7,
354
- "Gender": 4,
355
- "InHead": 8,
356
- "MdHead": 5,
357
- "Mood": 3,
358
- "NounHead": 10,
359
- "NumType": 4,
360
- "Number": 3,
361
- "Person": 4,
362
- "PronHead": 6,
363
- "PronType": 6,
364
- "Tense": 3,
365
- "VerbForm": 5,
366
- "VerbHead": 10,
367
- "WhHead": 7,
368
- "deprel": 42,
369
- "pos": 17,
370
- "xpos": 47
371
- },
372
- "pad_token_id": 0,
373
- "pooler_dropout": 0,
374
- "pooler_hidden_act": "gelu",
375
- "pooler_hidden_size": 768,
376
- "pos_att_type": [
377
- "p2c",
378
- "c2p"
379
- ],
380
- "position_biased_input": false,
381
- "position_buckets": 256,
382
- "relative_attention": true,
383
- "share_att_key": true,
384
- "torch_dtype": "float32",
385
- "transformers_version": "4.49.0",
386
- "type_vocab_size": 0,
387
- "vocab_size": 128100
388
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/ud_augmented_jj_rb_types_20250320_v2/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:57b5dda10a3ea434194a6896dce6e4e1e1290a8bb3db6d753b78b5b7f2b510b7
3
- size 804672988
 
 
 
 
models/ud_augmented_jj_rb_types_20250320_v2/special_tokens_map.json DELETED
@@ -1,51 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "[CLS]",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "cls_token": {
10
- "content": "[CLS]",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "eos_token": {
17
- "content": "[SEP]",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "mask_token": {
24
- "content": "[MASK]",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- },
30
- "pad_token": {
31
- "content": "[PAD]",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false
36
- },
37
- "sep_token": {
38
- "content": "[SEP]",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false
43
- },
44
- "unk_token": {
45
- "content": "[UNK]",
46
- "lstrip": false,
47
- "normalized": true,
48
- "rstrip": false,
49
- "single_word": false
50
- }
51
- }