rain1024 commited on
Commit
0bdcadc
·
verified ·
1 Parent(s): c4a4048

Upload src/corpus.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/corpus.py +158 -0
src/corpus.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ UDD-1 Corpus loader for dependency parsing.
3
+
4
+ This module provides a corpus class that downloads the UDD-1 dataset from
5
+ HuggingFace and converts it to CoNLL format for use with the underthesea
6
+ dependency parser trainer.
7
+ """
8
+
9
+ import os
10
+ from pathlib import Path
11
+
12
+
13
+ class UDD1Corpus:
14
+ """
15
+ Corpus class for the UDD-1 (Universal Dependency Dataset) for Vietnamese.
16
+
17
+ This class downloads the UDD-1 dataset from HuggingFace and converts it to
18
+ CoNLL-U format files that can be used with the underthesea ParserTrainer.
19
+
20
+ Attributes:
21
+ train: Path to the training data file (CoNLL format)
22
+ dev: Path to the development/validation data file (CoNLL format)
23
+ test: Path to the test data file (CoNLL format)
24
+
25
+ Example:
26
+ >>> from src.corpus import UDD1Corpus
27
+ >>> corpus = UDD1Corpus()
28
+ >>> print(corpus.train) # Path to train.conllu
29
+ """
30
+
31
+ name = "UDD-1"
32
+
33
+ def __init__(self, data_dir: str = None, force_download: bool = False):
34
+ """
35
+ Initialize the UDD-1 corpus.
36
+
37
+ Args:
38
+ data_dir: Directory to store the converted CoNLL files.
39
+ Defaults to ./data/UDD-1
40
+ force_download: If True, re-download and convert even if files exist.
41
+ """
42
+ if data_dir is None:
43
+ data_dir = Path(__file__).parent.parent / "data" / "UDD-1"
44
+ self.data_dir = Path(data_dir)
45
+ self.data_dir.mkdir(parents=True, exist_ok=True)
46
+
47
+ self._train = self.data_dir / "train.conllu"
48
+ self._dev = self.data_dir / "dev.conllu"
49
+ self._test = self.data_dir / "test.conllu"
50
+
51
+ if force_download or not self._files_exist():
52
+ self._download_and_convert()
53
+
54
+ def _files_exist(self) -> bool:
55
+ """Check if all required files exist."""
56
+ return self._train.exists() and self._dev.exists() and self._test.exists()
57
+
58
+ def _download_and_convert(self):
59
+ """Download UDD-1 from HuggingFace and convert to CoNLL format."""
60
+ # Lazy import - only needed when downloading
61
+ from datasets import load_dataset
62
+
63
+ print(f"Downloading UDD-1 dataset from HuggingFace...")
64
+ dataset = load_dataset("undertheseanlp/UDD-1")
65
+
66
+ print(f"Converting to CoNLL format...")
67
+ self._convert_split(dataset["train"], self._train)
68
+ self._convert_split(dataset["validation"], self._dev)
69
+ self._convert_split(dataset["test"], self._test)
70
+
71
+ print(f"Dataset saved to {self.data_dir}")
72
+ print(f" Train: {len(dataset['train'])} sentences")
73
+ print(f" Dev: {len(dataset['validation'])} sentences")
74
+ print(f" Test: {len(dataset['test'])} sentences")
75
+
76
+ def _convert_split(self, split, output_path: Path):
77
+ """Convert a dataset split to CoNLL-U format."""
78
+ with open(output_path, "w", encoding="utf-8") as f:
79
+ for item in split:
80
+ sent_id = item.get("sent_id", "")
81
+ text = item.get("text", "")
82
+
83
+ if sent_id:
84
+ f.write(f"# sent_id = {sent_id}\n")
85
+ if text:
86
+ f.write(f"# text = {text}\n")
87
+
88
+ tokens = item["tokens"]
89
+ lemmas = item.get("lemmas", ["_"] * len(tokens))
90
+ upos = item["upos"]
91
+ xpos = item.get("xpos", ["_"] * len(tokens))
92
+ feats = item.get("feats", ["_"] * len(tokens))
93
+ heads = item["head"]
94
+ deprels = item["deprel"]
95
+ deps = item.get("deps", ["_"] * len(tokens))
96
+ misc = item.get("misc", ["_"] * len(tokens))
97
+
98
+ for i in range(len(tokens)):
99
+ token_id = i + 1
100
+ form = tokens[i]
101
+ lemma = lemmas[i] if lemmas[i] else "_"
102
+ upos_tag = upos[i] if upos[i] else "_"
103
+ xpos_tag = xpos[i] if xpos[i] else "_"
104
+ feat = feats[i] if feats[i] else "_"
105
+ head = int(heads[i]) if heads[i] else 0
106
+ deprel = deprels[i] if deprels[i] else "_"
107
+ dep = deps[i] if deps[i] else "_"
108
+ misc_val = misc[i] if misc[i] else "_"
109
+
110
+ line = f"{token_id}\t{form}\t{lemma}\t{upos_tag}\t{xpos_tag}\t{feat}\t{head}\t{deprel}\t{dep}\t{misc_val}"
111
+ f.write(line + "\n")
112
+
113
+ f.write("\n")
114
+
115
+ @property
116
+ def train(self) -> str:
117
+ """Path to training data file."""
118
+ return str(self._train)
119
+
120
+ @property
121
+ def dev(self) -> str:
122
+ """Path to development/validation data file."""
123
+ return str(self._dev)
124
+
125
+ @property
126
+ def test(self) -> str:
127
+ """Path to test data file."""
128
+ return str(self._test)
129
+
130
+ def get_statistics(self) -> dict:
131
+ """Get dataset statistics."""
132
+ # Lazy import - only needed for statistics
133
+ from datasets import load_dataset
134
+
135
+ dataset = load_dataset("undertheseanlp/UDD-1")
136
+
137
+ stats = {
138
+ "train_sentences": len(dataset["train"]),
139
+ "dev_sentences": len(dataset["validation"]),
140
+ "test_sentences": len(dataset["test"]),
141
+ "train_tokens": sum(len(item["tokens"]) for item in dataset["train"]),
142
+ "dev_tokens": sum(len(item["tokens"]) for item in dataset["validation"]),
143
+ "test_tokens": sum(len(item["tokens"]) for item in dataset["test"]),
144
+ }
145
+
146
+ all_upos = set()
147
+ all_deprels = set()
148
+ for split in ["train", "validation", "test"]:
149
+ for item in dataset[split]:
150
+ all_upos.update(item["upos"])
151
+ all_deprels.update(item["deprel"])
152
+
153
+ stats["num_upos_tags"] = len(all_upos)
154
+ stats["num_deprels"] = len(all_deprels)
155
+ stats["upos_tags"] = sorted(all_upos)
156
+ stats["deprels"] = sorted(all_deprels)
157
+
158
+ return stats