| """ |
| UDD-1 Corpus loader for dependency parsing. |
| |
| This module provides a corpus class that downloads the UDD-1 dataset from |
| HuggingFace and converts it to CoNLL format for use with the underthesea |
| dependency parser trainer. |
| """ |
|
|
| import os |
| from pathlib import Path |
|
|
|
|
| class UDD1Corpus: |
| """ |
| Corpus class for the UDD-1 (Universal Dependency Dataset) for Vietnamese. |
| |
| This class downloads the UDD-1 dataset from HuggingFace and converts it to |
| CoNLL-U format files that can be used with the underthesea ParserTrainer. |
| |
| Attributes: |
| train: Path to the training data file (CoNLL format) |
| dev: Path to the development/validation data file (CoNLL format) |
| test: Path to the test data file (CoNLL format) |
| |
| Example: |
| >>> from src.corpus import UDD1Corpus |
| >>> corpus = UDD1Corpus() |
| >>> print(corpus.train) # Path to train.conllu |
| """ |
|
|
| name = "UDD-1" |
|
|
| def __init__(self, data_dir: str = None, force_download: bool = False): |
| """ |
| Initialize the UDD-1 corpus. |
| |
| Args: |
| data_dir: Directory to store the converted CoNLL files. |
| Defaults to ./data/UDD-1 |
| force_download: If True, re-download and convert even if files exist. |
| """ |
| if data_dir is None: |
| data_dir = Path(__file__).parent.parent / "data" / "UDD-1" |
| self.data_dir = Path(data_dir) |
| self.data_dir.mkdir(parents=True, exist_ok=True) |
|
|
| self._train = self.data_dir / "train.conllu" |
| self._dev = self.data_dir / "dev.conllu" |
| self._test = self.data_dir / "test.conllu" |
|
|
| if force_download or not self._files_exist(): |
| self._download_and_convert() |
|
|
| def _files_exist(self) -> bool: |
| """Check if all required files exist.""" |
| return self._train.exists() and self._dev.exists() and self._test.exists() |
|
|
| def _download_and_convert(self): |
| """Download UDD-1 from HuggingFace and convert to CoNLL format.""" |
| |
| from datasets import load_dataset |
|
|
| print(f"Downloading UDD-1 dataset from HuggingFace...") |
| dataset = load_dataset("undertheseanlp/UDD-1") |
|
|
| print(f"Converting to CoNLL format...") |
| self._convert_split(dataset["train"], self._train) |
| self._convert_split(dataset["validation"], self._dev) |
| self._convert_split(dataset["test"], self._test) |
|
|
| print(f"Dataset saved to {self.data_dir}") |
| print(f" Train: {len(dataset['train'])} sentences") |
| print(f" Dev: {len(dataset['validation'])} sentences") |
| print(f" Test: {len(dataset['test'])} sentences") |
|
|
| def _convert_split(self, split, output_path: Path): |
| """Convert a dataset split to CoNLL-U format.""" |
| with open(output_path, "w", encoding="utf-8") as f: |
| for item in split: |
| sent_id = item.get("sent_id", "") |
| text = item.get("text", "") |
|
|
| if sent_id: |
| f.write(f"# sent_id = {sent_id}\n") |
| if text: |
| f.write(f"# text = {text}\n") |
|
|
| tokens = item["tokens"] |
| lemmas = item.get("lemmas", ["_"] * len(tokens)) |
| upos = item["upos"] |
| xpos = item.get("xpos", ["_"] * len(tokens)) |
| feats = item.get("feats", ["_"] * len(tokens)) |
| heads = item["head"] |
| deprels = item["deprel"] |
| deps = item.get("deps", ["_"] * len(tokens)) |
| misc = item.get("misc", ["_"] * len(tokens)) |
|
|
| for i in range(len(tokens)): |
| token_id = i + 1 |
| form = tokens[i] |
| lemma = lemmas[i] if lemmas[i] else "_" |
| upos_tag = upos[i] if upos[i] else "_" |
| xpos_tag = xpos[i] if xpos[i] else "_" |
| feat = feats[i] if feats[i] else "_" |
| head = int(heads[i]) if heads[i] else 0 |
| deprel = deprels[i] if deprels[i] else "_" |
| dep = deps[i] if deps[i] else "_" |
| misc_val = misc[i] if misc[i] else "_" |
|
|
| line = f"{token_id}\t{form}\t{lemma}\t{upos_tag}\t{xpos_tag}\t{feat}\t{head}\t{deprel}\t{dep}\t{misc_val}" |
| f.write(line + "\n") |
|
|
| f.write("\n") |
|
|
| @property |
| def train(self) -> str: |
| """Path to training data file.""" |
| return str(self._train) |
|
|
| @property |
| def dev(self) -> str: |
| """Path to development/validation data file.""" |
| return str(self._dev) |
|
|
| @property |
| def test(self) -> str: |
| """Path to test data file.""" |
| return str(self._test) |
|
|
| def get_statistics(self) -> dict: |
| """Get dataset statistics.""" |
| |
| from datasets import load_dataset |
|
|
| dataset = load_dataset("undertheseanlp/UDD-1") |
|
|
| stats = { |
| "train_sentences": len(dataset["train"]), |
| "dev_sentences": len(dataset["validation"]), |
| "test_sentences": len(dataset["test"]), |
| "train_tokens": sum(len(item["tokens"]) for item in dataset["train"]), |
| "dev_tokens": sum(len(item["tokens"]) for item in dataset["validation"]), |
| "test_tokens": sum(len(item["tokens"]) for item in dataset["test"]), |
| } |
|
|
| all_upos = set() |
| all_deprels = set() |
| for split in ["train", "validation", "test"]: |
| for item in dataset[split]: |
| all_upos.update(item["upos"]) |
| all_deprels.update(item["deprel"]) |
|
|
| stats["num_upos_tags"] = len(all_upos) |
| stats["num_deprels"] = len(all_deprels) |
| stats["upos_tags"] = sorted(all_upos) |
| stats["deprels"] = sorted(all_deprels) |
|
|
| return stats |
|
|