""" UDD-1 Corpus loader for dependency parsing. This module provides a corpus class that downloads the UDD-1 dataset from HuggingFace and converts it to CoNLL format for use with the underthesea dependency parser trainer. """ import os from pathlib import Path class UDD1Corpus: """ Corpus class for the UDD-1 (Universal Dependency Dataset) for Vietnamese. This class downloads the UDD-1 dataset from HuggingFace and converts it to CoNLL-U format files that can be used with the underthesea ParserTrainer. Attributes: train: Path to the training data file (CoNLL format) dev: Path to the development/validation data file (CoNLL format) test: Path to the test data file (CoNLL format) Example: >>> from src.corpus import UDD1Corpus >>> corpus = UDD1Corpus() >>> print(corpus.train) # Path to train.conllu """ name = "UDD-1" def __init__(self, data_dir: str = None, force_download: bool = False): """ Initialize the UDD-1 corpus. Args: data_dir: Directory to store the converted CoNLL files. Defaults to ./data/UDD-1 force_download: If True, re-download and convert even if files exist. """ if data_dir is None: data_dir = Path(__file__).parent.parent / "data" / "UDD-1" self.data_dir = Path(data_dir) self.data_dir.mkdir(parents=True, exist_ok=True) self._train = self.data_dir / "train.conllu" self._dev = self.data_dir / "dev.conllu" self._test = self.data_dir / "test.conllu" if force_download or not self._files_exist(): self._download_and_convert() def _files_exist(self) -> bool: """Check if all required files exist.""" return self._train.exists() and self._dev.exists() and self._test.exists() def _download_and_convert(self): """Download UDD-1 from HuggingFace and convert to CoNLL format.""" # Lazy import - only needed when downloading from datasets import load_dataset print(f"Downloading UDD-1 dataset from HuggingFace...") dataset = load_dataset("undertheseanlp/UDD-1") print(f"Converting to CoNLL format...") self._convert_split(dataset["train"], self._train) self._convert_split(dataset["validation"], self._dev) self._convert_split(dataset["test"], self._test) print(f"Dataset saved to {self.data_dir}") print(f" Train: {len(dataset['train'])} sentences") print(f" Dev: {len(dataset['validation'])} sentences") print(f" Test: {len(dataset['test'])} sentences") def _convert_split(self, split, output_path: Path): """Convert a dataset split to CoNLL-U format.""" with open(output_path, "w", encoding="utf-8") as f: for item in split: sent_id = item.get("sent_id", "") text = item.get("text", "") if sent_id: f.write(f"# sent_id = {sent_id}\n") if text: f.write(f"# text = {text}\n") tokens = item["tokens"] lemmas = item.get("lemmas", ["_"] * len(tokens)) upos = item["upos"] xpos = item.get("xpos", ["_"] * len(tokens)) feats = item.get("feats", ["_"] * len(tokens)) heads = item["head"] deprels = item["deprel"] deps = item.get("deps", ["_"] * len(tokens)) misc = item.get("misc", ["_"] * len(tokens)) for i in range(len(tokens)): token_id = i + 1 form = tokens[i] lemma = lemmas[i] if lemmas[i] else "_" upos_tag = upos[i] if upos[i] else "_" xpos_tag = xpos[i] if xpos[i] else "_" feat = feats[i] if feats[i] else "_" head = int(heads[i]) if heads[i] else 0 deprel = deprels[i] if deprels[i] else "_" dep = deps[i] if deps[i] else "_" misc_val = misc[i] if misc[i] else "_" line = f"{token_id}\t{form}\t{lemma}\t{upos_tag}\t{xpos_tag}\t{feat}\t{head}\t{deprel}\t{dep}\t{misc_val}" f.write(line + "\n") f.write("\n") @property def train(self) -> str: """Path to training data file.""" return str(self._train) @property def dev(self) -> str: """Path to development/validation data file.""" return str(self._dev) @property def test(self) -> str: """Path to test data file.""" return str(self._test) def get_statistics(self) -> dict: """Get dataset statistics.""" # Lazy import - only needed for statistics from datasets import load_dataset dataset = load_dataset("undertheseanlp/UDD-1") stats = { "train_sentences": len(dataset["train"]), "dev_sentences": len(dataset["validation"]), "test_sentences": len(dataset["test"]), "train_tokens": sum(len(item["tokens"]) for item in dataset["train"]), "dev_tokens": sum(len(item["tokens"]) for item in dataset["validation"]), "test_tokens": sum(len(item["tokens"]) for item in dataset["test"]), } all_upos = set() all_deprels = set() for split in ["train", "validation", "test"]: for item in dataset[split]: all_upos.update(item["upos"]) all_deprels.update(item["deprel"]) stats["num_upos_tags"] = len(all_upos) stats["num_deprels"] = len(all_deprels) stats["upos_tags"] = sorted(all_upos) stats["deprels"] = sorted(all_deprels) return stats