bamboo-1 / src /corpus.py
rain1024's picture
Upload src/corpus.py with huggingface_hub
0bdcadc verified
"""
UDD-1 Corpus loader for dependency parsing.
This module provides a corpus class that downloads the UDD-1 dataset from
HuggingFace and converts it to CoNLL format for use with the underthesea
dependency parser trainer.
"""
import os
from pathlib import Path
class UDD1Corpus:
"""
Corpus class for the UDD-1 (Universal Dependency Dataset) for Vietnamese.
This class downloads the UDD-1 dataset from HuggingFace and converts it to
CoNLL-U format files that can be used with the underthesea ParserTrainer.
Attributes:
train: Path to the training data file (CoNLL format)
dev: Path to the development/validation data file (CoNLL format)
test: Path to the test data file (CoNLL format)
Example:
>>> from src.corpus import UDD1Corpus
>>> corpus = UDD1Corpus()
>>> print(corpus.train) # Path to train.conllu
"""
name = "UDD-1"
def __init__(self, data_dir: str = None, force_download: bool = False):
"""
Initialize the UDD-1 corpus.
Args:
data_dir: Directory to store the converted CoNLL files.
Defaults to ./data/UDD-1
force_download: If True, re-download and convert even if files exist.
"""
if data_dir is None:
data_dir = Path(__file__).parent.parent / "data" / "UDD-1"
self.data_dir = Path(data_dir)
self.data_dir.mkdir(parents=True, exist_ok=True)
self._train = self.data_dir / "train.conllu"
self._dev = self.data_dir / "dev.conllu"
self._test = self.data_dir / "test.conllu"
if force_download or not self._files_exist():
self._download_and_convert()
def _files_exist(self) -> bool:
"""Check if all required files exist."""
return self._train.exists() and self._dev.exists() and self._test.exists()
def _download_and_convert(self):
"""Download UDD-1 from HuggingFace and convert to CoNLL format."""
# Lazy import - only needed when downloading
from datasets import load_dataset
print(f"Downloading UDD-1 dataset from HuggingFace...")
dataset = load_dataset("undertheseanlp/UDD-1")
print(f"Converting to CoNLL format...")
self._convert_split(dataset["train"], self._train)
self._convert_split(dataset["validation"], self._dev)
self._convert_split(dataset["test"], self._test)
print(f"Dataset saved to {self.data_dir}")
print(f" Train: {len(dataset['train'])} sentences")
print(f" Dev: {len(dataset['validation'])} sentences")
print(f" Test: {len(dataset['test'])} sentences")
def _convert_split(self, split, output_path: Path):
"""Convert a dataset split to CoNLL-U format."""
with open(output_path, "w", encoding="utf-8") as f:
for item in split:
sent_id = item.get("sent_id", "")
text = item.get("text", "")
if sent_id:
f.write(f"# sent_id = {sent_id}\n")
if text:
f.write(f"# text = {text}\n")
tokens = item["tokens"]
lemmas = item.get("lemmas", ["_"] * len(tokens))
upos = item["upos"]
xpos = item.get("xpos", ["_"] * len(tokens))
feats = item.get("feats", ["_"] * len(tokens))
heads = item["head"]
deprels = item["deprel"]
deps = item.get("deps", ["_"] * len(tokens))
misc = item.get("misc", ["_"] * len(tokens))
for i in range(len(tokens)):
token_id = i + 1
form = tokens[i]
lemma = lemmas[i] if lemmas[i] else "_"
upos_tag = upos[i] if upos[i] else "_"
xpos_tag = xpos[i] if xpos[i] else "_"
feat = feats[i] if feats[i] else "_"
head = int(heads[i]) if heads[i] else 0
deprel = deprels[i] if deprels[i] else "_"
dep = deps[i] if deps[i] else "_"
misc_val = misc[i] if misc[i] else "_"
line = f"{token_id}\t{form}\t{lemma}\t{upos_tag}\t{xpos_tag}\t{feat}\t{head}\t{deprel}\t{dep}\t{misc_val}"
f.write(line + "\n")
f.write("\n")
@property
def train(self) -> str:
"""Path to training data file."""
return str(self._train)
@property
def dev(self) -> str:
"""Path to development/validation data file."""
return str(self._dev)
@property
def test(self) -> str:
"""Path to test data file."""
return str(self._test)
def get_statistics(self) -> dict:
"""Get dataset statistics."""
# Lazy import - only needed for statistics
from datasets import load_dataset
dataset = load_dataset("undertheseanlp/UDD-1")
stats = {
"train_sentences": len(dataset["train"]),
"dev_sentences": len(dataset["validation"]),
"test_sentences": len(dataset["test"]),
"train_tokens": sum(len(item["tokens"]) for item in dataset["train"]),
"dev_tokens": sum(len(item["tokens"]) for item in dataset["validation"]),
"test_tokens": sum(len(item["tokens"]) for item in dataset["test"]),
}
all_upos = set()
all_deprels = set()
for split in ["train", "validation", "test"]:
for item in dataset[split]:
all_upos.update(item["upos"])
all_deprels.update(item["deprel"])
stats["num_upos_tags"] = len(all_upos)
stats["num_deprels"] = len(all_deprels)
stats["upos_tags"] = sorted(all_upos)
stats["deprels"] = sorted(all_deprels)
return stats