tre-1 / src /dataset /vlsp2013_wtk.py
rain1024's picture
Add VLSP 2013 word segmentation, feature ablation study, and Hydra config
27e6434
"""VLSP 2013 Word Tokenization (WTK) dataset.
Source: https://github.com/undertheseanlp/underthesea/releases/download/resources/c7veardo0e.zip
Format: syllable-level BIO tagging (B-W = beginning of word, I-W = inside word)
Sentences separated by blank lines.
Statistics:
Train: 75,389 sentences
Test: 2,120 sentences
"""
from pathlib import Path
DATASET_DIR = Path(__file__).resolve().parent.parent.parent / "datasets" / "c7veardo0e"
TRAIN_PATH = DATASET_DIR / "train.txt"
TEST_PATH = DATASET_DIR / "test.txt"
def load_sentences(path: Path) -> list[list[tuple[str, str]]]:
"""Load sentences from a BIO-tagged file.
Returns a list of sentences, where each sentence is a list of
(syllable, tag) tuples.
"""
sentences = []
current = []
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
if current:
sentences.append(current)
current = []
else:
parts = line.split("\t")
if len(parts) == 2:
current.append((parts[0], parts[1]))
if current:
sentences.append(current)
return sentences
def load_train():
return load_sentences(TRAIN_PATH)
def load_test():
return load_sentences(TEST_PATH)