tre-1 / src /dataset /vlsp2013_wtk.py

Add VLSP 2013 word segmentation, feature ablation study, and Hydra config

27e6434 2 months ago

1.35 kB

	"""VLSP 2013 Word Tokenization (WTK) dataset.

	Source: https://github.com/undertheseanlp/underthesea/releases/download/resources/c7veardo0e.zip
	Format: syllable-level BIO tagging (B-W = beginning of word, I-W = inside word)
	Sentences separated by blank lines.

	Statistics:
	Train: 75,389 sentences
	Test: 2,120 sentences
	"""

	from pathlib import Path

	DATASET_DIR = Path(__file__).resolve().parent.parent.parent / "datasets" / "c7veardo0e"
	TRAIN_PATH = DATASET_DIR / "train.txt"
	TEST_PATH = DATASET_DIR / "test.txt"


	def load_sentences(path: Path) -> list[list[tuple[str, str]]]:
	"""Load sentences from a BIO-tagged file.

	Returns a list of sentences, where each sentence is a list of
	(syllable, tag) tuples.
	"""
	sentences = []
	current = []
	with open(path, encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	if current:
	sentences.append(current)
	current = []
	else:
	parts = line.split("\t")
	if len(parts) == 2:
	current.append((parts[0], parts[1]))
	if current:
	sentences.append(current)
	return sentences


	def load_train():
	return load_sentences(TRAIN_PATH)


	def load_test():
	return load_sentences(TEST_PATH)