| """VLSP 2013 Word Tokenization (WTK) dataset. | |
| Source: https://github.com/undertheseanlp/underthesea/releases/download/resources/c7veardo0e.zip | |
| Format: syllable-level BIO tagging (B-W = beginning of word, I-W = inside word) | |
| Sentences separated by blank lines. | |
| Statistics: | |
| Train: 75,389 sentences | |
| Test: 2,120 sentences | |
| """ | |
| from pathlib import Path | |
| DATASET_DIR = Path(__file__).resolve().parent.parent.parent / "datasets" / "c7veardo0e" | |
| TRAIN_PATH = DATASET_DIR / "train.txt" | |
| TEST_PATH = DATASET_DIR / "test.txt" | |
| def load_sentences(path: Path) -> list[list[tuple[str, str]]]: | |
| """Load sentences from a BIO-tagged file. | |
| Returns a list of sentences, where each sentence is a list of | |
| (syllable, tag) tuples. | |
| """ | |
| sentences = [] | |
| current = [] | |
| with open(path, encoding="utf-8") as f: | |
| for line in f: | |
| line = line.strip() | |
| if not line: | |
| if current: | |
| sentences.append(current) | |
| current = [] | |
| else: | |
| parts = line.split("\t") | |
| if len(parts) == 2: | |
| current.append((parts[0], parts[1])) | |
| if current: | |
| sentences.append(current) | |
| return sentences | |
| def load_train(): | |
| return load_sentences(TRAIN_PATH) | |
| def load_test(): | |
| return load_sentences(TEST_PATH) | |