File size: 6,096 Bytes
bac26dd 11632a3 bac26dd 11632a3 bac26dd 72a17f5 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd 11632a3 bac26dd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | #!/usr/bin/env python3
"""
Augment training data by duplicating isolated atomic tokens.
The model struggles with single-token inputs because they appear rarely as isolated
training examples. This script identifies atomic tokens and adds more isolated
training pairs for them.
Run this AFTER train_tokeniser.py and BEFORE train_t5.py:
1. generate_syr_lat_pairs.py -> syriac_*_corpus.jsonl
2. generate_clean_corpus.sh -> syriac_*_clean_corpus.jsonl
3. train_tokeniser.py -> src/tokeniser/
4. augment_atomic_tokens.py -> syriac_*_augmented_corpus.jsonl (this script)
5. train_t5.py -> model
"""
import argparse
import json
from collections import Counter
from pathlib import Path
from transformers import AutoTokenizer
# Configuration
MIN_ISOLATED_COUNT = 200 # Ensure each atomic token appears at least this many times
_SCRIPT_DIR = Path(__file__).resolve().parent
DEFAULT_TOKENIZER_PATH = str(_SCRIPT_DIR.parent / "tokeniser") # src/tokeniser
def load_corpus(path: Path, strip_augmented: bool = False) -> list[dict]:
"""Load JSONL corpus.
Args:
path: Path to corpus file
strip_augmented: If True, filter out previously augmented entries
"""
with open(path) as f:
data = [json.loads(line) for line in f]
if strip_augmented:
# Remove entries that were added by previous augmentation runs
data = [
d for d in data if d["transliteration"].get("source") != "augmented-atomic"
]
return data
def save_corpus(data: list[dict], path: Path):
"""Save JSONL corpus."""
with open(path, "w") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
def get_atomic_tokens(tokenizer, corpus: list[dict]) -> set[str]:
"""Find all inputs that tokenize to a single content token.
SentencePiece adds a leading space token (▁), so we check for
either 1 token or 2 tokens where the first is the space prefix.
"""
atomic = set()
space_token_id = tokenizer.convert_tokens_to_ids("▁")
for item in corpus:
src = item["transliteration"]["src"].strip()
ids = tokenizer(src).input_ids[:-1] # Remove </s>
# Single token = atomic
if len(ids) == 1:
atomic.add(src)
# Space prefix + single content token = also atomic
elif len(ids) == 2 and ids[0] == space_token_id:
atomic.add(src)
return atomic
def augment_corpus(
corpus: list[dict], tokenizer, min_count: int = MIN_ISOLATED_COUNT
) -> list[dict]:
"""Augment corpus with more isolated atomic token examples."""
# Get atomic tokens
atomic_tokens = get_atomic_tokens(tokenizer, corpus)
print(f"Found {len(atomic_tokens)} atomic tokens")
# Count current isolated occurrences
src_counts = Counter(item["transliteration"]["src"].strip() for item in corpus)
# Find atomic tokens that need augmentation
need_augmentation = {
src: min_count - src_counts[src]
for src in atomic_tokens
if src_counts[src] < min_count
}
print(f"Need to augment {len(need_augmentation)} tokens")
# Build lookup: src -> transliteration entry
src_to_entry = {}
for item in corpus:
src = item["transliteration"]["src"].strip()
if src in need_augmentation and src not in src_to_entry:
src_to_entry[src] = item["transliteration"]
# Create augmentation entries
augmented = []
for src, copies_needed in need_augmentation.items():
if src not in src_to_entry:
continue # Skip if we couldn't find the entry
entry = src_to_entry[src]
for _ in range(copies_needed):
augmented.append(
{
"transliteration": {
"src": entry["src"],
"tgt": entry["tgt"],
"title": "word",
"dialect": entry.get("dialect", "unknown"),
"source": "augmented-atomic",
}
}
)
print(f"Adding {len(augmented)} augmented entries")
return corpus + augmented
def main():
parser = argparse.ArgumentParser(
description="Augment corpus with atomic token examples"
)
parser.add_argument(
"--tokenizer",
default=DEFAULT_TOKENIZER_PATH,
help=f"Path to tokenizer (default: {DEFAULT_TOKENIZER_PATH})",
)
parser.add_argument(
"--min-count",
type=int,
default=MIN_ISOLATED_COUNT,
help=f"Minimum isolated occurrences per atomic token (default: {MIN_ISOLATED_COUNT})",
)
args = parser.parse_args()
print(f"Loading tokenizer from {args.tokenizer}...")
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
data_dir = _SCRIPT_DIR
for dialect in ["west", "east"]:
clean_path = data_dir / f"syriac_{dialect}_clean_corpus.jsonl"
augmented_path = data_dir / f"syriac_{dialect}_augmented_corpus.jsonl"
print(f"\n=== Processing {dialect.capitalize()} corpus ===")
# Try to load from augmented file (stripping old augmented entries) or clean file
if augmented_path.exists():
print(
f"Loading from {augmented_path.name} (stripping old augmented entries)..."
)
corpus = load_corpus(augmented_path, strip_augmented=True)
elif clean_path.exists():
print(f"Loading from {clean_path.name}...")
corpus = load_corpus(clean_path)
else:
print(f"ERROR: Neither {clean_path.name} nor {augmented_path.name} found!")
continue
print(f"Base corpus size: {len(corpus)}")
augmented = augment_corpus(corpus, tokenizer, min_count=args.min_count)
print(f"Augmented size: {len(augmented)}")
save_corpus(augmented, augmented_path)
print(f"Saved to {augmented_path}")
print("\nDone! Run train_t5.py to train the T5 model using the augmented corpus.")
if __name__ == "__main__":
main()
|