aramt5 / src /data /augment_atomic_tokens.py
crossroderick's picture
Data augmentation and balancing updates for a re-run of v3
11632a3
#!/usr/bin/env python3
"""
Augment training data by duplicating isolated atomic tokens.
The model struggles with single-token inputs because they appear rarely as isolated
training examples. This script identifies atomic tokens and adds more isolated
training pairs for them.
Run this AFTER train_tokeniser.py and BEFORE train_t5.py:
1. generate_syr_lat_pairs.py -> syriac_*_corpus.jsonl
2. generate_clean_corpus.sh -> syriac_*_clean_corpus.jsonl
3. train_tokeniser.py -> src/tokeniser/
4. augment_atomic_tokens.py -> syriac_*_augmented_corpus.jsonl (this script)
5. train_t5.py -> model
"""
import argparse
import json
from collections import Counter
from pathlib import Path
from transformers import AutoTokenizer
# Configuration
MIN_ISOLATED_COUNT = 200 # Ensure each atomic token appears at least this many times
_SCRIPT_DIR = Path(__file__).resolve().parent
DEFAULT_TOKENIZER_PATH = str(_SCRIPT_DIR.parent / "tokeniser") # src/tokeniser
def load_corpus(path: Path, strip_augmented: bool = False) -> list[dict]:
"""Load JSONL corpus.
Args:
path: Path to corpus file
strip_augmented: If True, filter out previously augmented entries
"""
with open(path) as f:
data = [json.loads(line) for line in f]
if strip_augmented:
# Remove entries that were added by previous augmentation runs
data = [
d for d in data if d["transliteration"].get("source") != "augmented-atomic"
]
return data
def save_corpus(data: list[dict], path: Path):
"""Save JSONL corpus."""
with open(path, "w") as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
def get_atomic_tokens(tokenizer, corpus: list[dict]) -> set[str]:
"""Find all inputs that tokenize to a single content token.
SentencePiece adds a leading space token (▁), so we check for
either 1 token or 2 tokens where the first is the space prefix.
"""
atomic = set()
space_token_id = tokenizer.convert_tokens_to_ids("▁")
for item in corpus:
src = item["transliteration"]["src"].strip()
ids = tokenizer(src).input_ids[:-1] # Remove </s>
# Single token = atomic
if len(ids) == 1:
atomic.add(src)
# Space prefix + single content token = also atomic
elif len(ids) == 2 and ids[0] == space_token_id:
atomic.add(src)
return atomic
def augment_corpus(
corpus: list[dict], tokenizer, min_count: int = MIN_ISOLATED_COUNT
) -> list[dict]:
"""Augment corpus with more isolated atomic token examples."""
# Get atomic tokens
atomic_tokens = get_atomic_tokens(tokenizer, corpus)
print(f"Found {len(atomic_tokens)} atomic tokens")
# Count current isolated occurrences
src_counts = Counter(item["transliteration"]["src"].strip() for item in corpus)
# Find atomic tokens that need augmentation
need_augmentation = {
src: min_count - src_counts[src]
for src in atomic_tokens
if src_counts[src] < min_count
}
print(f"Need to augment {len(need_augmentation)} tokens")
# Build lookup: src -> transliteration entry
src_to_entry = {}
for item in corpus:
src = item["transliteration"]["src"].strip()
if src in need_augmentation and src not in src_to_entry:
src_to_entry[src] = item["transliteration"]
# Create augmentation entries
augmented = []
for src, copies_needed in need_augmentation.items():
if src not in src_to_entry:
continue # Skip if we couldn't find the entry
entry = src_to_entry[src]
for _ in range(copies_needed):
augmented.append(
{
"transliteration": {
"src": entry["src"],
"tgt": entry["tgt"],
"title": "word",
"dialect": entry.get("dialect", "unknown"),
"source": "augmented-atomic",
}
}
)
print(f"Adding {len(augmented)} augmented entries")
return corpus + augmented
def main():
parser = argparse.ArgumentParser(
description="Augment corpus with atomic token examples"
)
parser.add_argument(
"--tokenizer",
default=DEFAULT_TOKENIZER_PATH,
help=f"Path to tokenizer (default: {DEFAULT_TOKENIZER_PATH})",
)
parser.add_argument(
"--min-count",
type=int,
default=MIN_ISOLATED_COUNT,
help=f"Minimum isolated occurrences per atomic token (default: {MIN_ISOLATED_COUNT})",
)
args = parser.parse_args()
print(f"Loading tokenizer from {args.tokenizer}...")
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
data_dir = _SCRIPT_DIR
for dialect in ["west", "east"]:
clean_path = data_dir / f"syriac_{dialect}_clean_corpus.jsonl"
augmented_path = data_dir / f"syriac_{dialect}_augmented_corpus.jsonl"
print(f"\n=== Processing {dialect.capitalize()} corpus ===")
# Try to load from augmented file (stripping old augmented entries) or clean file
if augmented_path.exists():
print(
f"Loading from {augmented_path.name} (stripping old augmented entries)..."
)
corpus = load_corpus(augmented_path, strip_augmented=True)
elif clean_path.exists():
print(f"Loading from {clean_path.name}...")
corpus = load_corpus(clean_path)
else:
print(f"ERROR: Neither {clean_path.name} nor {augmented_path.name} found!")
continue
print(f"Base corpus size: {len(corpus)}")
augmented = augment_corpus(corpus, tokenizer, min_count=args.min_count)
print(f"Augmented size: {len(augmented)}")
save_corpus(augmented, augmented_path)
print(f"Saved to {augmented_path}")
print("\nDone! Run train_t5.py to train the T5 model using the augmented corpus.")
if __name__ == "__main__":
main()