|
|
"""
|
|
|
Byte-Pair Encoding (BPE) Tokenizer for English-Malay Translation
|
|
|
=================================================================
|
|
|
We support two modes:
|
|
|
1. **Shared tokenizer** (preferred for 10+2 Tied Transformer):
|
|
|
A single BPE tokenizer trained on the concatenated en+ms corpus.
|
|
|
Both encoder and decoder share the same vocabulary.
|
|
|
2. **Separate tokenizers** (legacy):
|
|
|
Two independent BPE tokenizers, one per language.
|
|
|
|
|
|
Why BPE?
|
|
|
• Handles subword units, so rare / unseen words are decomposed into
|
|
|
known subword pieces instead of mapping to [UNK].
|
|
|
• Malay is morphologically rich (prefixes: me-, ber-, di-; suffixes:
|
|
|
-kan, -an, -i). BPE naturally learns these affixes as subword units,
|
|
|
giving much better coverage than a word-level tokenizer.
|
|
|
• Keeps vocabulary compact while still reaching high coverage on both
|
|
|
English and Malay.
|
|
|
|
|
|
Why shared vocabulary for en-ms?
|
|
|
• Both languages use the Latin script with significant lexical overlap
|
|
|
(loanwords: "teknologi", "matematik", "universiti"; numbers; proper nouns).
|
|
|
• A joint BPE captures cross-lingual subword patterns and enables
|
|
|
tied embeddings in the model (Press & Wolf, 2017), saving ~26M params.
|
|
|
|
|
|
Design choices:
|
|
|
• NFKC normalisation + lowercase – ensures consistent encoding of
|
|
|
Unicode characters and removes casing noise.
|
|
|
• Whitespace pre-tokeniser – splits on spaces before BPE merges; simple
|
|
|
and effective for Latin-script languages.
|
|
|
• Special tokens:
|
|
|
[PAD] – padding for uniform sequence lengths in batches
|
|
|
[UNK] – fallback for unknown characters
|
|
|
[CLS] – beginning-of-sequence / classification token
|
|
|
[SEP] – separator (unused in basic seq2seq but reserved)
|
|
|
[MASK] – reserved for masked-LM pretraining objectives
|
|
|
[BOS] – beginning of sentence (fed to decoder at step 0)
|
|
|
[EOS] – end of sentence (signals the decoder to stop)
|
|
|
"""
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
import os
|
|
|
import tempfile
|
|
|
from pathlib import Path
|
|
|
from typing import Iterator, List, Optional, Union
|
|
|
|
|
|
from tokenizers import Tokenizer
|
|
|
from tokenizers.models import BPE
|
|
|
from tokenizers.trainers import BpeTrainer
|
|
|
from tokenizers.pre_tokenizers import Whitespace
|
|
|
from tokenizers.normalizers import Sequence, NFKC, Lowercase
|
|
|
from tokenizers.processors import TemplateProcessing
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SPECIAL_TOKENS: List[str] = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "[BOS]", "[EOS]"]
|
|
|
PAD_TOKEN = "[PAD]"
|
|
|
UNK_TOKEN = "[UNK]"
|
|
|
CLS_TOKEN = "[CLS]"
|
|
|
SEP_TOKEN = "[SEP]"
|
|
|
MASK_TOKEN = "[MASK]"
|
|
|
BOS_TOKEN = "[BOS]"
|
|
|
EOS_TOKEN = "[EOS]"
|
|
|
|
|
|
DEFAULT_VOCAB_SIZE = 50_000
|
|
|
DEFAULT_MIN_FREQUENCY = 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _write_texts_to_tmpfile(texts: Iterator[str]) -> str:
|
|
|
"""Write an iterable of strings to a temp file, one per line. Returns path."""
|
|
|
tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8")
|
|
|
for line in texts:
|
|
|
line = line.strip()
|
|
|
if line:
|
|
|
tmp.write(line + "\n")
|
|
|
tmp.close()
|
|
|
return tmp.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_tokenizer(
|
|
|
vocab_size: int = DEFAULT_VOCAB_SIZE,
|
|
|
min_frequency: int = DEFAULT_MIN_FREQUENCY,
|
|
|
) -> tuple[Tokenizer, BpeTrainer]:
|
|
|
"""
|
|
|
Create an *untrained* BPE tokenizer and its trainer.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
tokenizer : Tokenizer
|
|
|
Ready to call ``tokenizer.train(files, trainer)``.
|
|
|
trainer : BpeTrainer
|
|
|
Configured trainer instance.
|
|
|
"""
|
|
|
tokenizer = Tokenizer(BPE(unk_token=UNK_TOKEN))
|
|
|
|
|
|
|
|
|
tokenizer.normalizer = Sequence([NFKC(), Lowercase()])
|
|
|
|
|
|
|
|
|
tokenizer.pre_tokenizer = Whitespace()
|
|
|
|
|
|
|
|
|
trainer = BpeTrainer(
|
|
|
vocab_size=vocab_size,
|
|
|
min_frequency=min_frequency,
|
|
|
special_tokens=SPECIAL_TOKENS,
|
|
|
show_progress=True,
|
|
|
)
|
|
|
|
|
|
return tokenizer, trainer
|
|
|
|
|
|
|
|
|
def train_tokenizer(
|
|
|
texts: Union[List[str], Iterator[str]],
|
|
|
vocab_size: int = DEFAULT_VOCAB_SIZE,
|
|
|
min_frequency: int = DEFAULT_MIN_FREQUENCY,
|
|
|
files: Optional[List[str]] = None,
|
|
|
) -> Tokenizer:
|
|
|
"""
|
|
|
Train a BPE tokenizer on the given texts **or** files.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
texts : list[str] or iterator of str, optional
|
|
|
Raw sentences. Ignored when *files* is provided.
|
|
|
vocab_size : int
|
|
|
Target vocabulary size (default 30 000).
|
|
|
min_frequency : int
|
|
|
Minimum frequency for a pair to be merged.
|
|
|
files : list[str], optional
|
|
|
Paths to plain-text files (one sentence per line).
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
Tokenizer
|
|
|
Trained tokenizer ready for encoding / decoding.
|
|
|
"""
|
|
|
tokenizer, trainer = build_tokenizer(vocab_size, min_frequency)
|
|
|
|
|
|
if files is not None:
|
|
|
tokenizer.train(files, trainer)
|
|
|
else:
|
|
|
|
|
|
tmp_path = _write_texts_to_tmpfile(iter(texts))
|
|
|
try:
|
|
|
tokenizer.train([tmp_path], trainer)
|
|
|
finally:
|
|
|
os.remove(tmp_path)
|
|
|
|
|
|
|
|
|
bos_id = tokenizer.token_to_id(BOS_TOKEN)
|
|
|
eos_id = tokenizer.token_to_id(EOS_TOKEN)
|
|
|
tokenizer.post_processor = TemplateProcessing(
|
|
|
single=f"[BOS]:0 $A:0 [EOS]:0",
|
|
|
pair=f"[BOS]:0 $A:0 [EOS]:0 [BOS]:1 $B:1 [EOS]:1",
|
|
|
special_tokens=[
|
|
|
("[BOS]", bos_id),
|
|
|
("[EOS]", eos_id),
|
|
|
],
|
|
|
)
|
|
|
|
|
|
return tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_tokenizer(tokenizer: Tokenizer, path: Union[str, Path]) -> None:
|
|
|
"""Save a trained tokenizer to a JSON file."""
|
|
|
path = Path(path)
|
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
tokenizer.save(str(path))
|
|
|
print(f"[✓] Tokenizer saved → {path}")
|
|
|
|
|
|
|
|
|
def load_tokenizer(path: Union[str, Path]) -> Tokenizer:
|
|
|
"""Load a previously saved tokenizer from a JSON file."""
|
|
|
tokenizer = Tokenizer.from_file(str(path))
|
|
|
print(f"[✓] Tokenizer loaded ← {path}")
|
|
|
return tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def encode(tokenizer: Tokenizer, text: str) -> List[int]:
|
|
|
"""Encode a single string and return token IDs (includes [BOS]/[EOS])."""
|
|
|
return tokenizer.encode(text).ids
|
|
|
|
|
|
|
|
|
def decode(tokenizer: Tokenizer, ids: List[int]) -> str:
|
|
|
"""Decode token IDs back to a string, skipping special tokens."""
|
|
|
return tokenizer.decode(ids, skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
def get_vocab_size(tokenizer: Tokenizer) -> int:
|
|
|
"""Return the size of the tokenizer's vocabulary."""
|
|
|
return tokenizer.get_vocab_size()
|
|
|
|
|
|
|
|
|
def token_to_id(tokenizer: Tokenizer, token: str) -> Optional[int]:
|
|
|
"""Look up the integer ID for a single token string."""
|
|
|
return tokenizer.token_to_id(token)
|
|
|
|
|
|
|
|
|
def id_to_token(tokenizer: Tokenizer, id: int) -> Optional[str]:
|
|
|
"""Look up the token string for a single integer ID."""
|
|
|
return tokenizer.id_to_token(id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_shared_tokenizer_from_dataset(
|
|
|
dataset,
|
|
|
src_lang: str = "en",
|
|
|
tgt_lang: str = "ms",
|
|
|
vocab_size: int = DEFAULT_VOCAB_SIZE,
|
|
|
save_dir: Union[str, Path] = "tokenizer",
|
|
|
) -> Tokenizer:
|
|
|
"""
|
|
|
Train a single shared BPE tokenizer on the concatenated en+ms corpus.
|
|
|
|
|
|
This is used with the 10+2 Tied Transformer architecture, where both
|
|
|
encoder and decoder share the same vocabulary and embedding matrix.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
dataset : datasets.Dataset
|
|
|
A HuggingFace dataset split where each example has a ``'translation'``
|
|
|
dict with keys for each language code.
|
|
|
src_lang : str
|
|
|
Source language code (default ``'en'``).
|
|
|
tgt_lang : str
|
|
|
Target language code (default ``'ms'``).
|
|
|
vocab_size : int
|
|
|
Vocabulary size for the shared tokenizer.
|
|
|
save_dir : str or Path
|
|
|
Directory to save the trained tokenizer JSON file.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
Tokenizer
|
|
|
A single shared tokenizer for both languages.
|
|
|
"""
|
|
|
save_dir = Path(save_dir)
|
|
|
|
|
|
|
|
|
src_texts = [example["translation"][src_lang] for example in dataset]
|
|
|
tgt_texts = [example["translation"][tgt_lang] for example in dataset]
|
|
|
all_texts = src_texts + tgt_texts
|
|
|
|
|
|
print(f"Training shared BPE tokenizer on {len(all_texts):,} sentences "
|
|
|
f"({len(src_texts):,} {src_lang} + {len(tgt_texts):,} {tgt_lang}) …")
|
|
|
shared_tokenizer = train_tokenizer(all_texts, vocab_size=vocab_size)
|
|
|
save_tokenizer(shared_tokenizer, save_dir / "tokenizer_shared.json")
|
|
|
|
|
|
|
|
|
for name, sample in [(src_lang, src_texts[0]), (tgt_lang, tgt_texts[0])]:
|
|
|
enc = shared_tokenizer.encode(sample)
|
|
|
print(f"\n[{name}] Sample: {sample[:80]}…")
|
|
|
print(f" Tokens : {enc.tokens[:15]}…")
|
|
|
print(f" IDs : {enc.ids[:15]}…")
|
|
|
print(f" Decoded: {shared_tokenizer.decode(enc.ids, skip_special_tokens=True)[:80]}…")
|
|
|
|
|
|
print(f"\n[✓] Shared tokenizer trained and saved to {save_dir}/tokenizer_shared.json")
|
|
|
return shared_tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_tokenizers_from_dataset(
|
|
|
dataset,
|
|
|
src_lang: str = "en",
|
|
|
tgt_lang: str = "ms",
|
|
|
vocab_size: int = DEFAULT_VOCAB_SIZE,
|
|
|
save_dir: Union[str, Path] = "tokenizer",
|
|
|
) -> tuple[Tokenizer, Tokenizer]:
|
|
|
"""
|
|
|
Train separate BPE tokenizers for source and target languages.
|
|
|
|
|
|
Parameters
|
|
|
----------
|
|
|
dataset : datasets.Dataset
|
|
|
A HuggingFace dataset split (e.g. ``dataset['train']``) where each
|
|
|
example has a ``'translation'`` dict with keys for each language code.
|
|
|
src_lang : str
|
|
|
Source language code (default ``'en'``).
|
|
|
tgt_lang : str
|
|
|
Target language code (default ``'ms'``).
|
|
|
vocab_size : int
|
|
|
Vocabulary size for each tokenizer.
|
|
|
save_dir : str or Path
|
|
|
Directory to save the trained tokenizer JSON files.
|
|
|
|
|
|
Returns
|
|
|
-------
|
|
|
(src_tokenizer, tgt_tokenizer)
|
|
|
"""
|
|
|
save_dir = Path(save_dir)
|
|
|
|
|
|
|
|
|
src_texts = [example["translation"][src_lang] for example in dataset]
|
|
|
tgt_texts = [example["translation"][tgt_lang] for example in dataset]
|
|
|
|
|
|
print(f"Training source tokenizer ({src_lang}) on {len(src_texts):,} sentences …")
|
|
|
src_tokenizer = train_tokenizer(src_texts, vocab_size=vocab_size)
|
|
|
save_tokenizer(src_tokenizer, save_dir / f"tokenizer_{src_lang}.json")
|
|
|
|
|
|
print(f"Training target tokenizer ({tgt_lang}) on {len(tgt_texts):,} sentences …")
|
|
|
tgt_tokenizer = train_tokenizer(tgt_texts, vocab_size=vocab_size)
|
|
|
save_tokenizer(tgt_tokenizer, save_dir / f"tokenizer_{tgt_lang}.json")
|
|
|
|
|
|
|
|
|
for name, tok, sample in [
|
|
|
(src_lang, src_tokenizer, src_texts[0]),
|
|
|
(tgt_lang, tgt_tokenizer, tgt_texts[0]),
|
|
|
]:
|
|
|
enc = tok.encode(sample)
|
|
|
print(f"\n[{name}] Sample: {sample[:80]}…")
|
|
|
print(f" Tokens : {enc.tokens[:15]}…")
|
|
|
print(f" IDs : {enc.ids[:15]}…")
|
|
|
print(f" Decoded: {tok.decode(enc.ids, skip_special_tokens=True)[:80]}…")
|
|
|
|
|
|
print(f"\n[✓] Both tokenizers trained and saved to {save_dir}/")
|
|
|
return src_tokenizer, tgt_tokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
from datasets import load_from_disk
|
|
|
|
|
|
print("Loading TED Talks IWSLT dataset (en ↔ ms, 2016) …")
|
|
|
ds = load_from_disk("dataset/en_ms_2016")
|
|
|
|
|
|
src_tok, tgt_tok = train_tokenizers_from_dataset(
|
|
|
ds,
|
|
|
src_lang="en",
|
|
|
tgt_lang="ms",
|
|
|
vocab_size=DEFAULT_VOCAB_SIZE,
|
|
|
save_dir="tokenizer",
|
|
|
)
|
|
|
|
|
|
print(f"\nEnglish vocab size : {get_vocab_size(src_tok):,}")
|
|
|
print(f"Malay vocab size : {get_vocab_size(tgt_tok):,}")
|
|
|
print(f"[PAD] id (en) : {token_to_id(src_tok, PAD_TOKEN)}")
|
|
|
print(f"[EOS] id (ms) : {token_to_id(tgt_tok, EOS_TOKEN)}")
|
|
|
|