bbkdevops's picture
download
raw
4.88 kB
"""
Phase 2: Train BPE Tokenizer จากศูนย์
- Vocab 65,536 tokens (Thai + EN + special)
- ใช้ tokenizers library (Rust-based, เร็วมาก)
- รองรับ Thai word boundary ด้วย pre-tokenizer พิเศษ
"""
import json
from pathlib import Path
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders
from tokenizers.normalizers import NFC
DATA_DIR = Path(__file__).parent.parent
FILTERED = DATA_DIR / "filtered" / "clean_qa.jsonl"
RAW_DIR = DATA_DIR / "raw"
OUT_DIR = Path(__file__).parent
OUT_DIR.mkdir(exist_ok=True)
VOCAB_SIZE = 65_536
SPECIAL_TOKENS = [
"<pad>", "<unk>", "<bos>", "<eos>",
"<user>", "<assistant>", "<system>",
"<th>", "<en>", # language tokens
"<think>", "</think>", # chain-of-thought
"<sep>",
]
def iter_texts():
"""Stream all text from dataset for training tokenizer"""
sources = list(RAW_DIR.glob("*.jsonl"))
if FILTERED.exists():
sources.append(FILTERED)
for path in sources:
with open(path, encoding="utf-8") as f:
for line in f:
try:
item = json.loads(line)
if item.get("question"):
yield item["question"]
if item.get("answer"):
yield item["answer"]
if item.get("context"):
yield item["context"]
except Exception:
pass
def save_texts_for_training(path: Path, max_lines: int = 5_000_000):
"""บันทึก corpus ไว้ train tokenizer"""
print(f"Writing corpus to {path} ...")
count = 0
with open(path, "w", encoding="utf-8") as f:
for text in iter_texts():
if count >= max_lines:
break
text = text.strip()
if text:
f.write(text + "\n")
count += 1
print(f" Written {count:,} lines")
def build_tokenizer() -> Tokenizer:
"""สร้าง BPE tokenizer ที่รองรับ Thai+EN อย่างถูกต้อง"""
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
# Normalize: NFC unicode normalization
tokenizer.normalizer = NFC() # type: ignore
# Pre-tokenizer: แยกตาม whitespace + punctuation
# สำหรับภาษาไทยที่ไม่มี space ระหว่างคำ → ใช้ byte-level fallback
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ # type: ignore
pre_tokenizers.UnicodeScripts(), # แยก script ต่างกัน (Thai vs Latin)
pre_tokenizers.ByteLevel(add_prefix_space=False),
])
tokenizer.decoder = decoders.ByteLevel() # type: ignore
# Post-processor: เพิ่ม BOS/EOS
bos_id = SPECIAL_TOKENS.index("<bos>")
eos_id = SPECIAL_TOKENS.index("<eos>")
tokenizer.post_processor = processors.TemplateProcessing( # type: ignore
single="<bos> $A <eos>",
pair="<bos> $A <sep> $B:1 <eos>:1",
special_tokens=[
("<bos>", bos_id),
("<eos>", eos_id),
("<sep>", SPECIAL_TOKENS.index("<sep>")),
],
)
return tokenizer
def train():
print("=" * 60)
print("TinyMind — Phase 2: Train BPE Tokenizer")
print(f"Vocab size: {VOCAB_SIZE:,} | Specials: {len(SPECIAL_TOKENS)}")
print("=" * 60)
corpus_path = OUT_DIR / "corpus.txt"
if not corpus_path.exists():
save_texts_for_training(corpus_path)
else:
print(f"Using existing corpus: {corpus_path}")
tokenizer = build_tokenizer()
trainer = trainers.BpeTrainer(
vocab_size=VOCAB_SIZE,
special_tokens=SPECIAL_TOKENS,
min_frequency=2,
show_progress=True,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)
print("\nTraining BPE ...")
tokenizer.train([str(corpus_path)], trainer=trainer)
# Save
tok_path = OUT_DIR / "tokenizer.json"
tokenizer.save(str(tok_path))
print(f"\nTokenizer saved → {tok_path}")
print(f"Actual vocab size: {tokenizer.get_vocab_size():,}")
# Quick test
test_sentences = [
"สวัสดีครับ ผมชื่อ TinyMind",
"What is artificial intelligence?",
"AI คืออะไร และมันทำงานอย่างไร?",
"<th> ดาวอังคารอยู่ห่างจากโลกเท่าไร? <sep> ประมาณ 225 ล้านกิโลเมตร <eos>",
]
print("\n--- Tokenizer Test ---")
for s in test_sentences:
enc = tokenizer.encode(s)
print(f" '{s[:40]}...' → {len(enc.ids)} tokens")
return tokenizer
if __name__ == "__main__":
train()

Xet Storage Details

Size:
4.88 kB
·
Xet hash:
d7779945598bdbd88e9a061fba6a7301bdc58becdf1f6de82da4e4bf58ffb13e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.