Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /tokenizer /train_tokenizer.py

bbkdevops

30 days ago

download

raw

4.88 kB

	"""
	Phase 2: Train BPE Tokenizer จากศูนย์
	- Vocab 65,536 tokens (Thai + EN + special)
	- ใช้ tokenizers library (Rust-based, เร็วมาก)
	- รองรับ Thai word boundary ด้วย pre-tokenizer พิเศษ
	"""

	import json
	from pathlib import Path
	from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders
	from tokenizers.normalizers import NFC

	DATA_DIR = Path(__file__).parent.parent
	FILTERED = DATA_DIR / "filtered" / "clean_qa.jsonl"
	RAW_DIR = DATA_DIR / "raw"
	OUT_DIR = Path(__file__).parent
	OUT_DIR.mkdir(exist_ok=True)

	VOCAB_SIZE = 65_536
	SPECIAL_TOKENS = [
	"<pad>", "<unk>", "<bos>", "<eos>",
	"<user>", "<assistant>", "<system>",
	"<th>", "<en>", # language tokens
	"<think>", "</think>", # chain-of-thought
	"<sep>",
	]


	def iter_texts():
	"""Stream all text from dataset for training tokenizer"""
	sources = list(RAW_DIR.glob("*.jsonl"))
	if FILTERED.exists():
	sources.append(FILTERED)

	for path in sources:
	with open(path, encoding="utf-8") as f:
	for line in f:
	try:
	item = json.loads(line)
	if item.get("question"):
	yield item["question"]
	if item.get("answer"):
	yield item["answer"]
	if item.get("context"):
	yield item["context"]
	except Exception:
	pass


	def save_texts_for_training(path: Path, max_lines: int = 5_000_000):
	"""บันทึก corpus ไว้ train tokenizer"""
	print(f"Writing corpus to {path} ...")
	count = 0
	with open(path, "w", encoding="utf-8") as f:
	for text in iter_texts():
	if count >= max_lines:
	break
	text = text.strip()
	if text:
	f.write(text + "\n")
	count += 1
	print(f" Written {count:,} lines")


	def build_tokenizer() -> Tokenizer:
	"""สร้าง BPE tokenizer ที่รองรับ Thai+EN อย่างถูกต้อง"""

	tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))

	# Normalize: NFC unicode normalization
	tokenizer.normalizer = NFC() # type: ignore

	# Pre-tokenizer: แยกตาม whitespace + punctuation
	# สำหรับภาษาไทยที่ไม่มี space ระหว่างคำ → ใช้ byte-level fallback
	tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ # type: ignore
	pre_tokenizers.UnicodeScripts(), # แยก script ต่างกัน (Thai vs Latin)
	pre_tokenizers.ByteLevel(add_prefix_space=False),
	])

	tokenizer.decoder = decoders.ByteLevel() # type: ignore

	# Post-processor: เพิ่ม BOS/EOS
	bos_id = SPECIAL_TOKENS.index("<bos>")
	eos_id = SPECIAL_TOKENS.index("<eos>")
	tokenizer.post_processor = processors.TemplateProcessing( # type: ignore
	single="<bos> $A <eos>",
	pair="<bos> $A <sep> $B:1 <eos>:1",
	special_tokens=[
	("<bos>", bos_id),
	("<eos>", eos_id),
	("<sep>", SPECIAL_TOKENS.index("<sep>")),
	],
	)

	return tokenizer


	def train():
	print("=" * 60)
	print("TinyMind — Phase 2: Train BPE Tokenizer")
	print(f"Vocab size: {VOCAB_SIZE:,} \| Specials: {len(SPECIAL_TOKENS)}")
	print("=" * 60)

	corpus_path = OUT_DIR / "corpus.txt"
	if not corpus_path.exists():
	save_texts_for_training(corpus_path)
	else:
	print(f"Using existing corpus: {corpus_path}")

	tokenizer = build_tokenizer()

	trainer = trainers.BpeTrainer(
	vocab_size=VOCAB_SIZE,
	special_tokens=SPECIAL_TOKENS,
	min_frequency=2,
	show_progress=True,
	initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
	)

	print("\nTraining BPE ...")
	tokenizer.train([str(corpus_path)], trainer=trainer)

	# Save
	tok_path = OUT_DIR / "tokenizer.json"
	tokenizer.save(str(tok_path))
	print(f"\nTokenizer saved → {tok_path}")
	print(f"Actual vocab size: {tokenizer.get_vocab_size():,}")

	# Quick test
	test_sentences = [
	"สวัสดีครับ ผมชื่อ TinyMind",
	"What is artificial intelligence?",
	"AI คืออะไร และมันทำงานอย่างไร?",
	"<th> ดาวอังคารอยู่ห่างจากโลกเท่าไร? <sep> ประมาณ 225 ล้านกิโลเมตร <eos>",
	]
	print("\n--- Tokenizer Test ---")
	for s in test_sentences:
	enc = tokenizer.encode(s)
	print(f" '{s[:40]}...' → {len(enc.ids)} tokens")

	return tokenizer


	if __name__ == "__main__":
	train()

Xet Storage Details

Size:: 4.88 kB
Xet hash:: d7779945598bdbd88e9a061fba6a7301bdc58becdf1f6de82da4e4bf58ffb13e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.