Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /tokenizer /train_tokenizer.py
| """ | |
| Phase 2: Train BPE Tokenizer จากศูนย์ | |
| - Vocab 65,536 tokens (Thai + EN + special) | |
| - ใช้ tokenizers library (Rust-based, เร็วมาก) | |
| - รองรับ Thai word boundary ด้วย pre-tokenizer พิเศษ | |
| """ | |
| import json | |
| from pathlib import Path | |
| from tokenizers import Tokenizer, models, pre_tokenizers, trainers, processors, decoders | |
| from tokenizers.normalizers import NFC | |
| DATA_DIR = Path(__file__).parent.parent | |
| FILTERED = DATA_DIR / "filtered" / "clean_qa.jsonl" | |
| RAW_DIR = DATA_DIR / "raw" | |
| OUT_DIR = Path(__file__).parent | |
| OUT_DIR.mkdir(exist_ok=True) | |
| VOCAB_SIZE = 65_536 | |
| SPECIAL_TOKENS = [ | |
| "<pad>", "<unk>", "<bos>", "<eos>", | |
| "<user>", "<assistant>", "<system>", | |
| "<th>", "<en>", # language tokens | |
| "<think>", "</think>", # chain-of-thought | |
| "<sep>", | |
| ] | |
| def iter_texts(): | |
| """Stream all text from dataset for training tokenizer""" | |
| sources = list(RAW_DIR.glob("*.jsonl")) | |
| if FILTERED.exists(): | |
| sources.append(FILTERED) | |
| for path in sources: | |
| with open(path, encoding="utf-8") as f: | |
| for line in f: | |
| try: | |
| item = json.loads(line) | |
| if item.get("question"): | |
| yield item["question"] | |
| if item.get("answer"): | |
| yield item["answer"] | |
| if item.get("context"): | |
| yield item["context"] | |
| except Exception: | |
| pass | |
| def save_texts_for_training(path: Path, max_lines: int = 5_000_000): | |
| """บันทึก corpus ไว้ train tokenizer""" | |
| print(f"Writing corpus to {path} ...") | |
| count = 0 | |
| with open(path, "w", encoding="utf-8") as f: | |
| for text in iter_texts(): | |
| if count >= max_lines: | |
| break | |
| text = text.strip() | |
| if text: | |
| f.write(text + "\n") | |
| count += 1 | |
| print(f" Written {count:,} lines") | |
| def build_tokenizer() -> Tokenizer: | |
| """สร้าง BPE tokenizer ที่รองรับ Thai+EN อย่างถูกต้อง""" | |
| tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) | |
| # Normalize: NFC unicode normalization | |
| tokenizer.normalizer = NFC() # type: ignore | |
| # Pre-tokenizer: แยกตาม whitespace + punctuation | |
| # สำหรับภาษาไทยที่ไม่มี space ระหว่างคำ → ใช้ byte-level fallback | |
| tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ # type: ignore | |
| pre_tokenizers.UnicodeScripts(), # แยก script ต่างกัน (Thai vs Latin) | |
| pre_tokenizers.ByteLevel(add_prefix_space=False), | |
| ]) | |
| tokenizer.decoder = decoders.ByteLevel() # type: ignore | |
| # Post-processor: เพิ่ม BOS/EOS | |
| bos_id = SPECIAL_TOKENS.index("<bos>") | |
| eos_id = SPECIAL_TOKENS.index("<eos>") | |
| tokenizer.post_processor = processors.TemplateProcessing( # type: ignore | |
| single="<bos> $A <eos>", | |
| pair="<bos> $A <sep> $B:1 <eos>:1", | |
| special_tokens=[ | |
| ("<bos>", bos_id), | |
| ("<eos>", eos_id), | |
| ("<sep>", SPECIAL_TOKENS.index("<sep>")), | |
| ], | |
| ) | |
| return tokenizer | |
| def train(): | |
| print("=" * 60) | |
| print("TinyMind — Phase 2: Train BPE Tokenizer") | |
| print(f"Vocab size: {VOCAB_SIZE:,} | Specials: {len(SPECIAL_TOKENS)}") | |
| print("=" * 60) | |
| corpus_path = OUT_DIR / "corpus.txt" | |
| if not corpus_path.exists(): | |
| save_texts_for_training(corpus_path) | |
| else: | |
| print(f"Using existing corpus: {corpus_path}") | |
| tokenizer = build_tokenizer() | |
| trainer = trainers.BpeTrainer( | |
| vocab_size=VOCAB_SIZE, | |
| special_tokens=SPECIAL_TOKENS, | |
| min_frequency=2, | |
| show_progress=True, | |
| initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), | |
| ) | |
| print("\nTraining BPE ...") | |
| tokenizer.train([str(corpus_path)], trainer=trainer) | |
| # Save | |
| tok_path = OUT_DIR / "tokenizer.json" | |
| tokenizer.save(str(tok_path)) | |
| print(f"\nTokenizer saved → {tok_path}") | |
| print(f"Actual vocab size: {tokenizer.get_vocab_size():,}") | |
| # Quick test | |
| test_sentences = [ | |
| "สวัสดีครับ ผมชื่อ TinyMind", | |
| "What is artificial intelligence?", | |
| "AI คืออะไร และมันทำงานอย่างไร?", | |
| "<th> ดาวอังคารอยู่ห่างจากโลกเท่าไร? <sep> ประมาณ 225 ล้านกิโลเมตร <eos>", | |
| ] | |
| print("\n--- Tokenizer Test ---") | |
| for s in test_sentences: | |
| enc = tokenizer.encode(s) | |
| print(f" '{s[:40]}...' → {len(enc.ids)} tokens") | |
| return tokenizer | |
| if __name__ == "__main__": | |
| train() | |
Xet Storage Details
- Size:
- 4.88 kB
- Xet hash:
- d7779945598bdbd88e9a061fba6a7301bdc58becdf1f6de82da4e4bf58ffb13e
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.