| """
|
| Faz 0 / Adım 2 — SentencePiece BPE 48K eğit (EN+TR).
|
|
|
| Kararlar (kılavuz Bölüm 3 + TildeOpen):
|
| model_type=bpe, vocab_size=48000, byte_fallback=True, split_digits=True,
|
| split_by_unicode_script=True, character_coverage=0.9999, normalization=identity
|
| (text normalization kapalı; byte_fallback bilinmeyenleri bayt seviyesinde yakalar).
|
|
|
| Çıktı: kod/data/smartcore_v1_tok.model + .vocab
|
| Kullanım: python kod/faz0_02_train_tokenizer.py
|
| """
|
| import os, time, argparse
|
| import sentencepiece as spm
|
|
|
|
|
| def main():
|
| ap = argparse.ArgumentParser()
|
| ap.add_argument("--corpus", default="kod/data/tokenizer_corpus.txt")
|
| ap.add_argument("--prefix", default="kod/data/smartcore_v1_tok")
|
| ap.add_argument("--vocab_size", type=int, default=48000)
|
| args = ap.parse_args()
|
|
|
| t0 = time.perf_counter()
|
| spm.SentencePieceTrainer.train(
|
| input=args.corpus,
|
| model_prefix=args.prefix,
|
| vocab_size=args.vocab_size,
|
| model_type="bpe",
|
| character_coverage=0.9999,
|
| byte_fallback=True,
|
| split_digits=True,
|
| split_by_unicode_script=True,
|
| allow_whitespace_only_pieces=True,
|
| remove_extra_whitespaces=False,
|
| normalization_rule_name="identity",
|
| max_sentence_length=8192,
|
| input_sentence_size=3_000_000,
|
| shuffle_input_sentence=True,
|
| num_threads=6,
|
| unk_id=0, bos_id=1, eos_id=2, pad_id=3,
|
| unk_piece="<unk>", bos_piece="<s>", eos_piece="</s>", pad_piece="<pad>",
|
| train_extremely_large_corpus=False,
|
| )
|
| print(f"\nEĞİTİM BİTTİ ({time.perf_counter()-t0:.0f}s) -> {args.prefix}.model / .vocab")
|
|
|
| sp = spm.SentencePieceProcessor(model_file=args.prefix + ".model")
|
| print(f"vocab_size = {sp.get_piece_size()}")
|
| for s in ["Merhaba dünya, bugün hava çok güzel!",
|
| "The quick brown fox jumps over 1234 lazy dogs.",
|
| "Yapay zeka modelleri 2026 yılında 180 milyon parametreyle eğitiliyor."]:
|
| ids = sp.encode(s, out_type=int)
|
| print(f" '{s[:45]}...' -> {len(ids)} token")
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|