""" Corpus'u tokenize edip nanoGPT formatinda .bin olarak kaydet. HIZLANDIRMA: - tokenizer.encode_batch (Rust + multi-threaded, single encode'dan ~5-8x hizli) - Dosyaya inkremental yazma (RAM'de tum array tutulmuyor) - Buyuk batch (5000 satir) — ic icine girmeden ust uste tokenize Cikti: data/train.bin (uint16 token id'leri) data/val.bin data/meta.pkl """ import argparse import pickle import time from pathlib import Path import numpy as np from tokenizers import Tokenizer from tqdm import tqdm DATA_DIR = Path(__file__).parent / "data" def encode_file(tokenizer, in_path: Path, out_path: Path, eot_id: int, batch_size: int = 5000, dtype=np.uint16): print(f"\n{in_path.name} -> {out_path.name}") t0 = time.time() # Satir sayisini once say (progress bar icin) print(" satir sayiliyor...", end=" ", flush=True) n_lines = 0 with open(in_path, "r", encoding="utf-8") as f: for _ in f: n_lines += 1 print(f"{n_lines:,}") total_tokens = 0 # Append modunda binary yaz — RAM'de tum array tutmuyoruz out_path.unlink(missing_ok=True) with open(in_path, "r", encoding="utf-8") as f, \ open(out_path, "ab", buffering=1024*1024) as out_f: pbar = tqdm(total=n_lines, desc="tokenize", smoothing=0.05) batch = [] def flush(batch): if not batch: return 0 # encode_batch Rust + multi-threaded, ic icine birden fazla cumle alir encs = tokenizer.encode_batch(batch) # Tum id'leri ve EOT'leri tek bir array'de birlestir all_ids = [] for enc in encs: all_ids.extend(enc.ids) all_ids.append(eot_id) arr = np.array(all_ids, dtype=dtype) out_f.write(arr.tobytes()) return len(arr) for line in f: line = line.strip() if not line: pbar.update(1) continue batch.append(line) if len(batch) >= batch_size: total_tokens += flush(batch) pbar.update(len(batch)) pbar.set_postfix(tokens=f"{total_tokens/1e6:.1f}M") batch.clear() # Son kalan if batch: total_tokens += flush(batch) pbar.update(len(batch)) pbar.close() elapsed = time.time() - t0 size_mb = out_path.stat().st_size / 1e6 speed = total_tokens / elapsed / 1e6 print(f" [OK] {total_tokens:,} token, {size_mb:.1f} MB, " f"{elapsed:.1f}s ({speed:.2f}M token/s)") return total_tokens def main(): parser = argparse.ArgumentParser() parser.add_argument("--tokenizer", type=str, default=str(DATA_DIR / "tokenizer-tr-16k.json")) parser.add_argument("--train-in", type=str, default=str(DATA_DIR / "corpus_train_v3.txt")) parser.add_argument("--val-in", type=str, default=None, help="Val corpus (yoksa val atlanir)") parser.add_argument("--train-out", type=str, default=None, help="Train .bin cikti yolu (yoksa data/train.bin)") parser.add_argument("--val-out", type=str, default=None, help="Val .bin cikti yolu (yoksa data/val.bin)") parser.add_argument("--meta-out", type=str, default=None, help="Meta pickle yolu (yoksa data/meta.pkl)") parser.add_argument("--batch-size", type=int, default=5000) args = parser.parse_args() # DATA_DIR yoksa oluştur (Lightning AI fresh env) DATA_DIR.mkdir(parents=True, exist_ok=True) train_out = Path(args.train_out) if args.train_out else (DATA_DIR / "train.bin") val_out = Path(args.val_out) if args.val_out else (DATA_DIR / "val.bin") meta_out = Path(args.meta_out) if args.meta_out else (DATA_DIR / "meta.pkl") train_out.parent.mkdir(parents=True, exist_ok=True) val_out.parent.mkdir(parents=True, exist_ok=True) tokenizer = Tokenizer.from_file(args.tokenizer) vocab_size = tokenizer.get_vocab_size() eot_id = tokenizer.token_to_id("<|endoftext|>") print(f"Vocab: {vocab_size} EOT id: {eot_id} Batch: {args.batch_size}") print(f"Train: {args.train_in} -> {train_out}") if args.val_in: print(f"Val: {args.val_in} -> {val_out}") else: print(f"Val: atlandi") if vocab_size > 65535: raise ValueError("Vocab 65535'ten buyuk, uint16 yetmez. uint32 kullan.") in_path = Path(args.train_in) if not in_path.exists(): raise FileNotFoundError(f"Train input yok: {in_path}") train_tokens = encode_file(tokenizer, in_path, train_out, eot_id, batch_size=args.batch_size) val_tokens = 0 if args.val_in: val_in = Path(args.val_in) if not val_in.exists(): print(f"UYARI: Val input yok ({val_in}), atlandi") else: val_tokens = encode_file(tokenizer, val_in, val_out, eot_id, batch_size=args.batch_size) meta = { "vocab_size": vocab_size, "eot_id": eot_id, "tokenizer_path": args.tokenizer, "train_tokens": train_tokens, "val_tokens": val_tokens, "train_out": str(train_out), "val_out": str(val_out) if val_tokens else None, } with open(meta_out, "wb") as f: pickle.dump(meta, f) print(f"\n[OK] Hazir.") print(f" Train: {train_tokens:,} token -> {train_out}") if val_tokens: print(f" Val: {val_tokens:,} token -> {val_out}") print(f" Meta: {meta_out}") if __name__ == "__main__": main()