| """ |
| Corpus'u tokenize edip nanoGPT formatinda .bin olarak kaydet. |
| |
| HIZLANDIRMA: |
| - tokenizer.encode_batch (Rust + multi-threaded, single encode'dan ~5-8x hizli) |
| - Dosyaya inkremental yazma (RAM'de tum array tutulmuyor) |
| - Buyuk batch (5000 satir) — ic icine girmeden ust uste tokenize |
| |
| Cikti: |
| data/train.bin (uint16 token id'leri) |
| data/val.bin |
| data/meta.pkl |
| """ |
|
|
| import argparse |
| import pickle |
| import time |
| from pathlib import Path |
|
|
| import numpy as np |
| from tokenizers import Tokenizer |
| from tqdm import tqdm |
|
|
| DATA_DIR = Path(__file__).parent / "data" |
|
|
|
|
| def encode_file(tokenizer, in_path: Path, out_path: Path, eot_id: int, |
| batch_size: int = 5000, dtype=np.uint16): |
| print(f"\n{in_path.name} -> {out_path.name}") |
| t0 = time.time() |
|
|
| |
| print(" satir sayiliyor...", end=" ", flush=True) |
| n_lines = 0 |
| with open(in_path, "r", encoding="utf-8") as f: |
| for _ in f: |
| n_lines += 1 |
| print(f"{n_lines:,}") |
|
|
| total_tokens = 0 |
| |
| out_path.unlink(missing_ok=True) |
|
|
| with open(in_path, "r", encoding="utf-8") as f, \ |
| open(out_path, "ab", buffering=1024*1024) as out_f: |
|
|
| pbar = tqdm(total=n_lines, desc="tokenize", smoothing=0.05) |
| batch = [] |
|
|
| def flush(batch): |
| if not batch: |
| return 0 |
| |
| encs = tokenizer.encode_batch(batch) |
| |
| all_ids = [] |
| for enc in encs: |
| all_ids.extend(enc.ids) |
| all_ids.append(eot_id) |
| arr = np.array(all_ids, dtype=dtype) |
| out_f.write(arr.tobytes()) |
| return len(arr) |
|
|
| for line in f: |
| line = line.strip() |
| if not line: |
| pbar.update(1) |
| continue |
| batch.append(line) |
| if len(batch) >= batch_size: |
| total_tokens += flush(batch) |
| pbar.update(len(batch)) |
| pbar.set_postfix(tokens=f"{total_tokens/1e6:.1f}M") |
| batch.clear() |
|
|
| |
| if batch: |
| total_tokens += flush(batch) |
| pbar.update(len(batch)) |
|
|
| pbar.close() |
|
|
| elapsed = time.time() - t0 |
| size_mb = out_path.stat().st_size / 1e6 |
| speed = total_tokens / elapsed / 1e6 |
| print(f" [OK] {total_tokens:,} token, {size_mb:.1f} MB, " |
| f"{elapsed:.1f}s ({speed:.2f}M token/s)") |
| return total_tokens |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--tokenizer", type=str, |
| default=str(DATA_DIR / "tokenizer-tr-16k.json")) |
| parser.add_argument("--train-in", type=str, |
| default=str(DATA_DIR / "corpus_train_v3.txt")) |
| parser.add_argument("--val-in", type=str, default=None, |
| help="Val corpus (yoksa val atlanir)") |
| parser.add_argument("--train-out", type=str, default=None, |
| help="Train .bin cikti yolu (yoksa data/train.bin)") |
| parser.add_argument("--val-out", type=str, default=None, |
| help="Val .bin cikti yolu (yoksa data/val.bin)") |
| parser.add_argument("--meta-out", type=str, default=None, |
| help="Meta pickle yolu (yoksa data/meta.pkl)") |
| parser.add_argument("--batch-size", type=int, default=5000) |
| args = parser.parse_args() |
|
|
| |
| DATA_DIR.mkdir(parents=True, exist_ok=True) |
|
|
| train_out = Path(args.train_out) if args.train_out else (DATA_DIR / "train.bin") |
| val_out = Path(args.val_out) if args.val_out else (DATA_DIR / "val.bin") |
| meta_out = Path(args.meta_out) if args.meta_out else (DATA_DIR / "meta.pkl") |
| train_out.parent.mkdir(parents=True, exist_ok=True) |
| val_out.parent.mkdir(parents=True, exist_ok=True) |
|
|
| tokenizer = Tokenizer.from_file(args.tokenizer) |
| vocab_size = tokenizer.get_vocab_size() |
| eot_id = tokenizer.token_to_id("<|endoftext|>") |
| print(f"Vocab: {vocab_size} EOT id: {eot_id} Batch: {args.batch_size}") |
| print(f"Train: {args.train_in} -> {train_out}") |
| if args.val_in: |
| print(f"Val: {args.val_in} -> {val_out}") |
| else: |
| print(f"Val: atlandi") |
|
|
| if vocab_size > 65535: |
| raise ValueError("Vocab 65535'ten buyuk, uint16 yetmez. uint32 kullan.") |
|
|
| in_path = Path(args.train_in) |
| if not in_path.exists(): |
| raise FileNotFoundError(f"Train input yok: {in_path}") |
|
|
| train_tokens = encode_file(tokenizer, in_path, train_out, eot_id, |
| batch_size=args.batch_size) |
|
|
| val_tokens = 0 |
| if args.val_in: |
| val_in = Path(args.val_in) |
| if not val_in.exists(): |
| print(f"UYARI: Val input yok ({val_in}), atlandi") |
| else: |
| val_tokens = encode_file(tokenizer, val_in, val_out, eot_id, |
| batch_size=args.batch_size) |
|
|
| meta = { |
| "vocab_size": vocab_size, |
| "eot_id": eot_id, |
| "tokenizer_path": args.tokenizer, |
| "train_tokens": train_tokens, |
| "val_tokens": val_tokens, |
| "train_out": str(train_out), |
| "val_out": str(val_out) if val_tokens else None, |
| } |
| with open(meta_out, "wb") as f: |
| pickle.dump(meta, f) |
|
|
| print(f"\n[OK] Hazir.") |
| print(f" Train: {train_tokens:,} token -> {train_out}") |
| if val_tokens: |
| print(f" Val: {val_tokens:,} token -> {val_out}") |
| print(f" Meta: {meta_out}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|