nanogpt-tr-v5-code / 04_tokenize.py
musabc's picture
upload 04_tokenize.py
fd87777 verified
Raw
History Blame Contribute Delete
5.71 kB
"""
Corpus'u tokenize edip nanoGPT formatinda .bin olarak kaydet.
HIZLANDIRMA:
- tokenizer.encode_batch (Rust + multi-threaded, single encode'dan ~5-8x hizli)
- Dosyaya inkremental yazma (RAM'de tum array tutulmuyor)
- Buyuk batch (5000 satir) — ic icine girmeden ust uste tokenize
Cikti:
data/train.bin (uint16 token id'leri)
data/val.bin
data/meta.pkl
"""
import argparse
import pickle
import time
from pathlib import Path
import numpy as np
from tokenizers import Tokenizer
from tqdm import tqdm
DATA_DIR = Path(__file__).parent / "data"
def encode_file(tokenizer, in_path: Path, out_path: Path, eot_id: int,
batch_size: int = 5000, dtype=np.uint16):
print(f"\n{in_path.name} -> {out_path.name}")
t0 = time.time()
# Satir sayisini once say (progress bar icin)
print(" satir sayiliyor...", end=" ", flush=True)
n_lines = 0
with open(in_path, "r", encoding="utf-8") as f:
for _ in f:
n_lines += 1
print(f"{n_lines:,}")
total_tokens = 0
# Append modunda binary yaz — RAM'de tum array tutmuyoruz
out_path.unlink(missing_ok=True)
with open(in_path, "r", encoding="utf-8") as f, \
open(out_path, "ab", buffering=1024*1024) as out_f:
pbar = tqdm(total=n_lines, desc="tokenize", smoothing=0.05)
batch = []
def flush(batch):
if not batch:
return 0
# encode_batch Rust + multi-threaded, ic icine birden fazla cumle alir
encs = tokenizer.encode_batch(batch)
# Tum id'leri ve EOT'leri tek bir array'de birlestir
all_ids = []
for enc in encs:
all_ids.extend(enc.ids)
all_ids.append(eot_id)
arr = np.array(all_ids, dtype=dtype)
out_f.write(arr.tobytes())
return len(arr)
for line in f:
line = line.strip()
if not line:
pbar.update(1)
continue
batch.append(line)
if len(batch) >= batch_size:
total_tokens += flush(batch)
pbar.update(len(batch))
pbar.set_postfix(tokens=f"{total_tokens/1e6:.1f}M")
batch.clear()
# Son kalan
if batch:
total_tokens += flush(batch)
pbar.update(len(batch))
pbar.close()
elapsed = time.time() - t0
size_mb = out_path.stat().st_size / 1e6
speed = total_tokens / elapsed / 1e6
print(f" [OK] {total_tokens:,} token, {size_mb:.1f} MB, "
f"{elapsed:.1f}s ({speed:.2f}M token/s)")
return total_tokens
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--tokenizer", type=str,
default=str(DATA_DIR / "tokenizer-tr-16k.json"))
parser.add_argument("--train-in", type=str,
default=str(DATA_DIR / "corpus_train_v3.txt"))
parser.add_argument("--val-in", type=str, default=None,
help="Val corpus (yoksa val atlanir)")
parser.add_argument("--train-out", type=str, default=None,
help="Train .bin cikti yolu (yoksa data/train.bin)")
parser.add_argument("--val-out", type=str, default=None,
help="Val .bin cikti yolu (yoksa data/val.bin)")
parser.add_argument("--meta-out", type=str, default=None,
help="Meta pickle yolu (yoksa data/meta.pkl)")
parser.add_argument("--batch-size", type=int, default=5000)
args = parser.parse_args()
# DATA_DIR yoksa oluştur (Lightning AI fresh env)
DATA_DIR.mkdir(parents=True, exist_ok=True)
train_out = Path(args.train_out) if args.train_out else (DATA_DIR / "train.bin")
val_out = Path(args.val_out) if args.val_out else (DATA_DIR / "val.bin")
meta_out = Path(args.meta_out) if args.meta_out else (DATA_DIR / "meta.pkl")
train_out.parent.mkdir(parents=True, exist_ok=True)
val_out.parent.mkdir(parents=True, exist_ok=True)
tokenizer = Tokenizer.from_file(args.tokenizer)
vocab_size = tokenizer.get_vocab_size()
eot_id = tokenizer.token_to_id("<|endoftext|>")
print(f"Vocab: {vocab_size} EOT id: {eot_id} Batch: {args.batch_size}")
print(f"Train: {args.train_in} -> {train_out}")
if args.val_in:
print(f"Val: {args.val_in} -> {val_out}")
else:
print(f"Val: atlandi")
if vocab_size > 65535:
raise ValueError("Vocab 65535'ten buyuk, uint16 yetmez. uint32 kullan.")
in_path = Path(args.train_in)
if not in_path.exists():
raise FileNotFoundError(f"Train input yok: {in_path}")
train_tokens = encode_file(tokenizer, in_path, train_out, eot_id,
batch_size=args.batch_size)
val_tokens = 0
if args.val_in:
val_in = Path(args.val_in)
if not val_in.exists():
print(f"UYARI: Val input yok ({val_in}), atlandi")
else:
val_tokens = encode_file(tokenizer, val_in, val_out, eot_id,
batch_size=args.batch_size)
meta = {
"vocab_size": vocab_size,
"eot_id": eot_id,
"tokenizer_path": args.tokenizer,
"train_tokens": train_tokens,
"val_tokens": val_tokens,
"train_out": str(train_out),
"val_out": str(val_out) if val_tokens else None,
}
with open(meta_out, "wb") as f:
pickle.dump(meta, f)
print(f"\n[OK] Hazir.")
print(f" Train: {train_tokens:,} token -> {train_out}")
if val_tokens:
print(f" Val: {val_tokens:,} token -> {val_out}")
print(f" Meta: {meta_out}")
if __name__ == "__main__":
main()