import os, time, json, math, numpy as np, torch
from gpt2 import GPT2

torch.manual_seed(1337)
torch.set_num_threads(2)

# ---------- config: "big but safe" 18M-param GPT-2 (BPE) ----------
cfg = dict(vocab_size=8192, n_embd=448, n_head=7, n_layer=6,
           block_size=256, dropout=0.1)
batch_size = 10
grad_accum = 2          # effective batch 40 for stability
lr         = 6e-4
min_lr     = 6e-5
warmup     = 300
max_iters  = 12000      # long run; checkpoints every eval so we can stop anytime
eval_iter  = 500
eval_batches = 40
ckpt_path  = 'big.pt'

meta = json.load(open('data/meta.json'))
eot_id = meta['eot']
data = np.memmap('data/train.bin', dtype=np.uint16, mode='r')
# hold out a random interior slice for val (TinyStories region, not the Alpaca tail)
val_lo = int(0.90 * len(data)); val_hi = int(0.92 * len(data))
train_data = data            # sample train from anywhere; val region is tiny
val_data = data[val_lo:val_hi]
print(f"corpus tokens: {len(data):,} | vocab {cfg['vocab_size']} | model ready")

def get_batch(split):
    blk = cfg['block_size']
    if split == 'train':
        ix = torch.randint(len(train_data) - blk, (batch_size,))
        d = train_data
    else:
        ix = torch.randint(len(val_data) - blk, (batch_size,))
        d = val_data
    x = torch.stack([torch.from_numpy(d[i:i+blk].astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy(d[i+1:i+1+blk].astype(np.int64)) for i in ix])
    return x, y

@torch.no_grad()
def est():
    out = {}; model.eval()
    for sp in ['train', 'val']:
        L = torch.zeros(eval_batches)
        for k in range(eval_batches):
            x, y = get_batch(sp); _, l = model(x, y); L[k] = l.item()
        out[sp] = L.mean().item()
    model.train(); return out

def get_lr(it):
    if it < warmup: return lr * it / warmup
    if it > max_iters: return min_lr
    r = (it - warmup) / (max_iters - warmup)
    return min_lr + 0.5 * (1 + math.cos(math.pi * r)) * (lr - min_lr)

model = GPT2(cfg)
print(f"params: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")
opt = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=0.1)

start_iter = 0; hist = []
if os.path.exists(ckpt_path):
    ck = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ck['model']); opt.load_state_dict(ck['opt'])
    start_iter = ck['iter']; hist = ck.get('hist', [])
    print(f"RESUMED from iter {start_iter}")

t0 = time.time(); best_val = 1e9
for it in range(start_iter, max_iters + 1):
    for g in opt.param_groups: g['lr'] = get_lr(it)
    if it % eval_iter == 0:
        l = est(); el = time.time() - t0
        print(f"iter {it:5d} | train {l['train']:.3f} | val {l['val']:.3f} | lr {get_lr(it):.1e} | {el/60:.1f}min", flush=True)
        hist.append({"iter": it, **l, "t": el})
        torch.save({'model': model.state_dict(), 'opt': opt.state_dict(),
                    'cfg': cfg, 'iter': it, 'hist': hist, 'eot': eot_id}, ckpt_path)
    # grad accumulation
    opt.zero_grad(set_to_none=True)
    for _ in range(grad_accum):
        x, y = get_batch('train'); _, loss = model(x, y)
        (loss / grad_accum).backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    opt.step()

print(f"DONE {(time.time()-t0)/60:.1f}min")