"""Build the conversational corpus + train a BPE tokenizer, then tokenize to a
binary token stream for fast training.
"""
import os, json, numpy as np
from tokenizers import ByteLevelBPETokenizer

DATA = 'data'
SLICE = f'{DATA}/ts_slice.txt'      # TinyStories slice (coherent prose+dialogue)
ALPACA = f'{DATA}/alpaca.json'
VOCAB_SIZE = 8192
EOT = '<|endoftext|>'

# ---------- 1. assemble a small mixed text for TOKENIZER TRAINING ----------
# (tokenizer only needs a representative sample, not the whole corpus)
sample_path = f'{DATA}/tok_sample.txt'
if not os.path.exists(sample_path):
    with open(sample_path, 'w') as out:
        # ~80MB of stories
        with open(SLICE) as f:
            out.write(f.read(80_000_000))
        # + Alpaca Q&A so the tokenizer knows the chat format tokens
        alp = json.load(open(ALPACA))
        for ex in alp[:20000]:
            if ex['input'].strip():
                continue
            out.write(f"\nUser: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}\n")
    print("tokenizer sample built")

# ---------- 2. train BPE ----------
tok_dir = 'tokenizer_bpe'
if not os.path.exists(f'{tok_dir}/vocab.json'):
    os.makedirs(tok_dir, exist_ok=True)
    tk = ByteLevelBPETokenizer()
    tk.train(files=[sample_path], vocab_size=VOCAB_SIZE, min_frequency=2,
             special_tokens=[EOT, '<pad>'])
    tk.save_model(tok_dir)
    print(f"BPE trained -> {tok_dir} (vocab {VOCAB_SIZE})")

# ---------- 3. tokenize the full corpus to a uint16 .bin ----------
from tokenizers import ByteLevelBPETokenizer as T
tk = T(f'{tok_dir}/vocab.json', f'{tok_dir}/merges.txt')
eot_id = tk.token_to_id(EOT)
print('eot id', eot_id, 'vocab', tk.get_vocab_size())

out_bin = f'{DATA}/train.bin'
if not os.path.exists(out_bin):
    ids_all = []
    # stories
    with open(SLICE) as f:
        text = f.read()
    # TinyStories already uses <|endoftext|> between stories
    parts = text.split(EOT)
    print(f"encoding {len(parts):,} stories...")
    buf = []
    BIN = np.memmap(out_bin, dtype=np.uint16, mode='w+', shape=(400_000_000,))
    pos = 0
    for i, p in enumerate(parts):
        p = p.strip()
        if not p:
            continue
        ids = tk.encode(p).ids + [eot_id]
        if pos + len(ids) >= BIN.shape[0]:
            break
        BIN[pos:pos+len(ids)] = ids
        pos += len(ids)
        if i % 50000 == 0:
            print(f"  {i:,} stories, {pos:,} tokens")
    # append Alpaca Q&A as User/Bot turns
    alp = json.load(open(ALPACA))
    for ex in alp:
        if ex['input'].strip():
            continue
        s = f"User: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}"
        ids = tk.encode(s).ids + [eot_id]
        if pos + len(ids) >= BIN.shape[0]:
            break
        BIN[pos:pos+len(ids)] = ids
        pos += len(ids)
    BIN.flush()
    # trim to actual size
    final = np.memmap(out_bin, dtype=np.uint16, mode='r', shape=(pos,))
    np.array(final).tofile(out_bin)
    print(f"DONE: {pos:,} tokens written to {out_bin}")
    json.dump({'tokens': int(pos), 'vocab_size': tk.get_vocab_size(), 'eot': eot_id},
              open(f'{DATA}/meta.json', 'w'))