| """Build the conversational corpus + train a BPE tokenizer, then tokenize to a |
| binary token stream for fast training. |
| """ |
| import os, json, numpy as np |
| from tokenizers import ByteLevelBPETokenizer |
|
|
| DATA = 'data' |
| SLICE = f'{DATA}/ts_slice.txt' |
| ALPACA = f'{DATA}/alpaca.json' |
| VOCAB_SIZE = 8192 |
| EOT = '<|endoftext|>' |
|
|
| |
| |
| sample_path = f'{DATA}/tok_sample.txt' |
| if not os.path.exists(sample_path): |
| with open(sample_path, 'w') as out: |
| |
| with open(SLICE) as f: |
| out.write(f.read(80_000_000)) |
| |
| alp = json.load(open(ALPACA)) |
| for ex in alp[:20000]: |
| if ex['input'].strip(): |
| continue |
| out.write(f"\nUser: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}\n") |
| print("tokenizer sample built") |
|
|
| |
| tok_dir = 'tokenizer_bpe' |
| if not os.path.exists(f'{tok_dir}/vocab.json'): |
| os.makedirs(tok_dir, exist_ok=True) |
| tk = ByteLevelBPETokenizer() |
| tk.train(files=[sample_path], vocab_size=VOCAB_SIZE, min_frequency=2, |
| special_tokens=[EOT, '<pad>']) |
| tk.save_model(tok_dir) |
| print(f"BPE trained -> {tok_dir} (vocab {VOCAB_SIZE})") |
|
|
| |
| from tokenizers import ByteLevelBPETokenizer as T |
| tk = T(f'{tok_dir}/vocab.json', f'{tok_dir}/merges.txt') |
| eot_id = tk.token_to_id(EOT) |
| print('eot id', eot_id, 'vocab', tk.get_vocab_size()) |
|
|
| out_bin = f'{DATA}/train.bin' |
| if not os.path.exists(out_bin): |
| ids_all = [] |
| |
| with open(SLICE) as f: |
| text = f.read() |
| |
| parts = text.split(EOT) |
| print(f"encoding {len(parts):,} stories...") |
| buf = [] |
| BIN = np.memmap(out_bin, dtype=np.uint16, mode='w+', shape=(400_000_000,)) |
| pos = 0 |
| for i, p in enumerate(parts): |
| p = p.strip() |
| if not p: |
| continue |
| ids = tk.encode(p).ids + [eot_id] |
| if pos + len(ids) >= BIN.shape[0]: |
| break |
| BIN[pos:pos+len(ids)] = ids |
| pos += len(ids) |
| if i % 50000 == 0: |
| print(f" {i:,} stories, {pos:,} tokens") |
| |
| alp = json.load(open(ALPACA)) |
| for ex in alp: |
| if ex['input'].strip(): |
| continue |
| s = f"User: {ex['instruction'].strip()}\nBot: {ex['output'].strip()}" |
| ids = tk.encode(s).ids + [eot_id] |
| if pos + len(ids) >= BIN.shape[0]: |
| break |
| BIN[pos:pos+len(ids)] = ids |
| pos += len(ids) |
| BIN.flush() |
| |
| final = np.memmap(out_bin, dtype=np.uint16, mode='r', shape=(pos,)) |
| np.array(final).tofile(out_bin) |
| print(f"DONE: {pos:,} tokens written to {out_bin}") |
| json.dump({'tokens': int(pos), 'vocab_size': tk.get_vocab_size(), 'eot': eot_id}, |
| open(f'{DATA}/meta.json', 'w')) |
|
|