| import os |
| import json |
| import numpy as np |
| import tiktoken |
| from tqdm import tqdm |
|
|
| train_file = "train.jsonl" |
| print(f"Loading data from {train_file}...") |
|
|
| dataset = [] |
| with open(train_file, 'r', encoding='utf-8') as f: |
| for line in f: |
| if line.strip(): |
| dataset.append(json.loads(line)) |
|
|
| print(f"Loaded {len(dataset)} examples.") |
|
|
| enc = tiktoken.get_encoding("gpt2") |
|
|
| val_size = min(1000, int(len(dataset) * 0.1)) |
| train_data = dataset[:-val_size] |
| val_data = dataset[-val_size:] |
|
|
| def process_data(data_list): |
| ids_out = [] |
| masks_out = [] |
| |
| for ex in tqdm(data_list): |
| convs = ex.get("conversations", []) |
|
|
| if len(convs) < 2: |
| continue |
| |
| user_text = convs[0] |
| assistant_text = convs[1] |
|
|
| prompt = f"Question: {user_text}\nAnswer: " |
| response = f"{assistant_text}" |
| |
| p_ids = enc.encode_ordinary(prompt) |
| r_ids = enc.encode_ordinary(response) |
| r_ids.append(enc.eot_token) |
| |
| seq = p_ids + r_ids |
| |
| mask = [0] * len(p_ids) + [1] * len(r_ids) |
| |
| ids_out.extend(seq) |
| masks_out.extend(mask) |
| |
| return ids_out, masks_out |
|
|
| print("Processing train split...") |
| train_ids, train_masks = process_data(train_data) |
| print("Processing val split...") |
| val_ids, val_masks = process_data(val_data) |
|
|
| print(f"Train has {len(train_ids):,} tokens") |
| print(f"Val has {len(val_ids):,} tokens") |
|
|
| np.array(train_ids, dtype=np.uint16).tofile("train.bin") |
| np.array(train_masks, dtype=np.uint16).tofile("train_mask.bin") |
| np.array(val_ids, dtype=np.uint16).tofile("val.bin") |
| np.array(val_masks, dtype=np.uint16).tofile("val_mask.bin") |
|
|
| print("Done! train.bin, train_mask.bin, val.bin, and val_mask.bin created successfully.") |