reFlow / data /sft-lima /prepare_sft_jsonl.py
reuAC's picture
Upload folder using huggingface_hub
672259a verified
import os
import json
import numpy as np
import tiktoken
from tqdm import tqdm
train_file = "train.jsonl"
print(f"Loading data from {train_file}...")
dataset = []
with open(train_file, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
dataset.append(json.loads(line))
print(f"Loaded {len(dataset)} examples.")
enc = tiktoken.get_encoding("gpt2")
val_size = min(1000, int(len(dataset) * 0.1))
train_data = dataset[:-val_size]
val_data = dataset[-val_size:]
def process_data(data_list):
ids_out = []
masks_out = []
for ex in tqdm(data_list):
convs = ex.get("conversations", [])
if len(convs) < 2:
continue
user_text = convs[0]
assistant_text = convs[1]
prompt = f"Question: {user_text}\nAnswer: "
response = f"{assistant_text}"
p_ids = enc.encode_ordinary(prompt)
r_ids = enc.encode_ordinary(response)
r_ids.append(enc.eot_token)
seq = p_ids + r_ids
mask = [0] * len(p_ids) + [1] * len(r_ids)
ids_out.extend(seq)
masks_out.extend(mask)
return ids_out, masks_out
print("Processing train split...")
train_ids, train_masks = process_data(train_data)
print("Processing val split...")
val_ids, val_masks = process_data(val_data)
print(f"Train has {len(train_ids):,} tokens")
print(f"Val has {len(val_ids):,} tokens")
np.array(train_ids, dtype=np.uint16).tofile("train.bin")
np.array(train_masks, dtype=np.uint16).tofile("train_mask.bin")
np.array(val_ids, dtype=np.uint16).tofile("val.bin")
np.array(val_masks, dtype=np.uint16).tofile("val_mask.bin")
print("Done! train.bin, train_mask.bin, val.bin, and val_mask.bin created successfully.")