Apex-1.5-Coder-Instruct-350M / prepare-apex-1.5-coder-data.py
LH-Tech-AI's picture
Upload 2 files
b136fe1 verified
import os
import numpy as np
import tiktoken
from datasets import load_dataset
from tqdm import tqdm
import random
# --- KONFIGURATION ---
OUTPUT_DIR = "data/apex_code_boost" # Neuer Name!
TOKENIZER_NAME = "gpt2"
SEED = 1337
# Sanfte Mischung für die Nachschulung:
# Wir nehmen weniger FineWeb, damit Code mehr Gewicht bekommt
FINEWEB_SAMPLES = 50000
# Wir laden zusätzlich einen Code-Datensatz (Python Fokus)
print("📥 Lade 'sahil2801/CodeAlpaca-20k'...")
code_dataset = load_dataset("sahil2801/CodeAlpaca-20k", split='train')
enc = tiktoken.get_encoding(TOKENIZER_NAME)
EOS_TOKEN = "<|endoftext|>"
def format_prompt_with_mask(instruction, input_text, output):
if input_text and input_text.strip():
prompt_text = f"Instruction:\n{instruction}\n\nInput:\n{input_text}\n\nResponse:\n"
else:
prompt_text = f"Instruction:\n{instruction}\n\nResponse:\n"
completion_text = f"{output}{EOS_TOKEN}"
prompt_ids = enc.encode(prompt_text, allowed_special={'<|endoftext|>'})
completion_ids = enc.encode(completion_text, allowed_special={'<|endoftext|>'})
full_ids = prompt_ids + completion_ids
mask = [0] * len(prompt_ids) + [1] * len(completion_ids)
return full_ids, mask
def main():
np.random.seed(SEED)
os.makedirs(OUTPUT_DIR, exist_ok=True)
alpaca = load_dataset("yahma/alpaca-cleaned", split='train')
fineweb = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split='train', streaming=True)
all_samples = []
# 1. Alpaca verarbeiten
for ex in tqdm(alpaca, desc="Alpaca"):
all_samples.append(format_prompt_with_mask(ex['instruction'], ex['input'], ex['output']))
# 2. Code-Alpaca verarbeiten (WICHTIG!)
for ex in tqdm(code_dataset, desc="Code-Alpaca"):
all_samples.append(format_prompt_with_mask(ex['instruction'], ex['input'], ex['output']))
# 3. FineWeb verarbeiten (Wissenserhalt)
fw_iter = iter(fineweb)
for _ in tqdm(range(FINEWEB_SAMPLES), desc="FineWeb"):
try:
ex = next(fw_iter)
text = ex['text'] + EOS_TOKEN
ids = enc.encode(text, allowed_special={EOS_TOKEN})
all_samples.append((ids, [1] * len(ids)))
except StopIteration:
break
# SHUFFLE für Anti-Forgetting
random.seed(SEED)
random.shuffle(all_samples)
all_tokens = []
all_masks = []
for ids, mask in all_samples:
all_tokens.extend(ids)
all_masks.extend(mask)
# Speichern
print(f"💾 Speichere in '{OUTPUT_DIR}'...")
np.array(all_tokens, dtype=np.uint16).tofile(os.path.join(OUTPUT_DIR, "train.bin"))
np.array(all_masks, dtype=np.uint8).tofile(os.path.join(OUTPUT_DIR, "train_mask.bin"))
print("✅ Datensatz für Code-Boost fertig!")
if __name__ == "__main__":
main()