File size: 2,812 Bytes
b136fe1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import numpy as np
import tiktoken
from datasets import load_dataset
from tqdm import tqdm
import random

# --- KONFIGURATION ---
OUTPUT_DIR = "data/apex_code_boost" # Neuer Name!
TOKENIZER_NAME = "gpt2" 
SEED = 1337

# Sanfte Mischung für die Nachschulung:
# Wir nehmen weniger FineWeb, damit Code mehr Gewicht bekommt
FINEWEB_SAMPLES = 50000 
# Wir laden zusätzlich einen Code-Datensatz (Python Fokus)
print("📥 Lade 'sahil2801/CodeAlpaca-20k'...")
code_dataset = load_dataset("sahil2801/CodeAlpaca-20k", split='train')

enc = tiktoken.get_encoding(TOKENIZER_NAME)
EOS_TOKEN = "<|endoftext|>"

def format_prompt_with_mask(instruction, input_text, output):
    if input_text and input_text.strip():
        prompt_text = f"Instruction:\n{instruction}\n\nInput:\n{input_text}\n\nResponse:\n"
    else:
        prompt_text = f"Instruction:\n{instruction}\n\nResponse:\n"
    completion_text = f"{output}{EOS_TOKEN}"
    prompt_ids = enc.encode(prompt_text, allowed_special={'<|endoftext|>'})
    completion_ids = enc.encode(completion_text, allowed_special={'<|endoftext|>'})
    full_ids = prompt_ids + completion_ids
    mask = [0] * len(prompt_ids) + [1] * len(completion_ids)
    return full_ids, mask

def main():
    np.random.seed(SEED)
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    alpaca = load_dataset("yahma/alpaca-cleaned", split='train')
    fineweb = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split='train', streaming=True)
    
    all_samples = []

    # 1. Alpaca verarbeiten
    for ex in tqdm(alpaca, desc="Alpaca"):
        all_samples.append(format_prompt_with_mask(ex['instruction'], ex['input'], ex['output']))

    # 2. Code-Alpaca verarbeiten (WICHTIG!)
    for ex in tqdm(code_dataset, desc="Code-Alpaca"):
        all_samples.append(format_prompt_with_mask(ex['instruction'], ex['input'], ex['output']))

    # 3. FineWeb verarbeiten (Wissenserhalt)
    fw_iter = iter(fineweb)
    for _ in tqdm(range(FINEWEB_SAMPLES), desc="FineWeb"):
        try:
            ex = next(fw_iter)
            text = ex['text'] + EOS_TOKEN
            ids = enc.encode(text, allowed_special={EOS_TOKEN})
            all_samples.append((ids, [1] * len(ids)))
        except StopIteration:
            break

    # SHUFFLE für Anti-Forgetting
    random.seed(SEED)
    random.shuffle(all_samples)

    all_tokens = []
    all_masks = []
    for ids, mask in all_samples:
        all_tokens.extend(ids)
        all_masks.extend(mask)

    # Speichern
    print(f"💾 Speichere in '{OUTPUT_DIR}'...")
    np.array(all_tokens, dtype=np.uint16).tofile(os.path.join(OUTPUT_DIR, "train.bin"))
    np.array(all_masks, dtype=np.uint8).tofile(os.path.join(OUTPUT_DIR, "train_mask.bin"))
    print("✅ Datensatz für Code-Boost fertig!")

if __name__ == "__main__":
    main()