| import os |
| import numpy as np |
| import tiktoken |
| from datasets import load_dataset |
| from tqdm import tqdm |
|
|
| |
| OUTPUT_DIR = "data/alpaca_cleaned_mixed_NEW" |
| |
| TOKENIZER_NAME = "gpt2" |
| SEED = 1337 |
|
|
| |
| |
| |
| FINEWEB_SAMPLES = 520000 |
|
|
| enc = tiktoken.get_encoding(TOKENIZER_NAME) |
| EOS_TOKEN = "<|endoftext|>" |
|
|
| def format_prompt_with_mask(instruction, input_text, output): |
| """ |
| Formatiert den Prompt und erstellt die Loss-Maske. |
| Format: |
| Instruction: ... |
| Input: ... (optional) |
| Response: ... <|endoftext|> |
| """ |
| |
| if input_text and input_text.strip(): |
| prompt_text = f"Instruction:\n{instruction}\n\nInput:\n{input_text}\n\nResponse:\n" |
| else: |
| prompt_text = f"Instruction:\n{instruction}\n\nResponse:\n" |
| |
| |
| completion_text = f"{output}{EOS_TOKEN}" |
| |
| |
| |
| prompt_ids = enc.encode(prompt_text, allowed_special={'<|endoftext|>'}) |
| completion_ids = enc.encode(completion_text, allowed_special={'<|endoftext|>'}) |
| |
| |
| full_ids = prompt_ids + completion_ids |
| |
| |
| |
| |
| mask = [0] * len(prompt_ids) + [1] * len(completion_ids) |
| |
| return full_ids, mask |
|
|
| def main(): |
| np.random.seed(SEED) |
| print(f"🚀 Starte Prepare-Script für SmaLLMPro (350M SFT)...") |
| print(f"📚 Tokenizer: {TOKENIZER_NAME}") |
| |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| |
| print("📥 Lade 'yahma/alpaca-cleaned' (Chat-Instruktionen)...") |
| alpaca = load_dataset("yahma/alpaca-cleaned", split='train') |
| |
| print(f"📥 Lade 'HuggingFaceFW/fineweb-edu' (Sample-10BT) für {FINEWEB_SAMPLES} Samples...") |
| fineweb = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split='train', streaming=True) |
| |
| all_tokens = [] |
| all_masks = [] |
| |
| |
| print("⚙️ Verarbeite Alpaca...") |
| for ex in tqdm(alpaca, desc="Alpaca"): |
| ids, mask = format_prompt_with_mask(ex['instruction'], ex['input'], ex['output']) |
| all_tokens.extend(ids) |
| all_masks.extend(mask) |
| |
| alpaca_len = len(all_tokens) |
| print(f" -> Alpaca Tokens: {alpaca_len:,}") |
|
|
| |
| |
| |
| print("⚙️ Verarbeite FineWeb (Anti-Forgetting)...") |
| fw_iter = iter(fineweb) |
| fw_count = 0 |
| fw_tokens_count = 0 |
| |
| for _ in tqdm(range(FINEWEB_SAMPLES), desc="FineWeb"): |
| try: |
| ex = next(fw_iter) |
| text = ex['text'] + EOS_TOKEN |
| ids = enc.encode(text, allowed_special={EOS_TOKEN}) |
| |
| all_tokens.extend(ids) |
| |
| all_masks.extend([1] * len(ids)) |
| |
| fw_tokens_count += len(ids) |
| fw_count += 1 |
| except StopIteration: |
| break |
| |
| print(f" -> FineWeb Tokens: {fw_tokens_count:,} (aus {fw_count} Dokumenten)") |
|
|
| |
| total_tokens = len(all_tokens) |
| print(f"\n💾 Speichere {total_tokens:,} Tokens in '{OUTPUT_DIR}'...") |
| |
| |
| token_arr = np.array(all_tokens, dtype=np.uint16) |
| token_arr.tofile(os.path.join(OUTPUT_DIR, "train.bin")) |
| |
| |
| mask_arr = np.array(all_masks, dtype=np.uint8) |
| mask_arr.tofile(os.path.join(OUTPUT_DIR, "train_mask.bin")) |
| |
| |
| print("\n🔍 --- SANITY CHECK ---") |
| print("Ich dekodiere die ersten 50 Tokens des ersten Beispiels, um zu prüfen, ob alles stimmt.") |
| print("Grün (TRAIN) = Was das Modell lernt. Grau (IGNORE) = Was das Modell nur liest.") |
| |
| check_len = 100 |
| sample_ids = all_tokens[:check_len] |
| sample_mask = all_masks[:check_len] |
| |
| |
| decoded_parts = [] |
| for t_id, m_val in zip(sample_ids, sample_mask): |
| token_str = enc.decode([t_id]) |
| if m_val == 1: |
| decoded_parts.append(f"\033[92m{token_str}\033[0m") |
| else: |
| decoded_parts.append(f"\033[90m{token_str}\033[0m") |
| |
| print("".join(decoded_parts)) |
| print("\n(Legende: \033[90mGrau=Prompt/Ignoriert\033[0m, \033[92mGrün=Response/Gelernt\033[0m)") |
| |
| if len(token_arr) != len(mask_arr): |
| print("\n❌ ACHTUNG: Token und Mask Array sind unterschiedlich lang! Irgendwas stimmt nicht!") |
| else: |
| print("\n✅ Alles perfekt. Arrays sind synchron. Du kannst trainieren.") |
|
|
| if __name__ == "__main__": |
| main() |
|
|