LH-Tech-AI
/

Apex-1-Instruct-350M

+import os
+import numpy as np
+import tiktoken
+from datasets import load_dataset
+from tqdm import tqdm
+OUTPUT_DIR = "data/alpaca_cleaned_mixed"
+TOKENIZER_NAME = "gpt2"
+SEED = 1337
+FINEWEB_SAMPLES = 2500
+enc = tiktoken.get_encoding(TOKENIZER_NAME)
+EOS_TOKEN = "<|endoftext|>"
+def format_prompt_with_mask(instruction, input_text, output):
+    """
+    Formatiert den Prompt und erstellt die Loss-Maske.
+    Format:
+    Instruction: ...
+    Input: ... (optional)
+    Response: ... <|endoftext|>
+    """
+    if input_text and input_text.strip():
+        prompt_text = f"Instruction:\n{instruction}\n\nInput:\n{input_text}\n\nResponse:\n"
+    else:
+        prompt_text = f"Instruction:\n{instruction}\n\nResponse:\n"
+    completion_text = f"{output}{EOS_TOKEN}"
+    prompt_ids = enc.encode(prompt_text, allowed_special={'<|endoftext|>'})
+    completion_ids = enc.encode(completion_text, allowed_special={'<|endoftext|>'})
+    full_ids = prompt_ids + completion_ids
+    mask = [0] * len(prompt_ids) + [1] * len(completion_ids)
+    return full_ids, mask
+def main():
+    np.random.seed(SEED)
+    print(f"🚀 Starting Prepare-Script for SmaLLMPro (350M Instruct)...")
+    print(f"📚 Tokenizer: {TOKENIZER_NAME}")
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    print("📥 Loading 'yahma/alpaca-cleaned' (Chat-Instructions)...")
+    alpaca = load_dataset("yahma/alpaca-cleaned", split='train')
+    print(f"📥 Loading 'HuggingFaceFW/fineweb-edu' (Sample-10BT) for {FINEWEB_SAMPLES} Samples...")
+    fineweb = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split='train', streaming=True)
+    all_tokens = []
+    all_masks = []
+    print("⚙️  Processing Alpaca...")
+    for ex in tqdm(alpaca, desc="Alpaca"):
+        ids, mask = format_prompt_with_mask(ex['instruction'], ex['input'], ex['output'])
+        all_tokens.extend(ids)
+        all_masks.extend(mask)
+    alpaca_len = len(all_tokens)
+    print(f"   -> Alpaca Tokens: {alpaca_len:,}")
+    print("⚙️  Processing FineWeb (Anti-Forgetting)...")
+    fw_iter = iter(fineweb)
+    fw_count = 0
+    fw_tokens_count = 0
+    for _ in tqdm(range(FINEWEB_SAMPLES), desc="FineWeb"):
+        try:
+            ex = next(fw_iter)
+            text = ex['text'] + EOS_TOKEN
+            ids = enc.encode(text, allowed_special={EOS_TOKEN})
+            all_tokens.extend(ids)
+            all_masks.extend([1] * len(ids))
+            fw_tokens_count += len(ids)
+            fw_count += 1
+        except StopIteration:
+            break
+    print(f"   -> FineWeb Tokens: {fw_tokens_count:,} (from {fw_count} documents)")
+    total_tokens = len(all_tokens)
+    print(f"\n💾 Saving {total_tokens:,} Tokens in '{OUTPUT_DIR}'...")
+    token_arr = np.array(all_tokens, dtype=np.uint16)
+    token_arr.tofile(os.path.join(OUTPUT_DIR, "train.bin"))
+    mask_arr = np.array(all_masks, dtype=np.uint8)
+    mask_arr.tofile(os.path.join(OUTPUT_DIR, "train_mask.bin"))
+    print("\n🔍 --- SANITY CHECK ---")
+    print("I decode the first 50 tokens of the first sample, to check, if everything is okay.")
+    print("Green (TRAIN) = The things the model learns. Grey (IGNORE) = The things the model only reads.")
+    check_len = 100
+    sample_ids = all_tokens[:check_len]
+    sample_mask = all_masks[:check_len]
+    decoded_parts = []
+    for t_id, m_val in zip(sample_ids, sample_mask):
+        token_str = enc.decode([t_id])
+        if m_val == 1:
+            decoded_parts.append(f"\033[92m{token_str}\033[0m")
+        else:
+            decoded_parts.append(f"\033[90m{token_str}\033[0m")
+    print("".join(decoded_parts))
+    print("\n(Legend: \033[90mGrey=Prompt/Ignored\033[0m, \033[Green=Response/Learned\033[0m)")
+    if len(token_arr) != len(mask_arr):
+        print("\n❌ Warning: Token and Mask Array have different lengths! Something has gone wrong!")
+    else:
+        print("\n✅ Everything seems to be fine. The arrays are synchronized. You can now start the training.")
+if __name__ == "__main__":
+    main()