{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "trusted": true }, "outputs": [], "source": [ "!git clone https://github.com/karpathy/nanoGPT.git\n", "%cd nanoGPT\n", "!pip install -U tiktoken datasets tqdm transformers huggingface_hub" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "import tiktoken\n", "from datasets import load_dataset\n", "from tqdm import tqdm\n", "from huggingface_hub import login\n", "\n", "# --- SETUP ---\n", "HF_TOKEN = \"TOKEN_HERE\"\n", "login(token=HF_TOKEN)\n", "\n", "# Config\n", "target_tokens = 250_000_000\n", "sft_ratio = 0.15\n", "data_dir = os.path.join('data', 'html_v2_mixed')\n", "os.makedirs(data_dir, exist_ok=True)\n", "\n", "enc = tiktoken.get_encoding(\"gpt2\")\n", "\n", "def process_and_save():\n", " train_file = os.path.join(data_dir, 'train.bin')\n", " val_file = os.path.join(data_dir, 'val.bin')\n", " \n", " print(\"Lade Datensätze...\")\n", " ds_stack = load_dataset(\"bigcode/the-stack-smol\", data_dir=\"data/html\", split=\"train\", streaming=True, token=HF_TOKEN)\n", " ds_sft = load_dataset(\"ttbui/html_alpaca\", split=\"train\", streaming=True)\n", " \n", " all_tokens_np = np.zeros(target_tokens, dtype=np.uint16)\n", " curr_idx = 0\n", " \n", " sft_target = int(target_tokens * sft_ratio)\n", " print(f\"Tokenisiere SFT-Daten (Ziel: {sft_target} Tokens)...\")\n", " \n", " pbar_sft = tqdm(total=sft_target)\n", " for ex in ds_sft:\n", " instr = ex.get('instruction', '')\n", " resp = ex.get('output', ex.get('code', ''))\n", " if not resp: continue\n", " \n", " text = f\"### Instruction:\\n{instr}\\n\\n### Response:\\n{resp}<|endoftext|>\"\n", " tokens = enc.encode_ordinary(text)\n", " \n", " take = min(len(tokens), sft_target - curr_idx)\n", " all_tokens_np[curr_idx:curr_idx+take] = np.array(tokens[:take], dtype=np.uint16)\n", " curr_idx += take\n", " pbar_sft.update(take)\n", " \n", " if curr_idx >= sft_target:\n", " break\n", " pbar_sft.close()\n", " \n", " print(f\"Tokenisiere Raw HTML (Rest bis {target_tokens} Tokens)...\")\n", " pbar_stack = tqdm(total=target_tokens - curr_idx)\n", " \n", " for entry in ds_stack:\n", " text = entry.get('content', '')\n", " if not text: continue\n", " \n", " tokens = enc.encode_ordinary(text)\n", " tokens.append(enc.eot_token)\n", " \n", " take = min(len(tokens), target_tokens - curr_idx)\n", " all_tokens_np[curr_idx:curr_idx+take] = np.array(tokens[:take], dtype=np.uint16)\n", " \n", " curr_idx += take\n", " pbar_stack.update(take)\n", " \n", " if curr_idx >= target_tokens:\n", " break\n", " pbar_stack.close()\n", "\n", " print(f\"\\nSorting and Shuffling (Simulation via Block-Shuffling)...\")\n", " \n", " n = curr_idx\n", " split_idx = int(n * 0.95) # 5% Validation\n", " train_data = all_tokens_np[:split_idx]\n", " val_data = all_tokens_np[split_idx:n]\n", "\n", " print(f\"Speichere train.bin ({len(train_data)} Tokens)...\")\n", " train_data.tofile(train_file)\n", " print(f\"Speichere val.bin ({len(val_data)} Tokens)...\")\n", " val_data.tofile(val_file)\n", " \n", " print(f\"Success! v2 dataset ready in {data_dir}\")\n", " print(f\"Verhältnis: {sft_ratio*100}% SFT / {(1-sft_ratio)*100}% Raw HTML\")\n", "\n", "if __name__ == \"__main__\":\n", " process_and_save()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "import os\n", "import shutil\n", "\n", "# 1. Pfade anpassen\n", "UPLOADED_MODEL_PATH = '/kaggle/input/notebooks/leoheinrich/htmllm-v2-124m-base/nanoGPT/out-html/ckpt.pt'\n", "os.makedirs(\"out-html\", exist_ok=True)\n", "\n", "# Checkpoint kopieren, damit nanoGPT ihn findet (init_from='resume')\n", "if os.path.exists(UPLOADED_MODEL_PATH):\n", " shutil.copy(UPLOADED_MODEL_PATH, os.path.join(\"out-html\", 'ckpt.pt'))\n", " print(\"Checkpoint erfolgreich geladen.\")\n", "else:\n", " print(\"FEHLER: Checkpoint nicht gefunden! Pfad prüfen.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "config_content = \"\"\"\n", "out_dir = 'out-html'\n", "eval_interval = 500\n", "eval_iters = 40\n", "log_interval = 1\n", "always_save_checkpoint = False\n", "\n", "dataset = 'html_v2_mixed'\n", "gradient_accumulation_steps = 8\n", "batch_size = 16\n", "block_size = 1024\n", "\n", "# Architektur (~124M Params)\n", "n_layer = 12\n", "n_head = 12\n", "n_embd = 768\n", "dropout = 0.1\n", "\n", "learning_rate = 6e-4\n", "max_iters = 15000\n", "lr_decay_iters = 15000\n", "min_lr = 6e-5\n", "beta2 = 0.99\n", "warmup_iters = 500\n", "device = 'cuda'\n", "compile = True\n", "dtype = 'float16'\n", "\n", "init_from = 'scratch'\n", "\"\"\"\n", "\n", "with open('config/train_html.py', 'w') as f:\n", " f.write(config_content)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "with open('train.py', 'r') as f:\n", " lines = f.readlines()\n", "\n", "new_lines = []\n", "for line in lines:\n", " new_lines.append(line)\n", " if \"if losses['val'] < best_val_loss or always_save_checkpoint:\" in line:\n", " # Code-Einschub für Live-Sampling v2\n", " new_lines.append(\" print('\\\\n--- v2 LIVE SAMPLES ---')\\n\")\n", " new_lines.append(\" import tiktoken\\n\")\n", " new_lines.append(\" enc = tiktoken.get_encoding('gpt2')\\n\")\n", " new_lines.append(\" # Test 1: Klassischer Autocomplete\\n\")\n", " new_lines.append(\" s1 = enc.encode('\\\\n', allowed_special={'<|endoftext|>'})\\n\")\n", " new_lines.append(\" # Test 2: SFT-Modus\\n\")\n", " new_lines.append(\" s2 = enc.encode('### Instruction:\\\\nCreate a blue button.\\\\n\\\\n### Response:\\\\n', allowed_special={'<|endoftext|>'})\\n\")\n", " new_lines.append(\" for prompt_ids, label in [(s1, 'AUTOCOMPLETE'), (s2, 'INSTRUCT')]:\\n\")\n", " new_lines.append(\" print(f'>> Mode: {label}')\\n\")\n", " new_lines.append(\" x = torch.tensor(prompt_ids, dtype=torch.long, device=device)[None, ...]\\n\")\n", " new_lines.append(\" with torch.no_grad():\\n\")\n", " new_lines.append(\" y = model.generate(x, 250, temperature=0.5, top_k=40)\\n\")\n", " new_lines.append(\" print(enc.decode(y[0].tolist()))\\n\")\n", " new_lines.append(\" print('-----------------------\\\\n')\\n\")\n", "\n", "with open('train_modified.py', 'w') as f:\n", " f.writelines(new_lines)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "import torch\n", "import torch.nn.functional as F\n", "import tiktoken\n", "from model import GPT, GPTConfig\n", "\n", "def generate_with_penalty(model, idx, max_new_tokens, temperature=0.4, repetition_penalty=1.5, top_k=40):\n", " for _ in range(max_new_tokens):\n", " idx_cond = idx if idx.size(1) <= model.config.block_size else idx[:, -model.config.block_size:]\n", " logits, _ = model(idx_cond)\n", " logits = logits[:, -1, :] / temperature\n", " \n", " # Stärkere Penalty für bereits erschienene Tokens\n", " for token_id in set(idx[0].tolist()):\n", " logits[0, token_id] -= repetition_penalty # Subtraktion statt Division ist oft stabiler\n", "\n", " # Top-K Filter\n", " v, _ = torch.topk(logits, min(top_k, logits.size(-1)))\n", " logits[logits < v[:, [-1]]] = -float('Inf')\n", " \n", " probs = F.softmax(logits, dim=-1)\n", " idx_next = torch.multinomial(probs, num_samples=1)\n", " idx = torch.cat((idx, idx_next), dim=1)\n", " \n", " if idx_next.item() == 50256: # <|endoftext|>\n", " break\n", " return idx\n", "\n", "def test_v2(checkpoint_path, prompt, max_tokens=512, temp=0.7, penalty=1.3):\n", " device = 'cuda'\n", " checkpoint = torch.load(checkpoint_path, map_location=device)\n", " model = GPT(GPTConfig(**checkpoint['model_args']))\n", " \n", " state_dict = checkpoint['model']\n", " unwanted_prefix = '_orig_mod.'\n", " for k,v in list(state_dict.items()):\n", " if k.startswith(unwanted_prefix):\n", " state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)\n", " \n", " model.load_state_dict(state_dict)\n", " model.to(device).eval()\n", " \n", " enc = tiktoken.get_encoding(\"gpt2\")\n", " start_ids = enc.encode(prompt, allowed_special={'<|endoftext|>'})\n", " x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]\n", " \n", " print(f\"\\n--- TESTING WITH TEMP {temp} AND PENALTY {penalty} ---\")\n", " with torch.no_grad():\n", " y = generate_with_penalty(model, x, max_tokens, temperature=temp, top_k=40, repetition_penalty=penalty)\n", " print(enc.decode(y[0].tolist()))\n", "\n", "# --- DEIN TEST-PLAN ---\n", "path = 'out-html/ckpt.pt'\n", "\n", "# Test 1: Design-Fokus (Bootstrap)\n", "#test_v2(path, \"### Instruction:\\nCreate a hero section with a dark background and a 'Learn More' button.\\n\\n### Response:\\n\", temp=0.8, penalty=1.4)\n", "\n", "# Test 2: CSS-Logik (Inline Styles)\n", "#test_v2(path, \"\\n\\n\\n