File size: 16,358 Bytes

03224f5

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!git clone https://github.com/karpathy/nanoGPT.git\n",
    "%cd nanoGPT\n",
    "!pip install -U tiktoken datasets tqdm transformers huggingface_hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import tiktoken\n",
    "from datasets import load_dataset\n",
    "from tqdm import tqdm\n",
    "from huggingface_hub import login\n",
    "\n",
    "# --- SETUP ---\n",
    "HF_TOKEN = \"TOKEN_HERE\"\n",
    "login(token=HF_TOKEN)\n",
    "\n",
    "# Config\n",
    "target_tokens = 250_000_000\n",
    "sft_ratio = 0.15\n",
    "data_dir = os.path.join('data', 'html_v2_mixed')\n",
    "os.makedirs(data_dir, exist_ok=True)\n",
    "\n",
    "enc = tiktoken.get_encoding(\"gpt2\")\n",
    "\n",
    "def process_and_save():\n",
    "    train_file = os.path.join(data_dir, 'train.bin')\n",
    "    val_file = os.path.join(data_dir, 'val.bin')\n",
    "    \n",
    "    print(\"Lade Datensätze...\")\n",
    "    ds_stack = load_dataset(\"bigcode/the-stack-smol\", data_dir=\"data/html\", split=\"train\", streaming=True, token=HF_TOKEN)\n",
    "    ds_sft = load_dataset(\"ttbui/html_alpaca\", split=\"train\", streaming=True)\n",
    "    \n",
    "    all_tokens_np = np.zeros(target_tokens, dtype=np.uint16)\n",
    "    curr_idx = 0\n",
    "    \n",
    "    sft_target = int(target_tokens * sft_ratio)\n",
    "    print(f\"Tokenisiere SFT-Daten (Ziel: {sft_target} Tokens)...\")\n",
    "    \n",
    "    pbar_sft = tqdm(total=sft_target)\n",
    "    for ex in ds_sft:\n",
    "        instr = ex.get('instruction', '')\n",
    "        resp = ex.get('output', ex.get('code', ''))\n",
    "        if not resp: continue\n",
    "            \n",
    "        text = f\"### Instruction:\\n{instr}\\n\\n### Response:\\n{resp}<|endoftext|>\"\n",
    "        tokens = enc.encode_ordinary(text)\n",
    "        \n",
    "        take = min(len(tokens), sft_target - curr_idx)\n",
    "        all_tokens_np[curr_idx:curr_idx+take] = np.array(tokens[:take], dtype=np.uint16)\n",
    "        curr_idx += take\n",
    "        pbar_sft.update(take)\n",
    "        \n",
    "        if curr_idx >= sft_target:\n",
    "            break\n",
    "    pbar_sft.close()\n",
    "    \n",
    "    print(f\"Tokenisiere Raw HTML (Rest bis {target_tokens} Tokens)...\")\n",
    "    pbar_stack = tqdm(total=target_tokens - curr_idx)\n",
    "    \n",
    "    for entry in ds_stack:\n",
    "        text = entry.get('content', '')\n",
    "        if not text: continue\n",
    "        \n",
    "        tokens = enc.encode_ordinary(text)\n",
    "        tokens.append(enc.eot_token)\n",
    "        \n",
    "        take = min(len(tokens), target_tokens - curr_idx)\n",
    "        all_tokens_np[curr_idx:curr_idx+take] = np.array(tokens[:take], dtype=np.uint16)\n",
    "        \n",
    "        curr_idx += take\n",
    "        pbar_stack.update(take)\n",
    "        \n",
    "        if curr_idx >= target_tokens:\n",
    "            break\n",
    "    pbar_stack.close()\n",
    "\n",
    "    print(f\"\\nSorting and Shuffling (Simulation via Block-Shuffling)...\")\n",
    "    \n",
    "    n = curr_idx\n",
    "    split_idx = int(n * 0.95) # 5% Validation\n",
    "    train_data = all_tokens_np[:split_idx]\n",
    "    val_data = all_tokens_np[split_idx:n]\n",
    "\n",
    "    print(f\"Speichere train.bin ({len(train_data)} Tokens)...\")\n",
    "    train_data.tofile(train_file)\n",
    "    print(f\"Speichere val.bin ({len(val_data)} Tokens)...\")\n",
    "    val_data.tofile(val_file)\n",
    "    \n",
    "    print(f\"Success! v2 dataset ready in {data_dir}\")\n",
    "    print(f\"Verhältnis: {sft_ratio*100}% SFT / {(1-sft_ratio)*100}% Raw HTML\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    process_and_save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import shutil\n",
    "\n",
    "# 1. Pfade anpassen\n",
    "UPLOADED_MODEL_PATH = '/kaggle/input/notebooks/leoheinrich/htmllm-v2-124m-base/nanoGPT/out-html/ckpt.pt'\n",
    "os.makedirs(\"out-html\", exist_ok=True)\n",
    "\n",
    "# Checkpoint kopieren, damit nanoGPT ihn findet (init_from='resume')\n",
    "if os.path.exists(UPLOADED_MODEL_PATH):\n",
    "    shutil.copy(UPLOADED_MODEL_PATH, os.path.join(\"out-html\", 'ckpt.pt'))\n",
    "    print(\"Checkpoint erfolgreich geladen.\")\n",
    "else:\n",
    "    print(\"FEHLER: Checkpoint nicht gefunden! Pfad prüfen.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "config_content = \"\"\"\n",
    "out_dir = 'out-html'\n",
    "eval_interval = 500\n",
    "eval_iters = 40\n",
    "log_interval = 1\n",
    "always_save_checkpoint = False\n",
    "\n",
    "dataset = 'html_v2_mixed'\n",
    "gradient_accumulation_steps = 8\n",
    "batch_size = 16\n",
    "block_size = 1024\n",
    "\n",
    "# Architektur (~124M Params)\n",
    "n_layer = 12\n",
    "n_head = 12\n",
    "n_embd = 768\n",
    "dropout = 0.1\n",
    "\n",
    "learning_rate = 6e-4\n",
    "max_iters = 15000\n",
    "lr_decay_iters = 15000\n",
    "min_lr = 6e-5\n",
    "beta2 = 0.99\n",
    "warmup_iters = 500\n",
    "device = 'cuda'\n",
    "compile = True\n",
    "dtype = 'float16'\n",
    "\n",
    "init_from = 'scratch'\n",
    "\"\"\"\n",
    "\n",
    "with open('config/train_html.py', 'w') as f:\n",
    "    f.write(config_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "with open('train.py', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "new_lines = []\n",
    "for line in lines:\n",
    "    new_lines.append(line)\n",
    "    if \"if losses['val'] < best_val_loss or always_save_checkpoint:\" in line:\n",
    "        # Code-Einschub für Live-Sampling v2\n",
    "        new_lines.append(\"            print('\\\\n--- v2 LIVE SAMPLES ---')\\n\")\n",
    "        new_lines.append(\"            import tiktoken\\n\")\n",
    "        new_lines.append(\"            enc = tiktoken.get_encoding('gpt2')\\n\")\n",
    "        new_lines.append(\"            # Test 1: Klassischer Autocomplete\\n\")\n",
    "        new_lines.append(\"            s1 = enc.encode('<!DOCTYPE html>\\\\n<html>', allowed_special={'<|endoftext|>'})\\n\")\n",
    "        new_lines.append(\"            # Test 2: SFT-Modus\\n\")\n",
    "        new_lines.append(\"            s2 = enc.encode('### Instruction:\\\\nCreate a blue button.\\\\n\\\\n### Response:\\\\n', allowed_special={'<|endoftext|>'})\\n\")\n",
    "        new_lines.append(\"            for prompt_ids, label in [(s1, 'AUTOCOMPLETE'), (s2, 'INSTRUCT')]:\\n\")\n",
    "        new_lines.append(\"                print(f'>> Mode: {label}')\\n\")\n",
    "        new_lines.append(\"                x = torch.tensor(prompt_ids, dtype=torch.long, device=device)[None, ...]\\n\")\n",
    "        new_lines.append(\"                with torch.no_grad():\\n\")\n",
    "        new_lines.append(\"                    y = model.generate(x, 250, temperature=0.5, top_k=40)\\n\")\n",
    "        new_lines.append(\"                print(enc.decode(y[0].tolist()))\\n\")\n",
    "        new_lines.append(\"            print('-----------------------\\\\n')\\n\")\n",
    "\n",
    "with open('train_modified.py', 'w') as f:\n",
    "    f.writelines(new_lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn.functional as F\n",
    "import tiktoken\n",
    "from model import GPT, GPTConfig\n",
    "\n",
    "def generate_with_penalty(model, idx, max_new_tokens, temperature=0.4, repetition_penalty=1.5, top_k=40):\n",
    "    for _ in range(max_new_tokens):\n",
    "        idx_cond = idx if idx.size(1) <= model.config.block_size else idx[:, -model.config.block_size:]\n",
    "        logits, _ = model(idx_cond)\n",
    "        logits = logits[:, -1, :] / temperature\n",
    "        \n",
    "        # Stärkere Penalty für bereits erschienene Tokens\n",
    "        for token_id in set(idx[0].tolist()):\n",
    "            logits[0, token_id] -= repetition_penalty # Subtraktion statt Division ist oft stabiler\n",
    "\n",
    "        # Top-K Filter\n",
    "        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))\n",
    "        logits[logits < v[:, [-1]]] = -float('Inf')\n",
    "        \n",
    "        probs = F.softmax(logits, dim=-1)\n",
    "        idx_next = torch.multinomial(probs, num_samples=1)\n",
    "        idx = torch.cat((idx, idx_next), dim=1)\n",
    "        \n",
    "        if idx_next.item() == 50256: # <|endoftext|>\n",
    "            break\n",
    "    return idx\n",
    "\n",
    "def test_v2(checkpoint_path, prompt, max_tokens=512, temp=0.7, penalty=1.3):\n",
    "    device = 'cuda'\n",
    "    checkpoint = torch.load(checkpoint_path, map_location=device)\n",
    "    model = GPT(GPTConfig(**checkpoint['model_args']))\n",
    "    \n",
    "    state_dict = checkpoint['model']\n",
    "    unwanted_prefix = '_orig_mod.'\n",
    "    for k,v in list(state_dict.items()):\n",
    "        if k.startswith(unwanted_prefix):\n",
    "            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)\n",
    "            \n",
    "    model.load_state_dict(state_dict)\n",
    "    model.to(device).eval()\n",
    "    \n",
    "    enc = tiktoken.get_encoding(\"gpt2\")\n",
    "    start_ids = enc.encode(prompt, allowed_special={'<|endoftext|>'})\n",
    "    x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]\n",
    "    \n",
    "    print(f\"\\n--- TESTING WITH TEMP {temp} AND PENALTY {penalty} ---\")\n",
    "    with torch.no_grad():\n",
    "        y = generate_with_penalty(model, x, max_tokens, temperature=temp, top_k=40, repetition_penalty=penalty)\n",
    "        print(enc.decode(y[0].tolist()))\n",
    "\n",
    "# --- DEIN TEST-PLAN ---\n",
    "path = 'out-html/ckpt.pt'\n",
    "\n",
    "# Test 1: Design-Fokus (Bootstrap)\n",
    "#test_v2(path, \"### Instruction:\\nCreate a hero section with a dark background and a 'Learn More' button.\\n\\n### Response:\\n\", temp=0.8, penalty=1.4)\n",
    "\n",
    "# Test 2: CSS-Logik (Inline Styles)\n",
    "#test_v2(path, \"<!DOCTYPE html>\\n<html>\\n<head>\\n<style>\\n  .card { background-color:\", max_tokens=200, temp=0.5, penalty=1.2)\n",
    "\n",
    "test_v2(path, \"<form class=\\\"p-4 border rounded\\\">\\n  <div class=\\\"mb-3\\\">\\n    <label class=\\\"form-label\\\">Email</label>\", 300, 0.8, 1.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!python train_modified.py config/train_html.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-13T13:58:30.581852Z",
     "iopub.status.busy": "2026-03-13T13:58:30.581555Z",
     "iopub.status.idle": "2026-03-13T13:58:30.586681Z",
     "shell.execute_reply": "2026-03-13T13:58:30.585908Z",
     "shell.execute_reply.started": "2026-03-13T13:58:30.581821Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# NOW: Resume after iteration 2500:\n",
    "config_content = \"\"\"\n",
    "out_dir = 'out-html'\n",
    "eval_interval = 500\n",
    "eval_iters = 40\n",
    "log_interval = 1\n",
    "always_save_checkpoint = False\n",
    "\n",
    "dataset = 'html_v2_mixed'\n",
    "gradient_accumulation_steps = 8\n",
    "batch_size = 16\n",
    "block_size = 1024\n",
    "\n",
    "# Architektur (~124M Params)\n",
    "n_layer = 12\n",
    "n_head = 12\n",
    "n_embd = 768\n",
    "dropout = 0.1\n",
    "\n",
    "learning_rate = 1e-4\n",
    "max_iters = 5000\n",
    "lr_decay_iters = 5000\n",
    "min_lr = 1e-6\n",
    "beta2 = 0.99\n",
    "warmup_iters = 500\n",
    "device = 'cuda'\n",
    "compile = True\n",
    "dtype = 'float16'\n",
    "\n",
    "init_from = 'resume'\n",
    "\"\"\"\n",
    "\n",
    "with open('config/train_html.py', 'w') as f:\n",
    "    f.write(config_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-03-13T13:58:34.059934Z",
     "iopub.status.busy": "2026-03-13T13:58:34.059199Z"
    },
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Overriding config with config/train_html.py:\n",
      "\n",
      "out_dir = 'out-html'\n",
      "eval_interval = 500\n",
      "eval_iters = 40\n",
      "log_interval = 1\n",
      "always_save_checkpoint = False\n",
      "\n",
      "dataset = 'html_v2_mixed'\n",
      "gradient_accumulation_steps = 8\n",
      "batch_size = 16\n",
      "block_size = 1024\n",
      "\n",
      "# Architektur (~124M Params)\n",
      "n_layer = 12\n",
      "n_head = 12\n",
      "n_embd = 768\n",
      "dropout = 0.1\n",
      "\n",
      "learning_rate = 1e-4\n",
      "max_iters = 5000\n",
      "lr_decay_iters = 5000\n",
      "min_lr = 1e-6\n",
      "beta2 = 0.99\n",
      "warmup_iters = 500\n",
      "device = 'cuda'\n",
      "compile = True\n",
      "dtype = 'float16'\n",
      "\n",
      "init_from = 'resume'\n",
      "\n",
      "tokens per iteration will be: 131,072\n",
      "/usr/local/lib/python3.12/dist-packages/torch/backends/__init__.py:46: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)\n",
      "  self.setter(val)\n",
      "Resuming training from out-html\n",
      "number of parameters: 123.59M\n",
      "/kaggle/working/nanoGPT/train_modified.py:196: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.\n",
      "  scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))\n",
      "num decayed parameter tensors: 50, with 124,354,560 parameters\n",
      "num non-decayed parameter tensors: 25, with 19,200 parameters\n",
      "using fused AdamW: True\n",
      "compiling the model... (takes a ~minute)\n",
      "step 2500: train loss 0.8834, val loss 1.0695\n",
      "iter 2500: loss 1.0691, time 32466.99ms, mfu -100.00%\n",
      "iter 2501: loss 0.6317, time 5796.15ms, mfu -100.00%\n",
      "iter 2502: loss 1.0973, time 7771.45ms, mfu -100.00%\n",
      "iter 2503: loss 0.8611, time 7912.30ms, mfu -100.00%\n"
     ]
    }
   ],
   "source": [
    "!python train_modified.py config/train_html.py"
   ]
  }
 ],
 "metadata": {
  "kaggle": {
   "accelerator": "nvidiaTeslaT4",
   "dataSources": [
    {
     "isSourceIdPinned": false,
     "sourceId": 303214451,
     "sourceType": "kernelVersion"
    }
   ],
   "isGpuEnabled": true,
   "isInternetEnabled": true,
   "language": "python",
   "sourceType": "notebook"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}