LH-Tech-AI
/

htmLLM-124M

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
+    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!git clone https://github.com/karpathy/nanoGPT.git\n",
+    "%cd nanoGPT\n",
+    "!pip install -U tiktoken datasets tqdm transformers huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import tiktoken\n",
+    "from datasets import load_dataset\n",
+    "from tqdm import tqdm\n",
+    "from huggingface_hub import login\n",
+    "\n",
+    "# --- SETUP ---\n",
+    "HF_TOKEN = \"TOKEN_HERE\"\n",
+    "login(token=HF_TOKEN)\n",
+    "\n",
+    "# Config\n",
+    "target_tokens = 250_000_000\n",
+    "sft_ratio = 0.15\n",
+    "data_dir = os.path.join('data', 'html_v2_mixed')\n",
+    "os.makedirs(data_dir, exist_ok=True)\n",
+    "\n",
+    "enc = tiktoken.get_encoding(\"gpt2\")\n",
+    "\n",
+    "def process_and_save():\n",
+    "    train_file = os.path.join(data_dir, 'train.bin')\n",
+    "    val_file = os.path.join(data_dir, 'val.bin')\n",
+    "    \n",
+    "    print(\"Lade Datensätze...\")\n",
+    "    ds_stack = load_dataset(\"bigcode/the-stack-smol\", data_dir=\"data/html\", split=\"train\", streaming=True, token=HF_TOKEN)\n",
+    "    ds_sft = load_dataset(\"ttbui/html_alpaca\", split=\"train\", streaming=True)\n",
+    "    \n",
+    "    all_tokens_np = np.zeros(target_tokens, dtype=np.uint16)\n",
+    "    curr_idx = 0\n",
+    "    \n",
+    "    sft_target = int(target_tokens * sft_ratio)\n",
+    "    print(f\"Tokenisiere SFT-Daten (Ziel: {sft_target} Tokens)...\")\n",
+    "    \n",
+    "    pbar_sft = tqdm(total=sft_target)\n",
+    "    for ex in ds_sft:\n",
+    "        instr = ex.get('instruction', '')\n",
+    "        resp = ex.get('output', ex.get('code', ''))\n",
+    "        if not resp: continue\n",
+    "            \n",
+    "        text = f\"### Instruction:\\n{instr}\\n\\n### Response:\\n{resp}<|endoftext|>\"\n",
+    "        tokens = enc.encode_ordinary(text)\n",
+    "        \n",
+    "        take = min(len(tokens), sft_target - curr_idx)\n",
+    "        all_tokens_np[curr_idx:curr_idx+take] = np.array(tokens[:take], dtype=np.uint16)\n",
+    "        curr_idx += take\n",
+    "        pbar_sft.update(take)\n",
+    "        \n",
+    "        if curr_idx >= sft_target:\n",
+    "            break\n",
+    "    pbar_sft.close()\n",
+    "    \n",
+    "    print(f\"Tokenisiere Raw HTML (Rest bis {target_tokens} Tokens)...\")\n",
+    "    pbar_stack = tqdm(total=target_tokens - curr_idx)\n",
+    "    \n",
+    "    for entry in ds_stack:\n",
+    "        text = entry.get('content', '')\n",
+    "        if not text: continue\n",
+    "        \n",
+    "        tokens = enc.encode_ordinary(text)\n",
+    "        tokens.append(enc.eot_token)\n",
+    "        \n",
+    "        take = min(len(tokens), target_tokens - curr_idx)\n",
+    "        all_tokens_np[curr_idx:curr_idx+take] = np.array(tokens[:take], dtype=np.uint16)\n",
+    "        \n",
+    "        curr_idx += take\n",
+    "        pbar_stack.update(take)\n",
+    "        \n",
+    "        if curr_idx >= target_tokens:\n",
+    "            break\n",
+    "    pbar_stack.close()\n",
+    "\n",
+    "    print(f\"\\nSorting and Shuffling (Simulation via Block-Shuffling)...\")\n",
+    "    \n",
+    "    n = curr_idx\n",
+    "    split_idx = int(n * 0.95) # 5% Validation\n",
+    "    train_data = all_tokens_np[:split_idx]\n",
+    "    val_data = all_tokens_np[split_idx:n]\n",
+    "\n",
+    "    print(f\"Speichere train.bin ({len(train_data)} Tokens)...\")\n",
+    "    train_data.tofile(train_file)\n",
+    "    print(f\"Speichere val.bin ({len(val_data)} Tokens)...\")\n",
+    "    val_data.tofile(val_file)\n",
+    "    \n",
+    "    print(f\"Success! v2 dataset ready in {data_dir}\")\n",
+    "    print(f\"Verhältnis: {sft_ratio*100}% SFT / {(1-sft_ratio)*100}% Raw HTML\")\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    process_and_save()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "\n",
+    "# 1. Pfade anpassen\n",
+    "UPLOADED_MODEL_PATH = '/kaggle/input/notebooks/leoheinrich/htmllm-v2-124m-base/nanoGPT/out-html/ckpt.pt'\n",
+    "os.makedirs(\"out-html\", exist_ok=True)\n",
+    "\n",
+    "# Checkpoint kopieren, damit nanoGPT ihn findet (init_from='resume')\n",
+    "if os.path.exists(UPLOADED_MODEL_PATH):\n",
+    "    shutil.copy(UPLOADED_MODEL_PATH, os.path.join(\"out-html\", 'ckpt.pt'))\n",
+    "    print(\"Checkpoint erfolgreich geladen.\")\n",
+    "else:\n",
+    "    print(\"FEHLER: Checkpoint nicht gefunden! Pfad prüfen.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "config_content = \"\"\"\n",
+    "out_dir = 'out-html'\n",
+    "eval_interval = 500\n",
+    "eval_iters = 40\n",
+    "log_interval = 1\n",
+    "always_save_checkpoint = False\n",
+    "\n",
+    "dataset = 'html_v2_mixed'\n",
+    "gradient_accumulation_steps = 8\n",
+    "batch_size = 16\n",
+    "block_size = 1024\n",
+    "\n",
+    "# Architektur (~124M Params)\n",
+    "n_layer = 12\n",
+    "n_head = 12\n",
+    "n_embd = 768\n",
+    "dropout = 0.1\n",
+    "\n",
+    "learning_rate = 6e-4\n",
+    "max_iters = 15000\n",
+    "lr_decay_iters = 15000\n",
+    "min_lr = 6e-5\n",
+    "beta2 = 0.99\n",
+    "warmup_iters = 500\n",
+    "device = 'cuda'\n",
+    "compile = True\n",
+    "dtype = 'float16'\n",
+    "\n",
+    "init_from = 'scratch'\n",
+    "\"\"\"\n",
+    "\n",
+    "with open('config/train_html.py', 'w') as f:\n",
+    "    f.write(config_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "with open('train.py', 'r') as f:\n",
+    "    lines = f.readlines()\n",
+    "\n",
+    "new_lines = []\n",
+    "for line in lines:\n",
+    "    new_lines.append(line)\n",
+    "    if \"if losses['val'] < best_val_loss or always_save_checkpoint:\" in line:\n",
+    "        # Code-Einschub für Live-Sampling v2\n",
+    "        new_lines.append(\"            print('\\\\n--- v2 LIVE SAMPLES ---')\\n\")\n",
+    "        new_lines.append(\"            import tiktoken\\n\")\n",
+    "        new_lines.append(\"            enc = tiktoken.get_encoding('gpt2')\\n\")\n",
+    "        new_lines.append(\"            # Test 1: Klassischer Autocomplete\\n\")\n",
+    "        new_lines.append(\"            s1 = enc.encode('<!DOCTYPE html>\\\\n<html>', allowed_special={'<|endoftext|>'})\\n\")\n",
+    "        new_lines.append(\"            # Test 2: SFT-Modus\\n\")\n",
+    "        new_lines.append(\"            s2 = enc.encode('### Instruction:\\\\nCreate a blue button.\\\\n\\\\n### Response:\\\\n', allowed_special={'<|endoftext|>'})\\n\")\n",
+    "        new_lines.append(\"            for prompt_ids, label in [(s1, 'AUTOCOMPLETE'), (s2, 'INSTRUCT')]:\\n\")\n",
+    "        new_lines.append(\"                print(f'>> Mode: {label}')\\n\")\n",
+    "        new_lines.append(\"                x = torch.tensor(prompt_ids, dtype=torch.long, device=device)[None, ...]\\n\")\n",
+    "        new_lines.append(\"                with torch.no_grad():\\n\")\n",
+    "        new_lines.append(\"                    y = model.generate(x, 250, temperature=0.5, top_k=40)\\n\")\n",
+    "        new_lines.append(\"                print(enc.decode(y[0].tolist()))\\n\")\n",
+    "        new_lines.append(\"            print('-----------------------\\\\n')\\n\")\n",
+    "\n",
+    "with open('train_modified.py', 'w') as f:\n",
+    "    f.writelines(new_lines)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "import tiktoken\n",
+    "from model import GPT, GPTConfig\n",
+    "\n",
+    "def generate_with_penalty(model, idx, max_new_tokens, temperature=0.4, repetition_penalty=1.5, top_k=40):\n",
+    "    for _ in range(max_new_tokens):\n",
+    "        idx_cond = idx if idx.size(1) <= model.config.block_size else idx[:, -model.config.block_size:]\n",
+    "        logits, _ = model(idx_cond)\n",
+    "        logits = logits[:, -1, :] / temperature\n",
+    "        \n",
+    "        # Stärkere Penalty für bereits erschienene Tokens\n",
+    "        for token_id in set(idx[0].tolist()):\n",
+    "            logits[0, token_id] -= repetition_penalty # Subtraktion statt Division ist oft stabiler\n",
+    "\n",
+    "        # Top-K Filter\n",
+    "        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))\n",
+    "        logits[logits < v[:, [-1]]] = -float('Inf')\n",
+    "        \n",
+    "        probs = F.softmax(logits, dim=-1)\n",
+    "        idx_next = torch.multinomial(probs, num_samples=1)\n",
+    "        idx = torch.cat((idx, idx_next), dim=1)\n",
+    "        \n",
+    "        if idx_next.item() == 50256: # <|endoftext|>\n",
+    "            break\n",
+    "    return idx\n",
+    "\n",
+    "def test_v2(checkpoint_path, prompt, max_tokens=512, temp=0.7, penalty=1.3):\n",
+    "    device = 'cuda'\n",
+    "    checkpoint = torch.load(checkpoint_path, map_location=device)\n",
+    "    model = GPT(GPTConfig(**checkpoint['model_args']))\n",
+    "    \n",
+    "    state_dict = checkpoint['model']\n",
+    "    unwanted_prefix = '_orig_mod.'\n",
+    "    for k,v in list(state_dict.items()):\n",
+    "        if k.startswith(unwanted_prefix):\n",
+    "            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)\n",
+    "            \n",
+    "    model.load_state_dict(state_dict)\n",
+    "    model.to(device).eval()\n",
+    "    \n",
+    "    enc = tiktoken.get_encoding(\"gpt2\")\n",
+    "    start_ids = enc.encode(prompt, allowed_special={'<|endoftext|>'})\n",
+    "    x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]\n",
+    "    \n",
+    "    print(f\"\\n--- TESTING WITH TEMP {temp} AND PENALTY {penalty} ---\")\n",
+    "    with torch.no_grad():\n",
+    "        y = generate_with_penalty(model, x, max_tokens, temperature=temp, top_k=40, repetition_penalty=penalty)\n",
+    "        print(enc.decode(y[0].tolist()))\n",
+    "\n",
+    "# --- DEIN TEST-PLAN ---\n",
+    "path = 'out-html/ckpt.pt'\n",
+    "\n",
+    "# Test 1: Design-Fokus (Bootstrap)\n",
+    "#test_v2(path, \"### Instruction:\\nCreate a hero section with a dark background and a 'Learn More' button.\\n\\n### Response:\\n\", temp=0.8, penalty=1.4)\n",
+    "\n",
+    "# Test 2: CSS-Logik (Inline Styles)\n",
+    "#test_v2(path, \"<!DOCTYPE html>\\n<html>\\n<head>\\n<style>\\n  .card { background-color:\", max_tokens=200, temp=0.5, penalty=1.2)\n",
+    "\n",
+    "test_v2(path, \"<form class=\\\"p-4 border rounded\\\">\\n  <div class=\\\"mb-3\\\">\\n    <label class=\\\"form-label\\\">Email</label>\", 300, 0.8, 1.5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!python train_modified.py config/train_html.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-03-13T13:58:30.581852Z",
+     "iopub.status.busy": "2026-03-13T13:58:30.581555Z",
+     "iopub.status.idle": "2026-03-13T13:58:30.586681Z",
+     "shell.execute_reply": "2026-03-13T13:58:30.585908Z",
+     "shell.execute_reply.started": "2026-03-13T13:58:30.581821Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# NOW: Resume after iteration 2500:\n",
+    "config_content = \"\"\"\n",
+    "out_dir = 'out-html'\n",
+    "eval_interval = 500\n",
+    "eval_iters = 40\n",
+    "log_interval = 1\n",
+    "always_save_checkpoint = False\n",
+    "\n",
+    "dataset = 'html_v2_mixed'\n",
+    "gradient_accumulation_steps = 8\n",
+    "batch_size = 16\n",
+    "block_size = 1024\n",
+    "\n",
+    "# Architektur (~124M Params)\n",
+    "n_layer = 12\n",
+    "n_head = 12\n",
+    "n_embd = 768\n",
+    "dropout = 0.1\n",
+    "\n",
+    "learning_rate = 1e-4\n",
+    "max_iters = 5000\n",
+    "lr_decay_iters = 5000\n",
+    "min_lr = 1e-6\n",
+    "beta2 = 0.99\n",
+    "warmup_iters = 500\n",
+    "device = 'cuda'\n",
+    "compile = True\n",
+    "dtype = 'float16'\n",
+    "\n",
+    "init_from = 'resume'\n",
+    "\"\"\"\n",
+    "\n",
+    "with open('config/train_html.py', 'w') as f:\n",
+    "    f.write(config_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-03-13T13:58:34.059934Z",
+     "iopub.status.busy": "2026-03-13T13:58:34.059199Z"
+    },
+    "trusted": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overriding config with config/train_html.py:\n",
+      "\n",
+      "out_dir = 'out-html'\n",
+      "eval_interval = 500\n",
+      "eval_iters = 40\n",
+      "log_interval = 1\n",
+      "always_save_checkpoint = False\n",
+      "\n",
+      "dataset = 'html_v2_mixed'\n",
+      "gradient_accumulation_steps = 8\n",
+      "batch_size = 16\n",
+      "block_size = 1024\n",
+      "\n",
+      "# Architektur (~124M Params)\n",
+      "n_layer = 12\n",
+      "n_head = 12\n",
+      "n_embd = 768\n",
+      "dropout = 0.1\n",
+      "\n",
+      "learning_rate = 1e-4\n",
+      "max_iters = 5000\n",
+      "lr_decay_iters = 5000\n",
+      "min_lr = 1e-6\n",
+      "beta2 = 0.99\n",
+      "warmup_iters = 500\n",
+      "device = 'cuda'\n",
+      "compile = True\n",
+      "dtype = 'float16'\n",
+      "\n",
+      "init_from = 'resume'\n",
+      "\n",
+      "tokens per iteration will be: 131,072\n",
+      "/usr/local/lib/python3.12/dist-packages/torch/backends/__init__.py:46: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)\n",
+      "  self.setter(val)\n",
+      "Resuming training from out-html\n",
+      "number of parameters: 123.59M\n",
+      "/kaggle/working/nanoGPT/train_modified.py:196: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.\n",
+      "  scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))\n",
+      "num decayed parameter tensors: 50, with 124,354,560 parameters\n",
+      "num non-decayed parameter tensors: 25, with 19,200 parameters\n",
+      "using fused AdamW: True\n",
+      "compiling the model... (takes a ~minute)\n",
+      "step 2500: train loss 0.8834, val loss 1.0695\n",
+      "iter 2500: loss 1.0691, time 32466.99ms, mfu -100.00%\n",
+      "iter 2501: loss 0.6317, time 5796.15ms, mfu -100.00%\n",
+      "iter 2502: loss 1.0973, time 7771.45ms, mfu -100.00%\n",
+      "iter 2503: loss 0.8611, time 7912.30ms, mfu -100.00%\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python train_modified.py config/train_html.py"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "nvidiaTeslaT4",
+   "dataSources": [
+    {
+     "isSourceIdPinned": false,
+     "sourceId": 303214451,
+     "sourceType": "kernelVersion"
+    }
+   ],
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}