{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.12.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!git clone https://github.com/karpathy/nanoGPT.git\n%cd nanoGPT\n!pip install -U tiktoken datasets tqdm transformers huggingface_hub","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import os\nimport numpy as np\nimport tiktoken\nfrom datasets import load_dataset\nfrom tqdm import tqdm\nfrom huggingface_hub import login\n\nHF_TOKEN = \"TOKEN_HERE\"\nlogin(token=HF_TOKEN)\n\ndata_dir = os.path.join('data', 'html_data')\nos.makedirs(data_dir, exist_ok=True)\n\nprint(\"Loading dataset 'the-stack-smol' (HTML)...\")\ndataset = load_dataset(\n \"bigcode/the-stack-smol\", \n data_dir=\"data/html\", \n split=\"train\", \n streaming=True,\n token=HF_TOKEN\n)\n\nenc = tiktoken.get_encoding(\"gpt2\")\n\ndef process_and_save():\n train_file = os.path.join(data_dir, 'train.bin')\n val_file = os.path.join(data_dir, 'val.bin')\n \n target_tokens = 150_000_000 \n \n print(f\"Tokenizing up to {target_tokens} tokens. Please wait...\")\n \n all_tokens_np = np.zeros(target_tokens, dtype=np.uint16)\n curr_idx = 0\n \n pbar = tqdm(total=target_tokens)\n for entry in dataset:\n text = entry.get('content', entry.get('code', ''))\n if not text: continue\n \n tokens = enc.encode_ordinary(text)\n tokens.append(enc.eot_token)\n \n take = min(len(tokens), target_tokens - curr_idx)\n all_tokens_np[curr_idx:curr_idx+take] = np.array(tokens[:take], dtype=np.uint16)\n \n curr_idx += take\n pbar.update(take)\n if curr_idx >= target_tokens:\n break\n pbar.close()\n\n n = curr_idx\n split_idx = int(n * 0.95)\n train_data = all_tokens_np[:split_idx]\n val_data = all_tokens_np[split_idx:n]\n\n train_data.tofile(train_file)\n val_data.tofile(val_file)\n print(f\"\\Success! {n} tokens saved in total.\")\n print(f\"Train: {len(train_data)} Tokens, Val: {len(val_data)} Tokens\")\n\nprocess_and_save()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"config_content = \"\"\"\nout_dir = 'out-html'\neval_interval = 100\neval_iters = 40\nlog_interval = 1\nalways_save_checkpoint = False\n\ndataset = 'html_data'\ngradient_accumulation_steps = 4\nbatch_size = 16\nblock_size = 512\n\n# Architektur (~50M Params)\nn_layer = 8\nn_head = 8\nn_embd = 512\ndropout = 0.1\n\nlearning_rate = 5e-5\nmax_iters = 10000\nlr_decay_iters = 10000\nmin_lr = 5e-5\nbeta2 = 0.99\nwarmup_iters = 50\ndevice = 'cuda'\ncompile = True\ndtype = 'float16'\n\ninit_from = 'resume'\n\"\"\"\n\nwith open('config/train_html.py', 'w') as f:\n f.write(config_content)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Modification of train.py for Live-Sampling\nwith open('train.py', 'r') as f:\n lines = f.readlines()\n\nnew_lines = []\nfor line in lines:\n new_lines.append(line)\n if 'if losses[\\'val\\'] < best_val_loss or always_save_checkpoint:' in line:\n new_lines.append(\" print('--- LIVE SAMPLE ---')\\n\")\n new_lines.append(\" import tiktoken\\n\")\n new_lines.append(\" enc = tiktoken.get_encoding('gpt2')\\n\")\n new_lines.append(\" start_ids = enc.encode(''})\\n\")\n new_lines.append(\" x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]\\n\")\n new_lines.append(\" y = model.generate(x, 500, temperature=0.45, top_k=40)\\n\")\n new_lines.append(\" print(enc.decode(y[0].tolist()))\\n\")\n new_lines.append(\" print('-------------------')\\n\")\n\nwith open('train_modified.py', 'w') as f:\n f.writelines(new_lines)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"!python train_modified.py config/train_html.py","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import torch\nimport tiktoken\nfrom model import GPT, GPTConfig\n\ndef generate_final_site(checkpoint_path, start_prompt=\"\", max_tokens=512):\n device = 'cuda'\n checkpoint = torch.load(checkpoint_path, map_location=device)\n \n config = checkpoint['model_args']\n model = GPT(GPTConfig(**config))\n state_dict = checkpoint['model']\n \n unwanted_prefix = '_orig_mod.'\n for k,v in list(state_dict.items()):\n if k.startswith(unwanted_prefix):\n state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)\n \n model.load_state_dict(state_dict)\n model.to(device)\n model.eval()\n\n enc = tiktoken.get_encoding(\"gpt2\")\n start_ids = enc.encode(start_prompt, allowed_special={'<|endoftext|>'})\n x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]\n\n print(\"Generating website...\")\n with torch.no_grad():\n y = model.generate(x, max_tokens, temperature=0.85, top_k=50)\n decoded = enc.decode(y[0].tolist())\n \n print(\"Output:\")\n print(decoded)\n\ngenerate_final_site('out-html/ckpt.pt', 'My Portfolio