File size: 6,065 Bytes
2d3a714
1
{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.12.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!git clone https://github.com/karpathy/nanoGPT.git\n%cd nanoGPT\n!pip install -U tiktoken datasets tqdm transformers huggingface_hub","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import os\nimport numpy as np\nimport tiktoken\nfrom datasets import load_dataset\nfrom tqdm import tqdm\nfrom huggingface_hub import login\n\nHF_TOKEN = \"TOKEN_HERE\"\nlogin(token=HF_TOKEN)\n\ndata_dir = os.path.join('data', 'html_data')\nos.makedirs(data_dir, exist_ok=True)\n\nprint(\"Loading dataset 'the-stack-smol' (HTML)...\")\ndataset = load_dataset(\n    \"bigcode/the-stack-smol\", \n    data_dir=\"data/html\", \n    split=\"train\", \n    streaming=True,\n    token=HF_TOKEN\n)\n\nenc = tiktoken.get_encoding(\"gpt2\")\n\ndef process_and_save():\n    train_file = os.path.join(data_dir, 'train.bin')\n    val_file = os.path.join(data_dir, 'val.bin')\n    \n    target_tokens = 150_000_000 \n    \n    print(f\"Tokenizing up to {target_tokens} tokens. Please wait...\")\n    \n    all_tokens_np = np.zeros(target_tokens, dtype=np.uint16)\n    curr_idx = 0\n    \n    pbar = tqdm(total=target_tokens)\n    for entry in dataset:\n        text = entry.get('content', entry.get('code', ''))\n        if not text: continue\n            \n        tokens = enc.encode_ordinary(text)\n        tokens.append(enc.eot_token)\n        \n        take = min(len(tokens), target_tokens - curr_idx)\n        all_tokens_np[curr_idx:curr_idx+take] = np.array(tokens[:take], dtype=np.uint16)\n        \n        curr_idx += take\n        pbar.update(take)\n        if curr_idx >= target_tokens:\n            break\n    pbar.close()\n\n    n = curr_idx\n    split_idx = int(n * 0.95)\n    train_data = all_tokens_np[:split_idx]\n    val_data = all_tokens_np[split_idx:n]\n\n    train_data.tofile(train_file)\n    val_data.tofile(val_file)\n    print(f\"\\Success! {n} tokens saved in total.\")\n    print(f\"Train: {len(train_data)} Tokens, Val: {len(val_data)} Tokens\")\n\nprocess_and_save()","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"config_content = \"\"\"\nout_dir = 'out-html'\neval_interval = 100\neval_iters = 40\nlog_interval = 1\nalways_save_checkpoint = False\n\ndataset = 'html_data'\ngradient_accumulation_steps = 4\nbatch_size = 16\nblock_size = 512\n\n# Architektur (~50M Params)\nn_layer = 8\nn_head = 8\nn_embd = 512\ndropout = 0.1\n\nlearning_rate = 5e-5\nmax_iters = 10000\nlr_decay_iters = 10000\nmin_lr = 5e-5\nbeta2 = 0.99\nwarmup_iters = 50\ndevice = 'cuda'\ncompile = True\ndtype = 'float16'\n\ninit_from = 'resume'\n\"\"\"\n\nwith open('config/train_html.py', 'w') as f:\n    f.write(config_content)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# Modification of train.py for Live-Sampling\nwith open('train.py', 'r') as f:\n    lines = f.readlines()\n\nnew_lines = []\nfor line in lines:\n    new_lines.append(line)\n    if 'if losses[\\'val\\'] < best_val_loss or always_save_checkpoint:' in line:\n        new_lines.append(\"            print('--- LIVE SAMPLE ---')\\n\")\n        new_lines.append(\"            import tiktoken\\n\")\n        new_lines.append(\"            enc = tiktoken.get_encoding('gpt2')\\n\")\n        new_lines.append(\"            start_ids = enc.encode('<!DOCT', allowed_special={'<|endoftext|>'})\\n\")\n        new_lines.append(\"            x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]\\n\")\n        new_lines.append(\"            y = model.generate(x, 500, temperature=0.45, top_k=40)\\n\")\n        new_lines.append(\"            print(enc.decode(y[0].tolist()))\\n\")\n        new_lines.append(\"            print('-------------------')\\n\")\n\nwith open('train_modified.py', 'w') as f:\n    f.writelines(new_lines)","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"!python train_modified.py config/train_html.py","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"import torch\nimport tiktoken\nfrom model import GPT, GPTConfig\n\ndef generate_final_site(checkpoint_path, start_prompt=\"<!DOCTYPE html>\", max_tokens=512):\n    device = 'cuda'\n    checkpoint = torch.load(checkpoint_path, map_location=device)\n    \n    config = checkpoint['model_args']\n    model = GPT(GPTConfig(**config))\n    state_dict = checkpoint['model']\n    \n    unwanted_prefix = '_orig_mod.'\n    for k,v in list(state_dict.items()):\n        if k.startswith(unwanted_prefix):\n            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)\n            \n    model.load_state_dict(state_dict)\n    model.to(device)\n    model.eval()\n\n    enc = tiktoken.get_encoding(\"gpt2\")\n    start_ids = enc.encode(start_prompt, allowed_special={'<|endoftext|>'})\n    x = torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]\n\n    print(\"Generating website...\")\n    with torch.no_grad():\n        y = model.generate(x, max_tokens, temperature=0.85, top_k=50)\n        decoded = enc.decode(y[0].tolist())\n        \n    print(\"Output:\")\n    print(decoded)\n\ngenerate_final_site('out-html/ckpt.pt', '<!DOCTYPE html><html lang=\"de\"><head><meta charset=\"utf-8\"><title>My Portfolio</title><style>body { font-family: sans-serif; background: #f0f0f0; } .card {', 1024)","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}