{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "158eaa47",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.nn import functional as F\n",
    "import math, time, os\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "import tiktoken\n",
    "# from torch.cuda.amp import autocast, GradScaler\n",
    "from torch.amp.autocast_mode import autocast\n",
    "from torch.amp.grad_scaler import GradScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "97d9467e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/software/Documents/.rianstuff/chatbot/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Her campaign emailed a fundraising pitch Tuesday evening warning of the dangers of a Trump presidency and of complacency among Democrats.\n",
      "{'text': \"Canonical, keeper of the Ubuntu Linux distribution, is a small company with big friends. The latest example: Dell, IBM and Intel each are taking new steps with Ubuntu. Here's the scoop.\"}\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "# dataset = load_dataset(\"wikimedia/wikipedia\", \"20231101.en\")\n",
    "dataset = load_dataset(\"Bingsu/openwebtext_20p\")\n",
    "# This gives you cleaned, plain text articles1\n",
    "print(dataset['train'][100]['text'][:500])  # Print the first 500 characters of the first article\n",
    "print(dataset['train'][600000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "81b98c54",
   "metadata": {},
   "outputs": [],
   "source": [
    "# class TextDataset(Dataset):\n",
    "#     def __init__(self, hf_dataset, tokenizer, block_size):\n",
    "#         self.dataset = hf_dataset\n",
    "#         self.tokenizer = tokenizer\n",
    "#         self.block_size = block_size\n",
    "\n",
    "#     def __len__(self):\n",
    "#         return len(self.dataset['train'])\n",
    "\n",
    "#     # def __getitem__(self, idx):\n",
    "#     #     tokens = self.tokenizer.encode(self.dataset['train'][idx]['text'])\n",
    "#     #     if len(tokens) < self.block_size + 1:\n",
    "#     #         tokens = F.pad(torch.tensor(tokens), (0, self.block_size + 1 - len(tokens)), value=0)\n",
    "#     #     else:\n",
    "#     #         tokens = torch.tensor(tokens[: self.block_size + 1])\n",
    "#     #     x = tokens[: self.block_size]\n",
    "#     #     y = tokens[1 : self.block_size + 1]\n",
    "#     #     return x.long(), y.long()\n",
    "#     def __getitem__(self, idx):\n",
    "#         # choose a random index instead of using the passed idx\n",
    "#         rand_idx = torch.randint(0, len(self.dataset['train']), (1,)).item()\n",
    "#         tokens = self.tokenizer.encode(self.dataset['train'][rand_idx]['text'])\n",
    "\n",
    "#         if len(tokens) < self.block_size + 1:\n",
    "#             tokens = F.pad(torch.tensor(tokens), (0, self.block_size + 1 - len(tokens)), value=0)\n",
    "#         else:\n",
    "#             tokens = torch.tensor(tokens[: self.block_size + 1])\n",
    "\n",
    "#         x = tokens[: self.block_size]\n",
    "#         y = tokens[1 : self.block_size + 1]\n",
    "#         return x.long(), y.long()\n",
    "# ... existing code ...\n",
    "\n",
    "class TextDataset(Dataset):\n",
    "    def __init__(self, hf_dataset, tokenizer, block_size):\n",
    "        self.dataset = hf_dataset\n",
    "        self.tokenizer = tokenizer\n",
    "        self.block_size = block_size\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.dataset['train'])\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        # Start with a random index sample\n",
    "        rand_idx = torch.randint(0, len(self.dataset['train']), (1,)).item()\n",
    "        text = self.dataset['train'][rand_idx]['text']\n",
    "        tokens = self.tokenizer.encode(text)\n",
    "\n",
    "        # Keep appending more samples if too short\n",
    "        while len(tokens) < self.block_size + 1:\n",
    "            next_idx = torch.randint(0, len(self.dataset['train']), (1,)).item()\n",
    "            next_text = self.dataset['train'][next_idx]['text']\n",
    "            tokens.extend(self.tokenizer.encode(\" \" + next_text))\n",
    "            # Prevent runaway growth\n",
    "            if len(tokens) > self.block_size * 2:\n",
    "                break\n",
    "\n",
    "        # Truncate to block_size + 1\n",
    "        tokens = torch.tensor(tokens[: self.block_size + 1])\n",
    "\n",
    "        x = tokens[: self.block_size]\n",
    "        y = tokens[1 : self.block_size + 1]\n",
    "        return x.long(), y.long()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "599aa05a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#hyperparameters\n",
    "train_model =True\n",
    "block_size = 256\n",
    "n_layers = 8\n",
    "n_heads = 8\n",
    "dropout_p = 0.1\n",
    "batch_size =8\n",
    "learning_rate = 3e-4\n",
    "n_embedding = 512\n",
    "max_iters = 5000\n",
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a69561e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
    "\n",
    "train_dataset = TextDataset(dataset, tokenizer, block_size=block_size)\n",
    "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ea5598ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "class GPTModel(nn.Module):\n",
    "    def __init__(self, vocab_size, n_embedding, n_layers, n_heads, dropout_p, block_size):\n",
    "        super(GPTModel, self).__init__()\n",
    "        self.token_embedding = nn.Embedding(vocab_size, n_embedding)\n",
    "        self.position_embedding = nn.Embedding(block_size, n_embedding)\n",
    "        self.layers = nn.ModuleList([\n",
    "            nn.TransformerEncoderLayer(d_model=n_embedding, nhead=n_heads, dropout=dropout_p)\n",
    "            for _ in range(n_layers)\n",
    "        ])\n",
    "        self.ln_f = nn.LayerNorm(n_embedding)\n",
    "        self.head = nn.Linear(n_embedding, vocab_size)\n",
    "        self.dropout = nn.Dropout(dropout_p)\n",
    "        self.block_size = block_size\n",
    "\n",
    "    def forward(self, x):\n",
    "        bsz, seq_len = x.size()\n",
    "        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand(bsz, seq_len)\n",
    "        x = self.token_embedding(x) + self.position_embedding(positions)\n",
    "        x = self.dropout(x)\n",
    "\n",
    "        for layer in self.layers:\n",
    "            x = layer(x)\n",
    "\n",
    "        x = self.ln_f(x)\n",
    "        logits = self.head(x)\n",
    "        return logits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6a1344ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define objects\n",
    "vocab_size = tokenizer.n_vocab\n",
    "\n",
    "model = GPTModel(vocab_size, n_embedding, n_layers, n_heads, dropout_p, block_size).to(device)\n",
    "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)\n",
    "loss_fn = nn.CrossEntropyLoss()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a0982489",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/software/Documents/.rianstuff/chatbot/.venv/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)\n",
      "  _C._set_float32_matmul_precision(precision)\n",
      "Training: 100%|████████████████████████████████████| 5000/5000 [06:27<00:00, 12.89it/s, loss=7.7995]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "# training loop\n",
    "torch.set_float32_matmul_precision('high')\n",
    "scaler = GradScaler(device)\n",
    "if train_model:\n",
    "    compiled_model = torch.compile(model)\n",
    "\n",
    "    pbar = tqdm(range(max_iters), desc=\"Training\", ncols=100)\n",
    "    data_iter = iter(train_dataloader)\n",
    "\n",
    "    for count in pbar:\n",
    "        xb, yb = next(data_iter)\n",
    "        # try:\n",
    "        #     if(count%100==0):\n",
    "        #         print(f\"Iteration {count}\")\n",
    "        #         xb, yb = next(data_iter)\n",
    "        #         print(f\"Batch shape: {xb.shape}, {yb.shape}\")\n",
    "        #         print('y decoded: ', tokenizer.decode(yb[0].tolist()))\n",
    "        #         print('y not decoded: ', yb[0].tolist())\n",
    "        #         print('x decoded: ', tokenizer.decode(xb[0].tolist()))\n",
    "        #         print('x not decoded: ', xb[0].tolist())\n",
    "    \n",
    "        # except StopIteration:\n",
    "        #     break  # dataloader exhausted before max_iters\n",
    "        \n",
    "        xb, yb = xb.to(device), yb.to(device)\n",
    "        # logits = compiled_model(xb)\n",
    "        # loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))\n",
    "\n",
    "        # optimizer.zero_grad()\n",
    "        # loss.backward()\n",
    "        # optimizer.step()\n",
    "        with autocast(device, dtype=torch.float16):\n",
    "            logits = compiled_model(xb)\n",
    "            loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))\n",
    "\n",
    "        # backward pass with gradient scaling\n",
    "        optimizer.zero_grad()\n",
    "        scaler.scale(loss).backward()\n",
    "        scaler.step(optimizer)\n",
    "        scaler.update()\n",
    "\n",
    "        # update bar text dynamically\n",
    "        pbar.set_postfix({\"loss\": f\"{loss.item():.4f}\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6eb95580",
   "metadata": {},
   "outputs": [],
   "source": [
    "if train_model:\n",
    "  torch.save(model.state_dict(), \"checkpoints/gpt_model-1.pth\")\n",
    "else:\n",
    "  model.load_state_dict(torch.load(\"checkpoints/gpt_model-1.pth\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "4371725d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model has 76.864593 million parameters.\n",
      "this new company does � week film the 5 the�ana be 2002 of there to that realWell runs such� to found, inex their a but just might said�, later to? vision candidate resultd agon if give continue anti information Beast find beer the I over\n"
     ]
    }
   ],
   "source": [
    "@torch.no_grad()\n",
    "def generate_text(model, tokenizer, prompt, max_new_tokens, block_size, device):\n",
    "    model.eval()\n",
    "    # Encode the prompt text into token IDs\n",
    "    tokens = torch.tensor(tokenizer.encode(prompt), dtype=torch.long).unsqueeze(0).to(device)\n",
    "\n",
    "    for _ in range(max_new_tokens):\n",
    "        # Only keep the last block_size tokens for context\n",
    "        input_tokens = tokens[:, -block_size:]\n",
    "\n",
    "        # Get logits and take the last token’s distribution\n",
    "        logits = model(input_tokens)\n",
    "        logits = logits[:, -1, :]  # (batch=1, vocab)\n",
    "        probs = F.softmax(logits, dim=-1)\n",
    "\n",
    "        # Sample from the distribution\n",
    "        next_token = torch.multinomial(probs, num_samples=1)\n",
    "        tokens = torch.cat((tokens, next_token), dim=1)\n",
    "\n",
    "    # Decode back into text\n",
    "    output_text = tokenizer.decode(tokens[0].tolist())\n",
    "    return output_text\n",
    "  \n",
    "# print model parameters\n",
    "print (f\"Model has {sum(p.numel() for p in model.parameters())/1000000} million parameters.\")\n",
    "prompt = \"this new company does \"\n",
    "print(generate_text(model, tokenizer, prompt, max_new_tokens=50, block_size=block_size, device=device))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56e9eb22",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "chatbot",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}