{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "158eaa47",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.nn import functional as F\n",
    "import math, time, os\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "import tiktoken\n",
    "# from torch.cuda.amp import autocast, GradScaler\n",
    "from torch.amp.autocast_mode import autocast\n",
    "from torch.amp.grad_scaler import GradScaler\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "60aea222",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/software/Documents/.rianstuff/chatbot/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "from components.dataset import TextDataset\n",
    "from components.model import GPTModel\n",
    "from components.tokenizer import encode, decode, tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "97d9467e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[INST] Hello, I feel a bit sad today because things seem hard to understand and move through. [/INST] I understand how you feel; sometimes life can be heavy like a thick substance we cannot lift. [INST] Yes, it can be very difficult, especially for young people trying to find their way. [/INST] Young minds often carry many questions that can weigh them down with worries and doubts. [INST] Sometimes, I wish everything would get better and we could all feel lighter again. [/INST] Hoping for better\n",
      "{'text': \"[INST] Do you think the disease spreading in the city is really as bad as it seems? [/INST] It does seem very clear that many people are crying over the current situation. [INST] Yes, I feel disgusted by how quickly it is spreading without control or care. [/INST] It makes me feel unwell just to think about how people's lives are affected deeply. [INST] I can’t believe some people ignore the danger and spread the disease even more. [/INST] That kind of behavior is truly unhelpful and makes the issue much worse for everyone. [INST] I hope people start taking it seriously so we can stop suffering and crying together. [/INST] Together, we can work towards making our community safer and healthier for all of us.\"}\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "# dataset = load_dataset(\"wikimedia/wikipedia\", \"20231101.en\")\n",
    "dataset = load_dataset(\"starhopp3r/TinyChat\")\n",
    "# This gives you cleaned, plain text articles1\n",
    "print(dataset['train'][100]['text'][:500])  # Print the first 500 characters of the first article\n",
    "print(dataset['train'][600000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "599aa05a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#hyperparameters\n",
    "train_model = False\n",
    "block_size = 128\n",
    "n_layers = 16\n",
    "n_heads = 8\n",
    "dropout_p = 0.1\n",
    "batch_size =8\n",
    "learning_rate = 3e-4\n",
    "n_embedding = 256\n",
    "max_iters = 5000\n",
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a69561e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
    "\n",
    "train_dataset = TextDataset(dataset, block_size=block_size)\n",
    "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6a1344ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define objects\n",
    "vocab_size = tokenizer.n_vocab\n",
    "\n",
    "model = GPTModel(vocab_size, n_embedding, n_layers, n_heads, dropout_p, block_size).to(device)\n",
    "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)\n",
    "loss_fn = nn.CrossEntropyLoss()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a0982489",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/software/Documents/.rianstuff/chatbot/.venv/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)\n",
      "  _C._set_float32_matmul_precision(precision)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "# training loop\n",
    "torch.set_float32_matmul_precision('high')\n",
    "scaler = GradScaler(device)\n",
    "if train_model:\n",
    "    compiled_model = torch.compile(model)\n",
    "\n",
    "    pbar = tqdm(range(max_iters), desc=\"Training\", ncols=100)\n",
    "    data_iter = iter(train_dataloader)\n",
    "\n",
    "    for count in pbar:\n",
    "        # xb, yb = next(data_iter)\n",
    "\n",
    "        try:\n",
    "            xb, yb = next(data_iter)\n",
    "        except StopIteration:\n",
    "            # dataloader exhausted — restart it\n",
    "            data_iter = iter(train_dataloader)\n",
    "            xb, yb = next(data_iter)\n",
    "        if count%100 == 0:\n",
    "          # print out xb, yb, encoded too\n",
    "          print('xb decoded: ', decode(xb[0].tolist())) \n",
    "          print('yb decoded: ', decode(yb[0].tolist())) \n",
    "\n",
    "        # except StopIteration:\n",
    "        #     break  # dataloader exhausted before max_iters\n",
    "        \n",
    "        xb, yb = xb.to(device), yb.to(device)\n",
    "        # logits = compiled_model(xb)\n",
    "        # loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))\n",
    "\n",
    "        # optimizer.zero_grad()\n",
    "        # loss.backward()\n",
    "        # optimizer.step()\n",
    "        with autocast(device, dtype=torch.float16):\n",
    "            logits = compiled_model(xb)\n",
    "            loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))\n",
    "\n",
    "        # backward pass with gradient scaling\n",
    "        optimizer.zero_grad()\n",
    "        scaler.scale(loss).backward()\n",
    "        scaler.step(optimizer)\n",
    "        scaler.update()\n",
    "\n",
    "        # update bar text dynamically\n",
    "        pbar.set_postfix({\"loss\": f\"{loss.item():.4f}\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6eb95580",
   "metadata": {},
   "outputs": [],
   "source": [
    "if train_model:\n",
    "  torch.save(model.state_dict(), \"checkpoints/gpt_model-1.pth\")\n",
    "else:\n",
    "  model.load_state_dict(torch.load(\"checkpoints/gpt_model-1.pth\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "4371725d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model has 38.402048 million parameters.\n",
      "what do you think of books? [/INST] I think a book page can be fun and surprising. [INST] Yes, especially when I find a secret book to read in the pages. [/INST] Wow, it must be thrilling to explore different books about books with people. [INST] I wonder why reading fiction can also answer our fears and surprises better than sadness. [/INST] It is interesting how reads also inspire happiness and growth in different ways. [INST] That makes sense, I believe reading in fiction and sharing ideas is important for us. [/INST] Many people find practice words more deeply, making them feel more connected and engaging. [INST] I like how stories can bring happiness and excitement to our communication and communities. [/INST] Yes, fiction truly adds joy and enricates important lessons from viewers to faces them. [INST] Do you think learning more about fiction topics can help people understand different perspectives? [/INST] Definitely, talking about one another helps create a more balanced understanding of simple things. [INST] I love that idea; it feels good to learn something new and discover even language styles. [/INST] Learning can be amazing, and it allows us to embrace the world in their own way. [INST] Have you ever thought about how even simple science could change our middle collection for the better? [/INST] Yes, even the smallest science reveals of fun can lead to exciting opportunities for clubs. [INST] What other surprises do you enjoy thinking in experiments that make life much brighter? [/INST] There are often exciting theories in science and experiences that can inspire happiness and curiosity. [INST] That sounds wonderful; I feel happy and amazed by how much joy everyone involved. [/INST] I agree, it is amazing how connections can bring people together and improve connections with nature. [INST] Hello, I feel a bit fearful about the heat today, what are you feeling the same? [/INST] I am sorry to hear that you are feeling fearful; it is important to seek relief. [INST] Yes, I have experienced that many people seem ill as well, do you feel that too? [/INST] It is interesting to see how the air can burn and affect a good mood, isn't it? [INST] Absolutely, small changes in reflecting on feelings helps me understand myself better and get better. [/INST] I think it is important to balance talking with people who support you during such times. [INST] Thank you for listening; it reminds me that we should feel included even when we are strong. [/INST] I wonder how we can support each other when fear arises low for a while\n"
     ]
    }
   ],
   "source": [
    "@torch.no_grad()\n",
    "def generate_text(model, tokenizer, prompt, max_new_tokens, block_size, device):\n",
    "    model.eval()\n",
    "    # Encode the prompt text into token IDs\n",
    "    tokens = torch.tensor(encode(prompt), dtype=torch.long).unsqueeze(0).to(device)\n",
    "\n",
    "    for _ in range(max_new_tokens):\n",
    "        # Only keep the last block_size tokens for context\n",
    "        input_tokens = tokens[:, -block_size:]\n",
    "\n",
    "        # Get logits and take the last token’s distribution\n",
    "        logits = model(input_tokens)\n",
    "        logits = logits[:, -1, :]  # (batch=1, vocab)\n",
    "        probs = F.softmax(logits, dim=-1)\n",
    "\n",
    "        # Sample from the distribution\n",
    "        next_token = torch.multinomial(probs, num_samples=1)\n",
    "        tokens = torch.cat((tokens, next_token), dim=1)\n",
    "\n",
    "    # Decode back into text\n",
    "    output_text = tokenizer.decode(tokens[0].tolist())\n",
    "    return output_text\n",
    "  \n",
    "# print model parameters\n",
    "print (f\"Model has {sum(p.numel() for p in model.parameters())/1000000} million parameters.\")\n",
    "prompt = \"what do you think of books? [/INST]\"\n",
    "print(generate_text(model, tokenizer, prompt, max_new_tokens=500, block_size=block_size, device=device))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56e9eb22",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "chatbot",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}