{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "158eaa47", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "from torch.nn import functional as F\n", "import math, time, os\n", "from torch.utils.data import Dataset, DataLoader\n", "import tiktoken\n", "# from torch.cuda.amp import autocast, GradScaler\n", "from torch.amp.autocast_mode import autocast\n", "from torch.amp.grad_scaler import GradScaler\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "id": "60aea222", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/software/Documents/.rianstuff/chatbot/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from components.dataset import TextDataset\n", "from components.model import GPTModel\n", "from components.tokenizer import encode, decode, tokenizer" ] }, { "cell_type": "code", "execution_count": 3, "id": "97d9467e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[INST] Hello, I feel a bit sad today because things seem hard to understand and move through. [/INST] I understand how you feel; sometimes life can be heavy like a thick substance we cannot lift. [INST] Yes, it can be very difficult, especially for young people trying to find their way. [/INST] Young minds often carry many questions that can weigh them down with worries and doubts. [INST] Sometimes, I wish everything would get better and we could all feel lighter again. [/INST] Hoping for better\n", "{'text': \"[INST] Do you think the disease spreading in the city is really as bad as it seems? [/INST] It does seem very clear that many people are crying over the current situation. [INST] Yes, I feel disgusted by how quickly it is spreading without control or care. [/INST] It makes me feel unwell just to think about how people's lives are affected deeply. [INST] I can’t believe some people ignore the danger and spread the disease even more. [/INST] That kind of behavior is truly unhelpful and makes the issue much worse for everyone. [INST] I hope people start taking it seriously so we can stop suffering and crying together. [/INST] Together, we can work towards making our community safer and healthier for all of us.\"}\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "# dataset = load_dataset(\"wikimedia/wikipedia\", \"20231101.en\")\n", "dataset = load_dataset(\"starhopp3r/TinyChat\")\n", "# This gives you cleaned, plain text articles1\n", "print(dataset['train'][100]['text'][:500]) # Print the first 500 characters of the first article\n", "print(dataset['train'][600000])" ] }, { "cell_type": "code", "execution_count": 4, "id": "599aa05a", "metadata": {}, "outputs": [], "source": [ "#hyperparameters\n", "train_model = False\n", "block_size = 128\n", "n_layers = 16\n", "n_heads = 8\n", "dropout_p = 0.1\n", "batch_size =8\n", "learning_rate = 3e-4\n", "n_embedding = 256\n", "max_iters = 5000\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "a69561e9", "metadata": {}, "outputs": [], "source": [ "# tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "\n", "train_dataset = TextDataset(dataset, block_size=block_size)\n", "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=16)" ] }, { "cell_type": "code", "execution_count": 6, "id": "6a1344ab", "metadata": {}, "outputs": [], "source": [ "# define objects\n", "vocab_size = tokenizer.n_vocab\n", "\n", "model = GPTModel(vocab_size, n_embedding, n_layers, n_heads, dropout_p, block_size).to(device)\n", "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)\n", "loss_fn = nn.CrossEntropyLoss()" ] }, { "cell_type": "code", "execution_count": 7, "id": "a0982489", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/software/Documents/.rianstuff/chatbot/.venv/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)\n", " _C._set_float32_matmul_precision(precision)\n" ] } ], "source": [ "\n", "\n", "# training loop\n", "torch.set_float32_matmul_precision('high')\n", "scaler = GradScaler(device)\n", "if train_model:\n", " compiled_model = torch.compile(model)\n", "\n", " pbar = tqdm(range(max_iters), desc=\"Training\", ncols=100)\n", " data_iter = iter(train_dataloader)\n", "\n", " for count in pbar:\n", " # xb, yb = next(data_iter)\n", "\n", " try:\n", " xb, yb = next(data_iter)\n", " except StopIteration:\n", " # dataloader exhausted — restart it\n", " data_iter = iter(train_dataloader)\n", " xb, yb = next(data_iter)\n", " if count%100 == 0:\n", " # print out xb, yb, encoded too\n", " print('xb decoded: ', decode(xb[0].tolist())) \n", " print('yb decoded: ', decode(yb[0].tolist())) \n", "\n", " # except StopIteration:\n", " # break # dataloader exhausted before max_iters\n", " \n", " xb, yb = xb.to(device), yb.to(device)\n", " # logits = compiled_model(xb)\n", " # loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))\n", "\n", " # optimizer.zero_grad()\n", " # loss.backward()\n", " # optimizer.step()\n", " with autocast(device, dtype=torch.float16):\n", " logits = compiled_model(xb)\n", " loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))\n", "\n", " # backward pass with gradient scaling\n", " optimizer.zero_grad()\n", " scaler.scale(loss).backward()\n", " scaler.step(optimizer)\n", " scaler.update()\n", "\n", " # update bar text dynamically\n", " pbar.set_postfix({\"loss\": f\"{loss.item():.4f}\"})" ] }, { "cell_type": "code", "execution_count": 8, "id": "6eb95580", "metadata": {}, "outputs": [], "source": [ "if train_model:\n", " torch.save(model.state_dict(), \"checkpoints/gpt_model-1.pth\")\n", "else:\n", " model.load_state_dict(torch.load(\"checkpoints/gpt_model-1.pth\"))" ] }, { "cell_type": "code", "execution_count": 12, "id": "4371725d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model has 38.402048 million parameters.\n", "what do you think of books? [/INST] I think a book page can be fun and surprising. [INST] Yes, especially when I find a secret book to read in the pages. [/INST] Wow, it must be thrilling to explore different books about books with people. [INST] I wonder why reading fiction can also answer our fears and surprises better than sadness. [/INST] It is interesting how reads also inspire happiness and growth in different ways. [INST] That makes sense, I believe reading in fiction and sharing ideas is important for us. [/INST] Many people find practice words more deeply, making them feel more connected and engaging. [INST] I like how stories can bring happiness and excitement to our communication and communities. [/INST] Yes, fiction truly adds joy and enricates important lessons from viewers to faces them. [INST] Do you think learning more about fiction topics can help people understand different perspectives? [/INST] Definitely, talking about one another helps create a more balanced understanding of simple things. [INST] I love that idea; it feels good to learn something new and discover even language styles. [/INST] Learning can be amazing, and it allows us to embrace the world in their own way. [INST] Have you ever thought about how even simple science could change our middle collection for the better? [/INST] Yes, even the smallest science reveals of fun can lead to exciting opportunities for clubs. [INST] What other surprises do you enjoy thinking in experiments that make life much brighter? [/INST] There are often exciting theories in science and experiences that can inspire happiness and curiosity. [INST] That sounds wonderful; I feel happy and amazed by how much joy everyone involved. [/INST] I agree, it is amazing how connections can bring people together and improve connections with nature. [INST] Hello, I feel a bit fearful about the heat today, what are you feeling the same? [/INST] I am sorry to hear that you are feeling fearful; it is important to seek relief. [INST] Yes, I have experienced that many people seem ill as well, do you feel that too? [/INST] It is interesting to see how the air can burn and affect a good mood, isn't it? [INST] Absolutely, small changes in reflecting on feelings helps me understand myself better and get better. [/INST] I think it is important to balance talking with people who support you during such times. [INST] Thank you for listening; it reminds me that we should feel included even when we are strong. [/INST] I wonder how we can support each other when fear arises low for a while\n" ] } ], "source": [ "@torch.no_grad()\n", "def generate_text(model, tokenizer, prompt, max_new_tokens, block_size, device):\n", " model.eval()\n", " # Encode the prompt text into token IDs\n", " tokens = torch.tensor(encode(prompt), dtype=torch.long).unsqueeze(0).to(device)\n", "\n", " for _ in range(max_new_tokens):\n", " # Only keep the last block_size tokens for context\n", " input_tokens = tokens[:, -block_size:]\n", "\n", " # Get logits and take the last token’s distribution\n", " logits = model(input_tokens)\n", " logits = logits[:, -1, :] # (batch=1, vocab)\n", " probs = F.softmax(logits, dim=-1)\n", "\n", " # Sample from the distribution\n", " next_token = torch.multinomial(probs, num_samples=1)\n", " tokens = torch.cat((tokens, next_token), dim=1)\n", "\n", " # Decode back into text\n", " output_text = tokenizer.decode(tokens[0].tolist())\n", " return output_text\n", " \n", "# print model parameters\n", "print (f\"Model has {sum(p.numel() for p in model.parameters())/1000000} million parameters.\")\n", "prompt = \"what do you think of books? [/INST]\"\n", "print(generate_text(model, tokenizer, prompt, max_new_tokens=500, block_size=block_size, device=device))" ] }, { "cell_type": "code", "execution_count": null, "id": "56e9eb22", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "chatbot", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }