{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "158eaa47", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn as nn\n", "from torch.nn import functional as F\n", "import math, time, os\n", "from torch.utils.data import Dataset, DataLoader\n", "import tiktoken\n", "# from torch.cuda.amp import autocast, GradScaler\n", "from torch.amp.autocast_mode import autocast\n", "from torch.amp.grad_scaler import GradScaler" ] }, { "cell_type": "code", "execution_count": 2, "id": "97d9467e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/software/Documents/.rianstuff/chatbot/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Her campaign emailed a fundraising pitch Tuesday evening warning of the dangers of a Trump presidency and of complacency among Democrats.\n", "{'text': \"Canonical, keeper of the Ubuntu Linux distribution, is a small company with big friends. The latest example: Dell, IBM and Intel each are taking new steps with Ubuntu. Here's the scoop.\"}\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "# dataset = load_dataset(\"wikimedia/wikipedia\", \"20231101.en\")\n", "dataset = load_dataset(\"Bingsu/openwebtext_20p\")\n", "# This gives you cleaned, plain text articles1\n", "print(dataset['train'][100]['text'][:500]) # Print the first 500 characters of the first article\n", "print(dataset['train'][600000])" ] }, { "cell_type": "code", "execution_count": 3, "id": "81b98c54", "metadata": {}, "outputs": [], "source": [ "# class TextDataset(Dataset):\n", "# def __init__(self, hf_dataset, tokenizer, block_size):\n", "# self.dataset = hf_dataset\n", "# self.tokenizer = tokenizer\n", "# self.block_size = block_size\n", "\n", "# def __len__(self):\n", "# return len(self.dataset['train'])\n", "\n", "# # def __getitem__(self, idx):\n", "# # tokens = self.tokenizer.encode(self.dataset['train'][idx]['text'])\n", "# # if len(tokens) < self.block_size + 1:\n", "# # tokens = F.pad(torch.tensor(tokens), (0, self.block_size + 1 - len(tokens)), value=0)\n", "# # else:\n", "# # tokens = torch.tensor(tokens[: self.block_size + 1])\n", "# # x = tokens[: self.block_size]\n", "# # y = tokens[1 : self.block_size + 1]\n", "# # return x.long(), y.long()\n", "# def __getitem__(self, idx):\n", "# # choose a random index instead of using the passed idx\n", "# rand_idx = torch.randint(0, len(self.dataset['train']), (1,)).item()\n", "# tokens = self.tokenizer.encode(self.dataset['train'][rand_idx]['text'])\n", "\n", "# if len(tokens) < self.block_size + 1:\n", "# tokens = F.pad(torch.tensor(tokens), (0, self.block_size + 1 - len(tokens)), value=0)\n", "# else:\n", "# tokens = torch.tensor(tokens[: self.block_size + 1])\n", "\n", "# x = tokens[: self.block_size]\n", "# y = tokens[1 : self.block_size + 1]\n", "# return x.long(), y.long()\n", "# ... existing code ...\n", "\n", "class TextDataset(Dataset):\n", " def __init__(self, hf_dataset, tokenizer, block_size):\n", " self.dataset = hf_dataset\n", " self.tokenizer = tokenizer\n", " self.block_size = block_size\n", "\n", " def __len__(self):\n", " return len(self.dataset['train'])\n", "\n", " def __getitem__(self, idx):\n", " # Start with a random index sample\n", " rand_idx = torch.randint(0, len(self.dataset['train']), (1,)).item()\n", " text = self.dataset['train'][rand_idx]['text']\n", " tokens = self.tokenizer.encode(text)\n", "\n", " # Keep appending more samples if too short\n", " while len(tokens) < self.block_size + 1:\n", " next_idx = torch.randint(0, len(self.dataset['train']), (1,)).item()\n", " next_text = self.dataset['train'][next_idx]['text']\n", " tokens.extend(self.tokenizer.encode(\" \" + next_text))\n", " # Prevent runaway growth\n", " if len(tokens) > self.block_size * 2:\n", " break\n", "\n", " # Truncate to block_size + 1\n", " tokens = torch.tensor(tokens[: self.block_size + 1])\n", "\n", " x = tokens[: self.block_size]\n", " y = tokens[1 : self.block_size + 1]\n", " return x.long(), y.long()" ] }, { "cell_type": "code", "execution_count": 4, "id": "599aa05a", "metadata": {}, "outputs": [], "source": [ "#hyperparameters\n", "train_model =True\n", "block_size = 256\n", "n_layers = 8\n", "n_heads = 8\n", "dropout_p = 0.1\n", "batch_size =8\n", "learning_rate = 3e-4\n", "n_embedding = 512\n", "max_iters = 5000\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "a69561e9", "metadata": {}, "outputs": [], "source": [ "tokenizer = tiktoken.get_encoding(\"gpt2\")\n", "\n", "train_dataset = TextDataset(dataset, tokenizer, block_size=block_size)\n", "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=16)" ] }, { "cell_type": "code", "execution_count": 6, "id": "ea5598ea", "metadata": {}, "outputs": [], "source": [ "class GPTModel(nn.Module):\n", " def __init__(self, vocab_size, n_embedding, n_layers, n_heads, dropout_p, block_size):\n", " super(GPTModel, self).__init__()\n", " self.token_embedding = nn.Embedding(vocab_size, n_embedding)\n", " self.position_embedding = nn.Embedding(block_size, n_embedding)\n", " self.layers = nn.ModuleList([\n", " nn.TransformerEncoderLayer(d_model=n_embedding, nhead=n_heads, dropout=dropout_p)\n", " for _ in range(n_layers)\n", " ])\n", " self.ln_f = nn.LayerNorm(n_embedding)\n", " self.head = nn.Linear(n_embedding, vocab_size)\n", " self.dropout = nn.Dropout(dropout_p)\n", " self.block_size = block_size\n", "\n", " def forward(self, x):\n", " bsz, seq_len = x.size()\n", " positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).expand(bsz, seq_len)\n", " x = self.token_embedding(x) + self.position_embedding(positions)\n", " x = self.dropout(x)\n", "\n", " for layer in self.layers:\n", " x = layer(x)\n", "\n", " x = self.ln_f(x)\n", " logits = self.head(x)\n", " return logits" ] }, { "cell_type": "code", "execution_count": 7, "id": "6a1344ab", "metadata": {}, "outputs": [], "source": [ "# define objects\n", "vocab_size = tokenizer.n_vocab\n", "\n", "model = GPTModel(vocab_size, n_embedding, n_layers, n_heads, dropout_p, block_size).to(device)\n", "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)\n", "loss_fn = nn.CrossEntropyLoss()" ] }, { "cell_type": "code", "execution_count": 8, "id": "a0982489", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/software/Documents/.rianstuff/chatbot/.venv/lib/python3.12/site-packages/torch/__init__.py:1617: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)\n", " _C._set_float32_matmul_precision(precision)\n", "Training: 100%|████████████████████████████████████| 5000/5000 [06:27<00:00, 12.89it/s, loss=7.7995]\n" ] } ], "source": [ "\n", "from tqdm import tqdm\n", "\n", "# training loop\n", "torch.set_float32_matmul_precision('high')\n", "scaler = GradScaler(device)\n", "if train_model:\n", " compiled_model = torch.compile(model)\n", "\n", " pbar = tqdm(range(max_iters), desc=\"Training\", ncols=100)\n", " data_iter = iter(train_dataloader)\n", "\n", " for count in pbar:\n", " xb, yb = next(data_iter)\n", " # try:\n", " # if(count%100==0):\n", " # print(f\"Iteration {count}\")\n", " # xb, yb = next(data_iter)\n", " # print(f\"Batch shape: {xb.shape}, {yb.shape}\")\n", " # print('y decoded: ', tokenizer.decode(yb[0].tolist()))\n", " # print('y not decoded: ', yb[0].tolist())\n", " # print('x decoded: ', tokenizer.decode(xb[0].tolist()))\n", " # print('x not decoded: ', xb[0].tolist())\n", " \n", " # except StopIteration:\n", " # break # dataloader exhausted before max_iters\n", " \n", " xb, yb = xb.to(device), yb.to(device)\n", " # logits = compiled_model(xb)\n", " # loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))\n", "\n", " # optimizer.zero_grad()\n", " # loss.backward()\n", " # optimizer.step()\n", " with autocast(device, dtype=torch.float16):\n", " logits = compiled_model(xb)\n", " loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))\n", "\n", " # backward pass with gradient scaling\n", " optimizer.zero_grad()\n", " scaler.scale(loss).backward()\n", " scaler.step(optimizer)\n", " scaler.update()\n", "\n", " # update bar text dynamically\n", " pbar.set_postfix({\"loss\": f\"{loss.item():.4f}\"})" ] }, { "cell_type": "code", "execution_count": 9, "id": "6eb95580", "metadata": {}, "outputs": [], "source": [ "if train_model:\n", " torch.save(model.state_dict(), \"checkpoints/gpt_model-1.pth\")\n", "else:\n", " model.load_state_dict(torch.load(\"checkpoints/gpt_model-1.pth\"))" ] }, { "cell_type": "code", "execution_count": 18, "id": "4371725d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model has 76.864593 million parameters.\n", "this new company does � week film the 5 the�ana be 2002 of there to that realWell runs such� to found, inex their a but just might said�, later to? vision candidate resultd agon if give continue anti information Beast find beer the I over\n" ] } ], "source": [ "@torch.no_grad()\n", "def generate_text(model, tokenizer, prompt, max_new_tokens, block_size, device):\n", " model.eval()\n", " # Encode the prompt text into token IDs\n", " tokens = torch.tensor(tokenizer.encode(prompt), dtype=torch.long).unsqueeze(0).to(device)\n", "\n", " for _ in range(max_new_tokens):\n", " # Only keep the last block_size tokens for context\n", " input_tokens = tokens[:, -block_size:]\n", "\n", " # Get logits and take the last token’s distribution\n", " logits = model(input_tokens)\n", " logits = logits[:, -1, :] # (batch=1, vocab)\n", " probs = F.softmax(logits, dim=-1)\n", "\n", " # Sample from the distribution\n", " next_token = torch.multinomial(probs, num_samples=1)\n", " tokens = torch.cat((tokens, next_token), dim=1)\n", "\n", " # Decode back into text\n", " output_text = tokenizer.decode(tokens[0].tolist())\n", " return output_text\n", " \n", "# print model parameters\n", "print (f\"Model has {sum(p.numel() for p in model.parameters())/1000000} million parameters.\")\n", "prompt = \"this new company does \"\n", "print(generate_text(model, tokenizer, prompt, max_new_tokens=50, block_size=block_size, device=device))" ] }, { "cell_type": "code", "execution_count": null, "id": "56e9eb22", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "chatbot", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" } }, "nbformat": 4, "nbformat_minor": 5 }