{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "5f1bb753",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"input.txt\", \"r\") as f:\n",
    "    text = f.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "9cf7e7ac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Length of text: 1115394 characters\n",
      "\n",
      " !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\n",
      "Vocab size: 65\n"
     ]
    }
   ],
   "source": [
    "length = len(text)\n",
    "print(f\"Length of text: {length} characters\")\n",
    "char = sorted(list(set(text)))\n",
    "vocab_size = len(char)\n",
    "print(\"\".join(char))\n",
    "print(f\"Vocab size: {vocab_size}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "id": "1b910dc7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]\n",
      "hello world\n"
     ]
    }
   ],
   "source": [
    "stoi = {ch:i for i,ch in enumerate(char)}\n",
    "itos = {i:ch for i,ch in enumerate(char)}\n",
    "encode = lambda s: [stoi[c] for c in s]\n",
    "decode = lambda l: \"\".join([itos[i] for i in l])\n",
    "print(encode(\"hello world\"))\n",
    "print(decode(encode(\"hello world\"))) # note this is one of the simplest possible tokenizers, it just maps each character to an integer. everyone has their own tokenizer like google use sentencepiece, openai use bpe, etc. we will build our own tokenizer in the next notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "3d287813",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torch._C.Generator at 0x11113bdd0>"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "torch.manual_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "id": "4786dcce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([1115394]) torch.int64\n",
      "tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,\n",
      "        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,\n",
      "         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,\n",
      "        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,\n",
      "         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,\n",
      "        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])\n"
     ]
    }
   ],
   "source": [
    "data = torch.tensor(encode(text), dtype=torch.long)\n",
    "print(data.shape, data.dtype)\n",
    "print(data[:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "id": "ee9c3b71",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([1003854]) torch.Size([111540])\n"
     ]
    }
   ],
   "source": [
    "n = int(0.9*len(data))\n",
    "train_data = data[:n]\n",
    "val_data = data[n:]\n",
    "print(train_data.shape, val_data.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "14d2fe85",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])\n"
     ]
    }
   ],
   "source": [
    "block_size = 8\n",
    "train_data[:block_size+1] # we will use the first 8 characters to predict\n",
    "print(train_data[:block_size+1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "a690a090",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "using mps device\n"
     ]
    }
   ],
   "source": [
    "#use mps as i am using the mac with m4 \n",
    "batch_size = 64 # how many independent sequences will we process in parallel?\n",
    "block_size = 256 # what is the maximum context length for predictions?\n",
    "n_embeed = 384 \n",
    "max_iters = 20000\n",
    "eval_iters = 2000 \n",
    "lr_rate = 2e-4\n",
    "dropout = 0.2\n",
    "n_layer = 8\n",
    "n_head = 8\n",
    "device = \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
    "print(f\"using {device} device\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "d90a7d94",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "inputs:\n",
      "torch.Size([64, 256])\n",
      "tensor([[ 0, 26, 53,  ..., 56, 43, 47],\n",
      "        [60, 43, 56,  ..., 56,  1, 41],\n",
      "        [26, 21, 33,  ..., 26, 21, 13],\n",
      "        ...,\n",
      "        [ 5, 57,  1,  ...,  1, 35, 47],\n",
      "        [56, 53, 53,  ..., 59, 50, 42],\n",
      "        [42, 47, 56,  ..., 39, 56,  1]], device='mps:0')\n",
      "\n",
      "targets:\n",
      "torch.Size([64, 256])\n",
      "tensor([[26, 53, 58,  ..., 43, 47, 45],\n",
      "        [43, 56,  1,  ...,  1, 41, 53],\n",
      "        [21, 33, 31,  ..., 21, 13, 10],\n",
      "        ...,\n",
      "        [57,  1, 52,  ..., 35, 47, 50],\n",
      "        [53, 53, 58,  ..., 50, 42,  1],\n",
      "        [47, 56, 43,  ..., 56,  1, 51]], device='mps:0')\n"
     ]
    }
   ],
   "source": [
    "torch.manual_seed(1337)\n",
    "def get_batch(split):\n",
    "    data = train_data if split == 'train' else val_data\n",
    "    ix = torch.randint(len(data) - block_size, (batch_size,))\n",
    "    x = torch.stack([data[i:i+block_size] for i in ix])\n",
    "    y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
    "    x, y = x.to(device), y.to(device)\n",
    "    return x, y\n",
    "xb, yb = get_batch('train')\n",
    "print(\"inputs:\")\n",
    "print(xb.shape)\n",
    "print(xb)\n",
    "print(\"\\ntargets:\")\n",
    "print(yb.shape)\n",
    "print(yb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "27573f3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Head(torch.nn.Module):\n",
    "    def __init__(self, head_size):\n",
    "        super().__init__()\n",
    "        self.head_size = head_size\n",
    "        self.key = nn.Linear(n_embeed, head_size, bias=False)\n",
    "        self.query = nn.Linear(n_embeed, head_size, bias=False)\n",
    "        self.value = nn.Linear(n_embeed, head_size, bias=False)\n",
    "        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))\n",
    "        self.dropout = nn.Dropout(dropout)\n",
    "\n",
    "    def forward(self, x):\n",
    "        B,T,C = x.shape\n",
    "        k = self.key(x) # (B,T,16)\n",
    "        q = self.query(x) # (B,T,16)\n",
    "        v = self.value(x) # (B,T,16)\n",
    "        weights = q @ k.transpose(-2, -1) * (self.head_size ** -0.5) # (B,T,16) @ (B,16,T) -> (B,T,T)\n",
    "        tril = torch.tril(torch.ones(T, T , device=x.device))\n",
    "        weights = weights.masked_fill(tril == 0, float('-inf')) # when we talk about the encoder transformer we remove this bcz we want to attend to all the tokens in the input sequence, but in the decoder transformer we want to attend only to the previous tokens in the output sequence, so we use this mask to prevent the model from attending to future tokens.\n",
    "        weights = torch.softmax(weights, dim=-1)\n",
    "        weights = self.dropout(weights)\n",
    "        out = weights @ v # (B,T,T) @ (B,T,C) -> (B,T,C)\n",
    "        return out\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "id": "a776b854",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MultiHeadAttention(torch.nn.Module):\n",
    "    def __init__(self, num_heads, head_size):\n",
    "        super().__init__()\n",
    "        self.heads = torch.nn.ModuleList([Head(head_size) for _ in range(num_heads)])\n",
    "        self.proj = torch.nn.Linear(n_embeed, n_embeed)\n",
    "        self.dropout = nn.Dropout(dropout)\n",
    "    def forward(self, x):\n",
    "        out = torch.cat([h(x) for h in self.heads], dim=-1)\n",
    "        out = self.proj(out)\n",
    "        return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "id": "da0d9201",
   "metadata": {},
   "outputs": [],
   "source": [
    "class feedforward(torch.nn.Module):\n",
    "    def __init__(self, n_embeed):\n",
    "        super().__init__()\n",
    "        self.net = torch.nn.Sequential(\n",
    "            torch.nn.Linear(n_embeed, 4*n_embeed), #according to paper there is the multiplier of 4 in the hidden layers \n",
    "            torch.nn.ReLU(),\n",
    "            torch.nn.Linear(4*n_embeed, n_embeed),\n",
    "            nn.Dropout(dropout)\n",
    "        )\n",
    "    def forward(self, x):\n",
    "        return self.net(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "1b2fc012",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Block(torch.nn.Module):\n",
    "    def __init__(self, n_embeed , n_head):\n",
    "        super().__init__()\n",
    "        head_size = n_embeed // n_head \n",
    "        self.sa_head = MultiHeadAttention(num_heads=n_head, head_size=head_size)\n",
    "        self.ffwd = feedforward(n_embeed)\n",
    "        self.ln1 = nn.LayerNorm(n_embeed)\n",
    "        self.ln2 = nn.LayerNorm(n_embeed)\n",
    "        self.dropout = nn.Dropout(dropout)\n",
    "    def forward(self, x):\n",
    "        x = x+self.sa_head(self.ln1(x))#this is slightly deviation from the original paper as we are passing the layer norem before the multi head attention and feedforward, but it is a common practice to do so, and it works better than the original paper.\n",
    "        x = x+self.ffwd(self.ln2(x))\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "7a3053a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "class BigramLanguageModel(torch.nn.Module):\n",
    "    def __init__(self, vocab_size, n_embeed):\n",
    "        super().__init__()\n",
    "        self.token_embedding_table = torch.nn.Embedding(vocab_size, n_embeed) \n",
    "        self.position_embedding_table = torch.nn.Embedding(block_size, n_embeed)\n",
    "        #not giving the good enhaced result when we try multiple Block() , deep neural net suffer from the optimisation issue \n",
    "        # self.blocks = nn.Sequential(\n",
    "        #     Block(n_embeed, n_head=4),\n",
    "        #     Block(n_embeed, n_head=4),\n",
    "        #     Block(n_embeed, n_head=4),\n",
    "        #     nn.LayerNorm(n_embeed),\n",
    "        # )\n",
    "        self.blocks = nn.Sequential(*[Block(n_embeed, n_head) for _ in range(n_layer)])\n",
    "        self.ln_f = nn.LayerNorm(n_embeed)\n",
    "        self.lm_head = torch.nn.Linear(n_embeed, vocab_size)\n",
    "    def forward(self, idx, targets=None):\n",
    "        B,T = idx.shape\n",
    "        # idx and targets are both (B,T) tensor of integers\n",
    "        token_emb = self.token_embedding_table(idx) # (B,T,C)\n",
    "        pos_emb = self.position_embedding_table(torch.arange(idx.shape[1], device=idx.device)) # (T,C)\n",
    "        x = token_emb + pos_emb # (B,T,C)\n",
    "        x = self.blocks(x) # (B,T,C)\n",
    "        x = self.ln_f(x) # (B,T,C)\n",
    "        logits = self.lm_head(x) # (B,T,vocab_size)\n",
    "        if targets is None:\n",
    "            loss = None\n",
    "        else:\n",
    "            B,T,C = logits.shape\n",
    "            logits = logits.view(B*T, C)\n",
    "            targets = targets.view(B*T)\n",
    "            loss = F.cross_entropy(logits, targets)\n",
    "        return logits, loss\n",
    "    \n",
    "    def generate(self, idx, max_new_tokens):\n",
    "        # idx is (B,T) array of indices in the current context\n",
    "        for _ in range(max_new_tokens):\n",
    "            idx_cond = idx[:, -block_size:] # crop idx to the last block_size tokens\n",
    "            logits, loss = self(idx_cond)\n",
    "            logits = logits[:, -1, :] # becomes (B,C) , as we only want to provide the last character as the input to predict the next character\n",
    "            probs = F.softmax(logits, dim=-1) # (B,C)\n",
    "            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)\n",
    "            idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)\n",
    "        return idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67e96f0b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "0e9d66e8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "logits shape: torch.Size([16384, 65])\n",
      "loss: 4.277037620544434\n",
      "\n",
      "tRNt'OUWzNdaNv;DZ!HWJxsg-rG$l.\n",
      "VXx;h&CEqoyJOlF.DmdMw;u;cjEIgcOQOID;$wig.tRIgazPSVyRpKBE-3UQBdJ'AIIxX\n"
     ]
    }
   ],
   "source": [
    "model = BigramLanguageModel(vocab_size, n_embeed)\n",
    "model = model.to(device)\n",
    "logits, loss = model(xb, yb)\n",
    "print(\"logits shape:\", logits.shape)\n",
    "print(\"loss:\", loss.item())\n",
    "print(decode(model.generate(idx=torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "1da9dd4f",
   "metadata": {},
   "outputs": [],
   "source": [
    "@torch.no_grad()\n",
    "def estimate_loss():\n",
    "    out = {}\n",
    "    model.eval()\n",
    "    for split in ['train', 'val']:\n",
    "        losses = torch.zeros(eval_iters)\n",
    "        for k in range(eval_iters):\n",
    "            X, Y = get_batch(split)\n",
    "            logits, loss = model(X, Y)\n",
    "            losses[k] = loss.item()\n",
    "        out[split] = losses.mean()\n",
    "    model.train()\n",
    "    return out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e3fb308",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "step 0: train loss 4.2785, val loss 4.2821\n"
     ]
    }
   ],
   "source": [
    "optimizer = torch.optim.AdamW(model.parameters(), lr=lr_rate)\n",
    "for steps in range(max_iters):\n",
    "    if steps % eval_iters == 0:\n",
    "        losses = estimate_loss()\n",
    "        print(f\"step {steps}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
    "\n",
    "    xb, yb = get_batch('train')\n",
    "    logits, loss = model(xb, yb)\n",
    "    optimizer.zero_grad(set_to_none=True)\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9490a27b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "DUKE VINCENTIO:\n",
      "Stand brother, sir, here it, uncle he got.\n",
      "\n",
      "VIRGILIA:\n",
      "A dog of the yousician, let your good brother, sister,\n",
      "nor it to die.\n",
      "\n",
      "VOLUMNIA:\n",
      "She is in the mar, and the matter:\n",
      "there is! What say you, Juliet alone and bird.\n",
      "Is thy life?\n",
      "\n",
      "JULIET:\n",
      "Being a child! prompt fear: speak, and look fellow good?\n",
      "\n",
      "FLORIZEL:\n",
      "And rumour, by my man's tooth made.\n",
      "\n",
      "JULIET:\n",
      "Ay, if you doth make leave your retires,\n",
      "A mother tempt my todder dial should have\n",
      "So dear and let me 'gainst words out again,\n",
      "Savest with honour's princes to hear throught of,\n",
      "His perdom of preserve\n",
      "Is posterity and secut\n",
      "No god costerb: shall be more the Capitol,\n",
      "But court did this hoursest, do begg them buried.\n",
      "His apple and dreams on daughter, and we will,\n",
      "He were laddy's wounds. O mother!\n",
      "Dread!\n",
      "In it the whitest through thee grief: why, general,\n",
      "My heart play'd many fellows upon him.\n",
      "\n",
      "FRIAR LAURENCE:\n",
      "For traitor the mind: what the journey, rise!\n",
      "I serve, or I know the senate, and let my indeed\n",
      "Will on brave it so lone\n"
     ]
    }
   ],
   "source": [
    "print(decode(model.generate(idx=torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=1000)[0].tolist()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d717cdc1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.788929 M parameters\n"
     ]
    }
   ],
   "source": [
    "print(sum(p.numel() for p in model.parameters())/1e6, \"M parameters\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58991844",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "torch.save(model.state_dict(), \"shakespeare_transformer.pt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "935f7d0e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.14.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}