{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "03ab65cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers.models import BPE\n",
    "from tokenizers import Tokenizer\n",
    "from tokenizers.decoders import ByteLevel as ByteLevelDecoder\n",
    "from tokenizers.normalizers import Sequence, Lowercase\n",
    "from tokenizers.pre_tokenizers import ByteLevel\n",
    "from tokenizers.trainers import BpeTrainer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "efe37f35",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer(BPE())\n",
    "tokenizer.normalizer = Sequence([Lowercase()])\n",
    "tokenizer.pre_tokenizer = ByteLevel()\n",
    "tokenizer.decoder = ByteLevelDecoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "6a596c74",
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = BpeTrainer(vocab_size = 50000, initial_alphabet=ByteLevel.alphabet(), special_tokens=['<s>', '<pad>', '</s>','</unk>', '<mask>'])\n",
    "tokenizer.train([\"../../datasets/austen-emma.txt\"], trainer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "00138f04",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.save(\"tokenizer_gpt/tokenizer.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "ca9061de",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import (GPT2TokenizerFast, GPT2Config, GPT2LMHeadModel)\n",
    "tokenizer_gpt = GPT2TokenizerFast.from_pretrained(\"tokenizer_gpt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "4e5f77ea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_gpt.add_special_tokens({\n",
    "    \"eos_token\": \"</s>\",\n",
    "    \"bos_token\": \"<s>\",\n",
    "    \"unk_token\": \"<unk>\",\n",
    "    \"pad_token\": \"<pad>\",\n",
    "    \"mask_token\": \"<mask>\"\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "ab84b4f2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 469, 293, 225, 2]"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_gpt.encode(\"<s> thisis </s>\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "2cda86e9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 469, 361, 225, 2]"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_gpt.encode(\"<s> this is </s>\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "5d7d2260",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = GPT2Config(\n",
    "    vocab_size = tokenizer_gpt.vocab_size,\n",
    "    bos_token_id = tokenizer_gpt.bos_token_id,\n",
    "    eos_token_id = tokenizer_gpt.eos_token_id\n",
    ")\n",
    "\n",
    "model = GPT2LMHeadModel(config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "45d2da31",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GPT2Config {\n",
       "  \"activation_function\": \"gelu_new\",\n",
       "  \"attn_pdrop\": 0.1,\n",
       "  \"bos_token_id\": 0,\n",
       "  \"embd_pdrop\": 0.1,\n",
       "  \"eos_token_id\": 2,\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"layer_norm_epsilon\": 1e-05,\n",
       "  \"model_type\": \"gpt2\",\n",
       "  \"n_embd\": 768,\n",
       "  \"n_head\": 12,\n",
       "  \"n_inner\": null,\n",
       "  \"n_layer\": 12,\n",
       "  \"n_positions\": 1024,\n",
       "  \"reorder_and_upcast_attn\": false,\n",
       "  \"resid_pdrop\": 0.1,\n",
       "  \"scale_attn_by_inverse_layer_idx\": false,\n",
       "  \"scale_attn_weights\": true,\n",
       "  \"summary_activation\": null,\n",
       "  \"summary_first_dropout\": 0.1,\n",
       "  \"summary_proj_to_labels\": true,\n",
       "  \"summary_type\": \"cls_index\",\n",
       "  \"summary_use_proj\": true,\n",
       "  \"transformers_version\": \"4.54.1\",\n",
       "  \"use_cache\": true,\n",
       "  \"vocab_size\": 11954\n",
       "}"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "692b12c5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GPT2LMHeadModel(\n",
       "  (transformer): GPT2Model(\n",
       "    (wte): Embedding(11954, 768)\n",
       "    (wpe): Embedding(1024, 768)\n",
       "    (drop): Dropout(p=0.1, inplace=False)\n",
       "    (h): ModuleList(\n",
       "      (0-11): 12 x GPT2Block(\n",
       "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
       "        (attn): GPT2Attention(\n",
       "          (c_attn): Conv1D(nf=2304, nx=768)\n",
       "          (c_proj): Conv1D(nf=768, nx=768)\n",
       "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
       "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
       "        )\n",
       "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
       "        (mlp): GPT2MLP(\n",
       "          (c_fc): Conv1D(nf=3072, nx=768)\n",
       "          (c_proj): Conv1D(nf=768, nx=3072)\n",
       "          (act): NewGELUActivation()\n",
       "          (dropout): Dropout(p=0.1, inplace=False)\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
       "  )\n",
       "  (lm_head): Linear(in_features=768, out_features=11954, bias=False)\n",
       ")"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "49b1f246",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"../../datasets/austen-emma.txt\", \"r\", encoding='utf-8') as f:\n",
    "    content = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "306508f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "content_p = []\n",
    "for c in content:\n",
    "    if len(c) > 10:\n",
    "        content_p.append(c.strip())\n",
    "content_p = ' '.join(content_p) + tokenizer_gpt.eos_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "4536854a",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_content = tokenizer_gpt.encode(content_p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "a1b0aacc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "195221"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tokenized_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "b475fdde",
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_len = 100\n",
    "examples = []\n",
    "for i in range(0, len(tokenized_content) - sample_len + 1):\n",
    "    examples.append(\n",
    "        tokenized_content[i:i+ sample_len]\n",
    "    )\n",
    "\n",
    "train_data = []\n",
    "labels = []\n",
    "for example in examples:\n",
    "    train_data.append(example[:-1])\n",
    "    labels.append(example[1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "ed046662",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch.utils.data import TensorDataset, DataLoader\n",
    "import torch.nn.functional as F\n",
    "\n",
    "buffer = 500\n",
    "batch_size = 64\n",
    "\n",
    "train_data = torch.Tensor(train_data).to(dtype=torch.long).cuda()\n",
    "labels = torch.Tensor(labels).to(dtype=torch.long).cuda()\n",
    "dataset = TensorDataset(train_data, labels)\n",
    "\n",
    "loader = DataLoader(dataset, batch_size=batch_size, drop_last=True, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "dc687478",
   "metadata": {},
   "outputs": [],
   "source": [
    "from torch.optim import Adam\n",
    "import torch.nn as nn\n",
    "from tqdm import tqdm\n",
    "\n",
    "model = model.cuda()\n",
    "optimizer = Adam(model.parameters(), lr=3e-5, eps=1e-08)\n",
    "criterion = nn.CrossEntropyLoss()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99b192b4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/3048 [00:00<?, ?it/s]C:\\Users\\han\\AppData\\Local\\Temp\\ipykernel_32488\\611308887.py:17: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
      "  pred = F.softmax(pred)\n",
      "100%|██████████| 3048/3048 [07:58<00:00,  6.37it/s, loss: 0.0061, aucc: 0.4009]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor(0.0085, device='cuda:0', grad_fn=<DivBackward0>)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "len() takes exactly one argument (0 given)",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mTypeError\u001b[39m                                 Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[79]\u001b[39m\u001b[32m, line 26\u001b[39m\n\u001b[32m     23\u001b[39m     progress.set_postfix_str(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mloss: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mloss.cpu().detach().numpy()\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.04f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, aucc: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00maucc.cpu()/cnt\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.04f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m     25\u001b[39m \u001b[38;5;28mprint\u001b[39m(losses / \u001b[38;5;28mlen\u001b[39m(loader))\n\u001b[32m---> \u001b[39m\u001b[32m26\u001b[39m \u001b[38;5;28mprint\u001b[39m(aucc / \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n",
      "\u001b[31mTypeError\u001b[39m: len() takes exactly one argument (0 given)"
     ]
    }
   ],
   "source": [
    "model.train()\n",
    "# for epoch in tqdm(range(0, 1)):\n",
    "losses = 0.\n",
    "aucc = 0\n",
    "cnt = 0\n",
    "progress = tqdm(loader)\n",
    "for x, y in progress:\n",
    "    pred = model(x).logits\n",
    "    \n",
    "    y = F.one_hot(y, num_classes=tokenizer_gpt.vocab_size)\n",
    "    \n",
    "    loss = criterion(pred.to(dtype=torch.float32), y.to(dtype=torch.float32))\n",
    "    optimizer.zero_grad()\n",
    "    loss.backward()\n",
    "    optimizer.step()\n",
    "    \n",
    "    pred = F.softmax(pred)\n",
    "    y = torch.argmax(y, dim=2)\n",
    "    aucc += torch.sum(pred == y)\n",
    "    cnt += (batch_size*99)\n",
    "    losses += loss\n",
    "    progress.set_postfix_str(f\"loss: {loss.cpu().detach().numpy():.04f}, aucc: {aucc.cpu()/cnt:.04f}\")\n",
    "    \n",
    "print(losses / len(loader))\n",
    "print(aucc / cnt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "5f0c3c7a",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate(start, model):\n",
    "    input_token_ids = tokenizer_gpt.encode(start, return_tensors='pt').cuda()\n",
    "    output = model.generate(\n",
    "        input_token_ids,\n",
    "        max_length= 500,\n",
    "        num_beams = 5,\n",
    "        temperature=0.7,\n",
    "        no_repeat_ngram_size=2,\n",
    "        num_return_sequences=1\n",
    "    )\n",
    "    return tokenizer_gpt.decode(output[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "7acad8eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n",
      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'  yes!\" cried emma.--\"my dearest harriet, mr. weston, who had been walking away from hartfield, he would never have seen him before; and while she did not quite understand how it might be supposed that she could give up the idea of any body\\'s coming to such a thing by his manners. \"i am afraid,\" he replied, \"that you must be the greatest pleasure.  you do not know what your father would have heard.\" \"oh! yes, my dear--but i dare say i am sure i shall think they will think you are very much obliged to be sure.--but this is an excellent miss smith, however, indeed; but there is being _you are quite enough to make one of course.--i can hardly ever hear of every thing to see nothing else.\" she comes in love with me, sir?--well--a very bad.--he is coming over this morning and yet quite complete in the whole.\" emma could not likely to call upon the same glance at all these words, if they walked on that sort of his feelings:  i must take care about ten months ago, perhaps--what an old acquaintance with my father--and so far off in spite of the smallest degree or two men, i was forced to bring a few weeks ago as far as well, unless they are going to wait for _she is a great regard for a young lady\\'s conduct, though her husband and therefore.\" chapter xvi and mrs. frank churchill came out of having given me--he came back again, when they were all my mother might have done?  she walked up in fact.--the same evening it was sitting down again--the case at this day?\" \"miss woodhouse\\'s eyes to keep your feelings did you walked off, and then?\" emma\\'s situation. chapter xiv some young woman\\'s manners to give me!  he seemed to feel that period at weymouth.\"  and nobody else, papa) the worst of hearing every body else--\" she came over-morrow.--they are so pleased with all that moment.--a little boys appeared till within half-day or four weeks, or three times a beautiful creature into the oddest creature!\" chapter xvii when she might not making a moment--\"the change?--you must wait a mile off!--but if this evening without being taken place, moreover, but one subject after making no account.--it seems every other people could never saw me; he made her mind was giving themselves off; her eyes made up these young'"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate(\" \", model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "b169491a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import (WEIGHTS_NAME, CONFIG_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "690df94b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'pytorch_model.bin'"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "WEIGHTS_NAME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "b6d192d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'config.json'"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CONFIG_NAME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "e4abd96c",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained(WEIGHTS_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "d719459b",
   "metadata": {},
   "outputs": [],
   "source": [
    "config.save_pretrained(CONFIG_NAME)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}