Spaces:

accelera-ai
/

open_mp_generator

Sleeping

App Files Files Community

mohamedahraf273 commited on 22 days ago

Commit

e8aab00

1 Parent(s): 94ee9c6

add generator

Browse files

Files changed (13) hide show

best_model.pth +3 -0
dataset.py +42 -0
generator.ipynb +867 -0
model/__pycache__/attn.cpython-312.pyc +0 -0
model/__pycache__/decoder.cpython-312.pyc +0 -0
model/__pycache__/encoder.cpython-312.pyc +0 -0
model/__pycache__/generator.cpython-312.pyc +0 -0
model/attn.py +37 -0
model/decoder.py +71 -0
model/encoder.py +56 -0
model/generator.py +78 -0
tokenizer.json +0 -0
tokenizer.py +212 -0

best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c87d759052debb4e4adb62ef51c9d65671d04bfc6e1f9fd4b2130c66e69b9257
+size 162038291

dataset.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+from torch.utils.data import Dataset
+class OpenMPDataset(Dataset):
+    def __init__(self, inputs, outputs, tokenizer, max_input_len=500, max_output_len=100):
+        self.inputs = inputs
+        self.outputs = outputs
+        self.tokenizer = tokenizer
+        self.max_input_len = max_input_len
+        self.max_output_len = max_output_len
+        self.pad_idx = tokenizer.char2idx['<PAD>']
+    def __len__(self):
+        return len(self.inputs)
+    def __getitem__(self, idx):
+        input_ids = self.tokenizer.encode(
+            self.inputs[idx],
+            self.max_input_len,
+            add_special_tokens=True
+        )
+        output_ids = self.tokenizer.encode(
+            self.outputs[idx],
+            self.max_output_len,
+            add_special_tokens=True
+        )
+        input_len = next(
+            (i for i, tok in enumerate(input_ids) if tok == self.pad_idx),
+            self.max_input_len
+        )
+        output_len = next(
+            (i for i, tok in enumerate(output_ids) if tok == self.pad_idx),
+            self.max_output_len
+        )
+        return {
+            'input': torch.tensor(input_ids, dtype=torch.long),
+            'output': torch.tensor(output_ids, dtype=torch.long),
+            'input_len': torch.tensor(input_len, dtype=torch.long),
+            'output_len': torch.tensor(output_len, dtype=torch.long)
+        }

generator.ipynb ADDED Viewed

	@@ -0,0 +1,867 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bae751d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import time\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "from torch.utils.data import DataLoader\n",
+    "from models.open_mp_gen.tokenizer import Tokenizer\n",
+    "from models.open_mp_gen.model.generator import Generator\n",
+    "from models.open_mp_gen.model.encoder import Encoder\n",
+    "from models.open_mp_gen.model.decoder import Decoder\n",
+    "from models.open_mp_gen.model.attn import BahdanauAttention\n",
+    "from models.open_mp_gen.dataset import OpenMPDataset\n",
+    "from accelera.src.utils.code_utils import pragma_to_class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c0e30f61",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BPE Tokenizer loaded from tokenizer.json\n",
+      " - Vocab size: 8002\n",
+      " - BPE merges: 7888\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<models.open_mp_gen.tokenizer.Tokenizer at 0x7a60237a60c0>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer = Tokenizer(vocab_size=8000)\n",
+    "tokenizer.load(\"tokenizer.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "db130c45",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training samples: 15671\n",
+      "Validation samples: 1684\n",
+      "\n",
+      "Sample input (first 70 chars):\n",
+      "[CLS:parallel_for] for (int ix = 1; ix < (N + 1); ix++)\n",
+      "{\n",
+      "  forces[ix] = forces[ix] * force_retention;\n",
+      "}\n",
+      "\n",
+      "Sample output:\n",
+      "omp parallel for\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_inputs, train_outputs = [], []\n",
+    "val_inputs, val_outputs = [], []\n",
+    "\n",
+    "with open('../data/data.json', 'r') as f:\n",
+    "    lines = f.readlines()\n",
+    "    \n",
+    "    split_idx = int(0.9 * len(lines))\n",
+    "    train_lines = lines[:split_idx]\n",
+    "    val_lines = lines[split_idx:]\n",
+    "\n",
+    "for line in train_lines:\n",
+    "    item = json.loads(line.strip())\n",
+    "    \n",
+    "    if item['label'] == 'False':\n",
+    "        continue\n",
+    "    \n",
+    "    cls = pragma_to_class(item['label'], item['pragma'])\n",
+    "    if cls == 'none':\n",
+    "        continue\n",
+    "    \n",
+    "    input_str = f\"[CLS:{cls}] {item['code']}\"\n",
+    "    output_str = item['pragma'].strip()\n",
+    "    \n",
+    "    if not output_str:\n",
+    "        continue\n",
+    "    \n",
+    "    train_inputs.append(input_str)\n",
+    "    train_outputs.append(output_str)\n",
+    "\n",
+    "for line in val_lines:\n",
+    "    item = json.loads(line.strip())\n",
+    "    if item['label'] == 'False':\n",
+    "        continue\n",
+    "    \n",
+    "    cls = pragma_to_class(item['label'], item['pragma'])\n",
+    "    if cls == 'none':\n",
+    "        continue\n",
+    "    \n",
+    "    input_str = f\"[CLS:{cls}] {item['code']}\"\n",
+    "    output_str = item['pragma'].strip()\n",
+    "    if not output_str:\n",
+    "        continue\n",
+    "    \n",
+    "    val_inputs.append(input_str)\n",
+    "    val_outputs.append(output_str)\n",
+    "\n",
+    "print(f\"Training samples: {len(train_inputs)}\")\n",
+    "print(f\"Validation samples: {len(val_inputs)}\")\n",
+    "print(f\"\\nSample input (first 70 chars):\\n{train_inputs[0]}\")\n",
+    "print(f\"Sample output:\\n{train_outputs[0]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d5747915",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Dataset shapes:\n",
+      "  Train: 15671 samples\n",
+      "  Val: 1684 samples\n",
+      "  Sample input tensor shape: torch.Size([500])\n",
+      "  Sample output tensor shape: torch.Size([100])\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_dataset = OpenMPDataset(\n",
+    "    train_inputs, train_outputs, tokenizer,\n",
+    "    max_input_len=500,\n",
+    "    max_output_len=100\n",
+    ")\n",
+    "\n",
+    "val_dataset = OpenMPDataset(\n",
+    "    val_inputs, val_outputs, tokenizer,\n",
+    "    max_input_len=500,\n",
+    "    max_output_len=100\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nDataset shapes:\")\n",
+    "print(f\"  Train: {len(train_dataset)} samples\")\n",
+    "print(f\"  Val: {len(val_dataset)} samples\")\n",
+    "print(f\"  Sample input tensor shape: {train_dataset[0]['input'].shape}\")\n",
+    "print(f\"  Sample output tensor shape: {train_dataset[0]['output'].shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5252d457",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "✓ Dataloaders ready!\n",
+      "  Train batches: 490\n",
+      "  Val batches: 53\n",
+      "\n",
+      "Sample batch structure:\n",
+      "  input shape: torch.Size([32, 500])\n",
+      "  output shape: torch.Size([32, 100])\n",
+      "  input_len shape: torch.Size([32])\n",
+      "  First sample input_len: 16\n",
+      "\n",
+      "Sample batch structure:\n",
+      "  input shape: torch.Size([32, 500])\n",
+      "  output shape: torch.Size([32, 100])\n",
+      "  input_len shape: torch.Size([32])\n",
+      "  First sample input_len: 16\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_loader = DataLoader(\n",
+    "    train_dataset,\n",
+    "    batch_size=32,\n",
+    "    shuffle=True,\n",
+    "    pin_memory=True\n",
+    ")\n",
+    "\n",
+    "val_loader = DataLoader(\n",
+    "    val_dataset,\n",
+    "    batch_size=32,\n",
+    "    shuffle=False,\n",
+    "    pin_memory=True\n",
+    ")\n",
+    "\n",
+    "print(f\"\\n✓ Dataloaders ready!\")\n",
+    "print(f\"  Train batches: {len(train_loader)}\")\n",
+    "print(f\"  Val batches: {len(val_loader)}\")\n",
+    "\n",
+    "sample_batch = next(iter(train_loader))\n",
+    "print(f\"\\nSample batch structure:\")\n",
+    "print(f\"  input shape: {sample_batch['input'].shape}\")\n",
+    "print(f\"  output shape: {sample_batch['output'].shape}\")\n",
+    "print(f\"  input_len shape: {sample_batch['input_len'].shape}\")\n",
+    "print(f\"  First sample input_len: {sample_batch['input_len'][0]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "11631bed",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model architecture:\n",
+      "Generator(\n",
+      "  (encoder): Encoder(\n",
+      "    (embedding): Embedding(8002, 128, padding_idx=0)\n",
+      "    (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)\n",
+      "    (dropout): Dropout(p=0.3, inplace=False)\n",
+      "  )\n",
+      "  (decoder): Decoder(\n",
+      "    (attention): BahdanauAttention(\n",
+      "      (W1): Linear(in_features=512, out_features=256, bias=True)\n",
+      "      (W2): Linear(in_features=256, out_features=256, bias=True)\n",
+      "      (V): Linear(in_features=256, out_features=1, bias=True)\n",
+      "    )\n",
+      "    (embedding): Embedding(8002, 128, padding_idx=0)\n",
+      "    (lstm): LSTM(640, 256, num_layers=2, batch_first=True, dropout=0.3)\n",
+      "    (fc_out): Linear(in_features=896, out_features=8002, bias=True)\n",
+      "    (dropout): Dropout(p=0.3, inplace=False)\n",
+      "  )\n",
+      "  (hidden_projection): Linear(in_features=512, out_features=256, bias=True)\n",
+      "  (cell_projection): Linear(in_features=512, out_features=256, bias=True)\n",
+      ")\n",
+      "\n",
+      "Total parameters: 13,499,715\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "VOCAB_SIZE = tokenizer.vocab_size\n",
+    "EMBED_SIZE = 128\n",
+    "HIDDEN_SIZE = 256\n",
+    "NUM_LAYERS = 2\n",
+    "DROPOUT = 0.3\n",
+    "\n",
+    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+    "\n",
+    "encoder = Encoder(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE, NUM_LAYERS, DROPOUT)\n",
+    "attention = BahdanauAttention(HIDDEN_SIZE)\n",
+    "decoder = Decoder(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE, attention, NUM_LAYERS, DROPOUT)\n",
+    "model = Generator(encoder, decoder, device).to(device)\n",
+    "model.apply(model._init_weights)\n",
+    "\n",
+    "print(\"Model architecture:\")\n",
+    "print(model)\n",
+    "print(f\"\\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2d3125a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PAD_IDX = tokenizer.char2idx['<PAD>']\n",
+    "criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)\n",
+    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
+    "scheduler = optim.lr_scheduler.ReduceLROnPlateau(\n",
+    "    optimizer, \n",
+    "    mode='min', \n",
+    "    factor=0.5, \n",
+    "    patience=2, \n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "794c40e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(model, iterator, optimizer, criterion, clip=1.0, teacher_forcing_ratio=0.5):\n",
+    "    model.train()\n",
+    "    epoch_loss = 0\n",
+    "    \n",
+    "    for batch in tqdm(iterator, desc=\"Training\", leave=False):\n",
+    "        src = batch['input'].to(device)\n",
+    "        trg = batch['output'].to(device)\n",
+    "        src_len = batch['input_len'].to(device)\n",
+    "        optimizer.zero_grad()\n",
+    "        output = model(src, src_len, trg, teacher_forcing_ratio)\n",
+    "        output_dim = output.shape[-1]\n",
+    "        output = output[1:].view(-1, output_dim)\n",
+    "        trg = trg.transpose(0, 1) \n",
+    "        trg = trg[1:].reshape(-1)\n",
+    "        \n",
+    "        loss = criterion(output, trg)\n",
+    "        loss.backward()\n",
+    "        \n",
+    "        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n",
+    "        \n",
+    "        optimizer.step()\n",
+    "        epoch_loss += loss.item()\n",
+    "    \n",
+    "    return epoch_loss / len(iterator)\n",
+    "\n",
+    "\n",
+    "def evaluate(model, iterator, criterion):\n",
+    "    model.eval()\n",
+    "    epoch_loss = 0\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        for batch in tqdm(iterator, desc=\"Evaluating\", leave=False):\n",
+    "            src = batch['input'].to(device)\n",
+    "            trg = batch['output'].to(device)\n",
+    "            src_len = batch['input_len'].to(device)\n",
+    "            \n",
+    "            output = model(src, src_len, trg, 0)\n",
+    "            \n",
+    "            output_dim = output.shape[-1]\n",
+    "            output = output[1:].view(-1, output_dim)\n",
+    "            \n",
+    "            trg = trg.transpose(0, 1)\n",
+    "            trg = trg[1:].reshape(-1)\n",
+    "            \n",
+    "            loss = criterion(output, trg)\n",
+    "            epoch_loss += loss.item()\n",
+    "    \n",
+    "    return epoch_loss / len(iterator)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "d4bb0e92",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 01/15 | Time: 7m 39s | TF Ratio: 0.50\n",
+      "\tTrain Loss: 4.5316 | Val Loss: 4.2697 | Best Val: 4.2697 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 02/15 | Time: 7m 33s | TF Ratio: 0.45\n",
+      "\tTrain Loss: 3.6810 | Val Loss: 4.0286 | Best Val: 4.0286 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 03/15 | Time: 7m 40s | TF Ratio: 0.41\n",
+      "\tTrain Loss: 3.4275 | Val Loss: 3.8817 | Best Val: 3.8817 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 04/15 | Time: 7m 40s | TF Ratio: 0.36\n",
+      "\tTrain Loss: 3.2257 | Val Loss: 3.7254 | Best Val: 3.7254 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 05/15 | Time: 7m 38s | TF Ratio: 0.33\n",
+      "\tTrain Loss: 3.0585 | Val Loss: 3.6210 | Best Val: 3.6210 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 06/15 | Time: 7m 37s | TF Ratio: 0.30\n",
+      "\tTrain Loss: 2.9102 | Val Loss: 3.4103 | Best Val: 3.4103 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 07/15 | Time: 7m 39s | TF Ratio: 0.27\n",
+      "\tTrain Loss: 2.7814 | Val Loss: 3.3304 | Best Val: 3.3304 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 08/15 | Time: 7m 38s | TF Ratio: 0.24\n",
+      "\tTrain Loss: 2.6669 | Val Loss: 3.2644 | Best Val: 3.2644 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 09/15 | Time: 7m 38s | TF Ratio: 0.22\n",
+      "\tTrain Loss: 2.5686 | Val Loss: 3.2038 | Best Val: 3.2038 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 10/15 | Time: 7m 38s | TF Ratio: 0.19\n",
+      "\tTrain Loss: 2.4794 | Val Loss: 3.0976 | Best Val: 3.0976 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 11/15 | Time: 7m 37s | TF Ratio: 0.17\n",
+      "\tTrain Loss: 2.4153 | Val Loss: 3.0713 | Best Val: 3.0713 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 12/15 | Time: 7m 35s | TF Ratio: 0.16\n",
+      "\tTrain Loss: 2.3247 | Val Loss: 2.9971 | Best Val: 2.9971 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 13/15 | Time: 7m 38s | TF Ratio: 0.14\n",
+      "\tTrain Loss: 2.2682 | Val Loss: 2.9529 | Best Val: 2.9529 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 14/15 | Time: 7m 38s | TF Ratio: 0.13\n",
+      "\tTrain Loss: 2.2045 | Val Loss: 2.9489 | Best Val: 2.9489 ✓ SAVED\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                           "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch: 15/15 | Time: 7m 39s | TF Ratio: 0.11\n",
+      "\tTrain Loss: 2.1487 | Val Loss: 2.9050 | Best Val: 2.9050 ✓ SAVED\n",
+      "\n",
+      "======================================================================\n",
+      "✓ TRAINING COMPLETE!\n",
+      "Best validation loss: 2.9050\n",
+      "Model saved to 'best_model.pth'\n",
+      "======================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "EPOCHS = 15\n",
+    "CLIP = 1.0\n",
+    "best_valid_loss = float('inf')\n",
+    "training_history = {'train_loss': [], 'valid_loss': []}\n",
+    "\n",
+    "for epoch in range(EPOCHS):\n",
+    "    start_time = time.time()\n",
+    "    \n",
+    "    tf_ratio = max(0.1, 0.5 * (0.9 ** epoch))\n",
+    "    train_loss = train(model, train_loader, optimizer, criterion, CLIP, tf_ratio)\n",
+    "    valid_loss = evaluate(model, val_loader, criterion)\n",
+    "    scheduler.step(valid_loss)\n",
+    "    if valid_loss < best_valid_loss:\n",
+    "        best_valid_loss = valid_loss\n",
+    "        torch.save({\n",
+    "            'epoch': epoch,\n",
+    "            'model_state_dict': model.state_dict(),\n",
+    "            'optimizer_state_dict': optimizer.state_dict(),\n",
+    "            'valid_loss': valid_loss,\n",
+    "            'vocab_size': VOCAB_SIZE,\n",
+    "            'embed_size': EMBED_SIZE,\n",
+    "            'hidden_size': HIDDEN_SIZE,\n",
+    "            'num_layers': NUM_LAYERS\n",
+    "        }, 'best_model.pth')\n",
+    "        save_status = \"✓ SAVED\"\n",
+    "    else:\n",
+    "        save_status = \" \"\n",
+    "    \n",
+    "    training_history['train_loss'].append(train_loss)\n",
+    "    training_history['valid_loss'].append(valid_loss)\n",
+    "    \n",
+    "    end_time = time.time()\n",
+    "    epoch_mins = int((end_time - start_time) / 60)\n",
+    "    epoch_secs = int((end_time - start_time) % 60)\n",
+    "    \n",
+    "    print(f'Epoch: {epoch+1:02}/{EPOCHS} | Time: {epoch_mins}m {epoch_secs}s | TF Ratio: {tf_ratio:.2f}')\n",
+    "    print(f'\\tTrain Loss: {train_loss:.4f} | Val Loss: {valid_loss:.4f} | Best Val: {best_valid_loss:.4f} {save_status}')\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*70)\n",
+    "print(f\"✓ TRAINING COMPLETE!\")\n",
+    "print(f\"Best validation loss: {best_valid_loss:.4f}\")\n",
+    "print(f\"Model saved to 'best_model.pth'\")\n",
+    "print(\"=\"*70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "a49bb85f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n",
+      "IMPORTANT: If you haven't re-run the TRAINING loop (Cell 9)\n",
+      "after applying the Transpose fix, the results below will likely\n",
+      "be poor/incomplete because the model hasn't updated its weights\n",
+      "correctly yet.\n",
+      "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n",
+      "\n",
+      "Running generation tests on validation set (True Greedy Decoding):\n",
+      "\n",
+      "Example 0:\n",
+      "Input:      [CLS:parallel_for] for (i = 0; i < 16; ++i)\n",
+      "  ;\n",
+      "\n",
+      "Target:     omp target parallel for simd simdlen(4 4)\n",
+      "Prediction: omp parallel for shared(,k,,,,,,,,,,,,,pr) shared(L,,,,,,,,,,,,,,,,\n",
+      "------------------------------------------------------------\n",
+      "Example 10:\n",
+      "Input:      [CLS:reduction] for (i = 1; i < (500 - 1); i++)\n",
+      "{\n",
+      "  iIndex = i * dim2;\n",
+      "  jIndex = 0;\n",
+      "  for (j = 1; j < (500 - 1); j++)\n",
+      "  {\n",
+      "    jIndex += 500;\n",
+      "    for (k = 1; k < (500 - 1); k++)\n",
+      "    {\n",
+      "      index = (iIndex + jIndex) + k;\n",
+      "      compute_it = old[index] * need;\n",
+      "      aggregate += compute_it / gimmie;\n",
+      "      accumulator = 0;\n",
+      "      long subsum1 = 0;\n",
+      "      long subsum2 = 0;\n",
+      "      long subsum3 = 0;\n",
+      "      for (z = 0; z < 27; z += 3)\n",
+      "      {\n",
+      "        subsum1 += old[index + arr[z]];\n",
+      "        subsum2 += old[index + arr[z + 1]];\n",
+      "        subsum3 += old[index + arr[z + 2]];\n",
+      "      }\n",
+      "\n",
+      "      accumulator += (subsum1 + subsum2) + subsum3;\n",
+      "      long value = accumulator / 27;\n",
+      "      int par = value / 100;\n",
+      "      a0 += ((unsigned) par) >> 31;\n",
+      "      a0 += !(par ^ 0);\n",
+      "      a1 += !(par ^ 1);\n",
+      "      a2 += !(par ^ 2);\n",
+      "      a3 += !(par ^ 3);\n",
+      "      a4 += !(par ^ 4);\n",
+      "      a5 += !(par ^ 5);\n",
+      "      a6 += !(par ^ 6);\n",
+      "      a7 += !(par ^ 7);\n",
+      "      a8 += !(par ^ 8);\n",
+      "      int64_t tmp = ((int64_t) par) - 9;\n",
+      "      a9 += (tmp >> 63) + 1;\n",
+      "      new[index] = value;\n",
+      "    }\n",
+      "\n",
+      "  }\n",
+      "\n",
+      "}\n",
+      "\n",
+      "Target:     omp parallel for private(j, k, z, accumulator, jIndex, index, iIndex, compute_it) reduction(+: aggregate, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9)\n",
+      "Prediction: omp parallel for reduction(+:data,,,,,,,,,,,,,,\n",
+      "------------------------------------------------------------\n",
+      "Example 20:\n",
+      "Input:      [CLS:parallel_for] for (i = 0; i < 16; ++i)\n",
+      "  ;\n",
+      "\n",
+      "Target:     omp parallel for simd firstprivate(, )\n",
+      "Prediction: omp parallel for shared(,k,,,,,,,,,,,,,pr) shared(L,,,,,,,,,,,,,,,,\n",
+      "------------------------------------------------------------\n",
+      "Example 30:\n",
+      "Input:      [CLS:parallel_for] for (i = 0; i < n; i++)\n",
+      "{\n",
+      "  x[i] = 1.0;\n",
+      "  y[i] = 2.0;\n",
+      "}\n",
+      "\n",
+      "Target:     omp parallel for private(i)\n",
+      "Prediction: omp parallel for shared(gen,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n",
+      "------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.eval()\n",
+    "\n",
+    "def generate_sentence(model, input_text, tokenizer, max_len=150, device='cuda'):\n",
+    "    \"\"\"\n",
+    "    Greedy decoding function that generates tokens until <EOS> or max_len.\n",
+    "    This mimics the model's forward pass but allows dynamic length generation.\n",
+    "    \"\"\"\n",
+    "    model.eval()\n",
+    "    \n",
+    "    # Tokenize input\n",
+    "    input_ids = tokenizer.encode(input_text, max_length=500, add_special_tokens=True)\n",
+    "    src_tensor = torch.LongTensor(input_ids).unsqueeze(0).to(device) # [1, src_len]\n",
+    "    src_len = torch.LongTensor([len(input_ids)]).to(device)          # [1]\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        # Encode\n",
+    "        encoder_outputs, hidden, cell = model.encoder(src_tensor, src_len)\n",
+    "        \n",
+    "        # Create mask (same logic as in Generator.forward)\n",
+    "        max_src_len = encoder_outputs.shape[1]\n",
+    "        mask = torch.arange(max_src_len, device=device).unsqueeze(0) < src_len.unsqueeze(1)\n",
+    "        mask = mask.float()\n",
+    "        \n",
+    "        # Project hidden/cell states from Encoder to Decoder size\n",
+    "        # Reshape to [num_layers, 2, batch, hidden] to combine bidirectional states\n",
+    "        hidden = hidden.view(model.encoder.num_layers, 2, 1, model.encoder.hidden_size)\n",
+    "        hidden = torch.cat((hidden[:, 0], hidden[:, 1]), dim=2)\n",
+    "        hidden = model.hidden_projection(hidden)\n",
+    "        \n",
+    "        cell = cell.view(model.encoder.num_layers, 2, 1, model.encoder.hidden_size)\n",
+    "        cell = torch.cat((cell[:, 0], cell[:, 1]), dim=2)\n",
+    "        cell = model.cell_projection(cell)\n",
+    "        \n",
+    "        # Start with <SOS>\n",
+    "        trg_indexes = [tokenizer.char2idx['<SOS>']]\n",
+    "        \n",
+    "        for i in range(max_len):\n",
+    "            trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device) # [1]\n",
+    "            \n",
+    "            output, hidden, cell, _ = model.decoder(\n",
+    "                trg_tensor, hidden, cell, encoder_outputs, mask\n",
+    "            )\n",
+    "            \n",
+    "            # Greedy prediction: take token with highest probability\n",
+    "            pred_token = output.argmax(1).item()\n",
+    "            trg_indexes.append(pred_token)\n",
+    "            \n",
+    "            if pred_token == tokenizer.char2idx['<EOS>']:\n",
+    "                break\n",
+    "                \n",
+    "    # Decode integers back to string\n",
+    "    return tokenizer.decode(trg_indexes)\n",
+    "\n",
+    "# ---------------------------------------------------------\n",
+    "print(\"!\"*60)\n",
+    "print(\"IMPORTANT: If you haven't re-run the TRAINING loop (Cell 9)\")\n",
+    "print(\"after applying the Transpose fix, the results below will likely\")\n",
+    "print(\"be poor/incomplete because the model hasn't updated its weights\")\n",
+    "print(\"correctly yet.\")\n",
+    "print(\"!\"*60 + \"\\n\")\n",
+    "\n",
+    "print(\"Running generation tests on validation set (True Greedy Decoding):\\n\")\n",
+    "test_indices = [0, 10, 20, 30]\n",
+    "# Ensure indices are within bounds\n",
+    "test_indices = [i for i in test_indices if i < len(val_inputs)]\n",
+    "\n",
+    "for i in test_indices:\n",
+    "    input_text = val_inputs[i]\n",
+    "    target_text = val_outputs[i]\n",
+    "    \n",
+    "    prediction = generate_sentence(model, input_text, tokenizer, device=device)\n",
+    "    \n",
+    "    print(f\"Example {i}:\")\n",
+    "    print(f\"Input:      {input_text}\")\n",
+    "    print(f\"Target:     {target_text}\")\n",
+    "    print(f\"Prediction: {prediction}\")\n",
+    "    print(\"-\" * 60)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85bd9571",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ---------------------------------------------------------\n",
+    "# RUN THIS CELL ONLY IF YOU WANT TO RESET TRAINING\n",
+    "# This initializes the model weights from scratch. \n",
+    "# Run this, and then run the TRAINING LOOP (Cell 9) again.\n",
+    "# ---------------------------------------------------------\n",
+    "\n",
+    "print(\"↺ RESETTING MODEL & OPTIMIZER...\")\n",
+    "model = Generator(encoder, decoder, device).to(device)\n",
+    "model.apply(model._init_weights)\n",
+    "\n",
+    "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
+    "training_history = {'train_loss': [], 'valid_loss': []}\n",
+    "best_valid_loss = float('inf')\n",
+    "\n",
+    "print(\"✓ Model reset. Now scroll up and run the TRAINING LOOP again.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

model/__pycache__/attn.cpython-312.pyc ADDED Viewed

Binary file (2.35 kB). View file

model/__pycache__/decoder.cpython-312.pyc ADDED Viewed

Binary file (3.12 kB). View file

model/__pycache__/encoder.cpython-312.pyc ADDED Viewed

Binary file (2.48 kB). View file

model/__pycache__/generator.cpython-312.pyc ADDED Viewed

Binary file (5.35 kB). View file

model/attn.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Optional
+class BahdanauAttention(nn.Module):
+    def __init__(self, hidden_size: int):
+        super(BahdanauAttention, self).__init__()
+        self.W1 = nn.Linear(hidden_size * 2, hidden_size)
+        self.W2 = nn.Linear(hidden_size, hidden_size)
+        self.V = nn.Linear(hidden_size, 1)
+    def forward(
+        self,
+        decoder_hidden: torch.Tensor,
+        encoder_outputs: torch.Tensor,
+        mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        hidden_expanded = decoder_hidden.unsqueeze(1)
+        score = torch.tanh(
+            self.W1(encoder_outputs) + self.W2(hidden_expanded)
+        )
+        attention_logits = self.V(score)
+        if mask is not None:
+            attention_logits = attention_logits.masked_fill(
+                mask.unsqueeze(-1) == 0,
+                -1e9
+            )
+        attention_weights = F.softmax(attention_logits, dim=1).squeeze(2)
+        context = torch.bmm(
+            attention_weights.unsqueeze(1),
+            encoder_outputs
+        ).squeeze(1)
+        return context, attention_weights

model/decoder.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        embed_size: int,
+        hidden_size: int,
+        attention: nn.Module,
+        num_layers: int = 2,
+        dropout: float = 0.3
+    ):
+        super(Decoder, self).__init__()
+        self.vocab_size = vocab_size
+        self.embed_size = embed_size
+        self.hidden_size = hidden_size
+        self.attention = attention
+        self.embedding = nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=embed_size,
+            padding_idx=0
+        )
+        self.lstm = nn.LSTM(
+            input_size=embed_size + hidden_size * 2,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0
+        )
+        self.fc_out = nn.Linear(
+            hidden_size + hidden_size * 2 + embed_size,
+            vocab_size
+        )
+        self.dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        input_token: torch.Tensor,
+        decoder_hidden: torch.Tensor,
+        decoder_cell: torch.Tensor,
+        encoder_outputs: torch.Tensor,
+        mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        embedded = self.dropout(self.embedding(input_token.unsqueeze(1)))
+        top_hidden = decoder_hidden[-1]
+        context, attention_weights = self.attention(
+            top_hidden, encoder_outputs, mask
+        )
+        lstm_input = torch.cat((embedded, context.unsqueeze(1)), dim=2)
+        output, (decoder_hidden, decoder_cell) = self.lstm(
+            lstm_input,
+            (decoder_hidden, decoder_cell)
+        )
+        output = output.squeeze(1)
+        embedded = embedded.squeeze(1)
+        output_context = torch.cat((output, context, embedded), dim=1)
+        prediction = self.fc_out(output_context)
+        return prediction, decoder_hidden, decoder_cell, attention_weights

model/encoder.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import torch.nn as nn
+from typing import Tuple
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int,
+        embed_size: int,
+        hidden_size: int,
+        num_layers: int = 2,
+        dropout: float = 0.3
+    ):
+        super(Encoder, self).__init__()
+        self.vocab_size = vocab_size
+        self.embed_size = embed_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.embedding = nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=embed_size,
+            padding_idx=0
+        )
+        self.lstm = nn.LSTM(
+            input_size=embed_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0,
+            bidirectional=True
+        )
+        self.dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        input_seq: torch.Tensor,
+        input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        embedded = self.dropout(self.embedding(input_seq))
+        packed_embedded = nn.utils.rnn.pack_padded_sequence(
+            embedded,
+            input_lengths.cpu(),
+            batch_first=True,
+            enforce_sorted=False
+        )
+        packed_output, (hidden, cell) = self.lstm(packed_embedded)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(
+            packed_output,
+            batch_first=True
+        )
+        return outputs, hidden, cell

model/generator.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import torch
+import torch.nn as nn
+class Generator(nn.Module):
+    def __init__(self, encoder: nn.Module, decoder: nn.Module, device: torch.device):
+        super(Generator, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.device = device
+        assert encoder.hidden_size == decoder.hidden_size, \
+            "Encoder and decoder hidden sizes must match!"
+        self.hidden_projection = nn.Linear(
+            encoder.hidden_size * 2, decoder.hidden_size
+        )
+        self.cell_projection = nn.Linear(
+            encoder.hidden_size * 2, decoder.hidden_size
+        )
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight.data, mean=0, std=0.01)
+            if module.bias is not None:
+                nn.init.constant_(module.bias.data, 0)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight.data, mean=0, std=0.01)
+        elif isinstance(module, nn.LSTM):
+            for name, param in module.named_parameters():
+                if 'weight' in name:
+                    nn.init.orthogonal_(param.data)
+                elif 'bias' in name:
+                    nn.init.constant_(param.data, 0)
+    def create_mask(self, input_seq: torch.Tensor) -> torch.Tensor:
+        return (input_seq != 0).float()
+    def forward(
+        self,
+        input_seq: torch.Tensor,
+        input_lengths: torch.Tensor,
+        target_seq: torch.Tensor,
+        teacher_forcing_ratio: float = 0.5
+    ) -> torch.Tensor:
+        batch_size = input_seq.shape[0]
+        target_len = target_seq.shape[1]
+        vocab_size = self.decoder.vocab_size
+        outputs = torch.zeros(target_len, batch_size, vocab_size).to(self.device)
+        encoder_outputs, hidden, cell = self.encoder(input_seq, input_lengths)
+        max_len = encoder_outputs.shape[1]
+        mask = torch.arange(max_len, device=self.device).unsqueeze(0) < input_lengths.unsqueeze(1)
+        mask = mask.float()
+        hidden = hidden.view(self.encoder.num_layers, 2, batch_size, self.encoder.hidden_size)
+        hidden = torch.cat((hidden[:, 0], hidden[:, 1]), dim=2)
+        hidden = self.hidden_projection(hidden)
+        cell = cell.view(self.encoder.num_layers, 2, batch_size, self.encoder.hidden_size)
+        cell = torch.cat((cell[:, 0], cell[:, 1]), dim=2)
+        cell = self.cell_projection(cell)
+        input_token = target_seq[:, 0]
+        for t in range(1, target_len):
+            output, hidden, cell, _ = self.decoder(
+                input_token, hidden, cell, encoder_outputs, mask
+            )
+            outputs[t] = output
+            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
+            top1 = output.argmax(1)
+            input_token = target_seq[:, t] if teacher_force else top1
+        return outputs

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import json
+from collections import Counter
+from collections import defaultdict
+from typing import Dict
+from typing import List
+from typing import Tuple
+class Tokenizer:
+    def __init__(self, vocab_size: int = 1000):
+        self.special_tokens = ['<PAD>', '<UNK>', '<SOS>', '<EOS>']
+        self.char2idx: Dict[str, int] = {}
+        self.idx2char: Dict[int, str] = {}
+        self.vocab_size: int = 0
+        self.target_vocab_size: int = vocab_size
+        self.bpe_ranks: Dict[Tuple[str, str], int] = {}
+        for idx, token in enumerate(self.special_tokens):
+            self.char2idx[token] = idx
+            self.idx2char[idx] = token
+        self.vocab_size = len(self.special_tokens)
+    def _get_stats(self, words: Dict[Tuple[str, ...], int]) -> Counter:
+        pairs = Counter()
+        for word, freq in words.items():
+            for i in range(len(word) - 1):
+                pairs[(word[i], word[i + 1])] += freq
+        return pairs
+    def _merge_vocab(
+        self, pair: Tuple[str, str], words: Dict[Tuple[str, ...], int]
+    ) -> Dict[Tuple[str, ...], int]:
+        new_words = {}
+        replacement = "".join(pair)
+        for word in words:
+            new_word = []
+            i = 0
+            while i < len(word):
+                if (
+                    i < len(word) - 1
+                    and word[i] == pair[0]
+                    and word[i + 1] == pair[1]
+                ):
+                    new_word.append(replacement)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_words[tuple(new_word)] = words[word]
+        return new_words
+    def build_vocab(self, texts: List[str]) -> None:
+        print(f"Building BPE vocabulary from {len(texts)} texts...")
+        vocab = set()
+        for text in texts:
+            vocab.update(text)
+        for char in sorted(vocab):
+            if char not in self.char2idx:
+                self.char2idx[char] = self.vocab_size
+                self.idx2char[self.vocab_size] = char
+                self.vocab_size += 1
+        print(
+            f"Initial character vocabulary: "
+            f"{self.vocab_size - len(self.special_tokens)} characters"
+        )
+        words = defaultdict(int)
+        for text in texts:
+            word = tuple(text)
+            words[word] += 1
+        num_merges = self.target_vocab_size - self.vocab_size
+        print(f"Learning {num_merges} BPE merges...")
+        for i in range(num_merges):
+            pairs = self._get_stats(words)
+            if not pairs:
+                break
+            best_pair = max(pairs, key=pairs.get)
+            words = self._merge_vocab(best_pair, words)
+            new_token = ''.join(best_pair)
+            if new_token not in self.char2idx:
+                self.char2idx[new_token] = self.vocab_size
+                self.idx2char[self.vocab_size] = new_token
+                self.vocab_size += 1
+            self.bpe_ranks[best_pair] = i
+            if (i + 1) % 100 == 0:
+                print(
+                    f"  Learned {i + 1} merges, "
+                    f"vocab size: {self.vocab_size}"
+                )
+        print(f"BPE Vocabulary built! Total tokens: {self.vocab_size}")
+        print(f" - Special tokens: {len(self.special_tokens)}")
+        print(f" - Base characters: {len(vocab)}")
+        print(f" - BPE subwords: {len(self.bpe_ranks)}")
+        print(f" - Sample subwords: {list(self.bpe_ranks.keys())[:5]}")
+    def _tokenize(self, text: str) -> List[str]:
+        if not text:
+            return []
+        word = tuple(text)
+        while len(word) > 1:
+            pairs = [(word[i], word[i + 1]) for i in range(len(word) - 1)]
+            valid_pairs = [p for p in pairs if p in self.bpe_ranks]
+            if not valid_pairs:
+                break
+            bigram = min(valid_pairs, key=lambda p: self.bpe_ranks[p])
+            new_word = []
+            i = 0
+            while i < len(word):
+                if (
+                    i < len(word) - 1
+                    and word[i] == bigram[0]
+                    and word[i + 1] == bigram[1]
+                ):
+                    new_word.append("".join(bigram))
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            word = tuple(new_word)
+        return list(word)
+    def add_token(self, token: str) -> None:
+        if token not in self.char2idx:
+            idx = self.vocab_size
+            self.char2idx[token] = idx
+            self.idx2char[idx] = token
+            self.vocab_size += 1
+    def encode(
+        self, text: str, max_length: int, add_special_tokens: bool = True
+    ) -> List[int]:
+        tokens = self._tokenize(text)
+        indices = []
+        if add_special_tokens:
+            indices.append(self.char2idx['<SOS>'])
+        for token in tokens[:max_length - (2 if add_special_tokens else 0)]:
+            indices.append(self.char2idx.get(token, self.char2idx['<UNK>']))
+        if add_special_tokens:
+            indices.append(self.char2idx['<EOS>'])
+        while len(indices) < max_length:
+            indices.append(self.char2idx['<PAD>'])
+        return indices
+    def decode(self, indices: List[int]) -> str:
+        chars = []
+        for idx in indices:
+            token = self.idx2char.get(idx, '<UNK>')
+            if token == '<EOS>':
+                break
+            if token not in ['<PAD>', '<SOS>', '<UNK>']:
+                chars.append(token)
+        return ''.join(chars)
+    def save(self, filepath: str) -> None:
+        state = {
+            "char2idx": self.char2idx,
+            "special_tokens": self.special_tokens,
+            "vocab_size": self.vocab_size,
+            "target_vocab_size": self.target_vocab_size,
+            "bpe_ranks": {
+                f"{k[0]}_{k[1]}": v for k, v in self.bpe_ranks.items()
+            },
+        }
+        with open(filepath, "w") as f:
+            json.dump(state, f, indent=2)
+        print(f"BPE Tokenizer saved to {filepath}")
+    def load(self, filepath: str) -> "Tokenizer":
+        with open(filepath, "r") as f:
+            state = json.load(f)
+        self.char2idx = state["char2idx"]
+        self.special_tokens = state["special_tokens"]
+        self.vocab_size = state["vocab_size"]
+        self.target_vocab_size = state.get("target_vocab_size", 1000)
+        self.idx2char = {v: k for k, v in self.char2idx.items()}
+        if "bpe_ranks" in state:
+            self.bpe_ranks = {}
+            for key, value in state["bpe_ranks"].items():
+                parts = key.split("_", 1)
+                if len(parts) == 2:
+                    self.bpe_ranks[(parts[0], parts[1])] = value
+        print(f"BPE Tokenizer loaded from {filepath}")
+        print(f" - Vocab size: {self.vocab_size}")
+        print(f" - BPE merges: {len(self.bpe_ranks)}")
+        return self