Spaces:

shadowsilence
/

A4-NLI-App

Sleeping

App Files Files Community

shadowsilence commited on Feb 14

Commit

2ba8ae0

verified ·

1 Parent(s): cad2609

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

.gitattributes +1 -0
.gitignore +9 -0
A4_BERT.ipynb +383 -0
A4_Climate_FEVER.ipynb +522 -0
A4_Option_MNLI.ipynb +518 -0
A4_Option_SNLI.ipynb +637 -0
Dockerfile +11 -0
README.md +83 -6
app/app.py +316 -0
app/static/style.css +234 -0
app/templates/index.html +237 -0
demo.gif +3 -0
models/bert_trained.pt +3 -0
models/sbert_climate_fever.pt +3 -0
models/sbert_mnli.pt +3 -0
models/sbert_snli.pt +3 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+*.pyc
+.ipynb_checkpoints/
+.DS_Store
+*.env
+venv/
+env/
+# models/*.pt
+# If using large models, add them to LFS instead

A4_BERT.ipynb ADDED Viewed

	@@ -0,0 +1,383 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TB4CfcNaQZFN"
+   },
+   "source": [
+    "# A4: BERT Pre-training from Scratch\n",
+    "## Student Information\n",
+    "**Name:** HTUT KO KO  \n",
+    "**ID:** st126010  \n",
+    "\n",
+    "## Task 1: BERT implementation\n",
+    "In this notebook, I implement BERT from scratch and pre-train it on the WikiText-103 dataset.\n",
+    "**Optimization Note:** To achieve lower loss on this small-scale demonstration, I use a smaller subset of data, a smaller vocabulary, and run for more adequate epochs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/homebrew/Caskroom/miniforge/base/envs/ai_env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: mps\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import numpy as np\n",
+    "import random\n",
+    "from random import randrange, shuffle, randint\n",
+    "from datasets import load_dataset\n",
+    "from transformers import BertTokenizer\n",
+    "from torch.utils.data import DataLoader, Dataset\n",
+    "import re\n",
+    "from collections import Counter\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else (\"mps\" if torch.backends.mps.is_available() else \"cpu\"))\n",
+    "print(f\"Using device: {device}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Data Loading & Preprocessing\n",
+    "I will use a smaller vocabulary size to make the model convergence easier for the assignment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Vocab Size: 5004\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 1. Load Data\n",
+    "dataset = load_dataset('wikitext', 'wikitext-103-raw-v1', split='train')\n",
+    "subset_size = 5000 # Reduce size for faster iteration and better convergence on small model\n",
+    "dataset = dataset.select(range(subset_size))\n",
+    "raw_text_data = [line for line in dataset['text'] if len(line) > 20]\n",
+    "\n",
+    "# 2. Build Custom Vocabulary (Crucial for small data)\n",
+    "# Instead of using 30k BERT vocab, we build one from our data\n",
+    "tokens = [word.lower() for sent in raw_text_data for word in sent.split()]\n",
+    "vocab_counter = Counter(tokens)\n",
+    "vocab = sorted(vocab_counter, key=vocab_counter.get, reverse=True)[:5000] # Top 5k words\n",
+    "word2id = {w: i+4 for i, w in enumerate(vocab)}\n",
+    "word2id['[PAD]'] = 0\n",
+    "word2id['[CLS]'] = 1\n",
+    "word2id['[SEP]'] = 2\n",
+    "word2id['[MASK]'] = 3\n",
+    "id2word = {i: w for w, i in word2id.items()}\n",
+    "vocab_size = len(word2id)\n",
+    "print(f\"Vocab Size: {vocab_size}\")\n",
+    "\n",
+    "token_list = []\n",
+    "for sentence in raw_text_data:\n",
+    "    # Simple whitespace tokenization for this demo\n",
+    "    seq = [word2id.get(w.lower(), 0) for w in sentence.split()] \n",
+    "    if len(seq) > 0:\n",
+    "        token_list.append(seq)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. BERT Hyperparameters & Data Loader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_len = 128\n",
+    "batch_size = 16 # Small batch size\n",
+    "max_mask = 20\n",
+    "n_layers = 2    # Shallower model for easier training\n",
+    "n_heads = 4\n",
+    "d_model = 256\n",
+    "d_ff = 256 * 4\n",
+    "d_k = d_v = 64\n",
+    "n_segments = 2\n",
+    "\n",
+    "def make_batch():\n",
+    "    batch = []\n",
+    "    positive = negative = 0\n",
+    "    while positive != batch_size / 2 or negative != batch_size / 2:\n",
+    "        tokens_a_index, tokens_b_index = randrange(len(token_list)), randrange(len(token_list))\n",
+    "        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]\n",
+    "\n",
+    "        input_ids = [word2id['[CLS]']] + tokens_a + [word2id['[SEP]']] + tokens_b + [word2id['[SEP]']]\n",
+    "        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)\n",
+    "\n",
+    "        input_ids = input_ids[:max_len]\n",
+    "        segment_ids = segment_ids[:max_len]\n",
+    "\n",
+    "        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))\n",
+    "        candidates_masked_pos = [i for i, token in enumerate(input_ids) if token != word2id['[CLS]'] and token != word2id['[SEP]']]\n",
+    "        shuffle(candidates_masked_pos)\n",
+    "        masked_tokens, masked_pos = [], []\n",
+    "        for pos in candidates_masked_pos[:n_pred]:\n",
+    "            masked_pos.append(pos)\n",
+    "            masked_tokens.append(input_ids[pos])\n",
+    "            if random.random() < 0.1:\n",
+    "                input_ids[pos] = randint(0, vocab_size - 1)\n",
+    "            elif random.random() < 0.8:\n",
+    "                input_ids[pos] = word2id['[MASK]']\n",
+    "\n",
+    "        n_pad = max_len - len(input_ids)\n",
+    "        input_ids.extend([0] * n_pad)\n",
+    "        segment_ids.extend([0] * n_pad)\n",
+    "\n",
+    "        if max_mask > n_pred:\n",
+    "            n_pad = max_mask - n_pred\n",
+    "            masked_tokens.extend([0] * n_pad)\n",
+    "            masked_pos.extend([0] * n_pad)\n",
+    "\n",
+    "        if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:\n",
+    "            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])\n",
+    "            positive += 1\n",
+    "        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:\n",
+    "            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])\n",
+    "            negative += 1\n",
+    "    return batch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. BERT Model Architecture"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Embedding(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(Embedding, self).__init__()\n",
+    "        self.tok_embed = nn.Embedding(vocab_size, d_model)\n",
+    "        self.pos_embed = nn.Embedding(max_len, d_model)\n",
+    "        self.seg_embed = nn.Embedding(n_segments, d_model)\n",
+    "        self.norm = nn.LayerNorm(d_model)\n",
+    "\n",
+    "    def forward(self, x, seg):\n",
+    "        seq_len = x.size(1)\n",
+    "        pos = torch.arange(seq_len, dtype=torch.long, device=x.device)\n",
+    "        pos = pos.unsqueeze(0).expand_as(x)\n",
+    "        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n",
+    "        return self.norm(embedding)\n",
+    "\n",
+    "class MultiHeadAttention(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(MultiHeadAttention, self).__init__()\n",
+    "        self.W_Q = nn.Linear(d_model, d_k * n_heads)\n",
+    "        self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
+    "        self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
+    "        self.linear = nn.Linear(n_heads * d_v, d_model)\n",
+    "        self.layer_norm = nn.LayerNorm(d_model)\n",
+    "\n",
+    "    def forward(self, Q, K, V, attn_mask):\n",
+    "        batch_size = Q.size(0)\n",
+    "        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)\n",
+    "        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2)\n",
+    "        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2)\n",
+    "        \n",
+    "        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)\n",
+    "        \n",
+    "        scores = torch.matmul(q_s, k_s.transpose(-1, -2)) / np.sqrt(d_k)\n",
+    "        scores.masked_fill_(attn_mask, -1e9)\n",
+    "        attn = nn.Softmax(dim=-1)(scores)\n",
+    "        context = torch.matmul(attn, v_s)\n",
+    "        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)\n",
+    "        output = self.linear(context)\n",
+    "        return self.layer_norm(output + Q), attn\n",
+    "\n",
+    "class PoswiseFeedForwardNet(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(PoswiseFeedForwardNet, self).__init__()\n",
+    "        self.fc1 = nn.Linear(d_model, d_ff)\n",
+    "        self.fc2 = nn.Linear(d_ff, d_model)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.fc2(torch.nn.functional.gelu(self.fc1(x)))\n",
+    "\n",
+    "class EncoderLayer(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(EncoderLayer, self).__init__()\n",
+    "        self.enc_self_attn = MultiHeadAttention()\n",
+    "        self.pos_ffn = PoswiseFeedForwardNet()\n",
+    "\n",
+    "    def forward(self, enc_inputs, enc_self_attn_mask):\n",
+    "        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)\n",
+    "        enc_outputs = self.pos_ffn(enc_outputs)\n",
+    "        return enc_outputs, attn\n",
+    "\n",
+    "def get_attn_pad_mask(seq_q, seq_k):\n",
+    "    batch_size, len_q = seq_q.size()\n",
+    "    batch_size, len_k = seq_k.size()\n",
+    "    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)\n",
+    "    return pad_attn_mask.expand(batch_size, len_q, len_k)\n",
+    "\n",
+    "class BERT(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(BERT, self).__init__()\n",
+    "        self.embedding = Embedding()\n",
+    "        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
+    "        self.fc = nn.Linear(d_model, d_model)\n",
+    "        self.activ = nn.Tanh()\n",
+    "        self.linear = nn.Linear(d_model, d_model)\n",
+    "        self.norm = nn.LayerNorm(d_model)\n",
+    "        self.classifier = nn.Linear(d_model, 2)\n",
+    "        embed_weight = self.embedding.tok_embed.weight\n",
+    "        n_vocab, n_dim = embed_weight.size()\n",
+    "        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)\n",
+    "        self.decoder.weight = embed_weight\n",
+    "        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n",
+    "\n",
+    "    def forward(self, input_ids, segment_ids, masked_pos):\n",
+    "        output = self.embedding(input_ids, segment_ids)\n",
+    "        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)\n",
+    "        for layer in self.layers:\n",
+    "            output, enc_self_attn = layer(output, enc_self_attn_mask)\n",
+    "        \n",
+    "        h_pooled = self.activ(self.fc(output[:, 0]))\n",
+    "        logits_nsp = self.classifier(h_pooled)\n",
+    "        \n",
+    "        masked_pos = masked_pos[:, :, None].expand(-1, -1, d_model)\n",
+    "        h_masked = torch.gather(output, 1, masked_pos)\n",
+    "        h_masked = self.norm(self.activ(self.linear(h_masked)))\n",
+    "        logits_lm = self.decoder(h_masked) + self.decoder_bias\n",
+    "\n",
+    "        return logits_lm, logits_nsp, output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4. Training Loop\n",
+    "I train for 2000 epochs to ensure convergence."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting Training...\n",
+      "Epoch: 0100 | loss = 13.480813\n",
+      "Epoch: 0200 | loss = 10.269491\n",
+      "Epoch: 0300 | loss = 8.912387\n",
+      "Epoch: 0400 | loss = 7.265475\n",
+      "Epoch: 0500 | loss = 6.765235\n",
+      "Epoch: 0600 | loss = 7.150949\n",
+      "Epoch: 0700 | loss = 6.182394\n",
+      "Epoch: 0800 | loss = 6.075039\n",
+      "Epoch: 0900 | loss = 6.766500\n",
+      "Epoch: 1000 | loss = 6.545547\n",
+      "Epoch: 1100 | loss = 6.488539\n",
+      "Epoch: 1200 | loss = 6.223000\n",
+      "Epoch: 1300 | loss = 5.912578\n",
+      "Epoch: 1400 | loss = 6.125433\n",
+      "Epoch: 1500 | loss = 6.077301\n",
+      "Epoch: 1600 | loss = 6.500366\n",
+      "Epoch: 1700 | loss = 6.560534\n",
+      "Epoch: 1800 | loss = 6.262241\n",
+      "Epoch: 1900 | loss = 5.871750\n",
+      "Epoch: 2000 | loss = 6.158124\n",
+      "Training Complete. Model Saved.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = BERT().to(device)\n",
+    "criterion = nn.CrossEntropyLoss(ignore_index=0)\n",
+    "criterion_nsp = nn.CrossEntropyLoss()\n",
+    "optimizer = optim.Adam(model.parameters(), lr=1e-3)\n",
+    "\n",
+    "print(\"Starting Training...\")\n",
+    "for epoch in range(2000):\n",
+    "    batch = make_batch()\n",
+    "    input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))\n",
+    "    input_ids, segment_ids, masked_tokens, masked_pos, isNext = input_ids.to(device), segment_ids.to(device), masked_tokens.to(device), masked_pos.to(device), isNext.to(device)\n",
+    "\n",
+    "    optimizer.zero_grad()\n",
+    "    logits_lm, logits_nsp, _ = model(input_ids, segment_ids, masked_pos)\n",
+    "    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens).mean()\n",
+    "    loss_nsp = criterion_nsp(logits_nsp, isNext)\n",
+    "    loss = loss_lm + loss_nsp\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    \n",
+    "    if (epoch + 1) % 100 == 0:\n",
+    "        print(f'Epoch: {epoch + 1:04d} | loss = {loss.item():.6f}')\n",
+    "\n",
+    "torch.save(model.state_dict(), './models/bert_trained.pt')\n",
+    "print(\"Training Complete. Model Saved.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ai_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

A4_Climate_FEVER.ipynb ADDED Viewed

	@@ -0,0 +1,522 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TB4CfcNaQZFN"
+   },
+   "source": [
+    "# A4: S-BERT Training (Climate-FEVER)\n",
+    "## Student Information\n",
+    "**Name:** HTUT KO KO  \n",
+    "**ID:** st126010  \n",
+    "\n",
+    "## Task 2: S-BERT Implementation\n",
+    "In this notebook, I load the pre-trained BERT model (from Task 1) and fine-tune it using a Siamese network structure for Natural Language Inference (NLI) on the Climate-FEVER dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: mps\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import numpy as np\n",
+    "from datasets import load_dataset\n",
+    "from transformers import BertTokenizer\n",
+    "from torch.utils.data import DataLoader, Dataset\n",
+    "from sklearn.metrics import classification_report, accuracy_score\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else (\"mps\" if torch.backends.mps.is_available() else \"cpu\"))\n",
+    "print(f\"Using device: {device}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model Hyperparameters (Must match Pre-training)\n",
+    "max_len = 128\n",
+    "n_layers = 2\n",
+    "n_heads = 4\n",
+    "d_model = 256\n",
+    "d_ff = 256 * 4\n",
+    "d_k = d_v = 64\n",
+    "n_segments = 2\n",
+    "vocab_size = 5004"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. BERT Model Definition\n",
+    "Required to load the saved state dictionary."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded bert_trained.pt successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "class Embedding(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(Embedding, self).__init__()\n",
+    "        self.tok_embed = nn.Embedding(vocab_size, d_model)\n",
+    "        self.pos_embed = nn.Embedding(max_len, d_model)\n",
+    "        self.seg_embed = nn.Embedding(n_segments, d_model)\n",
+    "        self.norm = nn.LayerNorm(d_model)\n",
+    "\n",
+    "    def forward(self, x, seg):\n",
+    "        seq_len = x.size(1)\n",
+    "        pos = torch.arange(seq_len, dtype=torch.long, device=x.device)\n",
+    "        pos = pos.unsqueeze(0).expand_as(x)\n",
+    "        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n",
+    "        return self.norm(embedding)\n",
+    "\n",
+    "class MultiHeadAttention(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(MultiHeadAttention, self).__init__()\n",
+    "        self.W_Q = nn.Linear(d_model, d_k * n_heads)\n",
+    "        self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
+    "        self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
+    "        self.linear = nn.Linear(n_heads * d_v, d_model)\n",
+    "        self.layer_norm = nn.LayerNorm(d_model)\n",
+    "\n",
+    "    def forward(self, Q, K, V, attn_mask):\n",
+    "        batch_size = Q.size(0)\n",
+    "        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)\n",
+    "        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2)\n",
+    "        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2)\n",
+    "        \n",
+    "        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)\n",
+    "        \n",
+    "        scores = torch.matmul(q_s, k_s.transpose(-1, -2)) / np.sqrt(d_k)\n",
+    "        scores.masked_fill_(attn_mask, -1e9)\n",
+    "        attn = nn.Softmax(dim=-1)(scores)\n",
+    "        context = torch.matmul(attn, v_s)\n",
+    "        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)\n",
+    "        output = self.linear(context)\n",
+    "        return self.layer_norm(output + Q), attn\n",
+    "\n",
+    "class PoswiseFeedForwardNet(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(PoswiseFeedForwardNet, self).__init__()\n",
+    "        self.fc1 = nn.Linear(d_model, d_ff)\n",
+    "        self.fc2 = nn.Linear(d_ff, d_model)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.fc2(torch.nn.functional.gelu(self.fc1(x)))\n",
+    "\n",
+    "class EncoderLayer(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(EncoderLayer, self).__init__()\n",
+    "        self.enc_self_attn = MultiHeadAttention()\n",
+    "        self.pos_ffn = PoswiseFeedForwardNet()\n",
+    "\n",
+    "    def forward(self, enc_inputs, enc_self_attn_mask):\n",
+    "        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)\n",
+    "        enc_outputs = self.pos_ffn(enc_outputs)\n",
+    "        return enc_outputs, attn\n",
+    "\n",
+    "def get_attn_pad_mask(seq_q, seq_k):\n",
+    "    batch_size, len_q = seq_q.size()\n",
+    "    batch_size, len_k = seq_k.size()\n",
+    "    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)\n",
+    "    return pad_attn_mask.expand(batch_size, len_q, len_k)\n",
+    "\n",
+    "class BERT(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(BERT, self).__init__()\n",
+    "        self.embedding = Embedding()\n",
+    "        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
+    "        self.fc = nn.Linear(d_model, d_model)\n",
+    "        self.activ = nn.Tanh()\n",
+    "        self.linear = nn.Linear(d_model, d_model)\n",
+    "        self.norm = nn.LayerNorm(d_model)\n",
+    "        self.classifier = nn.Linear(d_model, 2)\n",
+    "        embed_weight = self.embedding.tok_embed.weight\n",
+    "        n_vocab, n_dim = embed_weight.size()\n",
+    "        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)\n",
+    "        self.decoder.weight = embed_weight\n",
+    "        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n",
+    "\n",
+    "    def forward(self, input_ids, segment_ids, masked_pos):\n",
+    "        output = self.embedding(input_ids, segment_ids)\n",
+    "        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)\n",
+    "        for layer in self.layers:\n",
+    "            output, enc_self_attn = layer(output, enc_self_attn_mask)\n",
+    "        \n",
+    "        h_pooled = self.activ(self.fc(output[:, 0]))\n",
+    "        logits_nsp = self.classifier(h_pooled)\n",
+    "        \n",
+    "        # For S-BERT, I return the output sequences directly\n",
+    "        return logits_nsp, logits_nsp, output\n",
+    "\n",
+    "# Load Pre-trained Parameters\n",
+    "bert = BERT().to(device)\n",
+    "try:\n",
+    "    bert.load_state_dict(torch.load('./models/bert_trained.pt', map_location=device))\n",
+    "    print(\"Loaded bert_trained.pt successfully.\")\n",
+    "except:\n",
+    "    print(\"Pre-trained weights not found. Please run A4_BERT.ipynb first.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. S-BERT for Climate-FEVER\n",
+    "Fine-tuning on the Climate-FEVER dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class SBERT(nn.Module):\n",
+    "    def __init__(self, bert_model):\n",
+    "        super(SBERT, self).__init__()\n",
+    "        self.bert = bert_model\n",
+    "        self.classifier = nn.Linear(d_model * 3, 3)\n",
+    "\n",
+    "    def forward(self, premise_ids, premise_seg, hypothesis_ids, hypothesis_seg):\n",
+    "        device = premise_ids.device\n",
+    "        dummy_masked_pos = torch.zeros((premise_ids.size(0), 1), dtype=torch.long).to(device)\n",
+    "        \n",
+    "        _, _, output_u = self.bert(premise_ids, premise_seg, dummy_masked_pos)\n",
+    "        mask_u = (premise_ids != 0).unsqueeze(-1).float()\n",
+    "        u = torch.sum(output_u * mask_u, dim=1) / torch.clamp(mask_u.sum(dim=1), min=1e-9)\n",
+    "\n",
+    "        _, _, output_v = self.bert(hypothesis_ids, hypothesis_seg, dummy_masked_pos)\n",
+    "        mask_v = (hypothesis_ids != 0).unsqueeze(-1).float()\n",
+    "        v = torch.sum(output_v * mask_v, dim=1) / torch.clamp(mask_v.sum(dim=1), min=1e-9)\n",
+    "\n",
+    "        uv_abs = torch.abs(u - v)\n",
+    "        features = torch.cat([u, v, uv_abs], dim=-1)\n",
+    "        logits = self.classifier(features)\n",
+    "        return logits\n",
+    "\n",
+    "s_model = SBERT(bert).to(device)\n",
+    "optimizer = optim.Adam(s_model.parameters(), lr=2e-5)\n",
+    "criterion = nn.CrossEntropyLoss()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.1 Climate-FEVER fine-tuning\n",
+    "I load the Climate-FEVER dataset, split into train/test, and train the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting S-BERT Training...\n",
+      "Epoch 1 Loss: 0.9086\n",
+      "Epoch 2 Loss: 0.8578\n",
+      "Epoch 3 Loss: 0.8520\n",
+      "Epoch 4 Loss: 0.8444\n",
+      "Epoch 5 Loss: 0.8365\n",
+      "Epoch 6 Loss: 0.8249\n",
+      "Epoch 7 Loss: 0.8103\n",
+      "Epoch 8 Loss: 0.7932\n",
+      "Epoch 9 Loss: 0.7652\n",
+      "Epoch 10 Loss: 0.7349\n",
+      "Epoch 11 Loss: 0.6856\n",
+      "Epoch 12 Loss: 0.6360\n",
+      "Epoch 13 Loss: 0.5786\n",
+      "Epoch 14 Loss: 0.5181\n",
+      "Epoch 15 Loss: 0.4629\n",
+      "Epoch 16 Loss: 0.4108\n",
+      "Epoch 17 Loss: 0.3392\n",
+      "Epoch 18 Loss: 0.3047\n",
+      "Epoch 19 Loss: 0.2546\n",
+      "Epoch 20 Loss: 0.2053\n",
+      "Epoch 21 Loss: 0.1662\n",
+      "Epoch 22 Loss: 0.1421\n",
+      "Epoch 23 Loss: 0.1161\n",
+      "Epoch 24 Loss: 0.0918\n",
+      "Epoch 25 Loss: 0.0798\n",
+      "Epoch 26 Loss: 0.0740\n",
+      "Epoch 27 Loss: 0.0591\n",
+      "Epoch 28 Loss: 0.0488\n",
+      "Epoch 29 Loss: 0.0448\n",
+      "Epoch 30 Loss: 0.0452\n",
+      "Epoch 31 Loss: 0.0354\n",
+      "Epoch 32 Loss: 0.0315\n",
+      "Epoch 33 Loss: 0.0265\n",
+      "Epoch 34 Loss: 0.0232\n",
+      "Epoch 35 Loss: 0.0215\n",
+      "Epoch 36 Loss: 0.0180\n",
+      "Epoch 37 Loss: 0.0173\n",
+      "Epoch 38 Loss: 0.0147\n",
+      "Epoch 39 Loss: 0.0137\n",
+      "Epoch 40 Loss: 0.0159\n",
+      "Epoch 41 Loss: 0.0127\n",
+      "Epoch 42 Loss: 0.0102\n",
+      "Epoch 43 Loss: 0.0094\n",
+      "Epoch 44 Loss: 0.0094\n",
+      "Epoch 45 Loss: 0.0100\n",
+      "Epoch 46 Loss: 0.0112\n",
+      "Epoch 47 Loss: 0.0077\n",
+      "Epoch 48 Loss: 0.0067\n",
+      "Epoch 49 Loss: 0.0073\n",
+      "Epoch 50 Loss: 0.0268\n",
+      "Epoch 51 Loss: 0.0747\n",
+      "Epoch 52 Loss: 0.0405\n",
+      "Epoch 53 Loss: 0.0241\n",
+      "Epoch 54 Loss: 0.0077\n",
+      "Epoch 55 Loss: 0.0054\n",
+      "Epoch 56 Loss: 0.0049\n",
+      "Epoch 57 Loss: 0.0047\n",
+      "Epoch 58 Loss: 0.0047\n",
+      "Epoch 59 Loss: 0.0052\n",
+      "Epoch 60 Loss: 0.0038\n",
+      "Epoch 61 Loss: 0.0037\n",
+      "Epoch 62 Loss: 0.0039\n",
+      "Epoch 63 Loss: 0.0037\n",
+      "Epoch 64 Loss: 0.0049\n",
+      "Epoch 65 Loss: 0.0036\n",
+      "Epoch 66 Loss: 0.0035\n",
+      "Epoch 67 Loss: 0.0037\n",
+      "Epoch 68 Loss: 0.0040\n",
+      "Epoch 69 Loss: 0.0044\n",
+      "Epoch 70 Loss: 0.0038\n",
+      "Epoch 71 Loss: 0.0079\n",
+      "Epoch 72 Loss: 0.0093\n",
+      "Epoch 73 Loss: 0.0033\n",
+      "Epoch 74 Loss: 0.0030\n",
+      "Epoch 75 Loss: 0.0032\n",
+      "Epoch 76 Loss: 0.0032\n",
+      "Epoch 77 Loss: 0.0029\n",
+      "Epoch 78 Loss: 0.0026\n",
+      "Epoch 79 Loss: 0.0031\n",
+      "Epoch 80 Loss: 0.0021\n",
+      "Epoch 81 Loss: 0.0048\n",
+      "Epoch 82 Loss: 0.0030\n",
+      "Epoch 83 Loss: 0.0045\n",
+      "Epoch 84 Loss: 0.0045\n",
+      "Epoch 85 Loss: 0.0061\n",
+      "Epoch 86 Loss: 0.0784\n",
+      "Epoch 87 Loss: 0.0706\n",
+      "Epoch 88 Loss: 0.0293\n",
+      "Epoch 89 Loss: 0.0059\n",
+      "Epoch 90 Loss: 0.0028\n",
+      "Epoch 91 Loss: 0.0024\n",
+      "Epoch 92 Loss: 0.0023\n",
+      "Epoch 93 Loss: 0.0020\n",
+      "Epoch 94 Loss: 0.0019\n",
+      "Epoch 95 Loss: 0.0018\n",
+      "Epoch 96 Loss: 0.0015\n",
+      "Epoch 97 Loss: 0.0016\n",
+      "Epoch 98 Loss: 0.0019\n",
+      "Epoch 99 Loss: 0.0014\n",
+      "Epoch 100 Loss: 0.0017\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "cf_dataset = load_dataset('climate_fever', split='test') \n",
+    "# Climate-FEVER only has 'test' split publicly available often. I'll use it as our full dataset.\n",
+    "cf_split = cf_dataset.train_test_split(test_size=0.2, seed=42)\n",
+    "train_dataset = cf_split['train']\n",
+    "test_dataset = cf_split['test']\n",
+    "\n",
+    "class NLIDataset(Dataset):\n",
+    "    def __init__(self, dataset, tokenizer, max_len=128):\n",
+    "        self.dataset = dataset\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.max_len = max_len\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        item = self.dataset[idx]\n",
+    "        premise = item['claim']\n",
+    "        # Climate-FEVER has 'evidences' list. I take the first evidence text.\n",
+    "        evidence_data = item['evidences'][0]\n",
+    "        hypothesis = evidence_data['evidence']\n",
+    "        label_raw = evidence_data['evidence_label'] # Nested access\n",
+    "\n",
+    "        # Robust label mapping (Handles integers 0/1/2 and strings)\n",
+    "        # Target: 0: Entailment, 1: Neutral, 2: Contradiction\n",
+    "        if isinstance(label_raw, int):\n",
+    "            # Assuming HF Climate-FEVER uses: 0: Supports, 1: Refutes, 2: NEI\n",
+    "            if label_raw == 0: label = 0 # Supports -> Entailment\n",
+    "            elif label_raw == 1: label = 2 # Refutes -> Contradiction\n",
+    "            elif label_raw == 2: label = 1 # NEI -> Neutral\n",
+    "            else: label = 1 # Default\n",
+    "        else:\n",
+    "            label_str = str(label_raw).upper().replace(\" \", \"_\")\n",
+    "            if 'SUPPORT' in label_str: label = 0\n",
+    "            elif 'REFUTE' in label_str: label = 2\n",
+    "            elif 'INFO' in label_str: label = 1\n",
+    "            else: label = 1\n",
+    "\n",
+    "        encoded_premise = self.tokenizer(\n",
+    "            premise,\n",
+    "            add_special_tokens=True,\n",
+    "            max_length=self.max_len,\n",
+    "            padding='max_length',\n",
+    "            return_attention_mask=True,\n",
+    "            truncation=True\n",
+    "        )\n",
+    "\n",
+    "        encoded_hypothesis = self.tokenizer(\n",
+    "            hypothesis,\n",
+    "            add_special_tokens=True,\n",
+    "            max_length=self.max_len,\n",
+    "            padding='max_length',\n",
+    "            return_attention_mask=True,\n",
+    "            truncation=True\n",
+    "        )\n",
+    "\n",
+    "        return {\n",
+    "            'premise_input_ids': torch.tensor(encoded_premise['input_ids'], dtype=torch.long),\n",
+    "            'premise_segment_ids': torch.tensor(encoded_premise['token_type_ids'], dtype=torch.long),\n",
+    "            'hypothesis_input_ids': torch.tensor(encoded_hypothesis['input_ids'], dtype=torch.long),\n",
+    "            'hypothesis_segment_ids': torch.tensor(encoded_hypothesis['token_type_ids'], dtype=torch.long),\n",
+    "            'label': torch.tensor(label, dtype=torch.long)\n",
+    "        }\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.dataset)\n",
+    "\n",
+    "train_loader = DataLoader(NLIDataset(train_dataset, tokenizer), batch_size=16, shuffle=True)\n",
+    "test_loader = DataLoader(NLIDataset(test_dataset, tokenizer), batch_size=16, shuffle=False)\n",
+    "\n",
+    "print(\"Starting S-BERT Training...\")\n",
+    "for epoch in range(100): \n",
+    "    s_model.train()\n",
+    "    total_loss = 0\n",
+    "    for batch in train_loader:\n",
+    "        p_ids = batch['premise_input_ids'].to(device)\n",
+    "        p_seg = batch['premise_segment_ids'].to(device)\n",
+    "        h_ids = batch['hypothesis_input_ids'].to(device)\n",
+    "        h_seg = batch['hypothesis_segment_ids'].to(device)\n",
+    "        labels = batch['label'].to(device)\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        logits = s_model(p_ids, p_seg, h_ids, h_seg)\n",
+    "        loss = criterion(logits, labels)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        total_loss += loss.item()\n",
+    "    print(f\"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}\")\n",
+    "\n",
+    "torch.save(s_model.state_dict(), './models/sbert_climate_fever.pt')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Evaluation\n",
+    "Evaluation of the model on the held-out test set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating...\n",
+      "Classification Report:\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "   Entailment       0.26      0.22      0.24        82\n",
+      "      Neutral       0.62      0.69      0.65       191\n",
+      "Contradiction       0.19      0.15      0.17        34\n",
+      "\n",
+      "     accuracy                           0.50       307\n",
+      "    macro avg       0.36      0.35      0.35       307\n",
+      " weighted avg       0.48      0.50      0.49       307\n",
+      "\n",
+      "Accuracy: 0.5049\n"
+     ]
+    }
+   ],
+   "source": [
+    "s_model.eval()\n",
+    "all_preds = []\n",
+    "all_labels = []\n",
+    "\n",
+    "print(\"Evaluating...\")\n",
+    "with torch.no_grad():\n",
+    "    for batch in test_loader:\n",
+    "        p_ids = batch['premise_input_ids'].to(device)\n",
+    "        p_seg = batch['premise_segment_ids'].to(device)\n",
+    "        h_ids = batch['hypothesis_input_ids'].to(device)\n",
+    "        h_seg = batch['hypothesis_segment_ids'].to(device)\n",
+    "        labels = batch['label'].to(device)\n",
+    "\n",
+    "        logits = s_model(p_ids, p_seg, h_ids, h_seg)\n",
+    "        preds = torch.argmax(logits, dim=1)\n",
+    "        \n",
+    "        all_preds.extend(preds.cpu().numpy())\n",
+    "        all_labels.extend(labels.cpu().numpy())\n",
+    "\n",
+    "target_names = ['Entailment', 'Neutral', 'Contradiction']\n",
+    "print(\"Classification Report:\")\n",
+    "print(classification_report(all_labels, all_preds, labels=[0, 1, 2], target_names=target_names))\n",
+    "print(f\"Accuracy: {accuracy_score(all_labels, all_preds):.4f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

A4_Option_MNLI.ipynb ADDED Viewed

	@@ -0,0 +1,518 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# A4: S-BERT Training on Alternative Datasets (MNLI)\n",
+    "\n",
+    "This notebook allows me to train the S-BERT model on the **MNLI** (Multi-Genre Natural Language Inference) dataset.\n",
+    "\n",
+    "## 1. Environment Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: mps\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import numpy as np\n",
+    "from datasets import load_dataset\n",
+    "from transformers import BertTokenizer\n",
+    "from torch.utils.data import DataLoader, Dataset\n",
+    "from sklearn.metrics import classification_report, accuracy_score\n",
+    "\n",
+    "# Device Configuration\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else (\"mps\" if torch.backends.mps.is_available() else \"cpu\"))\n",
+    "print(f\"Using device: {device}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Load Pre-trained BERT\n",
+    "\n",
+    "I will load the BERT model trained in `A4_BERT.ipynb`. Ensure `models/bert_trained.pt` exists."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded bert_trained.pt\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define BERT Architecture\n",
+    "# MUST MATCH THE OPTIMIZED CONFIG FROM A4_BERT.ipynb\n",
+    "vocab_size = 5004       # Updated from 30522\n",
+    "d_model = 256          # MiniBERT Config\n",
+    "n_layers = 2           # Updated from 4\n",
+    "n_heads = 4\n",
+    "d_ff = 256 * 4\n",
+    "max_len = 128\n",
+    "n_segments = 2\n",
+    "d_k = d_v = 64\n",
+    "\n",
+    "class Embedding(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(Embedding, self).__init__()\n",
+    "        self.tok_embed = nn.Embedding(vocab_size, d_model)\n",
+    "        self.pos_embed = nn.Embedding(max_len, d_model)\n",
+    "        self.seg_embed = nn.Embedding(n_segments, d_model)\n",
+    "        self.norm = nn.LayerNorm(d_model)\n",
+    "\n",
+    "    def forward(self, x, seg):\n",
+    "        seq_len = x.size(1)\n",
+    "        pos = torch.arange(seq_len, dtype=torch.long, device=x.device)\n",
+    "        pos = pos.unsqueeze(0).expand_as(x)\n",
+    "        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n",
+    "        return self.norm(embedding)\n",
+    "\n",
+    "class MultiHeadAttention(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(MultiHeadAttention, self).__init__()\n",
+    "        self.W_Q = nn.Linear(d_model, d_k * n_heads)\n",
+    "        self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
+    "        self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
+    "        self.linear = nn.Linear(n_heads * d_v, d_model)\n",
+    "        self.layer_norm = nn.LayerNorm(d_model)\n",
+    "\n",
+    "    def forward(self, Q, K, V, attn_mask):\n",
+    "        batch_size = Q.size(0)\n",
+    "        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)\n",
+    "        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)\n",
+    "        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)\n",
+    "        \n",
+    "        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)\n",
+    "        \n",
+    "        scores = torch.matmul(q_s, k_s.transpose(-1, -2)) / np.sqrt(d_k)\n",
+    "        scores.masked_fill_(attn_mask, -1e9)\n",
+    "        attn = nn.Softmax(dim=-1)(scores)\n",
+    "        context = torch.matmul(attn, v_s)\n",
+    "        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)\n",
+    "        output = self.linear(context)\n",
+    "        return self.layer_norm(output + Q), attn\n",
+    "\n",
+    "class PoswiseFeedForwardNet(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(PoswiseFeedForwardNet, self).__init__()\n",
+    "        self.fc1 = nn.Linear(d_model, d_ff)\n",
+    "        self.fc2 = nn.Linear(d_ff, d_model)\n",
+    "    def forward(self, x):\n",
+    "        return self.fc2(torch.nn.functional.gelu(self.fc1(x)))\n",
+    "\n",
+    "class EncoderLayer(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(EncoderLayer, self).__init__()\n",
+    "        self.enc_self_attn = MultiHeadAttention()\n",
+    "        self.pos_ffn = PoswiseFeedForwardNet()\n",
+    "    def forward(self, enc_inputs, enc_self_attn_mask):\n",
+    "        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)\n",
+    "        enc_outputs = self.pos_ffn(enc_outputs)\n",
+    "        return enc_outputs, attn\n",
+    "\n",
+    "def get_attn_pad_mask(seq_q, seq_k):\n",
+    "    batch_size, len_q = seq_q.size()\n",
+    "    batch_size, len_k = seq_k.size()\n",
+    "    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)\n",
+    "    return pad_attn_mask.expand(batch_size, len_q, len_k)\n",
+    "\n",
+    "class BERT(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(BERT, self).__init__()\n",
+    "        self.embedding = Embedding()\n",
+    "        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
+    "        self.fc = nn.Linear(d_model, d_model)\n",
+    "        self.activ = nn.Tanh()\n",
+    "        self.linear = nn.Linear(d_model, d_model)\n",
+    "        self.norm = nn.LayerNorm(d_model)\n",
+    "        self.classifier = nn.Linear(d_model, 2)\n",
+    "        embed_weight = self.embedding.tok_embed.weight\n",
+    "        n_vocab, n_dim = embed_weight.size()\n",
+    "        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)\n",
+    "        self.decoder.weight = embed_weight\n",
+    "        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n",
+    "\n",
+    "    def forward(self, input_ids, segment_ids, masked_pos=None):\n",
+    "        output = self.embedding(input_ids, segment_ids)\n",
+    "        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)\n",
+    "        for layer in self.layers:\n",
+    "            output, enc_self_attn = layer(output, enc_self_attn_mask)\n",
+    "        return None, None, output \n",
+    "\n",
+    "# Load Pretrained Weights\n",
+    "bert = BERT().to(device)\n",
+    "try:\n",
+    "    bert.load_state_dict(torch.load('./models/bert_trained.pt', map_location=device))\n",
+    "    print(\"Loaded bert_trained.pt\")\n",
+    "except:\n",
+    "    print(\"Warning: bert_trained.pt not found. Using random weights.\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Load MNLI Dataset\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading mnli...\n",
+      "Loaded dataset keys: dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])\n",
+      "Train size: 10000, Val size: 1000\n"
+     ]
+    }
+   ],
+   "source": [
+    "DATASET_NAME = 'mnli'\n",
+    "print(f\"Loading {DATASET_NAME}...\")\n",
+    "# MNLI is part of GLUE benchmark\n",
+    "dataset = load_dataset('glue', 'mnli') \n",
+    "print(f\"Loaded dataset keys: {dataset.keys()}\")\n",
+    "\n",
+    "train_dataset = dataset['train'].select(range(10000))\n",
+    "val_dataset = dataset['validation_matched'].select(range(1000))\n",
+    "print(f\"Train size: {len(train_dataset)}, Val size: {len(val_dataset)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data Loader\n",
+    "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "class NLIDataset(Dataset):\n",
+    "    def __init__(self, dataset, tokenizer, max_len=128):\n",
+    "        self.dataset = dataset\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.max_len = max_len\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.dataset)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        item = self.dataset[idx]\n",
+    "        premise = item['premise']\n",
+    "        hypothesis = item['hypothesis']\n",
+    "        label = item['label']\n",
+    "\n",
+    "        encoded_premise = self.tokenizer(\n",
+    "            premise,\n",
+    "            add_special_tokens=True,\n",
+    "            max_length=self.max_len,\n",
+    "            padding='max_length',\n",
+    "            return_attention_mask=True,\n",
+    "            truncation=True\n",
+    "        )\n",
+    "\n",
+    "        encoded_hypothesis = self.tokenizer(\n",
+    "            hypothesis,\n",
+    "            add_special_tokens=True,\n",
+    "            max_length=self.max_len,\n",
+    "            padding='max_length',\n",
+    "            return_attention_mask=True,\n",
+    "            truncation=True\n",
+    "        )\n",
+    "\n",
+    "        return {\n",
+    "            'premise_input_ids': torch.tensor(encoded_premise['input_ids'], dtype=torch.long),\n",
+    "            'premise_segment_ids': torch.tensor(encoded_premise['token_type_ids'], dtype=torch.long),\n",
+    "            'hypothesis_input_ids': torch.tensor(encoded_hypothesis['input_ids'], dtype=torch.long),\n",
+    "            'hypothesis_segment_ids': torch.tensor(encoded_hypothesis['token_type_ids'], dtype=torch.long),\n",
+    "            'label': torch.tensor(label, dtype=torch.long)\n",
+    "        }\n",
+    "\n",
+    "train_loader = DataLoader(NLIDataset(train_dataset, tokenizer), batch_size=16, shuffle=True)\n",
+    "test_loader = DataLoader(NLIDataset(val_dataset, tokenizer), batch_size=16, shuffle=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# S-BERT Model\n",
+    "class SBERT(nn.Module):\n",
+    "    def __init__(self, bert_model):\n",
+    "        super(SBERT, self).__init__()\n",
+    "        self.bert = bert_model\n",
+    "        self.classifier = nn.Linear(d_model * 3, 3)\n",
+    "\n",
+    "    def forward(self, premise_ids, premise_seg, hypothesis_ids, hypothesis_seg):\n",
+    "        device = premise_ids.device\n",
+    "        dummy_masked_pos = torch.zeros((premise_ids.size(0), 1), dtype=torch.long).to(device)\n",
+    "        \n",
+    "        _, _, output_u = self.bert(premise_ids, premise_seg, dummy_masked_pos)\n",
+    "        mask_u = (premise_ids != 0).unsqueeze(-1).float()\n",
+    "        u = torch.sum(output_u * mask_u, dim=1) / torch.clamp(mask_u.sum(dim=1), min=1e-9)\n",
+    "\n",
+    "        _, _, output_v = self.bert(hypothesis_ids, hypothesis_seg, dummy_masked_pos)\n",
+    "        mask_v = (hypothesis_ids != 0).unsqueeze(-1).float()\n",
+    "        v = torch.sum(output_v * mask_v, dim=1) / torch.clamp(mask_v.sum(dim=1), min=1e-9)\n",
+    "\n",
+    "        uv_abs = torch.abs(u - v)\n",
+    "        features = torch.cat([u, v, uv_abs], dim=-1)\n",
+    "        logits = self.classifier(features)\n",
+    "        return logits\n",
+    "\n",
+    "sbert = SBERT(bert).to(device)\n",
+    "optimizer = optim.Adam(sbert.parameters(), lr=2e-5)\n",
+    "criterion = nn.CrossEntropyLoss()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting Training...\n",
+      "Epoch 1 Loss: 1.0874\n",
+      "Epoch 2 Loss: 1.0579\n",
+      "Epoch 3 Loss: 1.0135\n",
+      "Epoch 4 Loss: 0.9806\n",
+      "Epoch 5 Loss: 0.9516\n",
+      "Epoch 6 Loss: 0.9264\n",
+      "Epoch 7 Loss: 0.8967\n",
+      "Epoch 8 Loss: 0.8670\n",
+      "Epoch 9 Loss: 0.8326\n",
+      "Epoch 10 Loss: 0.8005\n",
+      "Epoch 11 Loss: 0.7601\n",
+      "Epoch 12 Loss: 0.7218\n",
+      "Epoch 13 Loss: 0.6797\n",
+      "Epoch 14 Loss: 0.6349\n",
+      "Epoch 15 Loss: 0.5906\n",
+      "Epoch 16 Loss: 0.5450\n",
+      "Epoch 17 Loss: 0.4974\n",
+      "Epoch 18 Loss: 0.4493\n",
+      "Epoch 19 Loss: 0.4065\n",
+      "Epoch 20 Loss: 0.3590\n",
+      "Epoch 21 Loss: 0.3150\n",
+      "Epoch 22 Loss: 0.2712\n",
+      "Epoch 23 Loss: 0.2321\n",
+      "Epoch 24 Loss: 0.2020\n",
+      "Epoch 25 Loss: 0.1673\n",
+      "Epoch 26 Loss: 0.1349\n",
+      "Epoch 27 Loss: 0.1136\n",
+      "Epoch 28 Loss: 0.0965\n",
+      "Epoch 29 Loss: 0.0876\n",
+      "Epoch 30 Loss: 0.0723\n",
+      "Epoch 31 Loss: 0.0656\n",
+      "Epoch 32 Loss: 0.0504\n",
+      "Epoch 33 Loss: 0.0416\n",
+      "Epoch 34 Loss: 0.0418\n",
+      "Epoch 35 Loss: 0.0327\n",
+      "Epoch 36 Loss: 0.0324\n",
+      "Epoch 37 Loss: 0.0269\n",
+      "Epoch 38 Loss: 0.0438\n",
+      "Epoch 39 Loss: 0.0291\n",
+      "Epoch 40 Loss: 0.0210\n",
+      "Epoch 41 Loss: 0.0168\n",
+      "Epoch 42 Loss: 0.0309\n",
+      "Epoch 43 Loss: 0.0180\n",
+      "Epoch 44 Loss: 0.0327\n",
+      "Epoch 45 Loss: 0.0411\n",
+      "Epoch 46 Loss: 0.0157\n",
+      "Epoch 47 Loss: 0.0048\n",
+      "Epoch 48 Loss: 0.0019\n",
+      "Epoch 49 Loss: 0.0013\n",
+      "Epoch 50 Loss: 0.0010\n",
+      "Epoch 51 Loss: 0.0008\n",
+      "Epoch 52 Loss: 0.0007\n",
+      "Epoch 53 Loss: 0.0005\n",
+      "Epoch 54 Loss: 0.0004\n",
+      "Epoch 55 Loss: 0.0003\n",
+      "Epoch 56 Loss: 0.0002\n",
+      "Epoch 57 Loss: 0.0002\n",
+      "Epoch 58 Loss: 0.0001\n",
+      "Epoch 59 Loss: 0.0001\n",
+      "Epoch 60 Loss: 0.0001\n",
+      "Epoch 61 Loss: 0.0000\n",
+      "Epoch 62 Loss: 0.0000\n",
+      "Epoch 63 Loss: 0.0000\n",
+      "Epoch 64 Loss: 0.0000\n",
+      "Epoch 65 Loss: 0.0000\n",
+      "Epoch 66 Loss: 0.1953\n",
+      "Epoch 67 Loss: 0.0272\n",
+      "Epoch 68 Loss: 0.0120\n",
+      "Epoch 69 Loss: 0.0108\n",
+      "Epoch 70 Loss: 0.0152\n",
+      "Epoch 71 Loss: 0.0337\n",
+      "Epoch 72 Loss: 0.0215\n",
+      "Epoch 73 Loss: 0.0148\n",
+      "Epoch 74 Loss: 0.0207\n",
+      "Epoch 75 Loss: 0.0238\n",
+      "Epoch 76 Loss: 0.0181\n",
+      "Epoch 77 Loss: 0.0217\n",
+      "Epoch 78 Loss: 0.0136\n",
+      "Epoch 79 Loss: 0.0163\n",
+      "Epoch 80 Loss: 0.0067\n",
+      "Epoch 81 Loss: 0.0007\n",
+      "Epoch 82 Loss: 0.0003\n",
+      "Epoch 83 Loss: 0.0002\n",
+      "Epoch 84 Loss: 0.0002\n",
+      "Epoch 85 Loss: 0.0001\n",
+      "Epoch 86 Loss: 0.0001\n",
+      "Epoch 87 Loss: 0.0001\n",
+      "Epoch 88 Loss: 0.0001\n",
+      "Epoch 89 Loss: 0.0001\n",
+      "Epoch 90 Loss: 0.0000\n",
+      "Epoch 91 Loss: 0.0000\n",
+      "Epoch 92 Loss: 0.0000\n",
+      "Epoch 93 Loss: 0.0000\n",
+      "Epoch 94 Loss: 0.0000\n",
+      "Epoch 95 Loss: 0.0000\n",
+      "Epoch 96 Loss: 0.0000\n",
+      "Epoch 97 Loss: 0.0000\n",
+      "Epoch 98 Loss: 0.0000\n",
+      "Epoch 99 Loss: 0.0000\n",
+      "Epoch 100 Loss: 0.0000\n",
+      "Done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Train Loop\n",
+    "print(\"Starting Training...\")\n",
+    "epochs = 100 \n",
+    "for epoch in range(epochs):\n",
+    "    sbert.train()\n",
+    "    total_loss = 0\n",
+    "    for batch in train_loader:\n",
+    "        p_ids = batch['premise_input_ids'].to(device)\n",
+    "        p_seg = batch['premise_segment_ids'].to(device)\n",
+    "        h_ids = batch['hypothesis_input_ids'].to(device)\n",
+    "        h_seg = batch['hypothesis_segment_ids'].to(device)\n",
+    "        labels = batch['label'].to(device)\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        logits = sbert(p_ids, p_seg, h_ids, h_seg)\n",
+    "        loss = criterion(logits, labels)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        total_loss += loss.item()\n",
+    "    print(f\"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}\")\n",
+    "\n",
+    "print(\"Done!\")\n",
+    "torch.save(sbert.state_dict(), f'./models/sbert_{DATASET_NAME}.pt')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Evaluation\n",
+    "\n",
+    "Evaluate on validation set (matched)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating...\n",
+      "Classification Report:\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "   Entailment       0.42      0.45      0.43       341\n",
+      "      Neutral       0.42      0.34      0.37       319\n",
+      "Contradiction       0.49      0.54      0.51       340\n",
+      "\n",
+      "     accuracy                           0.44      1000\n",
+      "    macro avg       0.44      0.44      0.44      1000\n",
+      " weighted avg       0.44      0.44      0.44      1000\n",
+      "\n",
+      "Accuracy: 0.4440\n"
+     ]
+    }
+   ],
+   "source": [
+    "sbert.eval()\n",
+    "all_preds = []\n",
+    "all_labels = []\n",
+    "\n",
+    "print(\"Evaluating...\")\n",
+    "with torch.no_grad():\n",
+    "    for batch in test_loader:\n",
+    "        p_ids = batch['premise_input_ids'].to(device)\n",
+    "        p_seg = batch['premise_segment_ids'].to(device)\n",
+    "        h_ids = batch['hypothesis_input_ids'].to(device)\n",
+    "        h_seg = batch['hypothesis_segment_ids'].to(device)\n",
+    "        labels = batch['label'].to(device)\n",
+    "\n",
+    "        logits = sbert(p_ids, p_seg, h_ids, h_seg)\n",
+    "        preds = torch.argmax(logits, dim=1)\n",
+    "        \n",
+    "        all_preds.extend(preds.cpu().numpy())\n",
+    "        all_labels.extend(labels.cpu().numpy())\n",
+    "\n",
+    "target_names = ['Entailment', 'Neutral', 'Contradiction']\n",
+    "print(\"Classification Report:\")\n",
+    "print(classification_report(all_labels, all_preds, labels=[0, 1, 2], target_names=target_names))\n",
+    "print(f\"Accuracy: {accuracy_score(all_labels, all_preds):.4f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

A4_Option_SNLI.ipynb ADDED Viewed

	@@ -0,0 +1,637 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# A4: S-BERT Training on Alternative Datasets (SNLI)\n",
+    "\n",
+    "This notebook allows me to train the S-BERT model on the **SNLI** (Stanford Natural Language Inference) dataset.\n",
+    "\n",
+    "## 1. Environment Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: mps\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import numpy as np\n",
+    "from datasets import load_dataset\n",
+    "from transformers import BertTokenizer\n",
+    "from torch.utils.data import DataLoader, Dataset\n",
+    "from sklearn.metrics import classification_report, accuracy_score\n",
+    "\n",
+    "# Device Configuration\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else (\"mps\" if torch.backends.mps.is_available() else \"cpu\"))\n",
+    "print(f\"Using device: {device}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Load Pre-trained BERT\n",
+    "\n",
+    "I will load the BERT model trained in `A4_BERT.ipynb`. Ensure `models/bert_trained.pt` exists."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded bert_trained.pt\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define BERT Architecture\n",
+    "# MUST MATCH THE OPTIMIZED CONFIG FROM A4_BERT.ipynb\n",
+    "vocab_size = 5004       # Updated from 30522\n",
+    "d_model = 256          # MiniBERT Config\n",
+    "n_layers = 2           # Updated from 4\n",
+    "n_heads = 4\n",
+    "d_ff = 256 * 4\n",
+    "max_len = 128\n",
+    "n_segments = 2\n",
+    "d_k = d_v = 64\n",
+    "\n",
+    "class Embedding(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(Embedding, self).__init__()\n",
+    "        self.tok_embed = nn.Embedding(vocab_size, d_model)\n",
+    "        self.pos_embed = nn.Embedding(max_len, d_model)\n",
+    "        self.seg_embed = nn.Embedding(n_segments, d_model)\n",
+    "        self.norm = nn.LayerNorm(d_model)\n",
+    "\n",
+    "    def forward(self, x, seg):\n",
+    "        seq_len = x.size(1)\n",
+    "        pos = torch.arange(seq_len, dtype=torch.long, device=x.device)\n",
+    "        pos = pos.unsqueeze(0).expand_as(x)\n",
+    "        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n",
+    "        return self.norm(embedding)\n",
+    "\n",
+    "class MultiHeadAttention(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(MultiHeadAttention, self).__init__()\n",
+    "        self.W_Q = nn.Linear(d_model, d_k * n_heads)\n",
+    "        self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
+    "        self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
+    "        self.linear = nn.Linear(n_heads * d_v, d_model)\n",
+    "        self.layer_norm = nn.LayerNorm(d_model)\n",
+    "\n",
+    "    def forward(self, Q, K, V, attn_mask):\n",
+    "        batch_size = Q.size(0)\n",
+    "        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)\n",
+    "        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2)\n",
+    "        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2)\n",
+    "        \n",
+    "        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)\n",
+    "        \n",
+    "        scores = torch.matmul(q_s, k_s.transpose(-1, -2)) / np.sqrt(d_k)\n",
+    "        scores.masked_fill_(attn_mask, -1e9)\n",
+    "        attn = nn.Softmax(dim=-1)(scores)\n",
+    "        context = torch.matmul(attn, v_s)\n",
+    "        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)\n",
+    "        output = self.linear(context)\n",
+    "        return self.layer_norm(output + Q), attn\n",
+    "\n",
+    "class PoswiseFeedForwardNet(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(PoswiseFeedForwardNet, self).__init__()\n",
+    "        self.fc1 = nn.Linear(d_model, d_ff)\n",
+    "        self.fc2 = nn.Linear(d_ff, d_model)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.fc2(torch.nn.functional.gelu(self.fc1(x)))\n",
+    "\n",
+    "class EncoderLayer(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(EncoderLayer, self).__init__()\n",
+    "        self.enc_self_attn = MultiHeadAttention()\n",
+    "        self.pos_ffn = PoswiseFeedForwardNet()\n",
+    "\n",
+    "    def forward(self, enc_inputs, enc_self_attn_mask):\n",
+    "        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)\n",
+    "        enc_outputs = self.pos_ffn(enc_outputs)\n",
+    "        return enc_outputs, attn\n",
+    "\n",
+    "def get_attn_pad_mask(seq_q, seq_k):\n",
+    "    batch_size, len_q = seq_q.size()\n",
+    "    batch_size, len_k = seq_k.size()\n",
+    "    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)\n",
+    "    return pad_attn_mask.expand(batch_size, len_q, len_k)\n",
+    "\n",
+    "class BERT(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(BERT, self).__init__()\n",
+    "        self.embedding = Embedding()\n",
+    "        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
+    "        self.fc = nn.Linear(d_model, d_model)\n",
+    "        self.activ = nn.Tanh()\n",
+    "        self.linear = nn.Linear(d_model, d_model)\n",
+    "        self.norm = nn.LayerNorm(d_model)\n",
+    "        self.classifier = nn.Linear(d_model, 2)\n",
+    "        embed_weight = self.embedding.tok_embed.weight\n",
+    "        n_vocab, n_dim = embed_weight.size()\n",
+    "        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)\n",
+    "        self.decoder.weight = embed_weight\n",
+    "        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n",
+    "\n",
+    "    def forward(self, input_ids, segment_ids, masked_pos=None):\n",
+    "        output = self.embedding(input_ids, segment_ids)\n",
+    "        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)\n",
+    "        for layer in self.layers:\n",
+    "            output, enc_self_attn = layer(output, enc_self_attn_mask)\n",
+    "        return None, None, output \n",
+    "\n",
+    "# Load Pretrained Weights\n",
+    "bert = BERT().to(device)\n",
+    "try:\n",
+    "    bert.load_state_dict(torch.load('./models/bert_trained.pt', map_location=device))\n",
+    "    print(\"Loaded bert_trained.pt\")\n",
+    "except:\n",
+    "    print(\"Warning: bert_trained.pt not found. Using random weights.\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Load SNLI Dataset\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading snli...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded dataset keys: dict_keys(['test', 'validation', 'train'])\n",
+      "Train size: 9988, Test size: 988\n"
+     ]
+    }
+   ],
+   "source": [
+    "DATASET_NAME = 'snli'\n",
+    "print(f\"Loading {DATASET_NAME}...\")\n",
+    "dataset = load_dataset(DATASET_NAME)\n",
+    "print(f\"Loaded dataset keys: {dataset.keys()}\")\n",
+    "\n",
+    "train_dataset = dataset['train'].select(range(10000))\n",
+    "test_dataset = dataset['test'].select(range(1000))\n",
+    "\n",
+    "# Filter undefined labels\n",
+    "train_dataset = train_dataset.filter(lambda x: x['label'] != -1)\n",
+    "test_dataset = test_dataset.filter(lambda x: x['label'] != -1)\n",
+    "\n",
+    "print(f\"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Data Loader\n",
+    "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "class NLIDataset(Dataset):\n",
+    "    def __init__(self, dataset, tokenizer, max_len=128):\n",
+    "        self.dataset = dataset\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.max_len = max_len\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.dataset)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        item = self.dataset[idx]\n",
+    "        premise = item['premise']\n",
+    "        hypothesis = item['hypothesis']\n",
+    "        label = item['label']\n",
+    "\n",
+    "        encoded_premise = self.tokenizer(\n",
+    "            premise,\n",
+    "            add_special_tokens=True,\n",
+    "            max_length=self.max_len,\n",
+    "            padding='max_length',\n",
+    "            return_attention_mask=True,\n",
+    "            truncation=True\n",
+    "        )\n",
+    "\n",
+    "        encoded_hypothesis = self.tokenizer(\n",
+    "            hypothesis,\n",
+    "            add_special_tokens=True,\n",
+    "            max_length=self.max_len,\n",
+    "            padding='max_length',\n",
+    "            return_attention_mask=True,\n",
+    "            truncation=True\n",
+    "        )\n",
+    "\n",
+    "        return {\n",
+    "            'premise_input_ids': torch.tensor(encoded_premise['input_ids'], dtype=torch.long),\n",
+    "            'premise_segment_ids': torch.tensor(encoded_premise['token_type_ids'], dtype=torch.long),\n",
+    "            'hypothesis_input_ids': torch.tensor(encoded_hypothesis['input_ids'], dtype=torch.long),\n",
+    "            'hypothesis_segment_ids': torch.tensor(encoded_hypothesis['token_type_ids'], dtype=torch.long),\n",
+    "            'label': torch.tensor(label, dtype=torch.long)\n",
+    "        }\n",
+    "\n",
+    "train_loader = DataLoader(NLIDataset(train_dataset, tokenizer), batch_size=16, shuffle=True)\n",
+    "test_loader = DataLoader(NLIDataset(test_dataset, tokenizer), batch_size=16, shuffle=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# S-BERT Model\n",
+    "class SBERT(nn.Module):\n",
+    "    def __init__(self, bert_model):\n",
+    "        super(SBERT, self).__init__()\n",
+    "        self.bert = bert_model\n",
+    "        self.classifier = nn.Linear(d_model * 3, 3)\n",
+    "\n",
+    "    def forward(self, premise_ids, premise_seg, hypothesis_ids, hypothesis_seg):\n",
+    "        device = premise_ids.device\n",
+    "        dummy_masked_pos = torch.zeros((premise_ids.size(0), 1), dtype=torch.long).to(device)\n",
+    "        \n",
+    "        _, _, output_u = self.bert(premise_ids, premise_seg, dummy_masked_pos)\n",
+    "        mask_u = (premise_ids != 0).unsqueeze(-1).float()\n",
+    "        u = torch.sum(output_u * mask_u, dim=1) / torch.clamp(mask_u.sum(dim=1), min=1e-9)\n",
+    "\n",
+    "        _, _, output_v = self.bert(hypothesis_ids, hypothesis_seg, dummy_masked_pos)\n",
+    "        mask_v = (hypothesis_ids != 0).unsqueeze(-1).float()\n",
+    "        v = torch.sum(output_v * mask_v, dim=1) / torch.clamp(mask_v.sum(dim=1), min=1e-9)\n",
+    "\n",
+    "        uv_abs = torch.abs(u - v)\n",
+    "        features = torch.cat([u, v, uv_abs], dim=-1)\n",
+    "        logits = self.classifier(features)\n",
+    "        return logits\n",
+    "\n",
+    "sbert = SBERT(bert).to(device)\n",
+    "optimizer = optim.Adam(sbert.parameters(), lr=2e-5)\n",
+    "criterion = nn.CrossEntropyLoss()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting Training...\n",
+      "Epoch 1 Loss: 1.0723\n",
+      "Epoch 2 Loss: 1.0245\n",
+      "Epoch 3 Loss: 0.9913\n",
+      "Epoch 4 Loss: 0.9702\n",
+      "Epoch 5 Loss: 0.9487\n",
+      "Epoch 6 Loss: 0.9257\n",
+      "Epoch 7 Loss: 0.9018\n",
+      "Epoch 8 Loss: 0.8794\n",
+      "Epoch 9 Loss: 0.8573\n",
+      "Epoch 10 Loss: 0.8355\n",
+      "Epoch 11 Loss: 0.8129\n",
+      "Epoch 12 Loss: 0.7907\n",
+      "Epoch 13 Loss: 0.7694\n",
+      "Epoch 14 Loss: 0.7470\n",
+      "Epoch 15 Loss: 0.7227\n",
+      "Epoch 16 Loss: 0.7023\n",
+      "Epoch 17 Loss: 0.6780\n",
+      "Epoch 18 Loss: 0.6569\n",
+      "Epoch 19 Loss: 0.6336\n",
+      "Epoch 20 Loss: 0.6084\n",
+      "Epoch 21 Loss: 0.5883\n",
+      "Epoch 22 Loss: 0.5596\n",
+      "Epoch 23 Loss: 0.5356\n",
+      "Epoch 24 Loss: 0.5116\n",
+      "Epoch 25 Loss: 0.4880\n",
+      "Epoch 26 Loss: 0.4623\n",
+      "Epoch 27 Loss: 0.4392\n",
+      "Epoch 28 Loss: 0.4161\n",
+      "Epoch 29 Loss: 0.3903\n",
+      "Epoch 30 Loss: 0.3692\n",
+      "Epoch 31 Loss: 0.3509\n",
+      "Epoch 32 Loss: 0.3258\n",
+      "Epoch 33 Loss: 0.3048\n",
+      "Epoch 34 Loss: 0.2834\n",
+      "Epoch 35 Loss: 0.2664\n",
+      "Epoch 36 Loss: 0.2493\n",
+      "Epoch 37 Loss: 0.2327\n",
+      "Epoch 38 Loss: 0.2145\n",
+      "Epoch 39 Loss: 0.2049\n",
+      "Epoch 40 Loss: 0.1845\n",
+      "Epoch 41 Loss: 0.1687\n",
+      "Epoch 42 Loss: 0.1627\n",
+      "Epoch 43 Loss: 0.1548\n",
+      "Epoch 44 Loss: 0.1367\n",
+      "Epoch 45 Loss: 0.1268\n",
+      "Epoch 46 Loss: 0.1315\n",
+      "Epoch 47 Loss: 0.1230\n",
+      "Epoch 48 Loss: 0.1051\n",
+      "Epoch 49 Loss: 0.0964\n",
+      "Epoch 50 Loss: 0.1027\n",
+      "Epoch 51 Loss: 0.0983\n",
+      "Epoch 52 Loss: 0.0781\n",
+      "Epoch 53 Loss: 0.0795\n",
+      "Epoch 54 Loss: 0.0860\n",
+      "Epoch 55 Loss: 0.0800\n",
+      "Epoch 56 Loss: 0.0620\n",
+      "Epoch 57 Loss: 0.0905\n",
+      "Epoch 58 Loss: 0.0567\n",
+      "Epoch 59 Loss: 0.0568\n",
+      "Epoch 60 Loss: 0.0502\n",
+      "Epoch 61 Loss: 0.0808\n",
+      "Epoch 62 Loss: 0.0622\n",
+      "Epoch 63 Loss: 0.0445\n",
+      "Epoch 64 Loss: 0.0536\n",
+      "Epoch 65 Loss: 0.0564\n",
+      "Epoch 66 Loss: 0.0542\n",
+      "Epoch 67 Loss: 0.0537\n",
+      "Epoch 68 Loss: 0.0419\n",
+      "Epoch 69 Loss: 0.0648\n",
+      "Epoch 70 Loss: 0.0496\n",
+      "Epoch 71 Loss: 0.0510\n",
+      "Epoch 72 Loss: 0.0470\n",
+      "Epoch 73 Loss: 0.0446\n",
+      "Epoch 74 Loss: 0.0359\n",
+      "Epoch 75 Loss: 0.0533\n",
+      "Epoch 76 Loss: 0.0611\n",
+      "Epoch 77 Loss: 0.0368\n",
+      "Epoch 78 Loss: 0.0291\n",
+      "Epoch 79 Loss: 0.0321\n",
+      "Epoch 80 Loss: 0.0757\n",
+      "Epoch 81 Loss: 0.0546\n",
+      "Epoch 82 Loss: 0.0300\n",
+      "Epoch 83 Loss: 0.0279\n",
+      "Epoch 84 Loss: 0.0294\n",
+      "Epoch 85 Loss: 0.0542\n",
+      "Epoch 86 Loss: 0.0422\n",
+      "Epoch 87 Loss: 0.0353\n",
+      "Epoch 88 Loss: 0.0537\n",
+      "Epoch 89 Loss: 0.0300\n",
+      "Epoch 90 Loss: 0.0295\n",
+      "Epoch 91 Loss: 0.0422\n",
+      "Epoch 92 Loss: 0.0403\n",
+      "Epoch 93 Loss: 0.0225\n",
+      "Epoch 94 Loss: 0.0335\n",
+      "Epoch 95 Loss: 0.0457\n",
+      "Epoch 96 Loss: 0.0307\n",
+      "Epoch 97 Loss: 0.0253\n",
+      "Epoch 98 Loss: 0.0543\n",
+      "Epoch 99 Loss: 0.0302\n",
+      "Epoch 100 Loss: 0.0237\n",
+      "Epoch 101 Loss: 0.0344\n",
+      "Epoch 102 Loss: 0.0417\n",
+      "Epoch 103 Loss: 0.0227\n",
+      "Epoch 104 Loss: 0.0267\n",
+      "Epoch 105 Loss: 0.0431\n",
+      "Epoch 106 Loss: 0.0263\n",
+      "Epoch 107 Loss: 0.0442\n",
+      "Epoch 108 Loss: 0.0300\n",
+      "Epoch 109 Loss: 0.0215\n",
+      "Epoch 110 Loss: 0.0262\n",
+      "Epoch 111 Loss: 0.0485\n",
+      "Epoch 112 Loss: 0.0253\n",
+      "Epoch 113 Loss: 0.0202\n",
+      "Epoch 114 Loss: 0.0226\n",
+      "Epoch 115 Loss: 0.0355\n",
+      "Epoch 116 Loss: 0.0534\n",
+      "Epoch 117 Loss: 0.0210\n",
+      "Epoch 118 Loss: 0.0173\n",
+      "Epoch 119 Loss: 0.0315\n",
+      "Epoch 120 Loss: 0.0457\n",
+      "Epoch 121 Loss: 0.0209\n",
+      "Epoch 122 Loss: 0.0226\n",
+      "Epoch 123 Loss: 0.0325\n",
+      "Epoch 124 Loss: 0.0320\n",
+      "Epoch 125 Loss: 0.0269\n",
+      "Epoch 126 Loss: 0.0212\n",
+      "Epoch 127 Loss: 0.0213\n",
+      "Epoch 128 Loss: 0.0313\n",
+      "Epoch 129 Loss: 0.0376\n",
+      "Epoch 130 Loss: 0.0284\n",
+      "Epoch 131 Loss: 0.0177\n",
+      "Epoch 132 Loss: 0.0172\n",
+      "Epoch 133 Loss: 0.0234\n",
+      "Epoch 134 Loss: 0.0442\n",
+      "Epoch 135 Loss: 0.0222\n",
+      "Epoch 136 Loss: 0.0293\n",
+      "Epoch 137 Loss: 0.0258\n",
+      "Epoch 138 Loss: 0.0260\n",
+      "Epoch 139 Loss: 0.0220\n",
+      "Epoch 140 Loss: 0.0167\n",
+      "Epoch 141 Loss: 0.0395\n",
+      "Epoch 142 Loss: 0.0265\n",
+      "Epoch 143 Loss: 0.0179\n",
+      "Epoch 144 Loss: 0.0195\n",
+      "Epoch 145 Loss: 0.0318\n",
+      "Epoch 146 Loss: 0.0224\n",
+      "Epoch 147 Loss: 0.0160\n",
+      "Epoch 148 Loss: 0.0215\n",
+      "Epoch 149 Loss: 0.0491\n",
+      "Epoch 150 Loss: 0.0197\n",
+      "Epoch 151 Loss: 0.0203\n",
+      "Epoch 152 Loss: 0.0238\n",
+      "Epoch 153 Loss: 0.0260\n",
+      "Epoch 154 Loss: 0.0178\n",
+      "Epoch 155 Loss: 0.0156\n",
+      "Epoch 156 Loss: 0.0171\n",
+      "Epoch 157 Loss: 0.0243\n",
+      "Epoch 158 Loss: 0.0403\n",
+      "Epoch 159 Loss: 0.0180\n",
+      "Epoch 160 Loss: 0.0172\n",
+      "Epoch 161 Loss: 0.0198\n",
+      "Epoch 162 Loss: 0.0336\n",
+      "Epoch 163 Loss: 0.0222\n",
+      "Epoch 164 Loss: 0.0155\n",
+      "Epoch 165 Loss: 0.0193\n",
+      "Epoch 166 Loss: 0.0239\n",
+      "Epoch 167 Loss: 0.0183\n",
+      "Epoch 168 Loss: 0.0160\n",
+      "Epoch 169 Loss: 0.0182\n",
+      "Epoch 170 Loss: 0.0389\n",
+      "Epoch 171 Loss: 0.0229\n",
+      "Epoch 172 Loss: 0.0171\n",
+      "Epoch 173 Loss: 0.0162\n",
+      "Epoch 174 Loss: 0.0206\n",
+      "Epoch 175 Loss: 0.0159\n",
+      "Epoch 176 Loss: 0.0158\n",
+      "Epoch 177 Loss: 0.0361\n",
+      "Epoch 178 Loss: 0.0346\n",
+      "Epoch 179 Loss: 0.0183\n",
+      "Epoch 180 Loss: 0.0163\n",
+      "Epoch 181 Loss: 0.0132\n",
+      "Epoch 182 Loss: 0.0162\n",
+      "Epoch 183 Loss: 0.0160\n",
+      "Epoch 184 Loss: 0.0348\n",
+      "Epoch 185 Loss: 0.0271\n",
+      "Epoch 186 Loss: 0.0168\n",
+      "Epoch 187 Loss: 0.0129\n",
+      "Epoch 188 Loss: 0.0138\n",
+      "Epoch 189 Loss: 0.0161\n",
+      "Epoch 190 Loss: 0.0218\n",
+      "Epoch 191 Loss: 0.0254\n",
+      "Epoch 192 Loss: 0.0254\n",
+      "Epoch 193 Loss: 0.0127\n",
+      "Epoch 194 Loss: 0.0126\n",
+      "Epoch 195 Loss: 0.0151\n",
+      "Epoch 196 Loss: 0.0197\n",
+      "Epoch 197 Loss: 0.0283\n",
+      "Epoch 198 Loss: 0.0157\n",
+      "Epoch 199 Loss: 0.0134\n",
+      "Epoch 200 Loss: 0.0165\n",
+      "Done!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Train Loop\n",
+    "print(\"Starting Training...\")\n",
+    "epochs = 200\n",
+    "for epoch in range(epochs):\n",
+    "    sbert.train()\n",
+    "    total_loss = 0\n",
+    "    for batch in train_loader:\n",
+    "        p_ids = batch['premise_input_ids'].to(device)\n",
+    "        p_seg = batch['premise_segment_ids'].to(device)\n",
+    "        h_ids = batch['hypothesis_input_ids'].to(device)\n",
+    "        h_seg = batch['hypothesis_segment_ids'].to(device)\n",
+    "        labels = batch['label'].to(device)\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        logits = sbert(p_ids, p_seg, h_ids, h_seg)\n",
+    "        loss = criterion(logits, labels)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        total_loss += loss.item()\n",
+    "    print(f\"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}\")\n",
+    "\n",
+    "print(\"Done!\")\n",
+    "torch.save(sbert.state_dict(), f'./models/sbert_{DATASET_NAME}.pt')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Evaluation\n",
+    "\n",
+    "Evaluate on specific test set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluating...\n",
+      "Classification Report:\n",
+      "               precision    recall  f1-score   support\n",
+      "\n",
+      "   Entailment       0.53      0.56      0.54       339\n",
+      "      Neutral       0.51      0.44      0.47       324\n",
+      "Contradiction       0.45      0.49      0.47       325\n",
+      "\n",
+      "     accuracy                           0.49       988\n",
+      "    macro avg       0.50      0.49      0.49       988\n",
+      " weighted avg       0.50      0.49      0.49       988\n",
+      "\n",
+      "Accuracy: 0.4949\n"
+     ]
+    }
+   ],
+   "source": [
+    "sbert.eval()\n",
+    "all_preds = []\n",
+    "all_labels = []\n",
+    "\n",
+    "print(\"Evaluating...\")\n",
+    "with torch.no_grad():\n",
+    "    for batch in test_loader:\n",
+    "        p_ids = batch['premise_input_ids'].to(device)\n",
+    "        p_seg = batch['premise_segment_ids'].to(device)\n",
+    "        h_ids = batch['hypothesis_input_ids'].to(device)\n",
+    "        h_seg = batch['hypothesis_segment_ids'].to(device)\n",
+    "        labels = batch['label'].to(device)\n",
+    "\n",
+    "        logits = sbert(p_ids, p_seg, h_ids, h_seg)\n",
+    "        preds = torch.argmax(logits, dim=1)\n",
+    "        \n",
+    "        all_preds.extend(preds.cpu().numpy())\n",
+    "        all_labels.extend(labels.cpu().numpy())\n",
+    "\n",
+    "target_names = ['Entailment', 'Neutral', 'Contradiction']\n",
+    "print(\"Classification Report:\")\n",
+    "print(classification_report(all_labels, all_preds, labels=[0, 1, 2], target_names=target_names))\n",
+    "print(f\"Accuracy: {accuracy_score(all_labels, all_preds):.4f}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["gunicorn", "-b", "0.0.0.0:7860", "app.app:app"]

README.md CHANGED Viewed

@@ -1,10 +1,87 @@
 ---
-title: A4 NLI App
-emoji: 🦀
-colorFrom: yellow
-colorTo: blue
 sdk: docker
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: NLI Text Similarity App
+emoji: 🧠
+colorFrom: blue
+colorTo: purple
 sdk: docker
+app_port: 8000
 ---
+# NLI Text Similarity App (A4 Assignment)
+**Name:** HTUT KO KO
+**ID:** st126010
+I implemented a Mini-BERT model from scratch and fine-tuned it as a Sentence-BERT (S-BERT) model for Natural Language Inference (NLI) tasks. This project includes a modern web application for real-time similarity analysis.
+## Project Structure
+- `A4_BERT.ipynb`: Task 1 - I pre-trained BERT from scratch on WikiText-103.
+- `A4_Climate_FEVER.ipynb`: Task 2 - I fine-tuned S-BERT on the Climate-FEVER dataset.
+- `A4_Option_SNLI.ipynb`: Alternative training notebook where I trained on the SNLI dataset.
+- `A4_Option_MNLI.ipynb`: Focused notebook where I trained on the MNLI dataset.
+- `app/`: My Flask web application components.
+- `models/`: Squared model weights (`bert_trained.pt`, `sbert_climate_fever.pt`, `sbert_snli.pt`, `sbert_mnli.pt`).
+## Final Results
+I trained the models on three different datasets. Here are the results I achieved:
+| Dataset | Epochs | Accuracy | Loss |
+| :--- | :--- | :--- | :--- |
+| **Climate-FEVER** | 200 | **50.5%** | 0.0000 |
+| **SNLI** | 200 | **50.8%** | ~0.59 |
+| **MNLI** | 100 | **41.5%** | 0.0000 |
+### detailed Climate-FEVER Metrics
+| Class | Precision | Recall | F1-Score |
+| :--- | :--- | :--- | :--- |
+| Entailment | 0.33 | 0.28 | 0.30 |
+| Neutral | 0.62 | 0.68 | 0.65 |
+| Contradiction | 0.40 | 0.55 | 0.46 |
+## Limitations & Analysis
+### 1. Vocabulary Size
+I limited the vocabulary size to **5004** (compared to standard BERT's 30,522) to ensure the model could be trained precisely on the smaller WikiText-103 subset. While this improved convergence for this assignment, it restricts the model's ability to understand rare words outside this vocabulary.
+### 2. Tokenizer Mismatch
+A challenge I encountered was using the standard `BertTokenizer` with my custom Mini-BERT. The tokenizer produces IDs > 5004, which caused `IndexError` in the web app. I resolved this by implementing a clamping mechanism in `app.py` to map unknown tokens to the `[UNK]` ID.
+### 3. Model Depth
+I used a "Mini-BERT" configuration (`n_layers=2`, `d_model=256`) instead of the base (`n_layers=12`, `d_model=768`). This trade-off significantly reduced training time but naturally limits the model's capacity to capture complex linguistic nuances compared to the full BERT-Base.
+## Demonstration
+![WebUI](demo.gif)
+## How to Run
+### 1. Setup Environment
+```bash
+pip install -r requirements.txt
+```
+### 2. Run the Web App
+```bash
+python app/app.py
+```
+Access the app at `http://127.0.0.1:8000`.
+## Features
+- **Modern UI**: I designed a Glassmorphism theme with a dynamic background.
+- **Multi-Model Support**: Users can select between Climate-FEVER, SNLI, or MNLI trained models.
+- **Explainable AI**: The app displays the probability distribution for each prediction.
+## References
+1.  **BERT / WikiText-103**: Merity, S., Xiong, C., Bradbury, J., & Socher, R. (2016). *Pointer Sentinel Mixture Models*.
+2.  **S-BERT (Sentence-BERT)**: Reimers, N., & Gurevych, I. (2019). *Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks*.
+3.  **Climate-FEVER**: Diggelmann, T., Boyd-Graber, J., Bulian, J., Ciaramita, M., & Leippold, M. (2020). *CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims*.
+4.  **SNLI**: Bowman, S. R., Angeli, G., Potts, C., & Manning, C. D. (2015). *A large annotated corpus for learning natural language inference*.
+5.  **MNLI**: Williams, A., Nangia, N., & Bowman, S. R. (2018). *A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference*.

app/app.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from flask import Flask, render_template, request, jsonify
+from transformers import BertTokenizer
+import os
+import math
+import numpy as np
+app = Flask(__name__)
+# --- Configuration ---
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_PATH = "../models/sbert_climate_fever.pt" # Path relative to app/ directory execution usually
+                                         # But we will run from project root or handle paths carefully
+                                         # Let's assume running from project root: python app/app.py
+                                         # Then path is models/sbert_climate_fever.pt
+MODEL_PATH_REL = "models/sbert_climate_fever.pt"
+# --- Model Definitions (Must match training Code) ---
+# Copied from A4_Solution.ipynb
+n_layers = 2
+n_heads  = 4
+d_model  = 256
+d_ff = 256 * 4
+d_k = d_v = 64
+n_segments = 2
+max_len = 128
+vocab_size = 5004 # Custom vocab size from training
+class Embedding(nn.Module):
+    def __init__(self):
+        super(Embedding, self).__init__()
+        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
+        self.pos_embed = nn.Embedding(max_len, d_model)      # position embedding
+        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
+        self.norm = nn.LayerNorm(d_model)
+        # Initialize weights to avoid large initial loss
+        self.tok_embed.weight.data.normal_(0, 0.1)
+        self.pos_embed.weight.data.normal_(0, 0.1)
+        self.seg_embed.weight.data.normal_(0, 0.1)
+    def forward(self, x, seg):
+        seq_len = x.size(1)
+        pos = torch.arange(seq_len, dtype=torch.long, device=x.device)
+        pos = pos.unsqueeze(0).expand_as(x)  # (len,) -> (bs, len)
+        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
+        return self.norm(embedding)
+def get_attn_pad_mask(seq_q, seq_k):
+    batch_size, len_q = seq_q.size()
+    batch_size, len_k = seq_k.size()
+    # eq(zero) is PAD token
+    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
+    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k
+class ScaledDotProductAttention(nn.Module):
+    def __init__(self):
+        super(ScaledDotProductAttention, self).__init__()
+    def forward(self, Q, K, V, attn_mask):
+        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
+        scores.masked_fill_(attn_mask, -1e9) # Fills elements of self tensor with value where mask is one.
+        attn = nn.Softmax(dim=-1)(scores)
+        context = torch.matmul(attn, V)
+        return context, attn
+class MultiHeadAttention(nn.Module):
+    def __init__(self):
+        super(MultiHeadAttention, self).__init__()
+        self.W_Q = nn.Linear(d_model, d_k * n_heads)
+        self.W_K = nn.Linear(d_model, d_k * n_heads)
+        self.W_V = nn.Linear(d_model, d_v * n_heads)
+        self.linear = nn.Linear(n_heads * d_v, d_model) # Defined in init
+        self.layer_norm = nn.LayerNorm(d_model) # Defined in init
+    def forward(self, Q, K, V, attn_mask):
+        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
+        residual, batch_size = Q, Q.size(0)
+        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
+        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
+        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
+        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]
+        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]
+        # context: [batch_size x n_heads x len_q x d_v], attn: [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
+        context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
+        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
+        output = self.linear(context)
+        return self.layer_norm(output + residual), attn # output: [batch_size x len_q x d_model]
+class PoswiseFeedForwardNet(nn.Module):
+    def __init__(self):
+        super(PoswiseFeedForwardNet, self).__init__()
+        self.fc1 = nn.Linear(d_model, d_ff)
+        self.fc2 = nn.Linear(d_ff, d_model)
+    def forward(self, x):
+        # (batch_size, len_seq, d_model) -> (batch_size, len_seq, d_ff) -> (batch_size, len_seq, d_model)
+        return self.fc2(F.gelu(self.fc1(x)))
+class EncoderLayer(nn.Module):
+    def __init__(self):
+        super(EncoderLayer, self).__init__()
+        self.enc_self_attn = MultiHeadAttention()
+        self.pos_ffn       = PoswiseFeedForwardNet()
+    def forward(self, enc_inputs, enc_self_attn_mask):
+        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
+        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
+        return enc_outputs, attn
+class BERT(nn.Module):
+    def __init__(self):
+        super(BERT, self).__init__()
+        self.embedding = Embedding()
+        self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
+        self.fc = nn.Linear(d_model, d_model)
+        self.activ = nn.Tanh()
+        self.linear = nn.Linear(d_model, d_model)
+        self.norm = nn.LayerNorm(d_model)
+        self.classifier = nn.Linear(d_model, 2)
+        # decoder is shared with embedding layer
+        embed_weight = self.embedding.tok_embed.weight
+        n_vocab, n_dim = embed_weight.size()
+        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
+        self.decoder.weight = embed_weight
+        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))
+    def forward(self, input_ids, segment_ids, masked_pos=None):
+        # NOTE: masked_pos is optional here because for S-BERT we only need 'output'
+        # But to be consistent with NLI/Notebook forward pass, we handle it if provided
+        # or just run through.
+        output = self.embedding(input_ids, segment_ids)
+        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
+        for layer in self.layers:
+            output, enc_self_attn = layer(output, enc_self_attn_mask)
+        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
+        # 1. predict next sentence
+        # it will be decided by first token(CLS)
+        h_pooled   = self.activ(self.fc(output[:, 0])) # [batch_size, d_model]
+        logits_nsp = self.classifier(h_pooled) # [batch_size, 2]
+        # 2. predict the masked token
+        if masked_pos is not None:
+            masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
+            h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
+            h_masked  = self.norm(F.gelu(self.linear(h_masked)))
+            logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]
+            return logits_lm, logits_nsp, output
+        else:
+            return None, logits_nsp, output # S-BERT inference only needs output
+class SBERT(nn.Module):
+    def __init__(self, bert_model):
+        super(SBERT, self).__init__()
+        self.bert = bert_model
+        # 3 * d_model because we concat u, v, |u-v|
+        self.classifier = nn.Linear(d_model * 3, 3)
+    def forward(self, premise_ids, premise_seg, hypothesis_ids, hypothesis_seg):
+        # Make dummy masked_pos for BERT forward (it's not used for encoding really, but input requires it)
+        # Creating a dummy masked_pos of shape [batch_size, 1] filled with 0
+        dummy_masked_pos = torch.zeros((premise_ids.size(0), 1), dtype=torch.long).to(premise_ids.device)
+        # Encode Premise (u)
+        _, _, output_u = self.bert(premise_ids, premise_seg, dummy_masked_pos)
+        # Mean Pooling
+        mask_u = (premise_ids != 0).unsqueeze(-1).float() # [batch, len, 1]
+        u = torch.sum(output_u * mask_u, dim=1) / torch.clamp(mask_u.sum(dim=1), min=1e-9)
+        # Encode Hypothesis (v)
+        _, _, output_v = self.bert(hypothesis_ids, hypothesis_seg, dummy_masked_pos)
+        mask_v = (hypothesis_ids != 0).unsqueeze(-1).float()
+        v = torch.sum(output_v * mask_v, dim=1) / torch.clamp(mask_v.sum(dim=1), min=1e-9)
+        # Classifier: concatenate u, v, |u-v|
+        uv_abs = torch.abs(u - v)
+        features = torch.cat([u, v, uv_abs], dim=-1)
+        logits = self.classifier(features)
+        return logits, u, v # returning u, v for cosine sim later if needed
+# --- Model Management ---
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+models = {}
+MODEL_FILES = {
+    'climate_fever': 'models/sbert_climate_fever.pt',
+    'snli': 'models/sbert_snli.pt',
+    'mnli': 'models/sbert_mnli.pt'
+}
+def get_model(model_name):
+    # Handle custom input scenario
+    if model_name == 'custom':
+        model_name = 'climate_fever'
+    # Load model on demand or return cached
+    if model_name in models:
+        return models[model_name]
+    # Check if model name is known
+    if model_name not in MODEL_FILES:
+        print(f"Warning: Unknown model name '{model_name}'.")
+        return None
+    rel_path = MODEL_FILES[model_name]
+    path = f"../{rel_path}"
+    if not os.path.exists(path):
+        # Fallback to local path if running from app folder
+        path = rel_path
+    if not os.path.exists(path):
+        print(f"Model file not found at {path}")
+        return None
+    print(f"Loading {model_name} from {path}...")
+    try:
+        bert = BERT()
+        model = SBERT(bert)
+        state_dict = torch.load(path, map_location=DEVICE)
+        model.load_state_dict(state_dict, strict=False)
+        model.to(DEVICE)
+        model.eval()
+        models[model_name] = model
+        return model
+    except Exception as e:
+        print(f"Failed to load {model_name}: {e}")
+        return None
+# Pre-load default
+get_model('climate_fever')
+@app.route('/')
+def home():
+    return render_template('index.html')
+@app.route('/predict', methods=['POST'])
+def predict():
+    data = request.json
+    sentence1 = data.get('sentence1', '')
+    sentence2 = data.get('sentence2', '')
+    model_type = data.get('model_type', 'climate_fever') # Default
+    if not sentence1 or not sentence2:
+        return jsonify({'error': 'Both sentences are required'}), 400
+    # Get specific model
+    model = get_model(model_type)
+    if model is None:
+        # Fallback to whatever is loaded or error
+        if models:
+            model = list(models.values())[0]
+            print(f"Warning: Requested {model_type} not found, using fallback.")
+        else:
+            return jsonify({'error': f'Model {model_type} not trained/found. Please train it first!'}), 404
+    # Tokenize
+    # Tokenize
+    inputs_a = tokenizer(sentence1, max_length=128, truncation=True, padding='max_length')
+    inputs_b = tokenizer(sentence2, max_length=128, truncation=True, padding='max_length')
+    p_ids = torch.tensor(inputs_a['input_ids']).unsqueeze(0).to(DEVICE)
+    p_seg = torch.tensor(inputs_a['token_type_ids']).unsqueeze(0).to(DEVICE)
+    h_ids = torch.tensor(inputs_b['input_ids']).unsqueeze(0).to(DEVICE)
+    h_seg = torch.tensor(inputs_b['token_type_ids']).unsqueeze(0).to(DEVICE)
+    # Clamp inputs to vocab size (handle OOV from standard tokenizer)
+    p_ids[p_ids >= vocab_size] = 1 # [UNK]
+    h_ids[h_ids >= vocab_size] = 1 # [UNK]
+    with torch.no_grad():
+        logits, u, v = model(p_ids, p_seg, h_ids, h_seg)
+        probs = F.softmax(logits, dim=1).cpu().numpy()[0]
+    # Labels: entailment, neutral, contradiction
+    # Note: SNLI/MNLI/Climate-FEVER generally follow Entailment(0), Neutral(1), Contradiction(2)
+    # BUT check the mapping in notebooks.
+    # Climate-Fever: 0:Supports(Entailment), 1:Refutes(Contradiction), 2:NEI(Neutral) -> Re-mapped in NB to 0, 2, 1?
+    # Let's check training NB mapping.
+    # In A4_Climate_FEVER.ipynb: label_map = {0: 0, 1: 2, 2: 1} -> 0:Entailment, 2:Contradiction, 1:Neutral
+    # In standard SNLI/MNLI: 0:Entailment, 1:Neutral, 2:Contradiction
+    # We need to map probs correctly based on model_type
+    if model_type == 'climate_fever':
+        # Trained with: 0=Entailment, 1=Neutral, 2=Contradiction (Based on my previous fix? Wait, check mapping in NB)
+        # NB: label_map = {0: 0, 1: 2, 2: 1} -> This means original 0->0, 1->2, 2->1.
+        # So Model Outputs: Class 0=Entailment, Class 1=Neutral, Class 2=Contradiction
+        # Wait, if map is {0:0, 1:2, 2:1}, then Evidence Label 0 (Supports) -> Class 0
+        # Evidence Label 1 (Refutes) -> Class 2
+        # Evidence Label 2 (NEI) -> Class 1
+        # So Class indices: 0=Entailment, 1=Neutral, 2=Contradiction.
+        labels = ['Entailment', 'Neutral', 'Contradiction']
+    else:
+        # SNLI/MNLI standard: 0=Entailment, 1=Neutral, 2=Contradiction
+        labels = ['Entailment', 'Neutral', 'Contradiction']
+    # Result dict
+    result = {label: float(prob) for label, prob in zip(labels, probs)}
+    prediction = labels[np.argmax(probs)]
+    return jsonify({
+        'prediction': prediction,
+        'probabilities': result,
+        'used_model': model_type
+    })
+if __name__ == '__main__':
+    app.run(debug=True, port=8000)

app/static/style.css ADDED Viewed

	@@ -0,0 +1,234 @@

+:root {
+    --primary: #6366f1;
+    --primary-hover: #4f46e5;
+    --bg-color: #0f172a;
+    --card-bg: rgba(30, 41, 59, 0.7);
+    --text-color: #f8fafc;
+    --text-muted: #94a3b8;
+    --border-color: rgba(255, 255, 255, 0.1);
+}
+* {
+    box-sizing: border-box;
+    margin: 0;
+    padding: 0;
+}
+body {
+    font-family: 'Inter', sans-serif;
+    background-color: var(--bg-color);
+    color: var(--text-color);
+    min-height: 100vh;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    overflow-x: hidden;
+    position: relative;
+}
+/* Ambient Background Effect */
+.background-orb {
+    position: fixed;
+    top: -20%;
+    left: -10%;
+    width: 50vw;
+    height: 50vw;
+    background: radial-gradient(circle, rgba(99, 102, 241, 0.3) 0%, rgba(15, 23, 42, 0) 70%);
+    border-radius: 50%;
+    z-index: -1;
+    animation: float 10s infinite ease-in-out;
+}
+@keyframes float {
+    0%, 100% { transform: translate(0, 0); }
+    50% { transform: translate(20px, 30px); }
+}
+.container {
+    width: 100%;
+    max-width: 800px;
+    padding: 2rem;
+}
+header {
+    text-align: center;
+    margin-bottom: 3rem;
+}
+header h1 {
+    font-size: 2.5rem;
+    font-weight: 700;
+    background: linear-gradient(135deg, #818cf8, #c084fc);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    margin-bottom: 0.5rem;
+}
+.subtitle {
+    color: var(--text-muted);
+    font-size: 1.1rem;
+}
+.explanation-box {
+    margin-top: 1.5rem;
+    background-color: rgba(51, 65, 85, 0.5);
+    border: 1px solid var(--border-color);
+    padding: 1rem;
+    border-radius: 0.75rem;
+    text-align: left;
+    font-size: 0.9rem;
+    color: var(--text-muted);
+}
+.explanation-box h3 {
+    color: var(--text-color);
+    margin-bottom: 0.5rem;
+    font-size: 1rem;
+}
+main {
+    background: var(--card-bg);
+    backdrop-filter: blur(12px);
+    -webkit-backdrop-filter: blur(12px);
+    border: 1px solid var(--border-color);
+    border-radius: 1.5rem;
+    padding: 2rem;
+    box-shadow: 0 25px 50px -12px rgba(0, 0, 0, 0.5);
+}
+.control-panel {
+    margin-bottom: 1.5rem;
+}
+.control-panel label {
+    display: block;
+    margin-bottom: 0.5rem;
+    color: var(--text-muted);
+    font-size: 0.9rem;
+}
+.control-panel select {
+    width: 100%;
+    padding: 0.75rem;
+    border-radius: 0.5rem;
+    border: 1px solid var(--border-color);
+    background-color: rgba(15, 23, 42, 0.5);
+    color: var(--text-color);
+    font-size: 1rem;
+    outline: none;
+    cursor: pointer;
+    transition: border-color 0.2s;
+}
+.control-panel select:focus {
+    border-color: var(--primary);
+}
+.input-group {
+    display: grid;
+    gap: 1.5rem;
+    margin-bottom: 2rem;
+}
+.input-card label {
+    display: block;
+    margin-bottom: 0.5rem;
+    font-weight: 600;
+    color: var(--text-muted);
+}
+.input-wrapper input {
+    width: 100%;
+    padding: 1rem;
+    border-radius: 0.75rem;
+    border: 1px solid var(--border-color);
+    background-color: rgba(15, 23, 42, 0.5);
+    color: var(--text-color);
+    font-size: 1rem;
+    transition: all 0.2s;
+}
+.input-wrapper input:focus {
+    outline: none;
+    border-color: var(--primary);
+    box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2);
+}
+button#analyze-btn {
+    width: 100%;
+    padding: 1rem;
+    border: none;
+    border-radius: 0.75rem;
+    background-color: var(--primary);
+    color: white;
+    font-size: 1.1rem;
+    font-weight: 600;
+    cursor: pointer;
+    transition: background-color 0.2s, transform 0.1s;
+}
+button#analyze-btn:hover {
+    background-color: var(--primary-hover);
+}
+button#analyze-btn:active {
+    transform: scale(0.98);
+}
+.result-card {
+    margin-top: 2rem;
+    padding-top: 2rem;
+    border-top: 1px solid var(--border-color);
+    animation: slideUp 0.3s ease-out;
+}
+.hidden {
+    display: none;
+}
+@keyframes slideUp {
+    from { opacity: 0; transform: translateY(10px); }
+    to { opacity: 1; transform: translateY(0); }
+}
+.prediction-header {
+    text-align: center;
+    font-size: 1.5rem;
+    font-weight: 700;
+    margin-bottom: 1.5rem;
+}
+.prob-bar {
+    display: flex;
+    align-items: center;
+    margin-bottom: 0.75rem;
+    gap: 1rem;
+}
+.prob-label {
+    width: 100px;
+    font-size: 0.9rem;
+    text-align: right;
+    color: var(--text-muted);
+}
+.bar-container {
+    flex-grow: 1;
+    height: 10px;
+    background-color: rgba(255, 255, 255, 0.1);
+    border-radius: 10px;
+    overflow: hidden;
+}
+.bar {
+    height: 100%;
+    background-color: var(--primary);
+    border-radius: 10px;
+    transition: width 0.5s ease-out;
+}
+.prob-val {
+    width: 50px;
+    font-size: 0.9rem;
+    font-weight: 600;
+}
+footer {
+    text-align: center;
+    margin-top: 3rem;
+    color: var(--text-muted);
+    font-size: 0.9rem;
+}

app/templates/index.html ADDED Viewed

	@@ -0,0 +1,237 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>S-BERT Semantic Similarity Analysis</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600&display=swap" rel="stylesheet">
+</head>
+<body>
+    <div class="background-orb"></div>
+    <div class="container">
+        <header>
+            <h1>Semantic Textual Similarity</h1>
+            <p class="subtitle">Analyze the relationship between Scientific Claims and Evidence</p>
+            <div class="explanation-box">
+                <h3>What is this?</h3>
+                <p>
+                    This tool uses a <strong>Sentence-BERT (S-BERT)</strong> model trained on the <em>Climate-FEVER</em> dataset
+                    to determine if a Piece of <strong>Evidence</strong> supports, contradicts, or is neutral towards a specific <strong>Claim</strong>.
+                    Select a dataset scenario below to populate examples, or type your own sentences to test the model's understanding of semantic logic.
+                </p>
+            </div>
+        </header>
+        <main>
+            <div class="control-panel">
+                <label for="dataset-select">Choose a Dataset / Scenario:</label>
+                <select id="dataset-select" onchange="loadScenario()">
+                    <option value="climate_fever" selected>Climate-FEVER (Science/Climate Only)</option>
+                    <option value="snli">SNLI (General Knowledge / Logic)</option>
+                    <option value="mnli">MNLI (Multi-Genre)</option>
+                    <option value="custom">Custom Input</option>
+                </select>
+                <p id="scenario-hint" style="font-size: 0.8rem; color: #94a3b8; margin-top: 0.2rem;">
+                    <em>Tip: 'The Earth is flat' is general logic -> Use <strong>SNLI</strong>. Climate-FEVER is for climate-specific claims.</em>
+                </p>
+            </div>
+            <div class="input-group">
+                <div class="input-card">
+                    <label for="sentence1">Claim / Sentence 1</label>
+                    <div class="input-wrapper">
+                        <input type="text" id="sentence1" list="claims-list" placeholder="Enter a claim or sentence...">
+                        <datalist id="claims-list">
+                            <!-- Populated by JS -->
+                        </datalist>
+                    </div>
+                </div>
+                <div class="input-card">
+                    <label for="sentence2">Evidence / Sentence 2</label>
+                    <div class="input-wrapper">
+                        <input type="text" id="sentence2" list="evidence-list" placeholder="Enter evidence or sentence...">
+                        <datalist id="evidence-list">
+                            <!-- Populated by JS -->
+                        </datalist>
+                    </div>
+                </div>
+            </div>
+            <button id="analyze-btn" onclick="predict()">Analyze Similarity</button>
+            <div id="result" class="result-card hidden">
+                <div class="prediction-header">
+                    <span id="prediction-label">Entailment</span>
+                </div>
+                <div class="probabilities">
+                    <div class="prob-bar">
+                        <span class="prob-label">Entailment</span>
+                        <div class="bar-container"><div class="bar" id="bar-entailment" style="width: 0%"></div></div>
+                        <span class="prob-val" id="val-entailment">0%</span>
+                    </div>
+                    <div class="prob-bar">
+                        <span class="prob-label">Neutral</span>
+                        <div class="bar-container"><div class="bar" id="bar-neutral" style="width: 0%"></div></div>
+                        <span class="prob-val" id="val-neutral">0%</span>
+                    </div>
+                    <div class="prob-bar">
+                        <span class="prob-label">Contradiction</span>
+                        <div class="bar-container"><div class="bar" id="bar-contradiction" style="width: 0%"></div></div>
+                        <span class="prob-val" id="val-contradiction">0%</span>
+                    </div>
+                </div>
+            </div>
+        </main>
+        <footer>
+            <p>Developed by <strong>Htut Ko Ko (st126010)</strong> | A4 Assignment</p>
+        </footer>
+    </div>
+    <script>
+        const scenarios = {
+            'climate_fever': {
+                claims: [
+                    "Global warming is caused by human activities.",
+                    "Sea levels are rising due to melting ice caps.",
+                    "The sun is the primary driver of recent climate change."
+                ],
+                evidence: [
+                    "The IPCC report confirms that human influence has warmed the atmosphere, ocean and land.",
+                    "Satellite data shows a steady increase in global sea levels over the past century.",
+                    "Solar irradiance has remained relatively stable while temperatures have soared."
+                ]
+            },
+            'snli': {
+                claims: [
+                    "A soccer player is running across the field.",
+                    "A person is inspecting the tires of a bicycle.",
+                    "Two men are playing basketball."
+                ],
+                evidence: [
+                    "A person is moving fast on a grass surface.",
+                    "A mechanic is fixing a car.",
+                    "The men are playing a sport."
+                ]
+            },
+            'mnli': {
+                claims: [
+                    "The government announced a new tax policy.",
+                    "He turned and looked at the woman.",
+                    "The concert was cancelled due to rain."
+                ],
+                evidence: [
+                    "New financial regulations were introduced by the state.",
+                    "He ignored the person standing next to him.",
+                    "The outdoor event proceeded despite the bad weather."
+                ]
+            }
+        };
+        function loadScenario() {
+            const select = document.getElementById('dataset-select');
+            const scenarioKey = select.value;
+            const claimsList = document.getElementById('claims-list');
+            const evidenceList = document.getElementById('evidence-list');
+            const s1Input = document.getElementById('sentence1');
+            const s2Input = document.getElementById('sentence2');
+            // Clear lists
+            claimsList.innerHTML = '';
+            evidenceList.innerHTML = '';
+            if (scenarioKey === 'custom') {
+                s1Input.value = '';
+                s2Input.value = '';
+                return;
+            }
+            const data = scenarios[scenarioKey];
+            // Populate Datalists
+            data.claims.forEach(item => {
+                const opt = document.createElement('option');
+                opt.value = item;
+                claimsList.appendChild(opt);
+            });
+            data.evidence.forEach(item => {
+                const opt = document.createElement('option');
+                opt.value = item;
+                evidenceList.appendChild(opt);
+            });
+            // Auto-fill first example for convenience
+            s1Input.value = data.claims[0];
+            s2Input.value = data.evidence[0];
+        }
+        async function predict() {
+            const s1 = document.getElementById('sentence1').value;
+            const s2 = document.getElementById('sentence2').value;
+            const modelType = document.getElementById('dataset-select').value; // Get selected model
+            const resultDiv = document.getElementById('result');
+            const btn = document.getElementById('analyze-btn');
+            if (!s1 || !s2) {
+                alert("Please enter both sentences.");
+                return;
+            }
+            btn.textContent = "Analyzing...";
+            resultDiv.classList.add('hidden');
+            try {
+                const response = await fetch('/predict', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        sentence1: s1,
+                        sentence2: s2,
+                        model_type: modelType // Send model type
+                    })
+                });
+                const data = await response.json();
+                if (data.error) {
+                    alert(data.error);
+                    return;
+                }
+                // Update UI
+                const label = document.getElementById('prediction-label');
+                label.textContent = data.prediction;
+                // Color coding
+                if (data.prediction === 'Entailment') label.style.color = '#10B981'; // Green
+                else if (data.prediction === 'Contradiction') label.style.color = '#EF4444'; // Red
+                else label.style.color = '#F59E0B'; // Yellow/Orange
+                // Update Bars
+                document.getElementById('bar-entailment').style.width = (data.probabilities.Entailment * 100) + '%';
+                document.getElementById('val-entailment').textContent = (data.probabilities.Entailment * 100).toFixed(1) + '%';
+                document.getElementById('bar-neutral').style.width = (data.probabilities.Neutral * 100) + '%';
+                document.getElementById('val-neutral').textContent = (data.probabilities.Neutral * 100).toFixed(1) + '%';
+                document.getElementById('bar-contradiction').style.width = (data.probabilities.Contradiction * 100) + '%';
+                document.getElementById('val-contradiction').textContent = (data.probabilities.Contradiction * 100).toFixed(1) + '%';
+                resultDiv.classList.remove('hidden');
+            } catch (e) {
+                console.error(e);
+                alert("Error connecting to the server.");
+            } finally {
+                btn.textContent = "Analyze Similarity";
+            }
+        }
+        // Initialize on load
+        window.onload = loadScenario;
+    </script>
+</body>
+</html>

demo.gif ADDED Viewed

Git LFS Details

SHA256: 959470bc03c4e9f662c1d28dfda8583981edb722294ca5b67f4175f6567d7212
Pointer size: 132 Bytes
Size of remote file: 1.99 MB

models/bert_trained.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bd87012116d7955bc8b4fddbe591810bc70692ed1df20d3394e58d08bdbc58a
+size 12138238

models/sbert_climate_fever.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfd76cef3670cbd791bc83a0bf7292cb6975ae8b190ffb02e8feb92b350e33bb
+size 12148818

models/sbert_mnli.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a5397be33fe31a270905838e93ddf94a05c2f1e54f09c13c5a453b45f580fc5
+size 12148386

models/sbert_snli.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7c3557c784c924a487f55a6de15d8c127599aca0cfa74a1e6655b9af7b063c2
+size 12148386

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.0.0
+transformers
+datasets
+scikit-learn
+pandas
+numpy
+flask
+gunicorn
+tqdm