{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "cell-01-install", "metadata": {}, "outputs": [ { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", "\u001b[1;31mClick here for more info. \n", "\u001b[1;31mView Jupyter log for further details." ] }, { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mCanceled future for execute_request message before replies were done" ] }, { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mCanceled future for execute_request message before replies were done. \n", "\u001b[1;31mView Jupyter log for further details." ] } ], "source": [ "# ╔══════════════════════════════════════════════════════════════╗\n", "# ║ CELL 1 — Install packages ║\n", "# ║ RUN THIS CELL ALONE FIRST — it will auto-restart runtime ║\n", "# ╚══════════════════════════════════════════════════════════════╝\n", "import subprocess, sys, os\n", "\n", "def pip(*pkgs):\n", " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"-q\", *pkgs])\n", "\n", "# No pinned numpy/pandas — let Colab use its pre-installed compatible versions.\n", "# Pinning old numpy/pandas causes the 'mtrand ABI mismatch' ValueError.\n", "pip(\n", " \"datasets>=2.18.0\",\n", " \"transformers>=4.40.0\",\n", " \"sentence-transformers>=2.7.0\",\n", " \"scikit-learn>=1.4.0\",\n", " \"tqdm>=4.66.0\",\n", " \"accelerate>=0.26.0\",\n", " \"evaluate\",\n", ")\n", "\n", "print(\"✅ Packages installed — restarting runtime now …\")\n", "os.kill(os.getpid(), 9) # auto-restart; Colab reconnects in ~5 s" ] }, { "cell_type": "code", "execution_count": 5, "id": "22cff354", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m527.3/527.3 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.6/177.6 kB\u001b[0m \u001b[31m20.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.6.1 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0m" ] } ], "source": [ "!pip install -q \"datasets==2.21.0\"" ] }, { "cell_type": "code", "execution_count": 1, "id": "cell-02-gpu", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n", "CUDA available: True\n", "Device name: Tesla T4\n" ] } ], "source": [ "# ╔══════════════════════════════════════════════════════════════╗\n", "# ║ CELL 2 — Mount Drive + GPU check ║\n", "# ║ Run AFTER the runtime has restarted ║\n", "# ╚══════════════════════════════════════════════════════════════╝\n", "from google.colab import drive\n", "drive.mount(\"/content/drive\")\n", "\n", "import torch\n", "print(\"CUDA available:\", torch.cuda.is_available())\n", "print(\"Device name: \", torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"CPU\")\n", "assert torch.cuda.is_available(), \"❌ No GPU — set Runtime type to T4 GPU!\"\n", "DEVICE = 0" ] }, { "cell_type": "code", "execution_count": 2, "id": "cell-03-config", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Config ready | output → /content/drive/MyDrive/Athernex/nli_contract_model_final\n" ] } ], "source": [ "# ╔══════════════════════════════════════════════════════════════╗\n", "# ║ CELL 3 — Config ║\n", "# ╚══════════════════════════════════════════════════════════════╝\n", "import os\n", "\n", "CNLI_SIZE = 6820 # full ContractNLI train split\n", "MNLI_SIZE = 50000 # pool; genre filter keeps ~8-10k government rows\n", "SYNTH_SIZE = 1000 # synthetic contradiction pairs\n", "\n", "BASE_MODEL = \"typeform/distilbert-base-uncased-mnli\"\n", "OUTPUT_DIR = \"/content/drive/MyDrive/Athernex/nli_contract_model_final\"\n", "EPOCHS = 5\n", "BATCH_SIZE = 32 # T4 handles 32 at max_length=128\n", "LR = 2e-5\n", "MAX_LEN = 128\n", "\n", "LABEL2ID = {\"entailment\": 0, \"contradiction\": 1, \"neutral\": 2}\n", "ID2LABEL = {v: k for k, v in LABEL2ID.items()}\n", "\n", "os.makedirs(OUTPUT_DIR, exist_ok=True)\n", "print(f\"✅ Config ready | output → {OUTPUT_DIR}\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "cell-04-helpers", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Data helpers defined.\n" ] } ], "source": [ "# ╔══════════════════════════════════════════════════════════════╗\n", "# ║ CELL 4 — Data loading helpers ║\n", "# ╚══════════════════════════════════════════════════════════════╝\n", "import re\n", "import pandas as pd\n", "from datasets import load_dataset\n", "\n", "def clean_clause(text: str) -> str:\n", " text = re.sub(r'\\s+', ' ', text).strip()\n", " text = re.sub(r'[^\\x00-\\x7F]+', '', text)\n", " return text\n", "\n", "def load_contract_nli(split: str = \"train\", size: int = CNLI_SIZE):\n", " \"\"\"Full ContractNLI — kiddothe2b/contract-nli, subset contractnli_a.\"\"\"\n", " slice_str = f\"{split}[:{size}]\" if size else split\n", " return load_dataset(\n", " \"kiddothe2b/contract-nli\", \"contractnli_a\",\n", " split=slice_str, trust_remote_code=True\n", " )\n", "\n", "def process_contract_nli(dataset) -> pd.DataFrame:\n", " \"\"\"kiddothe2b schema: 0=contradiction, 1=entailment, 2=neutral.\"\"\"\n", " label_map = {0: \"contradiction\", 1: \"entailment\", 2: \"neutral\"}\n", " records = []\n", " for s in dataset:\n", " p = clean_clause(s[\"premise\"])\n", " h = clean_clause(s[\"hypothesis\"])\n", " if len(p) < 20 or len(h) < 20:\n", " continue\n", " records.append({\"clause1\": p, \"clause2\": h,\n", " \"label\": label_map.get(s[\"label\"], \"neutral\")})\n", " return pd.DataFrame(records)\n", "\n", "def load_mnli_government(split: str = \"train\", size: int = MNLI_SIZE):\n", " \"\"\"MultiNLI filtered to government genre.\"\"\"\n", " if split == \"validation\":\n", " split = \"validation_matched\"\n", " slice_str = f\"{split}[:{size}]\" if size else split\n", " ds = load_dataset(\"nyu-mll/multi_nli\", split=slice_str, trust_remote_code=True)\n", " return ds.filter(lambda x: x[\"genre\"] == \"government\")\n", "\n", "def process_mnli_government(dataset) -> pd.DataFrame:\n", " \"\"\"MultiNLI schema: 0=entailment, 1=neutral, 2=contradiction.\"\"\"\n", " label_map = {0: \"entailment\", 1: \"neutral\", 2: \"contradiction\"}\n", " records = []\n", " for s in dataset:\n", " if not s[\"premise\"] or not s[\"hypothesis\"]:\n", " continue\n", " p = clean_clause(s[\"premise\"])\n", " h = clean_clause(s[\"hypothesis\"])\n", " if len(p) < 20 or len(h) < 20:\n", " continue\n", " records.append({\"clause1\": p, \"clause2\": h,\n", " \"label\": label_map.get(s[\"label\"], \"neutral\")})\n", " return pd.DataFrame(records)\n", "\n", "NEGATION_MAP = {\n", " \"shall\": \"shall not\", \"must\": \"must not\",\n", " \"will\": \"will not\", \"may\": \"may not\",\n", " \"is required to\": \"is not required to\",\n", " \"exclusive\": \"non-exclusive\", \"limited\": \"unlimited\",\n", " \"terminate\": \"not terminate\",\n", "}\n", "\n", "def simulate_contradiction(clause: str):\n", " for term, negated in NEGATION_MAP.items():\n", " if term in clause.lower():\n", " return re.sub(term, negated, clause, count=1, flags=re.IGNORECASE)\n", " return None\n", "\n", "def build_synthetic_pairs(clauses: list, sample_size: int = SYNTH_SIZE) -> pd.DataFrame:\n", " import random; random.seed(42)\n", " sampled = random.sample(clauses, min(sample_size, len(clauses)))\n", " records = []\n", " for clause in sampled:\n", " neg = simulate_contradiction(clause)\n", " if neg:\n", " records.append({\"clause1\": clause, \"clause2\": neg, \"label\": \"contradiction\"})\n", " return pd.DataFrame(records)\n", "\n", "print(\"✅ Data helpers defined.\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "cell-05-build", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "=======================================================\n", "BUILDING FULL TRAINING DATA\n", "=======================================================\n", "\n", "[1/3] ContractNLI (size=6820) ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:103: UserWarning: \n", "Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.\n", "You are not authenticated with the Hugging Face Hub in this notebook.\n", "If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).\n", " warnings.warn(\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3828de1843b244eab1864397b1be07ec", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/796k [00:00\n", " \n", " \n", " [2340/2340 04:25, Epoch 5/5]\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation LossAccuracyF1
10.3311080.2997530.8821970.883166
20.2872760.2499320.9094700.909536
30.1930820.2323980.9200760.919938
40.1819510.2296380.9234850.923234
50.1480840.2447990.9215910.921433

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "89c8b242441e433995dac74b6d960d3f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Writing model shards: 0%| | 0/1 [00:00\n", " \n", " \n", " [83/83 00:02]\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "{'eval_loss': 0.22964176535606384, 'eval_accuracy': 0.9234848484848485, 'eval_f1': 0.9232344709378449, 'eval_runtime': 2.2543, 'eval_samples_per_second': 1171.072, 'eval_steps_per_second': 36.818, 'epoch': 5.0}\n" ] } ], "source": [ "# ╔══════════════════════════════════════════════════════════════╗\n", "# ║ CELL 10 — Final eval on validation split ║\n", "# ╚══════════════════════════════════════════════════════════════╝\n", "print(\"\\n📊 Final evaluation on validation split ...\")\n", "eval_results = trainer.evaluate()\n", "print(eval_results)" ] }, { "cell_type": "code", "execution_count": 10, "id": "cell-11-heldout", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "[Held-out] Loading MultiNLI validation_matched (government) ...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2ee65214435a4afebff9dd1c35036c8e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Filter: 0%| | 0/5000 [00:00