{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "z3OlwGoEWzrh" }, "source": [ "## Step - 1\n", "### spplitting the dataset into training, testing and evaluation" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "executionInfo": { "elapsed": 33845, "status": "ok", "timestamp": 1743507170906, "user": { "displayName": "", "userId": "" }, "user_tz": -330 }, "id": "9bCr_L7WXflr", "outputId": "1264cf41-65ee-42db-c7e6-0294d632b66d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yJwjC33HWzrj", "outputId": "51b04736-18a4-45f9-b439-22b1e77f993c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original subject distribution: Counter({'Electrostatics and Current Electricity': 76, 'Mechanics': 60, 'Kinematics': 55, 'Electromagnetism': 45, 'Thermodynamics': 44, 'Optics': 38, 'Atomic and Modern Physics': 30, 'Electronic Devices': 29, 'Periodic Motion': 13, 'Waves and Oscillations': 10})\n", "\n", "Dataset split into 280 training, 60 testing, and 60 evaluation samples.\n", "\n", "Subject distribution in each split:\n", "Train: Counter({'Electrostatics and Current Electricity': 53, 'Mechanics': 42, 'Kinematics': 39, 'Thermodynamics': 31, 'Electromagnetism': 31, 'Optics': 27, 'Atomic and Modern Physics': 21, 'Electronic Devices': 20, 'Periodic Motion': 9, 'Waves and Oscillations': 7})\n", "Test: Counter({'Electrostatics and Current Electricity': 11, 'Mechanics': 9, 'Kinematics': 8, 'Electromagnetism': 7, 'Thermodynamics': 6, 'Optics': 5, 'Atomic and Modern Physics': 5, 'Electronic Devices': 5, 'Periodic Motion': 2, 'Waves and Oscillations': 2})\n", "Eval: Counter({'Electrostatics and Current Electricity': 12, 'Mechanics': 9, 'Kinematics': 8, 'Thermodynamics': 7, 'Electromagnetism': 7, 'Optics': 6, 'Electronic Devices': 4, 'Atomic and Modern Physics': 4, 'Periodic Motion': 2, 'Waves and Oscillations': 1})\n" ] } ], "source": [ "import json\n", "import random\n", "from sklearn.model_selection import train_test_split\n", "from collections import Counter\n", "\n", "# Set random seed for reproducibility\n", "RANDOM_SEED = 42\n", "random.seed(RANDOM_SEED)\n", "\n", "# Load the dataset\n", "with open(r\"/content/drive/MyDrive/dataset/high_school_physics.json\", \"r\", encoding=\"utf-8\") as f:\n", " data = json.load(f)\n", "\n", "# Extract the 'subject' field for stratification\n", "subjects = [item[\"subject\"] for item in data]\n", "\n", "# Verify initial distribution\n", "print(\"Original subject distribution:\", Counter(subjects))\n", "\n", "# First split: 70% train, 30% temp (test + eval)\n", "train_data, temp_data = train_test_split(\n", " data,\n", " train_size=0.7,\n", " stratify=subjects,\n", " random_state=RANDOM_SEED\n", ")\n", "\n", "# Second split: Split the 30% temp into 15% test and 15% eval (50/50 of temp)\n", "test_data, eval_data = train_test_split(\n", " temp_data,\n", " test_size=0.5, # 50% of 30% = 15% of original\n", " stratify=[item[\"subject\"] for item in temp_data],\n", " random_state=RANDOM_SEED\n", ")\n", "\n", "# Save train, test, and evaluation sets\n", "with open(r\"/content/drive/MyDrive/dataset/train.json\", \"w\", encoding=\"utf-8\") as f:\n", " json.dump(train_data, f, indent=4)\n", "with open(r\"/content/drive/MyDrive/dataset/test.json\", \"w\", encoding=\"utf-8\") as f:\n", " json.dump(test_data, f, indent=4)\n", "with open(r\"/content/drive/MyDrive/dataset/eval.json\", \"w\", encoding=\"utf-8\") as f:\n", " json.dump(eval_data, f, indent=4)\n", "\n", "# Print split sizes and subject distribution\n", "print(f\"\\nDataset split into {len(train_data)} training, {len(test_data)} testing, and {len(eval_data)} evaluation samples.\")\n", "print(\"\\nSubject distribution in each split:\")\n", "print(\"Train:\", Counter([item[\"subject\"] for item in train_data]))\n", "print(\"Test:\", Counter([item[\"subject\"] for item in test_data]))\n", "print(\"Eval:\", Counter([item[\"subject\"] for item in eval_data]))" ] }, { "cell_type": "markdown", "metadata": { "id": "UKGUfjMv7qpa" }, "source": [ "### Analysis of the Output\n", "#### Original Distribution\n", "- **Total samples**: 400\n", "- **Subjects**:\n", " - Electrostatics and Current Electricity: 76\n", " - Mechanics: 60\n", " - Kinematics: 55\n", " - Electromagnetism: 45\n", " - Thermodynamics: 44\n", " - Optics: 38\n", " - Atomic and Modern Physics: 30\n", " - Electronic Devices: 29\n", " - Periodic Motion: 13\n", " - Waves and Oscillations: 10\n", "\n", "#### Split Results\n", "- **Train**: 280 samples (70%)\n", "- **Test**: 60 samples (15%)\n", "- **Eval**: 60 samples (15%)\n", "\n", "#### Subject Distribution Across Splits\n", "| Subject | Original | Train (70%) | Test (15%) | Eval (15%) |\n", "|----------------------------------|----------|-------------|------------|------------|\n", "| Electrostatics and Current Elec. | 76 | 53 (53.2) | 11 (11.4) | 12 (11.4) |\n", "| Mechanics | 60 | 42 (42) | 9 (9) | 9 (9) |\n", "| Kinematics | 55 | 39 (38.5) | 8 (8.25) | 8 (8.25) |\n", "| Electromagnetism | 45 | 31 (31.5) | 7 (6.75) | 7 (6.75) |\n", "| Thermodynamics | 44 | 31 (30.8) | 6 (6.6) | 7 (6.6) |\n", "| Optics | 38 | 27 (26.6) | 5 (5.7) | 6 (5.7) |\n", "| Atomic and Modern Physics | 30 | 21 (21) | 5 (4.5) | 4 (4.5) |\n", "| Electronic Devices | 29 | 20 (20.3) | 5 (4.35) | 4 (4.35) |\n", "| Periodic Motion | 13 | 9 (9.1) | 2 (1.95) | 2 (1.95) |\n", "| Waves and Oscillations | 10 | 7 (7) | 2 (1.5) | 1 (1.5) |\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "knFCJUypWzrk", "outputId": "3fd328bb-f797-4420-f5f3-b9b82b19b80d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'id': 298, 'question': 'What is the change in internal energy of an ideal gas during an isochoric process?', 'subject': 'Thermodynamics', 'choices': ['Zero', 'Positive', 'Negative', 'Depends on the process'], 'answer': 'D', 'explanation': 'In an isochoric process, the change in internal energy depends on the heat added to or removed from the gas.', 'dataset': 'high_school_physics'}\n", "{'id': 206, 'question': 'An ideal gas is heated at constant volume. What happens to its pressure?', 'subject': 'Thermodynamics', 'choices': ['Increases', 'Decreases', 'Remains constant', 'Doubles'], 'answer': 'A', 'explanation': \"According to Gay-Lussac's Law, P1/T1 = P2/T2. If temperature increases, pressure increases.\", 'dataset': 'high_school_physics'}\n" ] } ], "source": [ "print(test_data[0])\n", "print(test_data[3])" ] }, { "cell_type": "markdown", "metadata": { "id": "t15fKpYVWzrm" }, "source": [ "### Zero-Shot Evaluation of Flan-T5 from Hugging Face" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 400, "referenced_widgets": [ "d806ef0e3ef04aaa8ba107d84d1cd19c", "f8029b88425b4b278cc5e7bdcaa70932", "22d10edbd39b45f9a30b332018811101", "297f1efa21044e40a96a1c55e99055a5", "16096f6613aa4bea921acd5567e101ff", "6e2c262086d34d5692c7c687f4f7e1a3", "ba35ee2946834b75b5b507659b8fda73", "196706d308754b6cb11e831ab7289f8a", "ad258cbb563649f6958236548007747a", "f7c8049e0f584b148988950b239f7fa1", "e7e8ffb530584414aba8f6c333968e4b", "546f78d7533e4559bb6e4452d33ec446", "5bc7ccd2e58a4ebb8e71de995d17c9ba", "adaf2371062d4669a711722ed260c678", "c73f2e2aa4f041a5bf53ecc7a0c57a03", "6f06287d20744458940aea48f52e7c93", "d9cfbe41f4124ebc972661ef8a085faf", "1ab679315d0e4c6c806845d615af93b7", "9e34ac90fe9c40a79fa5c43d748d6a4c", "ac1101f3d95d4d2dba66c83a9d981461", "826b999f5aa347f49ee944ca0c2dbb48", "160d33b558884654876a995720f290b1", "4a8195191118488fb28080116944a436", "ec53e5faf7144f93bce59b0198191ef8", "d4bdf9e0921548959bcc4f8feff9eea1", "f73f821d096445e1974a1497f087b427", "49114e643789483f9bf91a27bdf17e22", "fa8e6d00f4414ea0b8de67357a321778", "425b7dd31317498282843cb9589f13b2", "a49b973f462e4193984ba5abda1b38c3", "c4113345537949f78a6c4acdf3474282", "8d9003fbe1074bfe89a449e5f2cc8a90", "20127094d14644399dceff5b0c185789", "72dea0bda607440aa1617505c80b50b8", "9a4be4d7ff3e4df69288d7df1415a0f2", "a0818b40009b4bf39c18f3fe825d0766", "61994a9982d5457ab27aef3f253c2d01", "8c65b13e4c6f4b77b63858df2955ff94", "5d70bcc545d7431cab2d2702d68d1810", "95fbe4d8d1824b4d967c97e393afc82a", "c050f583e8ba466c859d9b2c77ed4897", "fbb8350a029b405ca83a89ba30b94910", "62f0174bcbe2494bbee62208307fa6a5", "6f0d59374a4e4aa092f4d6467071f11d", "4165abe7acdc46e6a231d9e51d6f9e17", "466bbf52773940d9a1d852eed0f348bf", "de4c85a2f49e47e2b26f9b1ebf17abc6", "da9f6390036f4997a29e0313d625ba26", "8e36bd4ead0047efb8885c9bfbb3972f", "a33db019e2084bbea4b1039d8b336a17", "fdd75827b0c94dbaabefd4c1e94a69c5", "e0cef7c7d2d74ad39cb44c640b2d28f4", "6f4396941f1a4712936e1554b5166103", "af289bed7a0942bcb4e4a832ec829da8", "1493e6dc29094c1c86b8cf2934486ce1", "7536c0affafd4ecda647559827da4c8d", "d43dae81182346fcb1c0ac17ec298d8e", "d2048627398e4a85ae79d78bcc265b3e", "441e44c3877c466793224dbb192b7eef", "50dbe053d8d243a9ba4cbbc49b3f703b", "065c808cd1f14961b1bbae61357a9c87", "ebda6187cdf74f0b890d979a4d5707f7", "1fa6adfd7862444db70ae88207c14864", "a32288a5f4d0415e9ce753c47112536f", "b95f0f67d325463cb8116c8676fd1170", "d971a803cd174974b71c3d05b25900ac", "a50ce4a04cc84cfc80baac4cb7a5a5cd", "e984173af9f94aafacf39321893a8d20", "964f35b07bbd4d00b16c77cd8c9e7294", "e937dce6f825402aae694b341f2b4fa1", "753962218d74452292e25c7834aa4f68", "a11e6b25cf9243e7a47f9036113a694e", "61a996df59654f8b9963e91a30144e7b", "1067a8f774c1490eb378a09a082862a5", "0563ac72fce7404d864ef05c7495ace3", "73d678518d5244d7bdcad2921917e9b7", "9ca18ee7466f461f9afde1dc7c5bdd80" ] }, "id": "3jf9z-URWzrn", "outputId": "bf361b2e-adec-4072-d7fb-08155b89ba4c" }, "outputs": [], "source": [ "from transformers import pipeline\n", "\n", "qa_pipeline = pipeline(\"text2text-generation\", model=\"google/flan-t5-base\")\n", "\n", "def evaluate_model(model, dataset):\n", " correct = 0\n", " total = len(dataset)\n", "\n", " for item in dataset:\n", " question = item[\"question\"]\n", " idx = ord(item[\"answer\"])-ord(\"A\")\n", " correct_answer = item[\"choices\"][idx]\n", "\n", " prompt = f\"Give me the final answer without any explaination, just the couple of words with units that directly show the answer for the Question: {question} with Choices: {', '.join(item['choices'])} Answer:\"\n", " prediction = model(prompt, max_length=20, truncation=True)[0][\"generated_text\"]\n", "\n", " # print(correct_answer, prediction, item[\"id\"])\n", " if correct_answer in prediction:\n", " correct += 1\n", "\n", " accuracy = (correct / total) * 100\n", " return accuracy\n", "\n", "zero_shot_accuracy = evaluate_model(qa_pipeline, test_data)\n", "print(f\"Zero-Shot Accuracy: {zero_shot_accuracy:.2f}%\")\n" ] }, { "cell_type": "markdown", "metadata": { "id": "Eef1iSSngxg7" }, "source": [ "### Supervised Fine-Tuning Flan-T5-Base on Training Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 490 }, "id": "f9mgCN8qg85c", "outputId": "4d965872-c4d5-44fd-aea5-e2655df60827" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n", "Using device: cpu\n", "Initializing fresh FLAN-T5-Large model...\n", "Starting epoch 1/3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [170/210 1:00:16 < 14:21, 0.05 it/s, Epoch 2.41/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation Loss
1No log28.344961
2No log18.623024

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [210/210 1:16:38, Epoch 3/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation Loss
1No log28.344961
2No log18.623024
3No log14.014906

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Model saved after epoch 1 to /content/drive/MyDrive/dataset/trained_model_epoch_1\n", "Starting epoch 2/3\n" ] }, { "data": { "text/html": [ "\n", "

\n", " \n", " \n", " [ 57/210 18:46 < 52:14, 0.05 it/s, Epoch 0.80/3]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
EpochTraining LossValidation Loss

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Mount Google Drive\n", "from google.colab import drive\n", "drive.mount('/content/drive', force_remount=True) # Force remount for fresh connection\n", "\n", "# Imports\n", "from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments\n", "from torch.utils.data import Dataset\n", "import json\n", "import os\n", "import time\n", "import torch\n", "\n", "# Disable W&B logging\n", "os.environ[\"WANDB_DISABLED\"] = \"true\"\n", "\n", "# Use CPU explicitly\n", "device = torch.device(\"cpu\")\n", "print(f\"Using device: {device}\")\n", "\n", "# Load model and tokenizer\n", "model_name = \"google/flan-t5-base\"\n", "model_path = \"/content/drive/MyDrive/dataset/trained_model\"\n", "if os.path.exists(model_path):\n", " print(\"Loading previously fine-tuned model...\")\n", " model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)\n", " tokenizer = T5Tokenizer.from_pretrained(model_path, legacy=False)\n", "else:\n", " print(\"Initializing fresh FLAN-T5-Large model...\")\n", " model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)\n", " tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)\n", "\n", "# Custom dataset class\n", "class PhysicsDataset(Dataset):\n", " def __init__(self, data, tokenizer, max_length=128): # Reduced max_length\n", " self.data = data\n", " self.tokenizer = tokenizer\n", " self.max_length = max_length\n", "\n", " def __len__(self):\n", " return len(self.data)\n", "\n", " def __getitem__(self, idx):\n", " item = self.data[idx]\n", " prompt = f\"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:\"\n", " idx = ord(item[\"answer\"][0]) - ord(\"A\")\n", " target = item[\"choices\"][idx]\n", "\n", " encodings = self.tokenizer(prompt, truncation=True, padding=\"max_length\", max_length=self.max_length, return_tensors=\"pt\")\n", " target_encodings = self.tokenizer(target, truncation=True, padding=\"max_length\", max_length=self.max_length, return_tensors=\"pt\")\n", "\n", " return {\n", " \"input_ids\": encodings[\"input_ids\"].squeeze(),\n", " \"attention_mask\": encodings[\"attention_mask\"].squeeze(),\n", " \"labels\": target_encodings[\"input_ids\"].squeeze(),\n", " }\n", "\n", "# Load datasets\n", "train_file = \"/content/drive/MyDrive/dataset/train.json\"\n", "eval_file = \"/content/drive/MyDrive/dataset/eval.json\"\n", "test_file = \"/content/drive/MyDrive/dataset/test.json\"\n", "\n", "with open(train_file, \"r\", encoding=\"utf-8\") as f:\n", " train_data = json.load(f)\n", "with open(eval_file, \"r\", encoding=\"utf-8\") as f:\n", " eval_data = json.load(f)\n", "with open(test_file, \"r\", encoding=\"utf-8\") as f:\n", " test_data = json.load(f)\n", "\n", "train_dataset = PhysicsDataset(train_data, tokenizer)\n", "eval_dataset = PhysicsDataset(eval_data, tokenizer)\n", "\n", "# Define directories\n", "model_save_dir = \"/content/drive/MyDrive/dataset/trained_model\"\n", "results_dir = \"/content/drive/MyDrive/dataset/results\"\n", "os.makedirs(model_save_dir, exist_ok=True)\n", "os.makedirs(results_dir, exist_ok=True)\n", "\n", "# Training arguments optimized for CPU\n", "training_args = TrainingArguments(\n", " output_dir=results_dir,\n", " per_device_train_batch_size=1, # Reduced to minimize memory\n", " per_device_eval_batch_size=1,\n", " num_train_epochs=3,\n", " save_steps=70, # Save every ~1/4 epoch (280 samples / 1 batch = 280 steps, 70 steps ~ 1 epoch)\n", " eval_strategy=\"epoch\", # Updated from evaluation_strategy\n", " logging_dir=\"./logs\",\n", " run_name=f\"flan-t5-finetune-{time.strftime('%Y%m%d-%H%M%S')}\",\n", " report_to=\"none\",\n", " learning_rate=1e-5, # Lower LR for stability\n", " gradient_accumulation_steps=4, # Effective batch size = 1 * 4 = 4\n", " fp16=False, # Disabled for CPU\n", " save_total_limit=2, # Keep only 2 latest checkpoints to save space\n", ")\n", "\n", "# Initialize trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=eval_dataset,\n", ")\n", "\n", "# Train the model with manual checkpoint saving\n", "for epoch in range(int(training_args.num_train_epochs)):\n", " print(f\"Starting epoch {epoch + 1}/{int(training_args.num_train_epochs)}\")\n", " trainer.train()\n", " # Save model after each epoch\n", " model.save_pretrained(f\"{model_save_dir}_epoch_{epoch + 1}\")\n", " tokenizer.save_pretrained(f\"{model_save_dir}_epoch_{epoch + 1}\")\n", " print(f\"Model saved after epoch {epoch + 1} to {model_save_dir}_epoch_{epoch + 1}\")\n", "\n", "# Final save\n", "model.save_pretrained(model_save_dir)\n", "tokenizer.save_pretrained(model_save_dir)\n", "print(f\"Fine-tuned model saved to {model_save_dir}\")\n", "\n", "# Evaluate on test set\n", "correct = 0\n", "predictions = []\n", "for item in test_data:\n", " prompt = f\"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:\"\n", " inputs = tokenizer(prompt, return_tensors=\"pt\").to(device)\n", " outputs = model.generate(**inputs, max_length=10)\n", " prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()\n", " idx = ord(item[\"answer\"][0]) - ord(\"A\")\n", " target = item[\"choices\"][idx]\n", " predictions.append({\"question\": item[\"question\"], \"predicted\": prediction, \"target\": target})\n", " if prediction == target:\n", " correct += 1\n", "\n", "accuracy = correct / len(test_data) * 100\n", "print(f\"Fine-tuned accuracy on test set: {accuracy:.2f}% ({correct}/{len(test_data)})\")\n", "\n", "# Save predictions for analysis\n", "with open(\"/content/drive/MyDrive/dataset/finetune_test_results.json\", \"w\") as f:\n", " json.dump({\"accuracy\": accuracy, \"predictions\": predictions}, f, indent=4)\n", "print(\"Test results saved to /content/drive/MyDrive/dataset/finetune_test_results.json\")" ] }, { "cell_type": "markdown", "metadata": { "id": "fCuxI6ReWzrn" }, "source": [ "### Testing the trained model on test.json" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 834, "referenced_widgets": [ "c4fb253ad3f546a6972979cbf1b189f2", "af2b6ea6bd414bcba1f30ca0530f8440", "7a58748fe2444e7b9e0534a6433bbb34", "d07fe8a99c26406688177b284c9f51f1", "32f9346f82cd45a9bbd25fa39d9b7e2e", "14ffee7be9db46ada32485163d955b02", "d9caed934670439fbaaa892a22217f8c", "2e12eba30e954d139f27bff5c622e79e", "ac23565930f341c7ab2d725a6f3f1507", "2c166af962534276991315ce86c0f69a", "c90309148ab849079760efbb8234800c", "f623309fb05644c3ab5f6743ee163fb1", "887b5e5d18e243b591592b8f0b920c49", "1fa7f06f00fa48558f310270ef1e40c3", "c2d7052326e8410db4e6cd24e0813ab5", "1cad26a947d047ca84bbc61489aa1fe2", "ce6bd13d9add40da860e18614f17cb91", "d71acee438bc49a1892919b8abe87b74", "27f3cc9e8eac41a889c75958c882023e", "d09bfe60fe4d4b4d9b1d6ee6e2a7a583", "9ddd354b39564721b7f9560d606fb7c1", "16f91a1f86a34d1e8ac162aaa022284c", "8b6cd75bd6ce4a3e8e4178317169c9cb", "70f3420393064eb499a5aeb53f454784", "c01e550f8f0f49a3b0d8c5b95d60f03b", "2be465a78f4b4404a565b2771619450d", "5e4c34f2827a4459b853d2f707d148ef", "bb45397e34dc4c049aa49b1a6eedc42e", "cec5e628f3684a8fa8e35aea0ff4a5b5", "42562b72421c47788751d91941e5f34c", "3804146536f84feb8b04bca8adea9098", "01905115dcf846c785419594f6f13591", "cc60d501057f42acb3743e99e02a426e", "9ed15b88792045868b5073cc6dd634a3", "70c284ee884b473a847d1fa363575f5d", "d0b84424f1cb4377b5369501f75c9c90", "4f3e3870616147c88025a87d0d5fc6c6", "204e01fcdd4b4e42874af0d29c4df0f5", "59fe4e25f7774484bad001df52dbe78b", "b459645721f249f2a2da9c6bed1efb5a", "2beb6ea259674ac5a4fe538e0d2d46cf", "52365a9477fb484293220802a23c1094", "1ea36c10c40d425f8ccfdacf0d7dfd40", "5a7fa73520794ca8b540fe1174611e63", "f3e47b170a3e4834b8f685282cbfff18", "1b576483576945e08239f5613780ace6", "cd542179d79e45edab873601c94f61c6", "6264e2e12a4e4e44bec0f9ad86c4d45b", "31f49e47de18422fac7db71e15123eba", "675d7b9a68344d379a78e77e42294d4b", "2fdd59145efc4fbdaaeb4de08c729743", "5fda1a584d7448238891fee4f6fa26fc", "5932590500a140e3a1d8f8fa870ae820", "b2f2de31d9c84c37b0c68f7c78bbaf67", "1e8403c2ebe24dcea89412da4bee3f14", "a2b0e16dea024bf9b13518a9d4a90ed7", "992471be87d4415e843ebb77f3d6bb83", "4c60b150b3ab4e78b5ba361b174ad2bb", "e6e9c27b8e4347daaf4f71fe1787944b", "ac6fbca00eb2423fa310b6ca5442bec2", "0d21d5eb08d943bd90d4616e759f48ea", "eb0ecb93fb9a4e03b8a638edf027cac1", "519a0c67aab64b819f376e6880644433", "8a62187eb5b04765828e06e38e4a6e4e", "8afd1b1b2cf3442da161580679d695a6", "e02892f8b4bc4fc28365f5568437c0e9", "9dada24184ee4686adaac4922b1216df", "f35b94f584664feab34af1905ea93b6f", "a5c30ee65e864cb48fb2fdea5b969ee9", "86dc5c230e984c5ab9feb2ab3430994e", "52083f9146374909ae4169f7afdc4490", "e028dd60274d457a8a450736cbaef9a3", "68284c0f396047499a30c57018eaaf21", "e1b203a2a18244c68133be352b82bc35", "d4e138e4e44b4310acd99bfef1062070", "9f4fa4b3c79d4444b8c74870eee59164", "de5737a544e248d09ae12a0d48c4c6f5" ] }, "id": "jOcarEpxa5nP", "outputId": "360dcf1a-d877-411b-ca2c-c52812d85639" }, "outputs": [], "source": [ "from transformers import T5ForConditionalGeneration, T5Tokenizer\n", "import json\n", "import torch\n", "import os\n", "from tqdm import tqdm\n", "\n", "# Paths\n", "model_save_dir = \"/content/drive/MyDrive/dataset/trained_model\"\n", "test_file_path = \"/content/drive/MyDrive/dataset/test.json\" # Adjust if your test file has a different name\n", "\n", "# First, let's check if the model files exist\n", "print(\"Checking model directory contents:\")\n", "for root, dirs, files in os.walk(model_save_dir):\n", " for file in files:\n", " print(os.path.join(root, file))\n", "\n", "# Load the model and tokenizer\n", "try:\n", " model = T5ForConditionalGeneration.from_pretrained(model_save_dir)\n", " tokenizer = T5Tokenizer.from_pretrained(model_save_dir)\n", " print(\"Model and tokenizer loaded successfully!\")\n", "except Exception as e:\n", " print(f\"Error loading model: {e}\")\n", " # If model loading fails, load the original pretrained model\n", " print(\"Loading the base model instead...\")\n", " model = T5ForConditionalGeneration.from_pretrained(\"google/flan-t5-large\")\n", " tokenizer = T5Tokenizer.from_pretrained(\"google/flan-t5-large\", legacy=False)\n", "\n", "# Load test data\n", "with open(test_file_path, \"r\", encoding=\"utf-8\") as f:\n", " test_data = json.load(f)\n", "\n", "print(f\"Loaded {len(test_data)} test examples\")\n", "\n", "# Set device\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device)\n", "model.eval()\n", "\n", "# Testing function\n", "def evaluate_model(model, tokenizer, test_data, device, correct, total):\n", "\n", " for item in tqdm(test_data):\n", " prompt = f\"Question: {item['question']} Choices: {', '.join(item['choices'])} Answer:\"\n", " correct_idx = ord(item[\"answer\"][0]) - ord(\"A\")\n", " correct_answer = item[\"choices\"][correct_idx]\n", "\n", " # Tokenize input\n", " input_ids = tokenizer(prompt, return_tensors=\"pt\", truncation=True, max_length=128).input_ids.to(device)\n", "\n", " # Generate output\n", " with torch.no_grad():\n", " outputs = model.generate(\n", " input_ids=input_ids,\n", " max_length=128,\n", " num_beams=4,\n", " early_stopping=True\n", " )\n", "\n", " # Decode output\n", " predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", "\n", " # Simple string match for evaluation\n", " if predicted_text.strip() == correct_answer.strip():\n", " correct += 1\n", " else:\n", " # Print some examples of wrong predictions for debugging\n", " if total < 5: # Limit to just a few examples\n", " print(f\"\\nQuestion: {item['question']}\")\n", " print(f\"Choices: {', '.join(item['choices'])}\")\n", " print(f\"Correct Answer: {correct_answer}\")\n", " print(f\"Predicted: {predicted_text}\")\n", "\n", " total += 1\n", "\n", " accuracy = correct / total if total > 0 else 0\n", " return accuracy\n", "\n", "# Evaluate model\n", "print(\"Evaluating model on test set...\")\n", "correct = 0\n", "total = 0\n", "accuracy = evaluate_model(model, tokenizer, test_data, device, correct, total)\n", "print(f\"Test Accuracy: {accuracy:.4f} ({correct}/{total})\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0spUSTpta968", "outputId": "e7c4fa5d-f880-4ae8-8800-6ece11b5f53e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "26.666666666666668\n" ] } ], "source": [ "print(accuracy*100)" ] }, { "cell_type": "markdown", "metadata": { "id": "Gbnhsd5HTxZ9" }, "source": [ "### Zero-Shot evaluation of Phi-3.5-Mini on testing dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 101, "referenced_widgets": [ "61b4572efe1b403ca93894c7a50d85fe", "a707d40b58ad4ab1bfc6a509f86ae91b", "37e63812f3d343378e8d09c8c0acc92f", "6d9c362cb4584e04a7267e9758071a45", "944bbe53f6a74f7b90d8943e716127eb", "cbb861476450441d872740fca632c8e8", "e74c2514f368496f994ec6d9b49cc06d", "1d9e3b878aa649868b1ee924a7acccc3", "4964d3f4582a43bb845b2ca040eecc01", "60fb80449f9849b4b5b700f08ee2aed9", "f5c798148893406e9cdcf6b758cc4e46" ] }, "id": "ZjVsSxyZVLkU", "outputId": "bbd00f08-b272-417e-8b00-95bafb261474" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n", "Loading model: microsoft/phi-3.5-mini-instruct\n", "Using device: cpu\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "61b4572efe1b403ca93894c7a50d85fe", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00