{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyPX0KJCddQKdgG/BZ7JHsh1"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","source":["!pip install -q torch torchvision pandas ftfy"],"metadata":{"id":"PL0XMNx7C-t3"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":2,"metadata":{"id":"2S6ibSnMCtZs","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1748707375302,"user_tz":-300,"elapsed":43159,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}},"outputId":"af1d6999-6aae-48ec-8ff6-2fb083b95802"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["# === Paths ===\n","CSV_PATH = '/content/drive/My Drive/CLIP_Project/Data/results.csv'\n","IMG_DIR = '/content/drive/My Drive/CLIP_Project/Data/Testing_Images'\n","MODEL_PATH = '/content/drive/My Drive/CLIP_Project/Model_Files'"],"metadata":{"id":"_HlCWG2zlhjl","executionInfo":{"status":"ok","timestamp":1748707378675,"user_tz":-300,"elapsed":14,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["# === Load Libraries ===\n","import os\n","import pandas as pd\n","from PIL import Image\n","from torch.utils.data import Dataset, DataLoader\n","from transformers import CLIPProcessor, CLIPModel\n","import torch\n","import torch.nn.functional as F\n","from torch.nn import CrossEntropyLoss"],"metadata":{"id":"kSPwCA9aC_F3","executionInfo":{"status":"ok","timestamp":1748707404284,"user_tz":-300,"elapsed":23452,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}}},"execution_count":4,"outputs":[]},{"cell_type":"code","source":["device = \"cuda\" if torch.cuda.is_available() else \"cpu\""],"metadata":{"id":"ua0Po_aYDF4a","executionInfo":{"status":"ok","timestamp":1748707424115,"user_tz":-300,"elapsed":41,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["# === Load Model & Processor ===\n","model = CLIPModel.from_pretrained(MODEL_PATH).to(device)\n","processor = CLIPProcessor.from_pretrained(MODEL_PATH)"],"metadata":{"id":"DEqqGaTXDUHT","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1748707440975,"user_tz":-300,"elapsed":13704,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}},"outputId":"416b991a-8907-4938-9726-f9e42a4a5880"},"execution_count":6,"outputs":[{"output_type":"stream","name":"stderr","text":["Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"]}]},{"cell_type":"code","source":["# === Load and Filter Data ===\n","df = pd.read_csv(CSV_PATH, sep='|', engine='python')\n","df.columns = [col.strip() for col in df.columns]\n","df['image_name'] = df['image_name'].astype(str).str.strip()\n","df['comment'] = df['comment'].astype(str).str.strip()\n","df['comment_number'] = pd.to_numeric(df['comment_number'], errors='coerce')\n","df = df[df['comment_number'] == 0].reset_index(drop=True)\n","df['filepath'] = df['image_name'].apply(lambda x: os.path.join(IMG_DIR, x))\n","df = df[df['filepath'].apply(os.path.exists)].reset_index(drop=True)\n","df = df.sample(n=500, random_state=42).reset_index(drop=True)"],"metadata":{"id":"9Z-vMxDcl5FN","executionInfo":{"status":"ok","timestamp":1748707461829,"user_tz":-300,"elapsed":12329,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["print(df.head(5))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7yN-pjq_FkYJ","executionInfo":{"status":"ok","timestamp":1748707472354,"user_tz":-300,"elapsed":37,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}},"outputId":"716b42b3-e4df-47d7-d0b0-bebf0be084e2"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["      image_name  comment_number  \\\n","0  505062117.jpg             0.0   \n","1  513390919.jpg             0.0   \n","2  535529555.jpg             0.0   \n","3  493507605.jpg             0.0   \n","4  516433137.jpg             0.0   \n","\n","                                             comment  \\\n","0  A dog swims in the water with a tennis ball in...   \n","1  Two young children and a young adult are worki...   \n","2  Two dogs try to get the chewed-up red Frisbee ...   \n","3  A wet black dog is running away from another b...   \n","4  An elderly man lies on a couch in an alleyway ...   \n","\n","                                            filepath  \n","0  /content/drive/My Drive/CLIP_Project/Data/Test...  \n","1  /content/drive/My Drive/CLIP_Project/Data/Test...  \n","2  /content/drive/My Drive/CLIP_Project/Data/Test...  \n","3  /content/drive/My Drive/CLIP_Project/Data/Test...  \n","4  /content/drive/My Drive/CLIP_Project/Data/Test...  \n"]}]},{"cell_type":"code","source":["# === Dataset and DataLoader ===\n","class FlickrTestDataset(Dataset):\n","    def __init__(self, dataframe):\n","        self.data = dataframe\n","\n","    def __len__(self):\n","        return len(self.data)\n","\n","    def __getitem__(self, idx):\n","        image = Image.open(self.data.iloc[idx]['filepath']).convert(\"RGB\")\n","        text = self.data.iloc[idx]['comment']\n","        return {'image': image, 'text': text}\n","\n","def collate_fn(batch):\n","    texts = [item['text'] for item in batch]\n","    images = [item['image'] for item in batch]\n","    return processor(text=texts, images=images, return_tensors=\"pt\", padding=True, truncation=True)\n","\n","test_dataset = FlickrTestDataset(df)\n","test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)"],"metadata":{"id":"AIh1UcCvl6wH","executionInfo":{"status":"ok","timestamp":1748707477151,"user_tz":-300,"elapsed":48,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["# === Evaluation ===\n","model.eval()\n","loss_fn = CrossEntropyLoss()\n","\n","all_image_embeds = []\n","all_text_embeds = []\n","total_loss = 0.0\n","total_samples = 0\n","correct_image_to_text = 0\n","correct_text_to_image = 0\n","\n","with torch.no_grad():\n","    for batch in test_loader:\n","        batch = {k: v.to(device) for k, v in batch.items()}\n","        outputs = model(**batch)\n","\n","        # Normalize embeddings\n","        image_embeds = F.normalize(outputs.image_embeds, p=2, dim=1)\n","        text_embeds = F.normalize(outputs.text_embeds, p=2, dim=1)\n","\n","        # Store for global similarity matrix\n","        all_image_embeds.append(image_embeds)\n","        all_text_embeds.append(text_embeds)\n","\n","        # Compute similarity\n","        logits_per_image = image_embeds @ text_embeds.T\n","        logits_per_text = text_embeds @ image_embeds.T\n","\n","        labels = torch.arange(image_embeds.size(0), device=device)\n","        loss_i = loss_fn(logits_per_image, labels)\n","        loss_t = loss_fn(logits_per_text, labels)\n","        loss = (loss_i + loss_t) / 2\n","\n","        total_loss += loss.item()\n","        total_samples += 1\n","\n","        # Accuracy (Recall@1)\n","        pred_i2t = torch.argmax(logits_per_image, dim=1)\n","        pred_t2i = torch.argmax(logits_per_text, dim=1)\n","        correct_image_to_text += (pred_i2t == labels).sum().item()\n","        correct_text_to_image += (pred_t2i == labels).sum().item()"],"metadata":{"id":"dOa-zao7l-ya","executionInfo":{"status":"ok","timestamp":1748707734554,"user_tz":-300,"elapsed":250159,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}}},"execution_count":10,"outputs":[]},{"cell_type":"code","source":["# === Final Metrics ===\n","avg_loss = total_loss / total_samples\n","accuracy_i2t = correct_image_to_text / len(df)\n","accuracy_t2i = correct_text_to_image / len(df)\n","\n","print(f\"\\n✅ Testing Completed\")\n","print(f\"📉 Average Contrastive Loss: {avg_loss:.4f}\")\n","print(f\"🎯 Accuracy (Image -> Text): {accuracy_i2t*100:.2f}%\")\n","print(f\"🎯 Accuracy (Text -> Image): {accuracy_t2i*100:.2f}%\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"M2p-NzAbEzoa","executionInfo":{"status":"ok","timestamp":1748707786818,"user_tz":-300,"elapsed":13,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}},"outputId":"51398e31-7ea6-4a5d-b4c7-b8e9c32f51da"},"execution_count":11,"outputs":[{"output_type":"stream","name":"stdout","text":["\n","✅ Testing Completed\n","📉 Average Contrastive Loss: 2.7667\n","🎯 Accuracy (Image -> Text): 75.60%\n","🎯 Accuracy (Text -> Image): 76.60%\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"AAp9D7PfGz6u"},"execution_count":null,"outputs":[]}]}