{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"h7zlX1qoEmDA"},"outputs":[],"source":["!pip install gradio transformers datasets torchvision"]},{"cell_type":"code","source":["import os\n","import pandas as pd\n","import torch\n","from PIL import Image\n","from transformers import CLIPProcessor, CLIPModel\n","import gradio as gr\n","from tqdm import tqdm"],"metadata":{"id":"jRqPayqIEvYS"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')\n","\n","# -------------------- Paths --------------------\n","MODEL_PATH = \"/content/drive/My Drive/CLIP_Project/Model_Files\"\n","CSV_PATH = \"/content/drive/My Drive/CLIP_Project/Data/results.csv\"\n","IMG_DIR = \"/content/drive/My Drive/CLIP_Project/Data/Images\"\n","NUM_IMAGES = 1000"],"metadata":{"id":"BVabhv7aE4fU","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1748779541133,"user_tz":-300,"elapsed":47192,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}},"outputId":"93768d9c-75f8-45f3-fe82-67f6099c0f07"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"code","source":["device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")"],"metadata":{"id":"pf2bfspKEyNv"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# -------------------- Load Model --------------------\n","model = CLIPModel.from_pretrained(MODEL_PATH).to(device)\n","processor = CLIPProcessor.from_pretrained(MODEL_PATH)\n","model.eval()"],"metadata":{"id":"5ijAU5m3E_US","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1748779579329,"user_tz":-300,"elapsed":17237,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}},"outputId":"b8cb675f-c784-4508-e269-88752e9b507d"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"]},{"output_type":"execute_result","data":{"text/plain":["CLIPModel(\n","  (text_model): CLIPTextTransformer(\n","    (embeddings): CLIPTextEmbeddings(\n","      (token_embedding): Embedding(49408, 512)\n","      (position_embedding): Embedding(77, 512)\n","    )\n","    (encoder): CLIPEncoder(\n","      (layers): ModuleList(\n","        (0-11): 12 x CLIPEncoderLayer(\n","          (self_attn): CLIPAttention(\n","            (k_proj): Linear(in_features=512, out_features=512, bias=True)\n","            (v_proj): Linear(in_features=512, out_features=512, bias=True)\n","            (q_proj): Linear(in_features=512, out_features=512, bias=True)\n","            (out_proj): Linear(in_features=512, out_features=512, bias=True)\n","          )\n","          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n","          (mlp): CLIPMLP(\n","            (activation_fn): QuickGELUActivation()\n","            (fc1): Linear(in_features=512, out_features=2048, bias=True)\n","            (fc2): Linear(in_features=2048, out_features=512, bias=True)\n","          )\n","          (layer_norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n","        )\n","      )\n","    )\n","    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n","  )\n","  (vision_model): CLIPVisionTransformer(\n","    (embeddings): CLIPVisionEmbeddings(\n","      (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)\n","      (position_embedding): Embedding(50, 768)\n","    )\n","    (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n","    (encoder): CLIPEncoder(\n","      (layers): ModuleList(\n","        (0-11): 12 x CLIPEncoderLayer(\n","          (self_attn): CLIPAttention(\n","            (k_proj): Linear(in_features=768, out_features=768, bias=True)\n","            (v_proj): Linear(in_features=768, out_features=768, bias=True)\n","            (q_proj): Linear(in_features=768, out_features=768, bias=True)\n","            (out_proj): Linear(in_features=768, out_features=768, bias=True)\n","          )\n","          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n","          (mlp): CLIPMLP(\n","            (activation_fn): QuickGELUActivation()\n","            (fc1): Linear(in_features=768, out_features=3072, bias=True)\n","            (fc2): Linear(in_features=3072, out_features=768, bias=True)\n","          )\n","          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n","        )\n","      )\n","    )\n","    (post_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n","  )\n","  (visual_projection): Linear(in_features=768, out_features=512, bias=False)\n","  (text_projection): Linear(in_features=512, out_features=512, bias=False)\n",")"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","source":["# -------------------- Load and Prepare Dataset --------------------\n","# Load CSV with pipe separator and strip columns\n","df = pd.read_csv(CSV_PATH, sep='|', engine='python', skip_blank_lines=True)\n","\n","# Clean column names and strip spaces from text columns\n","df.columns = [col.strip() for col in df.columns]\n","df['image_name'] = df['image_name'].astype(str).str.strip()\n","df['comment'] = df['comment'].astype(str).str.strip()\n","df['comment_number'] = df['comment_number'].astype(str).str.strip()\n","\n","# Function to safely parse comment_number integer from messy strings\n","def parse_comment_number(x):\n","    try:\n","        # Just take the first token and convert to int\n","        return int(x.split()[0])\n","    except:\n","        return -1  # invalid values become -1\n","\n","df['comment_number'] = df['comment_number'].apply(parse_comment_number)\n","\n","# Keep only rows where comment_number == 0 (first comment per image)\n","df = df[df['comment_number'] == 0]\n","\n","# Add full image file path\n","df['filepath'] = df['image_name'].apply(lambda x: os.path.join(IMG_DIR, x))\n","\n","# Filter only rows where image file exists\n","df = df[df['filepath'].apply(os.path.exists)]\n","\n","# Drop duplicate images just in case\n","df = df.drop_duplicates(subset='image_name')\n","\n","# Sample N images or all if less than N\n","df = df.sample(n=min(NUM_IMAGES, len(df)), random_state=42).reset_index(drop=True)\n","\n","# Final lists\n","image_paths = df['filepath'].tolist()\n","captions = df['comment'].tolist()\n","\n","print(f\"Prepared {len(image_paths)} images and captions.\")"],"metadata":{"id":"cxU0DZpNFD4d","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1748779630650,"user_tz":-300,"elapsed":12316,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}},"outputId":"e3a3afb7-8249-499e-bb85-ca5685b77c3f"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["<ipython-input-6-1d4be4547aa4>:25: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame.\n","Try using .loc[row_indexer,col_indexer] = value instead\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n","  df['filepath'] = df['image_name'].apply(lambda x: os.path.join(IMG_DIR, x))\n"]},{"output_type":"stream","name":"stdout","text":["Prepared 1000 images and captions.\n"]}]},{"cell_type":"code","source":["# -------------------- Precompute Embeddings --------------------\n","image_embeddings = []\n","text_embeddings = []\n","\n","print(\"Computing embeddings...\")\n","\n","with torch.no_grad():\n","    for i in tqdm(range(0, len(image_paths), 16)):\n","        batch_imgs = [Image.open(p).convert(\"RGB\") for p in image_paths[i:i+16]]\n","        batch_texts = captions[i:i+16]\n","        inputs = processor(text=batch_texts, images=batch_imgs, return_tensors=\"pt\", padding=True, truncation=True).to(device)\n","        outputs = model(**inputs)\n","        img_embed = torch.nn.functional.normalize(outputs.image_embeds, p=2, dim=1)\n","        txt_embed = torch.nn.functional.normalize(outputs.text_embeds, p=2, dim=1)\n","        image_embeddings.append(img_embed.cpu())\n","        text_embeddings.append(txt_embed.cpu())\n","\n","image_embeddings = torch.cat(image_embeddings)\n","text_embeddings = torch.cat(text_embeddings)\n","\n","print(\"Embeddings ready!\")"],"metadata":{"id":"I8t5_dEoFG_s","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1748711935283,"user_tz":-300,"elapsed":490414,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}},"outputId":"e71a8277-a7ec-420f-c185-cb801307cb7f"},"execution_count":null,"outputs":[{"metadata":{"tags":null},"name":"stdout","output_type":"stream","text":["Computing embeddings...\n"]},{"output_type":"stream","name":"stderr","text":["100%|██████████| 63/63 [09:07<00:00,  8.70s/it]"]},{"output_type":"stream","name":"stdout","text":["Embeddings ready!\n"]},{"output_type":"stream","name":"stderr","text":["\n"]}]},{"cell_type":"code","source":["# Saving Embedding\n","torch.save(image_embeddings, '/content/drive/My Drive/CLIP_Project/Data/Image_Embeddings.pt')\n","torch.save(text_embeddings, '/content/drive/My Drive/CLIP_Project/Data/Text_Embeddings.pt')"],"metadata":{"id":"4SDhqWeTVMrk"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Load embeddings without recomputing:\n","image_embeddings = torch.load('/content/drive/My Drive/CLIP_Project/Data/Image_Embeddings.pt')\n","text_embeddings = torch.load('/content/drive/My Drive/CLIP_Project/Data/Text_Embeddings.pt')"],"metadata":{"id":"dpqQWBU4WG81"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# -------------------- Retrieval Functions --------------------\n","\n","def image_to_text(img):\n","    with torch.no_grad():\n","        inputs = processor(images=img, return_tensors=\"pt\").to(device)\n","        query_embed = model.get_image_features(**inputs)\n","        query_embed = torch.nn.functional.normalize(query_embed, p=2, dim=1).cpu()\n","        similarities = (query_embed @ text_embeddings.T).squeeze()\n","        best_idx = similarities.argmax().item()\n","        return captions[best_idx]\n","\n","def text_to_image(text):\n","    with torch.no_grad():\n","        inputs = processor(text=text, return_tensors=\"pt\", padding=True, truncation=True).to(device)\n","        query_embed = model.get_text_features(**inputs)\n","        query_embed = torch.nn.functional.normalize(query_embed, p=2, dim=1).cpu()\n","        similarities = (query_embed @ image_embeddings.T).squeeze()\n","        best_idx = similarities.argmax().item()\n","        return image_paths[best_idx]"],"metadata":{"id":"2khkfWuTFNnd"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# -------------------- Gradio Interface --------------------\n","\n","from PIL import Image\n","\n","image_input = gr.Image(type=\"pil\", label=\"Upload an Image\")\n","text_input = gr.Textbox(label=\"Enter Caption\")\n","\n","# Interface for Image → Text\n","iface_img2txt = gr.Interface(\n","    fn=image_to_text,\n","    inputs=image_input,\n","    outputs=gr.Textbox(label=\"Matched Caption\"),\n","    title=\"Image to Text Retrieval\"\n",")\n","\n","# Interface for Text → Image\n","def text_to_image_output(text):\n","    img_path = text_to_image(text)\n","    return Image.open(img_path)\n","\n","iface_txt2img = gr.Interface(\n","    fn=text_to_image_output,\n","    inputs=text_input,\n","    outputs=gr.Image(label=\"Matched Image\"),\n","    title=\"Text to Image Retrieval\"\n",")\n","\n","# Combine both interfaces in tabs\n","interface = gr.TabbedInterface(\n","    [iface_img2txt, iface_txt2img],\n","    [\"Image to Text\", \"Text to Image\"]\n",")"],"metadata":{"id":"If-a-ItzFTuU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["interface.launch()"],"metadata":{"id":"q65zsryoFVP_","colab":{"base_uri":"https://localhost:8080/","height":650},"executionInfo":{"status":"ok","timestamp":1748779651545,"user_tz":-300,"elapsed":3428,"user":{"displayName":"Ramzan0001","userId":"17484931948124609414"}},"outputId":"85330b6f-7c68-4314-ee10-1a802334c47e"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n","\n","Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n","* Running on public URL: https://aa1ce5e21a61c1ae5f.gradio.live\n","\n","This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"]},{"output_type":"display_data","data":{"text/plain":["<IPython.core.display.HTML object>"],"text/html":["<div><iframe src=\"https://aa1ce5e21a61c1ae5f.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"]},"metadata":{}},{"output_type":"execute_result","data":{"text/plain":[]},"metadata":{},"execution_count":10}]},{"cell_type":"code","source":[],"metadata":{"id":"saaXb_VKVU_1"},"execution_count":null,"outputs":[]}]}