JanadaSroor
/

vision-models

@@ -4,13 +4,15 @@
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "# AI Kit Vision Models Demo\n",
                 "\n",
-                "This notebook demonstrates how to use the optimized ONNX models from the [JanadaSroor/vision_models](https://huggingface.co/JanadaSroor) repository. These models are designed for high-performance inference on mobile devices and edge applications.\n",
                 "\n",
                 "## Models Included:\n",
-                "- **CLIP (OpenAI)**: For text-to-image and image-to-image similarity search.\n",
-                "- **ViT (Google)**: For high-quality image feature extraction."
             ]
         },
         {
@@ -20,7 +22,7 @@
             "outputs": [],
             "source": [
                 "# 1. Install Dependencies\n",
-                "!pip install onnxruntime transformers pillow numpy huggingface_hub"
             ]
         },
         {
@@ -33,10 +35,9 @@
                 "import os\n",
                 "import time\n",
                 "import numpy as np\n",
-                "import torch\n",
-                "from PIL import Image\n",
                 "import requests\n",
                 "from io import BytesIO\n",
                 "import onnxruntime as ort\n",
                 "from transformers import CLIPProcessor, ViTFeatureExtractor\n",
                 "from huggingface_hub import hf_hub_download"
@@ -48,7 +49,7 @@
             "source": [
                 "## 3. Download Models from Hugging Face\n",
                 "\n",
-                "We'll download the quantized versions of the models for efficient CPU inference."
             ]
         },
         {
@@ -57,27 +58,32 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "def download_model(repo_id, filename):\n",
-                "    print(f\"Downloading {filename} from {repo_id}...\")\n",
-                "    return hf_hub_download(repo_id=repo_id, filename=filename)\n",
-                "\n",
-                "# CLIP Models\n",
-                "REPO_CLIP = \"JanadaSroor/clip-vit-base-patch32-onnx\"\n",
-                "clip_text_path = download_model(REPO_CLIP, \"clip_text_quantized.onnx\")\n",
-                "clip_vision_path = download_model(REPO_CLIP, \"clip_vision_quantized.onnx\")\n",
-                "\n",
-                "# ViT Model\n",
-                "REPO_VIT = \"JanadaSroor/vit-base-patch16-224-onnx\"\n",
-                "vit_path = download_model(REPO_VIT, \"vit_base_quantized.onnx\")"
             ]
         },
         {
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "## 4. Initialization\n",
                 "\n",
-                "Load the ONNX sessions and the processors."
             ]
         },
         {
@@ -86,23 +92,25 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "print(\"Initializing ONNX sessions...\")\n",
                 "text_sess = ort.InferenceSession(clip_text_path)\n",
                 "vision_sess = ort.InferenceSession(clip_vision_path)\n",
                 "vit_sess = ort.InferenceSession(vit_path)\n",
                 "\n",
-                "print(\"Loading processors...\")\n",
                 "clip_processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
-                "vit_extractor = ViTFeatureExtractor.from_pretrained(\"google/vit-base-patch16-224\")"
             ]
         },
         {
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "## 5. CLIP Demo: Text-to-Image Similarity\n",
                 "\n",
-                "We'll take a test image and several text descriptions to see which description matches best."
             ]
         },
         {
@@ -111,40 +119,41 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "def get_image(url):\n",
-                "    response = requests.get(url)\n",
-                "    return Image.open(BytesIO(response.content)).convert(\"RGB\")\n",
                 "\n",
-                "# Sample image: A cat\n",
-                "img_url = \"https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba?ixlib=rb-1.2.1&auto=format&fit=crop&w=500&q=80\"\n",
-                "image = get_image(img_url)\n",
-                "image.thumbnail((300, 300))\n",
-                "display(image)\n",
                 "\n",
-                "queries = [\"a photo of a cat\", \"a photo of a dog\", \"a photo of a car\", \"a sunset\"]\n",
                 "\n",
-                "# Encode Image\n",
-                "image_inputs = clip_processor(images=image, return_tensors=\"np\")\n",
-                "image_embeds = vision_sess.run(None, dict(image_inputs))[0][0]\n",
                 "\n",
-                "# Encode Texts and Calculate Similarity\n",
-                "print(\"\\nSimilarity Scores:\")\n",
                 "for query in queries:\n",
                 "    text_inputs = clip_processor(text=[query], return_tensors=\"np\", padding=True)\n",
                 "    text_embeds = text_sess.run(None, dict(text_inputs))[0][0]\n",
                 "    \n",
-                "    # Cosine Similarity\n",
                 "    similarity = np.dot(text_embeds, image_embeds) / (np.linalg.norm(text_embeds) * np.linalg.norm(image_embeds))\n",
-                "    print(f\"- Query: '{query}' -> Score: {similarity:.4f}\")"
             ]
         },
         {
             "cell_type": "markdown",
             "metadata": {},
             "source": [
-                "## 6. ViT Demo: Image Embedding\n",
                 "\n",
-                "Extract high-dimensional features (768D) from an image using the ViT model."
             ]
         },
         {
@@ -153,15 +162,15 @@
             "metadata": {},
             "outputs": [],
             "source": [
-                "# Encode image with ViT\n",
-                "vit_inputs = vit_extractor(images=image, return_tensors=\"np\")\n",
-                "vit_outputs = vit_sess.run(None, dict(vit_inputs))\n",
-                "\n",
-                "# The output for vit-base is usually [batch, sequence_length, hidden_size]\n",
-                "# For image similarity, we typically use the CLS token (index 0)\n",
-                "vit_embeds = vit_outputs[0][0][0]\n",
-                "print(f\"ViT Embedding Shape: {vit_embeds.shape}\")\n",
-                "print(f\"First 5 values: {vit_embeds[:5]}\")"
             ]
         }
     ],

             "cell_type": "markdown",
             "metadata": {},
             "source": [
+                "# AI Kit Gallery - Vision Models Demo\n",
                 "\n",
+                "This notebook demonstrates how to use the optimized ONNX models from the [JanadaSroor/vision-models](https://huggingface.co/JanadaSroor/vision-models) repository. These models are designed for high-performance inference on mobile devices.\n",
                 "\n",
                 "## Models Included:\n",
+                "- **CLIP (OpenAI)**: Text-to-Image & Image-to-Image similarity.\n",
+                "- **ViT (Google)**: High-quality image feature extraction.\n",
+                "\n",
+                "All models are quantized (INT8) or optimized for mobile use."
             ]
         },
         {
             "outputs": [],
             "source": [
                 "# 1. Install Dependencies\n",
+                "!pip install onnxruntime transformers pillow numpy huggingface_hub requests"
             ]
         },
         {
                 "import os\n",
                 "import time\n",
                 "import numpy as np\n",
                 "import requests\n",
                 "from io import BytesIO\n",
+                "from PIL import Image\n",
                 "import onnxruntime as ort\n",
                 "from transformers import CLIPProcessor, ViTFeatureExtractor\n",
                 "from huggingface_hub import hf_hub_download"
             "source": [
                 "## 3. Download Models from Hugging Face\n",
                 "\n",
+                "We download the models directly from the `JanadaSroor/vision-models` repository."
             ]
         },
         {
             "metadata": {},
             "outputs": [],
             "source": [
+                "# Configuration\n",
+                "REPO_ID = \"JanadaSroor/vision-models\"\n",
+                "MODELS_DIR = \"models\"\n",
+                "\n",
+                "def download_onnx_model(filename):\n",
+                "    print(f\"Downloading {filename}...\")\n",
+                "    # Files are stored in the 'models/' subdirectory in the repo\n",
+                "    return hf_hub_download(repo_id=REPO_ID, filename=f\"models/{filename}\")\n",
+                "\n",
+                "# Download CLIP Models\n",
+                "clip_text_path = download_onnx_model(\"clip_text_quantized.onnx\")\n",
+                "clip_vision_path = download_onnx_model(\"clip_vision_quantized.onnx\")\n",
+                "\n",
+                "# Download ViT Model\n",
+                "vit_path = download_onnx_model(\"vit_base_quantized.onnx\")\n",
+                "\n",
+                "print(\"\\n✅ All models downloaded successfully!\")"
             ]
         },
         {
             "cell_type": "markdown",
             "metadata": {},
             "source": [
+                "## 4. Initialize Inference Sessions\n",
                 "\n",
+                "We create ONNX Runtime sessions for hardware-accelerated inference."
             ]
         },
         {
             "metadata": {},
             "outputs": [],
             "source": [
+                "# Initialize ONNX Sessions\n",
                 "text_sess = ort.InferenceSession(clip_text_path)\n",
                 "vision_sess = ort.InferenceSession(clip_vision_path)\n",
                 "vit_sess = ort.InferenceSession(vit_path)\n",
                 "\n",
+                "# Initialize Processors (for tokenizing text and preprocessing images)\n",
                 "clip_processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+                "vit_extractor = ViTFeatureExtractor.from_pretrained(\"google/vit-base-patch16-224\")\n",
+                "\n",
+                "print(\"✅ Inference sessions ready.\")"
             ]
         },
         {
             "cell_type": "markdown",
             "metadata": {},
             "source": [
+                "## 5. CLIP Demo: Search Images with Text\n",
                 "\n",
+                "We will compare a query text against a test image to see the similarity score."
             ]
         },
         {
             "metadata": {},
             "outputs": [],
             "source": [
+                "# Load a test image\n",
+                "url = \"https://images.unsplash.com/photo-1543466835-00a7907e9de1?ixlib=rb-4.0.3&auto=format&fit=crop&w=500&q=80\"\n",
+                "response = requests.get(url)\n",
+                "image = Image.open(BytesIO(response.content)).convert(\"RGB\")\n",
+                "display(image.resize((300, 300)))\n",
                 "\n",
+                "# Define queries\n",
+                "queries = [\"a cute dog\", \"a running dog\", \"a cat\", \"a car\", \"food\"]\n",
                 "\n",
+                "# 1. Encode Image (CLIP Vision)\n",
+                "inputs = clip_processor(images=image, return_tensors=\"np\")\n",
+                "image_embeds = vision_sess.run(None, dict(inputs))[0][0]\n",
                 "\n",
+                "# 2. Encode Text & Compare\n",
+                "print(f\"\\n{'Query':<20} | {'Score':<10}\")\n",
+                "print(\"-\" * 35)\n",
                 "\n",
                 "for query in queries:\n",
+                "    # Tokenize and encode text\n",
                 "    text_inputs = clip_processor(text=[query], return_tensors=\"np\", padding=True)\n",
                 "    text_embeds = text_sess.run(None, dict(text_inputs))[0][0]\n",
                 "    \n",
+                "    # Calculate Cosine Similarity\n",
                 "    similarity = np.dot(text_embeds, image_embeds) / (np.linalg.norm(text_embeds) * np.linalg.norm(image_embeds))\n",
+                "    \n",
+                "    print(f\"{query:<20} | {similarity:.4f}\")"
             ]
         },
         {
             "cell_type": "markdown",
             "metadata": {},
             "source": [
+                "## 6. ViT Demo: Feature Extraction\n",
                 "\n",
+                "Generate a 768-dimensional embedding vector for the image using the ViT model."
             ]
         },
         {
             "metadata": {},
             "outputs": [],
             "source": [
+                "inputs = vit_extractor(images=image, return_tensors=\"np\")\n",
+                "outputs = vit_sess.run(None, dict(inputs))\n",
+                "\n",
+                "# For ViT, the first output [0] is the last_hidden_state.\n",
+                "# We typically use the first token (CLS token) as the image representation.\n",
+                "cls_embedding = outputs[0][0][0]\n",
+                "\n",
+                "print(f\"ViT Embedding Shape: {cls_embedding.shape}\")\n",
+                "print(f\"First 10 values: {cls_embedding[:10]}\")"
             ]
         }
     ],