# AI Kit Gallery - Vision Models Demo

This notebook demonstrates how to use the optimized ONNX models from the [JanadaSroor/vision-models](https://huggingface.co/JanadaSroor/vision-models) repository. These models are designed for high-performance inference on mobile devices.

## Models Included:
- **CLIP (OpenAI)**: Text-to-Image & Image-to-Image similarity.
- **ViT (Google)**: High-quality image feature extraction.

All models are quantized (INT8) or optimized for mobile use.

In [None]:
# 1. Install Dependencies
!pip install onnxruntime transformers pillow numpy huggingface_hub requests

In [None]:
# 2. Import Libraries
import os
import time
import numpy as np
import requests
from io import BytesIO
from PIL import Image
import onnxruntime as ort
from transformers import CLIPProcessor, ViTFeatureExtractor
from huggingface_hub import hf_hub_download

## 3. Download Models from Hugging Face

We download the models directly from the `JanadaSroor/vision-models` repository.

In [None]:
# Configuration
REPO_ID = "JanadaSroor/vision-models"
MODELS_DIR = "models"

def download_onnx_model(filename):
 print(f"Downloading {filename}...")
 # Files are stored in the 'models/' subdirectory in the repo
 return hf_hub_download(repo_id=REPO_ID, filename=f"models/{filename}")

# Download CLIP Models
clip_text_path = download_onnx_model("clip_text_quantized.onnx")
clip_vision_path = download_onnx_model("clip_vision_quantized.onnx")

# Download ViT Model
vit_path = download_onnx_model("vit_base_quantized.onnx")

print("\nāœ… All models downloaded successfully!")

## 4. Initialize Inference Sessions

We create ONNX Runtime sessions for hardware-accelerated inference.

In [None]:
# Initialize ONNX Sessions
text_sess = ort.InferenceSession(clip_text_path)
vision_sess = ort.InferenceSession(clip_vision_path)
vit_sess = ort.InferenceSession(vit_path)

# Initialize Processors (for tokenizing text and preprocessing images)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
vit_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")

print("āœ… Inference sessions ready.")

## 5. CLIP Demo: Search Images with Text

We will compare a query text against a test image to see the similarity score.

In [None]:
import numpy as np
import requests
from PIL import Image
from io import BytesIO

# Load a test image
url = "https://images.unsplash.com/photo-1543466835-00a7907e9de1?ixlib=rb-4.0.3&auto=format&fit=crop&w=500&q=80"
response = requests.get(url)
image = Image.open(BytesIO(response.content)).convert("RGB")
display(image.resize((300, 300)))

# Define queries
queries = ["a cute dog", "a dog looking", "a cat", "a car", "food"]

# ---------- 1. Encode Image ----------
image_inputs = clip_processor(images=image, return_tensors="np")
image_embed = vision_sess.run(None, dict(image_inputs))[0][0]

# L2 normalize image embedding
image_embed = image_embed / np.linalg.norm(image_embed)
scores = []

for query in queries:
 text_inputs = clip_processor(text=[query], return_tensors="np", padding=True)
 text_embed = text_sess.run(None, dict(text_inputs))[0][0]
 text_embed = text_embed / np.linalg.norm(text_embed)

 score = 100.0 * np.dot(text_embed, image_embed)
 scores.append(score)

scores = np.array(scores)

# Softmax over queries (THIS is what CLIP expects)
probs = np.exp(scores) / np.exp(scores).sum()

print(f"\n{'Query':<20} | {'Logit':<10} | {'Prob'}")
print("-" * 50)

for q, s, p in zip(queries, scores, probs):
 print(f"{q:<20} | {s:8.2f} | {100*p:.3f}%")



## 6. ViT Demo: Feature Extraction

Generate a 768-dimensional embedding vector for the image using the ViT model.

In [None]:
inputs = vit_extractor(images=image, return_tensors="np")
outputs = vit_sess.run(None, dict(inputs))

# For ViT, the first output [0] is the last_hidden_state.
# We typically use the first token (CLS token) as the image representation.
cls_embedding = outputs[0][0][0]

print(f"ViT Embedding Shape: {cls_embedding.shape}")
print(f"First 10 values: {cls_embedding[:10]}")