Spaces:

akash4552
/

assignment

Sleeping

App Files Files Community

akash4552 commited on Aug 28, 2025

Commit

9a5ac5b

verified ·

1 Parent(s): 7e4d943

Update app.py

Browse files

Files changed (1) hide show

app.py +349 -82

app.py CHANGED Viewed

@@ -1,95 +1,362 @@
-import gradio as gr
 import torch
-import clip
-import faiss
 import numpy as np
-from PIL import Image
-import os
-# Load CLIP model
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model, preprocess = clip.load("ViT-B/32", device=device)
-# Global storage
-image_paths = []
-image_embeddings = None
-faiss_index = None
-def build_faiss_index(images):
-    """Build FAISS index from uploaded images"""
-    global image_paths, image_embeddings, faiss_index
-    image_paths = []
-    embeddings = []
-    for img in images:
-        image_paths.append(img.name)
-        pil_img = Image.open(img.name).convert("RGB")
-        tensor_img = preprocess(pil_img).unsqueeze(0).to(device)
-        with torch.no_grad():
-            emb = model.encode_image(tensor_img)
-            emb /= emb.norm(dim=-1, keepdim=True)
-        embeddings.append(emb.cpu().numpy())
-    image_embeddings = np.vstack(embeddings).astype("float32")
-    # Build FAISS index
-    d = image_embeddings.shape[1]  # embedding dimension
-    faiss_index = faiss.IndexFlatIP(d)  # cosine similarity (inner product)
-    faiss_index.add(image_embeddings)
-    return f"Indexed {len(image_paths)} images."
-def search(query, top_k=5):
-    """Search top-k most similar images given a text query"""
-    global image_paths, faiss_index, image_embeddings
-    if faiss_index is None:
-        return "Please upload and index images first.", []
-    # Encode query
-    text = clip.tokenize([query]).to(device)
     with torch.no_grad():
-        text_emb = model.encode_text(text)
-        text_emb /= text_emb.norm(dim=-1, keepdim=True)
-    text_emb = text_emb.cpu().numpy().astype("float32")
-    # Search FAISS
-    scores, indices = faiss_index.search(text_emb, top_k)
-    results = []
-    for idx, score in zip(indices[0], scores[0]):
-        img = image_paths[idx]
-        results.append((img, float(score)))
-    return f"Top {top_k} results for '{query}'", results
-def display_results(query, top_k=5):
-    message, results = search(query, top_k)
-    images, scores = [], []
-    for img, score in results:
-        images.append(img)
-        scores.append(f"{score:.3f}")
-    return message, images, scores
-with gr.Blocks() as demo:
-    gr.Markdown("## Image Search with CLIP + FAISS 🚀")
-    with gr.Row():
-        img_upload = gr.File(file_types=[".png", ".jpg", ".jpeg"], file_count="multiple")
-        build_btn = gr.Button("Build Index")
-    status = gr.Textbox(label="Status")
-    with gr.Row():
-        query = gr.Textbox(label="Search Query")
-        top_k = gr.Slider(1, 20, value=5, step=1, label="Top K Results")
-        search_btn = gr.Button("Search")
-    output_text = gr.Textbox(label="Results")
-    output_gallery = gr.Gallery(label="Ranked Images").style(grid=[5], height="auto")
-    output_scores = gr.Textbox(label="Similarity Scores")
-    build_btn.click(fn=build_faiss_index, inputs=[img_upload], outputs=[status])
-    search_btn.click(fn=display_results, inputs=[query, top_k], outputs=[output_text, output_gallery, output_scores])
-demo.launch()

+"""
+Gradio app: Text-to-Image ranking using OpenCLIP (open-source)
+Features:
+- Accepts a text query and multiple images (100+).
+- Encodes text and images with OpenCLIP (ViT-B-32 by default).
+- Computes cosine similarity, normalizes scores to 0-100.
+- Returns a ranked CSV and a visual grid image annotated with scores.
+- GPU optional (will use CUDA if available).
+"""
+import os
+import io
+import math
+import time
+from typing import List, Tuple, Optional
 import torch
+import open_clip
+from PIL import Image, ImageDraw, ImageFont
 import numpy as np
+import pandas as pd
+import gradio as gr
+# -------------------------
+# Configuration / Globals
+# -------------------------
+MODEL_NAME = "ViT-B-32"      # OpenCLIP model backbone
+# MODEL_PRETRAIN = "laion2b_s32b_b79k"
+MODEL_PRETRAIN = "openai"  # pretraining dataset variant (open weights)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+BATCH_SIZE = 64              # image encoding batch size (tune by your GPU/CPU memory)
+TOP_K_DEFAULT = 20           # how many top results to show visually
+THUMB_SIZE = (256, 256)      # thumbnail size for visual grid
+FONT_PATH = None             # if you want a custom TTF, set path, else default PIL font used
+NORMALIZE_SCORE_TO = 100     # final scores in 0..NORMALIZE_SCORE_TO
+# -------------------------
+# Load model once at startup (lazy load wrapped in function)
+_model_data = {"loaded": False}
+def load_model(device: str = DEVICE):
+    """
+    Loads OpenCLIP model and transforms. Cached on first call.
+    Returns model, preprocess function, tokenizer, and embedding dimension.
+    """
+    if _model_data.get("loaded", False):
+        return _model_data["model"], _model_data["preprocess"], _model_data["tokenizer"], _model_data["dim"]
+    print(f"Loading OpenCLIP {MODEL_NAME} ({MODEL_PRETRAIN}) to {device} ...")
+    model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, MODEL_PRETRAIN)
+    tokenizer = open_clip.get_tokenizer(MODEL_NAME)
+    model.to(device)
+    model.eval()
+    # store
+    dim = model.text_projection.shape[1] if hasattr(model, "text_projection") else model.projection.shape[1]
+    _model_data.update({
+        "loaded": True,
+        "model": model,
+        "preprocess": preprocess,
+        "tokenizer": tokenizer,
+        "dim": dim
+    })
+    print("Model loaded.")
+    return model, preprocess, tokenizer, dim
+# -------------------------
+# Utilities
+# -------------------------
+def load_pil_image(file_obj) -> Image.Image:
+    """
+    Given a file-like object from Gradio (or path), return a PIL image in RGB.
+    """
+    if isinstance(file_obj, str):
+        img = Image.open(file_obj)
+    else:
+        file_obj.seek(0)
+        img = Image.open(io.BytesIO(file_obj.read()))
+    return img.convert("RGB")
+def batchify(iterable, batch_size):
+    """Yield successive batches from iterable"""
+    it = list(iterable)
+    for i in range(0, len(it), batch_size):
+        yield it[i:i + batch_size]
+def encode_text(text: str, model, tokenizer, device: str = DEVICE) -> torch.Tensor:
+    """
+    Encode text to a normalized embedding tensor (1 x dim)
+    """
+    texts_tokenized = tokenizer([text])
     with torch.no_grad():
+        text_tokens = texts_tokenized.to(device)
+        text_feats = model.encode_text(text_tokens)  # (1, dim)
+        text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
+    return text_feats
+def encode_images(images: List[Image.Image], model, preprocess, device: str = DEVICE, batch_size: int = BATCH_SIZE) -> torch.Tensor:
+    """
+    Encode a list of PIL images into normalized embeddings (N x dim).
+    Uses batching to avoid memory blowups. Returns CPU tensor.
+    """
+    all_feats = []
+    model_device = next(model.parameters()).device
+    for batch in batchify(images, batch_size):
+        # preprocess and stack
+        batch_tensors = torch.stack([preprocess(img) for img in batch]).to(device)
+        with torch.no_grad():
+            feats = model.encode_image(batch_tensors)
+            feats = feats / feats.norm(dim=-1, keepdim=True)
+            all_feats.append(feats.cpu())
+    all_feats = torch.cat(all_feats, dim=0)
+    return all_feats  # on CPU
+def cosine_similarity_matrix(text_feat: torch.Tensor, image_feats: torch.Tensor) -> np.ndarray:
+    """
+    Given text_feat (1 x dim) and image_feats (N x dim), compute cosine similarities in numpy.
+    Returns ndarray shape (N,)
+    """
+    # text_feat on CPU?
+    if isinstance(text_feat, torch.Tensor):
+        text_feat = text_feat.cpu()
+    sims = (image_feats @ text_feat.squeeze(0).cpu().T).numpy().squeeze()
+    # clamp tiny numerical issues
+    sims = np.clip(sims, -1.0, 1.0)
+    return sims
+def normalize_scores_to_range(scores: np.ndarray, low=0.0, high=NORMALIZE_SCORE_TO) -> np.ndarray:
+    """
+    Maps scores from [-1,1] (cosine) to [low,high] (e.g., 0..100).
+    If all scores equal, map to mid-range to avoid divide-by-zero.
+    """
+    # if scores are already in [-1,1], map linearly
+    min_s, max_s = float(scores.min()), float(scores.max())
+    if math.isclose(min_s, max_s):
+        # degenerate case: all scores same — map all to midpoint
+        mid = (low + high) / 2.0
+        return np.full_like(scores, fill_value=mid, dtype=float)
+    # first ensure range is within [-1,1] - cosine outputs
+    scores_clipped = np.clip(scores, -1.0, 1.0)
+    # normalize to 0..1
+    norm01 = (scores_clipped - (-1.0)) / (2.0)
+    mapped = low + norm01 * (high - low)
+    return mapped
+def make_visual_grid(images: List[Image.Image], scores: List[float], top_k: int = 12,
+                     thumb_size: Tuple[int, int] = THUMB_SIZE, columns: int = 4,
+                     font_path: Optional[str] = FONT_PATH) -> Image.Image:
+    """
+    Create a single PIL image that arranges top_k thumbnails in a grid with score captions.
+    """
+    top_k = min(top_k, len(images))
+    rows = math.ceil(top_k / columns)
+    w, h = thumb_size
+    caption_height = 28
+    grid_w = columns * w
+    grid_h = rows * (h + caption_height)
+    grid_img = Image.new("RGB", (grid_w, grid_h), color=(255, 255, 255))
+    draw = ImageDraw.Draw(grid_img)
+    try:
+        if font_path and os.path.exists(font_path):
+            font = ImageFont.truetype(font_path, 16)
+        else:
+            font = ImageFont.load_default()
+    except Exception:
+        font = ImageFont.load_default()
+    for idx in range(top_k):
+        img = images[idx].copy().resize(thumb_size, Image.Resampling.LANCZOS)
+        col = idx % columns
+        row = idx // columns
+        x = col * w
+        y = row * (h + caption_height)
+        grid_img.paste(img, (x, y))
+        # caption with background rectangle for readability
+        caption = f"{scores[idx]:.1f}"
+        # text_w, text_h = draw.textsize(caption, font=font)
+        # For Pillow >=10
+        bbox = draw.textbbox((0, 0), caption, font=font)
+        text_w, text_h = bbox[2] - bbox[0], bbox[3] - bbox[1]
+        rect_x0 = x
+        rect_y0 = y + h
+        rect_x1 = x + w
+        rect_y1 = rect_y0 + caption_height
+        draw.rectangle([rect_x0, rect_y0, rect_x1, rect_y1], fill=(255, 255, 255))
+        text_x = x + 6
+        text_y = rect_y0 + (caption_height - text_h) // 2
+        draw.text((text_x, text_y), caption, fill=(0, 0, 0), font=font)
+    return grid_img
+# -------------------------
+# Core pipeline
+# -------------------------
+def rank_images_by_text(query: str, files: List[gr.File], top_k: int = TOP_K_DEFAULT,
+                        use_gpu: bool = (DEVICE == "cuda")) -> Tuple[pd.DataFrame, Image.Image]:
+    """
+    Main pipeline:
+    - load model (if not)
+    - read images from files
+    - encode text and images
+    - compute cosine similarity
+    - produce ranked DataFrame and visual grid image
+    Returns: (pandas.DataFrame with columns ['filename','score_cosine','score_normalized'], PIL.Image grid)
+    """
+    start_time = time.time()
+    if not query or (not files):
+        raise ValueError("Please provide both a text query and at least one image file.")
+    model, preprocess, tokenizer, dim = load_model(DEVICE if use_gpu else "cpu")
+    device = DEVICE if use_gpu else "cpu"
+    # Load images and remember filenames
+    images = []
+    filenames = []
+    for f in files:
+        # f is a tempfile-like object from gradio
+        try:
+            pil = load_pil_image(f)
+            images.append(pil)
+            # get filename attribute gracefully
+            name = getattr(f, "name", None)
+            if name:
+                fname = os.path.basename(name)
+            else:
+                # try to get filename from object dict
+                fname = getattr(f, "filename", "uploaded_image")
+            filenames.append(fname)
+        except Exception as e:
+            print(f"Skipping a file due to load error: {e}")
+    if len(images) == 0:
+        raise ValueError("No valid images could be loaded from uploads.")
+    # Encode text
+    text_feat = encode_text(query, model, tokenizer, device=device)
+    # Encode images (batched)
+    image_feats = encode_images(images, model, preprocess, device=device, batch_size=BATCH_SIZE)
+    # Compute cosine similarities
+    sims = cosine_similarity_matrix(text_feat, image_feats)  # range [-1,1]
+    scores_norm = normalize_scores_to_range(sims, low=0.0, high=float(NORMALIZE_SCORE_TO))
+    # Rank results
+    order = np.argsort(-sims)  # descending by raw cosine
+    sims_sorted = sims[order]
+    scores_sorted = scores_norm[order]
+    filenames_sorted = [filenames[i] for i in order]
+    images_sorted = [images[i] for i in order]
+    # Build DataFrame
+    df = pd.DataFrame({
+        "filename": filenames_sorted,
+        "score_cosine": sims_sorted,
+        f"score_{int(NORMALIZE_SCORE_TO)}": scores_sorted
+    })
+    # Create visual grid of top_k results
+    top_k = min(top_k, len(images_sorted))
+    top_images = images_sorted[:top_k]
+    top_scores = scores_sorted[:top_k].tolist()
+    grid_img = make_visual_grid(top_images, top_scores, top_k=top_k, thumb_size=THUMB_SIZE, columns=4)
+    elapsed = time.time() - start_time
+    print(f"Query processed in {elapsed:.2f}s. Images: {len(images)}. Top-K: {top_k}")
+    return df, grid_img
+# -------------------------
+# Gradio app UI
+# -------------------------
+def gradio_rank_fn(query: str, image_files: List[gr.File], top_k: int = TOP_K_DEFAULT, use_gpu: bool = (DEVICE == "cuda")):
+    """
+    Wrapper for Gradio. Returns (ranked table as CSV string / DataFrame, grid image as PIL, optionally downloadable CSV).
+    """
+    if not image_files:
+        return "No images uploaded.", None, None
+    try:
+        df, grid_img = rank_images_by_text(query, image_files, top_k=top_k, use_gpu=use_gpu)
+    except Exception as e:
+        return f"Error: {e}", None, None
+    # Save CSV to buffer so user can download
+    csv_buffer = io.StringIO()
+    df.to_csv(csv_buffer, index=False)
+    csv_bytes = csv_buffer.getvalue().encode("utf-8")
+    csv_buffer.close()
+    # Return textual summary, grid image, and CSV bytes for download component
+    summary = f"Ranked {len(df)} images for query: '{query}'. Top score: {df['score_cosine'].max():.4f}"
+    return summary, grid_img, ("rankings.csv", csv_bytes, "text/csv")
+def build_interface():
+    title = "Text → Image Ranking (OpenCLIP) — Free & Open-source"
+    description = """
+    Enter any text query (e.g., "red chinos") and upload multiple product images (100+ supported).
+    The app uses an OpenCLIP model (open-source) to compute embeddings for text and images, then ranks images by cosine similarity.
+    You will get a visual grid of the top results annotated with normalized similarity scores (0–100) and a downloadable CSV of all rankings.
+    """
+    with gr.Blocks(title=title) as demo:
+        gr.Markdown(f"# {title}")
+        gr.Markdown(description)
+        with gr.Row():
+            with gr.Column(scale=3):
+                query = gr.Textbox(label="Text query", placeholder="e.g. 'red chinos' or 'floral kurta with pockets'", lines=1)
+                image_files = gr.File(label="Upload product images (multiple)", file_count="multiple",
+                                      file_types=["image"], interactive=True)
+                top_k = gr.Slider(minimum=1, maximum=64, value=TOP_K_DEFAULT, step=1, label="Top-K to visualize")
+                use_gpu = gr.Checkbox(label=f"Use GPU (detected device: {DEVICE}). Uncheck to force CPU.", value=(DEVICE == "cuda"))
+                run_btn = gr.Button("Rank images")
+                status_output = gr.Textbox(label="Status", interactive=False)
+            with gr.Column(scale=2):
+                gallery = gr.Image(type="pil", label="Top results grid (annotated)")
+                download = gr.File(label="Download CSV rankings")
+                summary = gr.Textbox(label="Summary", interactive=False)
+        # Hook up
+        def wrapped_run(q, files, topk, use_gpu_flag):
+            status = "Processing..."
+            # Gradio won't show intermediate states in this simple wrapper, so return at the end
+            try:
+                summary_text, grid_img, csv_tuple = gradio_rank_fn(q, files, topk, use_gpu_flag)
+                # for gr.File returning bytes tuple: (filename, bytes, mime)
+                # Save csv bytes to temp file for gr.File returning
+                if csv_tuple:
+                    fname, content_bytes, mime = csv_tuple
+                    # save to a BytesIO that gr.File can serve via memory? Gradio expects a path or a file-like?
+                    # We'll save to disk in a temp file to make it simple:
+                    tmp_path = os.path.join(os.getcwd(), fname)
+                    with open(tmp_path, "wb") as f:
+                        f.write(content_bytes)
+                    csv_path = tmp_path
+                else:
+                    csv_path = None
+                return summary_text, grid_img, csv_path
+            except Exception as e:
+                return f"Error: {e}", None, None
+        run_btn.click(fn=wrapped_run, inputs=[query, image_files, top_k, use_gpu], outputs=[summary, gallery, download])
+        gr.Markdown("## Notes")
+        gr.Markdown("- This uses an **open-source** OpenCLIP model. No paid API calls.")
+        gr.Markdown("- For best performance on large batches, run on a machine with a CUDA GPU. If you don't have a GPU, leave 'Use GPU' unchecked.")
+        gr.Markdown("- If you want to scale beyond thousands of images in a production setting, index the image embeddings with FAISS/Annoy and perform ANN search rather than computing full cosine in-memory.")
+    return demo
+if __name__ == "__main__":
+    demo = build_interface()
+    # Start Gradio
+    demo.launch()