Spaces:

akash4552
/

assignment

Sleeping

App Files Files Community

akash4552 commited on Aug 28, 2025

Commit

61df0b2

verified ·

1 Parent(s): 9a5ac5b

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -45

app.py CHANGED Viewed

@@ -24,18 +24,18 @@ import gradio as gr
 # -------------------------
 # Configuration / Globals
 # -------------------------
-MODEL_NAME = "ViT-B-32"      # OpenCLIP model backbone
 # MODEL_PRETRAIN = "laion2b_s32b_b79k"
-MODEL_PRETRAIN = "openai"  # pretraining dataset variant (open weights)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-BATCH_SIZE = 64              # image encoding batch size (tune by your GPU/CPU memory)
-TOP_K_DEFAULT = 20           # how many top results to show visually
-THUMB_SIZE = (256, 256)      # thumbnail size for visual grid
-FONT_PATH = None             # if you want a custom TTF, set path, else default PIL font used
-NORMALIZE_SCORE_TO = 100     # final scores in 0..NORMALIZE_SCORE_TO
 # -------------------------
-# Load model once at startup (lazy load wrapped in function)
 _model_data = {"loaded": False}
@@ -52,7 +52,7 @@ def load_model(device: str = DEVICE):
     tokenizer = open_clip.get_tokenizer(MODEL_NAME)
     model.to(device)
     model.eval()
-    # store
     dim = model.text_projection.shape[1] if hasattr(model, "text_projection") else model.projection.shape[1]
     _model_data.update({
         "loaded": True,
@@ -107,14 +107,14 @@ def encode_images(images: List[Image.Image], model, preprocess, device: str = DE
     all_feats = []
     model_device = next(model.parameters()).device
     for batch in batchify(images, batch_size):
-        # preprocess and stack
         batch_tensors = torch.stack([preprocess(img) for img in batch]).to(device)
         with torch.no_grad():
             feats = model.encode_image(batch_tensors)
             feats = feats / feats.norm(dim=-1, keepdim=True)
             all_feats.append(feats.cpu())
     all_feats = torch.cat(all_feats, dim=0)
-    return all_feats  # on CPU
 def cosine_similarity_matrix(text_feat: torch.Tensor, image_feats: torch.Tensor) -> np.ndarray:
@@ -122,11 +122,11 @@ def cosine_similarity_matrix(text_feat: torch.Tensor, image_feats: torch.Tensor)
     Given text_feat (1 x dim) and image_feats (N x dim), compute cosine similarities in numpy.
     Returns ndarray shape (N,)
     """
-    # text_feat on CPU?
     if isinstance(text_feat, torch.Tensor):
         text_feat = text_feat.cpu()
     sims = (image_feats @ text_feat.squeeze(0).cpu().T).numpy().squeeze()
-    # clamp tiny numerical issues
     sims = np.clip(sims, -1.0, 1.0)
     return sims
@@ -136,15 +136,15 @@ def normalize_scores_to_range(scores: np.ndarray, low=0.0, high=NORMALIZE_SCORE_
     Maps scores from [-1,1] (cosine) to [low,high] (e.g., 0..100).
     If all scores equal, map to mid-range to avoid divide-by-zero.
     """
-    # if scores are already in [-1,1], map linearly
     min_s, max_s = float(scores.min()), float(scores.max())
     if math.isclose(min_s, max_s):
-        # degenerate case: all scores same — map all to midpoint
         mid = (low + high) / 2.0
         return np.full_like(scores, fill_value=mid, dtype=float)
-    # first ensure range is within [-1,1] - cosine outputs
     scores_clipped = np.clip(scores, -1.0, 1.0)
-    # normalize to 0..1
     norm01 = (scores_clipped - (-1.0)) / (2.0)
     mapped = low + norm01 * (high - low)
     return mapped
@@ -180,10 +180,9 @@ def make_visual_grid(images: List[Image.Image], scores: List[float], top_k: int
         x = col * w
         y = row * (h + caption_height)
         grid_img.paste(img, (x, y))
-        # caption with background rectangle for readability
         caption = f"{scores[idx]:.1f}"
-        # text_w, text_h = draw.textsize(caption, font=font)
-        # For Pillow >=10
         bbox = draw.textbbox((0, 0), caption, font=font)
         text_w, text_h = bbox[2] - bbox[0], bbox[3] - bbox[1]
@@ -199,9 +198,7 @@ def make_visual_grid(images: List[Image.Image], scores: List[float], top_k: int
     return grid_img
-# -------------------------
-# Core pipeline
-# -------------------------
 def rank_images_by_text(query: str, files: List[gr.File], top_k: int = TOP_K_DEFAULT,
                         use_gpu: bool = (DEVICE == "cuda")) -> Tuple[pd.DataFrame, Image.Image]:
     """
@@ -220,20 +217,20 @@ def rank_images_by_text(query: str, files: List[gr.File], top_k: int = TOP_K_DEF
     model, preprocess, tokenizer, dim = load_model(DEVICE if use_gpu else "cpu")
     device = DEVICE if use_gpu else "cpu"
-    # Load images and remember filenames
     images = []
     filenames = []
     for f in files:
-        # f is a tempfile-like object from gradio
         try:
             pil = load_pil_image(f)
             images.append(pil)
-            # get filename attribute gracefully
             name = getattr(f, "name", None)
             if name:
                 fname = os.path.basename(name)
             else:
-                # try to get filename from object dict
                 fname = getattr(f, "filename", "uploaded_image")
             filenames.append(fname)
         except Exception as e:
@@ -242,31 +239,30 @@ def rank_images_by_text(query: str, files: List[gr.File], top_k: int = TOP_K_DEF
     if len(images) == 0:
         raise ValueError("No valid images could be loaded from uploads.")
-    # Encode text
     text_feat = encode_text(query, model, tokenizer, device=device)
-    # Encode images (batched)
     image_feats = encode_images(images, model, preprocess, device=device, batch_size=BATCH_SIZE)
-    # Compute cosine similarities
     sims = cosine_similarity_matrix(text_feat, image_feats)  # range [-1,1]
     scores_norm = normalize_scores_to_range(sims, low=0.0, high=float(NORMALIZE_SCORE_TO))
     # Rank results
-    order = np.argsort(-sims)  # descending by raw cosine
     sims_sorted = sims[order]
     scores_sorted = scores_norm[order]
     filenames_sorted = [filenames[i] for i in order]
     images_sorted = [images[i] for i in order]
-    # Build DataFrame
     df = pd.DataFrame({
         "filename": filenames_sorted,
         "score_cosine": sims_sorted,
         f"score_{int(NORMALIZE_SCORE_TO)}": scores_sorted
     })
-    # Create visual grid of top_k results
     top_k = min(top_k, len(images_sorted))
     top_images = images_sorted[:top_k]
     top_scores = scores_sorted[:top_k].tolist()
@@ -291,23 +287,20 @@ def gradio_rank_fn(query: str, image_files: List[gr.File], top_k: int = TOP_K_DE
     except Exception as e:
         return f"Error: {e}", None, None
-    # Save CSV to buffer so user can download
     csv_buffer = io.StringIO()
     df.to_csv(csv_buffer, index=False)
     csv_bytes = csv_buffer.getvalue().encode("utf-8")
     csv_buffer.close()
-    # Return textual summary, grid image, and CSV bytes for download component
     summary = f"Ranked {len(df)} images for query: '{query}'. Top score: {df['score_cosine'].max():.4f}"
     return summary, grid_img, ("rankings.csv", csv_bytes, "text/csv")
 def build_interface():
-    title = "Text → Image Ranking (OpenCLIP) — Free & Open-source"
     description = """
     Enter any text query (e.g., "red chinos") and upload multiple product images (100+ supported).
     The app uses an OpenCLIP model (open-source) to compute embeddings for text and images, then ranks images by cosine similarity.
-    You will get a visual grid of the top results annotated with normalized similarity scores (0–100) and a downloadable CSV of all rankings.
     """
     with gr.Blocks(title=title) as demo:
         gr.Markdown(f"# {title}")
@@ -326,18 +319,16 @@ def build_interface():
                 download = gr.File(label="Download CSV rankings")
                 summary = gr.Textbox(label="Summary", interactive=False)
-        # Hook up
         def wrapped_run(q, files, topk, use_gpu_flag):
             status = "Processing..."
-            # Gradio won't show intermediate states in this simple wrapper, so return at the end
             try:
                 summary_text, grid_img, csv_tuple = gradio_rank_fn(q, files, topk, use_gpu_flag)
-                # for gr.File returning bytes tuple: (filename, bytes, mime)
-                # Save csv bytes to temp file for gr.File returning
                 if csv_tuple:
                     fname, content_bytes, mime = csv_tuple
-                    # save to a BytesIO that gr.File can serve via memory? Gradio expects a path or a file-like?
-                    # We'll save to disk in a temp file to make it simple:
                     tmp_path = os.path.join(os.getcwd(), fname)
                     with open(tmp_path, "wb") as f:
                         f.write(content_bytes)
@@ -351,8 +342,9 @@ def build_interface():
         run_btn.click(fn=wrapped_run, inputs=[query, image_files, top_k, use_gpu], outputs=[summary, gallery, download])
         gr.Markdown("## Notes")
         gr.Markdown("- This uses an **open-source** OpenCLIP model. No paid API calls.")
-        gr.Markdown("- For best performance on large batches, run on a machine with a CUDA GPU. If you don't have a GPU, leave 'Use GPU' unchecked.")
-        gr.Markdown("- If you want to scale beyond thousands of images in a production setting, index the image embeddings with FAISS/Annoy and perform ANN search rather than computing full cosine in-memory.")
     return demo

 # -------------------------
 # Configuration / Globals
 # -------------------------
+MODEL_NAME = "ViT-B-32"
 # MODEL_PRETRAIN = "laion2b_s32b_b79k"
+MODEL_PRETRAIN = "openai"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+BATCH_SIZE = 64
+TOP_K_DEFAULT = 20
+THUMB_SIZE = (256, 256)
+FONT_PATH = None
+NORMALIZE_SCORE_TO = 100
 # -------------------------
 _model_data = {"loaded": False}
     tokenizer = open_clip.get_tokenizer(MODEL_NAME)
     model.to(device)
     model.eval()
     dim = model.text_projection.shape[1] if hasattr(model, "text_projection") else model.projection.shape[1]
     _model_data.update({
         "loaded": True,
     all_feats = []
     model_device = next(model.parameters()).device
     for batch in batchify(images, batch_size):
         batch_tensors = torch.stack([preprocess(img) for img in batch]).to(device)
         with torch.no_grad():
             feats = model.encode_image(batch_tensors)
             feats = feats / feats.norm(dim=-1, keepdim=True)
             all_feats.append(feats.cpu())
     all_feats = torch.cat(all_feats, dim=0)
+    return all_feats
 def cosine_similarity_matrix(text_feat: torch.Tensor, image_feats: torch.Tensor) -> np.ndarray:
     Given text_feat (1 x dim) and image_feats (N x dim), compute cosine similarities in numpy.
     Returns ndarray shape (N,)
     """
     if isinstance(text_feat, torch.Tensor):
         text_feat = text_feat.cpu()
     sims = (image_feats @ text_feat.squeeze(0).cpu().T).numpy().squeeze()
     sims = np.clip(sims, -1.0, 1.0)
     return sims
     Maps scores from [-1,1] (cosine) to [low,high] (e.g., 0..100).
     If all scores equal, map to mid-range to avoid divide-by-zero.
     """
     min_s, max_s = float(scores.min()), float(scores.max())
     if math.isclose(min_s, max_s):
         mid = (low + high) / 2.0
         return np.full_like(scores, fill_value=mid, dtype=float)
     scores_clipped = np.clip(scores, -1.0, 1.0)
     norm01 = (scores_clipped - (-1.0)) / (2.0)
     mapped = low + norm01 * (high - low)
     return mapped
         x = col * w
         y = row * (h + caption_height)
         grid_img.paste(img, (x, y))
         caption = f"{scores[idx]:.1f}"
         bbox = draw.textbbox((0, 0), caption, font=font)
         text_w, text_h = bbox[2] - bbox[0], bbox[3] - bbox[1]
     return grid_img
 def rank_images_by_text(query: str, files: List[gr.File], top_k: int = TOP_K_DEFAULT,
                         use_gpu: bool = (DEVICE == "cuda")) -> Tuple[pd.DataFrame, Image.Image]:
     """
     model, preprocess, tokenizer, dim = load_model(DEVICE if use_gpu else "cpu")
     device = DEVICE if use_gpu else "cpu"
     images = []
     filenames = []
     for f in files:
         try:
             pil = load_pil_image(f)
             images.append(pil)
             name = getattr(f, "name", None)
             if name:
                 fname = os.path.basename(name)
             else:
                 fname = getattr(f, "filename", "uploaded_image")
             filenames.append(fname)
         except Exception as e:
     if len(images) == 0:
         raise ValueError("No valid images could be loaded from uploads.")
     text_feat = encode_text(query, model, tokenizer, device=device)
     image_feats = encode_images(images, model, preprocess, device=device, batch_size=BATCH_SIZE)
     sims = cosine_similarity_matrix(text_feat, image_feats)  # range [-1,1]
     scores_norm = normalize_scores_to_range(sims, low=0.0, high=float(NORMALIZE_SCORE_TO))
     # Rank results
+    order = np.argsort(-sims)
     sims_sorted = sims[order]
     scores_sorted = scores_norm[order]
     filenames_sorted = [filenames[i] for i in order]
     images_sorted = [images[i] for i in order]
     df = pd.DataFrame({
         "filename": filenames_sorted,
         "score_cosine": sims_sorted,
         f"score_{int(NORMALIZE_SCORE_TO)}": scores_sorted
     })
     top_k = min(top_k, len(images_sorted))
     top_images = images_sorted[:top_k]
     top_scores = scores_sorted[:top_k].tolist()
     except Exception as e:
         return f"Error: {e}", None, None
     csv_buffer = io.StringIO()
     df.to_csv(csv_buffer, index=False)
     csv_bytes = csv_buffer.getvalue().encode("utf-8")
     csv_buffer.close()
     summary = f"Ranked {len(df)} images for query: '{query}'. Top score: {df['score_cosine'].max():.4f}"
     return summary, grid_img, ("rankings.csv", csv_bytes, "text/csv")
 def build_interface():
+    title = "Text → Image Ranking"
     description = """
     Enter any text query (e.g., "red chinos") and upload multiple product images (100+ supported).
     The app uses an OpenCLIP model (open-source) to compute embeddings for text and images, then ranks images by cosine similarity.
     """
     with gr.Blocks(title=title) as demo:
         gr.Markdown(f"# {title}")
                 download = gr.File(label="Download CSV rankings")
                 summary = gr.Textbox(label="Summary", interactive=False)
         def wrapped_run(q, files, topk, use_gpu_flag):
             status = "Processing..."
             try:
                 summary_text, grid_img, csv_tuple = gradio_rank_fn(q, files, topk, use_gpu_flag)
                 if csv_tuple:
                     fname, content_bytes, mime = csv_tuple
                     tmp_path = os.path.join(os.getcwd(), fname)
                     with open(tmp_path, "wb") as f:
                         f.write(content_bytes)
         run_btn.click(fn=wrapped_run, inputs=[query, image_files, top_k, use_gpu], outputs=[summary, gallery, download])
         gr.Markdown("## Notes")
         gr.Markdown("- This uses an **open-source** OpenCLIP model. No paid API calls.")
+        gr.Markdown("- The app is slow because every time it runs it creates embeddings of the text and the images . The speed of the app can be increased if we use already stored images so we don't have to create embeddings everytime.")
+        gr.Markdown("The accuracy of this app can be increased if we used different models of open clip , but for computational efficiency i have utilized one of the efficient models . Also if we finetune this model , the accuracy of the model can be hugely increased, But since this is just a asssignment , i have created a demo prototype only.")
     return demo