Spaces:

fast-stager
/

clipick

Running

Nightfury16 Claude Opus 4.6 commited on 3 days ago

Commit

3aa023a

1 Parent(s): ada4422

Fix mobileclip2_l14 checkpoint loading and add pyvips acceleration

- Switch model from mobileclip_b to mobileclip2_l14 matching checkpoint 2602
- Fix head architecture: nn.Linear -> 2-layer MLP (Linear/GELU/Dropout/Linear)
matching training code's RankingHead with head.net.{0,3} key layout
- Use GELU activation (not ReLU) to match training exactly
- Infer head_hidden_dim from checkpoint at load time
- Remove reparameterize_model (MobileOne-specific, not applicable to ViT-L/14)
- Replace PIL with pyvips (shrink-on-load thumbnail_buffer for fast JPEG decode)
- Replace sequential requests with urllib3 PoolManager + ThreadPoolExecutor(16)
- Add torch.inference_mode, fp16 autocast on CUDA, torch.compile
- Add packages.txt (libvips-dev) for HF Spaces

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (5) hide show

app.py +180 -135
config.yml +1 -1
model.py +51 -46
packages.txt +1 -0
requirements.txt +3 -4

app.py CHANGED Viewed

@@ -1,53 +1,45 @@
 import torch
 import gradio as gr
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from typing import List
 import os
-import yaml
-import requests
 import json
 import random
-from PIL import Image, ImageOps
-from io import BytesIO
-from types import SimpleNamespace
-from torchvision import transforms
-from huggingface_hub import hf_hub_download
-import mobileclip
-from mobileclip.modules.common.mobileone import reparameterize_model
 from model import MobileCLIPRanker
 HF_USER_REPO = "Nightfury16/clipick"
-HF_FILENAME = "best_model_2602.pth"
-CONFIG_PATH = "config.yml"
 JSON_DATA_PATH = "combined_unique.json"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-def load_config(path="config.yml"):
-    if not os.path.exists(path):
-        return SimpleNamespace(**{
-            "data": SimpleNamespace(img_size=224),
-            "model": SimpleNamespace(name="mobileclip2_l14")
-        })
-    with open(path, "r") as f:
-        cfg_dict = yaml.safe_load(f)
-    def recursive_namespace(d):
-        if isinstance(d, dict):
-            for k, v in d.items():
-                d[k] = recursive_namespace(v)
-            return SimpleNamespace(**d)
-        return d
-    return recursive_namespace(cfg_dict)
 groups_data = []
 try:
     if os.path.exists(JSON_DATA_PATH):
         with open(JSON_DATA_PATH, "r") as f:
-            data = json.load(f)
-            for group in data.get("groups", []):
                 urls = group.get("images", [])
                 if urls:
                     groups_data.append("\n".join(urls))
@@ -55,151 +47,204 @@ try:
 except Exception as e:
     print(f"Error loading JSON data: {e}")
 print("--- Loading Ranker Server ---")
 print(f"Device: {DEVICE}")
-cfg = load_config(CONFIG_PATH)
-model = MobileCLIPRanker(cfg)
-try:
-    print(f"Downloading Fine-Tuned weights ({HF_FILENAME}) from {HF_USER_REPO}...")
-    local_weight_path = hf_hub_download(repo_id=HF_USER_REPO, filename=HF_FILENAME)
-    checkpoint = torch.load(local_weight_path, map_location=DEVICE)
-    if isinstance(checkpoint, dict) and "model_state_dict" in checkpoint:
-        raw_state_dict = checkpoint["model_state_dict"]
-    else:
-        raw_state_dict = checkpoint
-    state_dict = {k.replace("module.", ""): v for k, v in raw_state_dict.items()}
-    model.load_state_dict(state_dict, strict=True)
-    print("✅ Weights loaded successfully!")
-except Exception as e:
-    print(f"❌ CRITICAL: Load failed. {e}")
-    raise e
-print("⚡ Reparameterizing MobileCLIP-B for inference speed...")
-if hasattr(model, 'backbone'):
-    model.backbone = reparameterize_model(model.backbone)
-model.to(DEVICE)
-model.eval()
-norm_transform = transforms.Compose([
-    transforms.ToTensor(),
-    transforms.Normalize(mean=(0.481, 0.457, 0.408), std=(0.268, 0.261, 0.275))
-])
-def letterbox_image(img, size):
-    '''Pad image to square to preserve aspect ratio (No distortion)'''
-    img.thumbnail((size, size), Image.Resampling.BICUBIC)
-    delta_w = size - img.size[0]
-    delta_h = size - img.size[1]
-    padding = (delta_w//2, delta_h//2, delta_w-(delta_w//2), delta_h-(delta_h//2))
-    return ImageOps.expand(img, padding, fill=(128, 128, 128))
 def get_best_image(url_list):
-    valid_tensors = []
-    valid_indices = []
-    clean_urls = []
-    for u in url_list:
-        if isinstance(u, str) and u.strip():
-            clean_urls.append(u.strip())
-    print(f"Processing {len(clean_urls)} images...")
-    for i, src in enumerate(clean_urls):
-        try:
-            if src.startswith("http"):
-                resp = requests.get(src, timeout=3)
-                img = Image.open(BytesIO(resp.content)).convert("RGB")
-            else:
-                img = Image.open(src).convert("RGB")
-            img_padded = letterbox_image(img, cfg.data.img_size)
-            tensor = norm_transform(img_padded)
-            valid_tensors.append(tensor)
-            valid_indices.append(i)
-        except Exception as e:
-            print(f"Error loading {src}: {e}")
-    if not valid_tensors:
         return None, []
-    batch = torch.stack(valid_tensors).unsqueeze(0).to(DEVICE)
-    valid_len = torch.tensor([len(valid_tensors)]).to(DEVICE)
-    with torch.no_grad():
-        scores = model(batch, valid_lens=valid_len).view(-1).cpu().numpy()
-    results = []
-    for idx, score in zip(valid_indices, scores):
-        results.append({"url": clean_urls[idx], "score": float(score)})
-    results.sort(key=lambda x: x["score"], reverse=True)
     return results[0]["url"], results
 app = FastAPI()
 class RankRequest(BaseModel):
     urls: List[str]
 @app.post("/api/rank")
 async def rank_endpoint(req: RankRequest):
     if not req.urls:
-        raise HTTPException(status_code=400, detail="List of URLs cannot be empty")
-    best_url, results = get_best_image(req.urls)
-    if best_url is None:
-        raise HTTPException(status_code=400, detail="Could not load any images")
-    return {"best_image": best_url, "ranking": results}
 def load_group_by_index(index):
     idx = int(index) - 1
-    if 0 <= idx < len(groups_data): return groups_data[idx]
-    return "Invalid Index"
 def load_random_group():
-    if not groups_data: return 1, "No data."
-    rand_idx = random.randint(0, len(groups_data) - 1)
-    return rand_idx + 1, groups_data[rand_idx]
 def gradio_wrapper(text_input):
-    urls = text_input.split("\n")
-    best_url, results = get_best_image(urls)
-    if best_url is None: return None, "Error loading images"
-    try:
-        if best_url.startswith("http"):
-            resp = requests.get(best_url, timeout=3)
-            best_img_pil = Image.open(BytesIO(resp.content)).convert("RGB")
-        else:
-            best_img_pil = Image.open(best_url).convert("RGB")
-    except: best_img_pil = None
-    return best_img_pil, results
 with gr.Blocks() as demo:
-    gr.Markdown(f"# 🏠 Real Estate Ranker (Student Model)")
-    gr.Markdown("Using **MobileCLIP-B** (Distilled) with smart resizing.")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 1. Select Data")
             with gr.Row():
-                index_input = gr.Number(value=1, label="Group #", minimum=1, precision=0)
-                random_btn = gr.Button("🎲 Random", variant="secondary")
             load_btn = gr.Button("Load Group", size="sm")
             gr.Markdown("### 2. URLs")
             input_text = gr.Textbox(label="Image URLs", lines=6)
-            rank_btn = gr.Button("🚀 Rank", variant="primary")
         with gr.Column(scale=1):
-            output_image = gr.Image(label="🏆 Best Image", type="pil")
             output_json = gr.JSON(label="Scores")
-    random_btn.click(fn=load_random_group, inputs=None, outputs=[index_input, input_text])
     load_btn.click(fn=load_group_by_index, inputs=index_input, outputs=input_text)
-    rank_btn.click(fn=gradio_wrapper, inputs=input_text, outputs=[output_image, output_json])
-app = gr.mount_gradio_app(app, demo, path="/")

 import torch
+import numpy as np
+import pyvips
 import gradio as gr
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from typing import List
+from contextlib import nullcontext
+from concurrent.futures import ThreadPoolExecutor
+from huggingface_hub import hf_hub_download
+import urllib3
 import os
 import json
 import random
 from model import MobileCLIPRanker
+# ── Config ──────────────────────────────────────────────────────────────
 HF_USER_REPO = "Nightfury16/clipick"
+HF_FILENAME = "best_model_2602.pth"
 JSON_DATA_PATH = "combined_unique.json"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+IMG_SIZE = 224
+# Normalisation constants (pre-shaped for numpy broadcast)
+MEAN = np.float32([0.481, 0.457, 0.408]).reshape(1, 1, 3)
+INV_STD = (1.0 / np.float32([0.268, 0.261, 0.275])).reshape(1, 1, 3)
+# ── Connection & thread pools ───────────────────────────────────────────
+http_pool = urllib3.PoolManager(
+    maxsize=32,
+    retries=urllib3.Retry(total=1, backoff_factor=0),
+    timeout=urllib3.Timeout(connect=2.0, read=3.0),
+)
+fetch_pool = ThreadPoolExecutor(max_workers=16)
+# ── Load group data ─────────────────────────────────────────────────────
 groups_data = []
 try:
     if os.path.exists(JSON_DATA_PATH):
         with open(JSON_DATA_PATH, "r") as f:
+            for group in json.load(f).get("groups", []):
                 urls = group.get("images", [])
                 if urls:
                     groups_data.append("\n".join(urls))
 except Exception as e:
     print(f"Error loading JSON data: {e}")
+# ── Load model ──────────────────────────────────────────────────────────
 print("--- Loading Ranker Server ---")
 print(f"Device: {DEVICE}")
+# 1. Download fine-tuned checkpoint first to infer head dimensions
+print(f"Downloading fine-tuned weights ({HF_FILENAME})...")
+local_weight_path = hf_hub_download(repo_id=HF_USER_REPO, filename=HF_FILENAME)
+checkpoint = torch.load(local_weight_path, map_location=DEVICE)
+raw_sd = (
+    checkpoint.get("model_state_dict", checkpoint)
+    if isinstance(checkpoint, dict)
+    else checkpoint
+)
+state_dict = {k.replace("module.", ""): v for k, v in raw_sd.items()}
+# Infer hidden dim from checkpoint so architecture matches exactly
+head_hidden = state_dict["head.net.0.weight"].shape[0]
+print(f"Head hidden dim inferred from checkpoint: {head_hidden}")
+# 2. Build model with matching architecture, load weights
+model = MobileCLIPRanker(backbone_dim=768, head_hidden_dim=head_hidden)
+model.load_state_dict(state_dict, strict=True)
+print("Weights loaded successfully.")
+model.to(DEVICE).eval()
+# 3. Compile for faster inference on CUDA
+if DEVICE == "cuda" and hasattr(torch, "compile"):
+    try:
+        model = torch.compile(model, mode="reduce-overhead")
+        print("Model compiled with torch.compile (reduce-overhead)")
+    except Exception:
+        pass
+# ── Image processing (pyvips) ──────────────────────────────────────────
+def _fetch_and_preprocess(url: str):
+    """Fetch one image, letterbox-resize, normalise -> CHW float32 numpy."""
+    try:
+        if url.startswith("http"):
+            resp = http_pool.request("GET", url, preload_content=True)
+            if resp.status != 200:
+                return None
+            # thumbnail_buffer uses shrink-on-load (fast JPEG decode)
+            img = pyvips.Image.thumbnail_buffer(
+                resp.data, IMG_SIZE, height=IMG_SIZE
+            )
+        else:
+            img = pyvips.Image.thumbnail(url, IMG_SIZE, height=IMG_SIZE)
+        # Ensure 3-band sRGB
+        if img.bands == 4:
+            img = img.flatten(background=[128, 128, 128])
+        elif img.bands == 1:
+            img = img.colourspace("srgb")
+        # Letterbox pad to exact IMG_SIZE x IMG_SIZE
+        if img.width != IMG_SIZE or img.height != IMG_SIZE:
+            img = img.gravity(
+                "centre", IMG_SIZE, IMG_SIZE,
+                extend="background", background=[128, 128, 128],
+            )
+        # -> float32 CHW normalised numpy
+        arr = np.ndarray(
+            buffer=img.write_to_memory(),
+            dtype=np.uint8,
+            shape=(IMG_SIZE, IMG_SIZE, 3),
+        )
+        arr = (arr.astype(np.float32) * (1.0 / 255.0) - MEAN) * INV_STD
+        return arr.transpose(2, 0, 1)  # HWC -> CHW
+    except Exception:
+        return None
+def _fetch_display(url: str):
+    """Fetch image for Gradio display -> numpy uint8 HWC."""
+    try:
+        if url.startswith("http"):
+            resp = http_pool.request("GET", url, preload_content=True)
+            img = pyvips.Image.new_from_buffer(resp.data, "")
+        else:
+            img = pyvips.Image.new_from_file(url, access="sequential")
+        if img.bands == 4:
+            img = img.flatten(background=[255, 255, 255])
+        elif img.bands == 1:
+            img = img.colourspace("srgb")
+        return np.ndarray(
+            buffer=img.write_to_memory(),
+            dtype=np.uint8,
+            shape=(img.height, img.width, 3),
+        )
+    except Exception:
+        return None
+# ── Core ranking logic ──────────────────────────────────────────────────
 def get_best_image(url_list):
+    clean = [u.strip() for u in url_list if isinstance(u, str) and u.strip()]
+    if not clean:
         return None, []
+    # Parallel fetch + preprocess across thread pool
+    futures = {
+        fetch_pool.submit(_fetch_and_preprocess, u): i
+        for i, u in enumerate(clean)
+    }
+    arrays, indices = [], []
+    for fut in futures:
+        arr = fut.result()
+        if arr is not None:
+            arrays.append(arr)
+            indices.append(futures[fut])
+    if not arrays:
+        return None, []
+    batch = torch.from_numpy(np.stack(arrays)).unsqueeze(0).to(DEVICE)
+    vlens = torch.tensor([len(arrays)], device=DEVICE)
+    amp_ctx = (
+        torch.autocast(device_type="cuda", dtype=torch.float16)
+        if DEVICE == "cuda"
+        else nullcontext()
+    )
+    with torch.inference_mode(), amp_ctx:
+        scores = model(batch, valid_lens=vlens).view(-1).cpu().numpy()
+    results = sorted(
+        [{"url": clean[i], "score": float(s)} for i, s in zip(indices, scores)],
+        key=lambda r: r["score"],
+        reverse=True,
+    )
     return results[0]["url"], results
+# ── FastAPI ─────────────────────────────────────────────────────────────
 app = FastAPI()
 class RankRequest(BaseModel):
     urls: List[str]
 @app.post("/api/rank")
 async def rank_endpoint(req: RankRequest):
     if not req.urls:
+        raise HTTPException(400, "URL list is empty")
+    best, results = get_best_image(req.urls)
+    if best is None:
+        raise HTTPException(400, "No images could be loaded")
+    return {"best_image": best, "ranking": results}
+# ── Gradio UI ───────────────────────────────────────────────────────────
 def load_group_by_index(index):
     idx = int(index) - 1
+    return groups_data[idx] if 0 <= idx < len(groups_data) else "Invalid index"
 def load_random_group():
+    if not groups_data:
+        return 1, "No data."
+    i = random.randint(0, len(groups_data) - 1)
+    return i + 1, groups_data[i]
 def gradio_wrapper(text_input):
+    best_url, results = get_best_image(text_input.split("\n"))
+    if best_url is None:
+        return None, "Error loading images"
+    return _fetch_display(best_url), results
 with gr.Blocks() as demo:
+    gr.Markdown("# Real Estate Image Ranker")
+    gr.Markdown("**MobileCLIP2-L14** fine-tuned ranker with pyvips acceleration.")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 1. Select Data")
             with gr.Row():
+                index_input = gr.Number(
+                    value=1, label="Group #", minimum=1, precision=0
+                )
+                random_btn = gr.Button("Random", variant="secondary")
             load_btn = gr.Button("Load Group", size="sm")
             gr.Markdown("### 2. URLs")
             input_text = gr.Textbox(label="Image URLs", lines=6)
+            rank_btn = gr.Button("Rank", variant="primary")
         with gr.Column(scale=1):
+            output_image = gr.Image(label="Best Image", type="numpy")
             output_json = gr.JSON(label="Scores")
+    random_btn.click(fn=load_random_group, outputs=[index_input, input_text])
     load_btn.click(fn=load_group_by_index, inputs=index_input, outputs=input_text)
+    rank_btn.click(
+        fn=gradio_wrapper, inputs=input_text, outputs=[output_image, output_json]
+    )
+app = gr.mount_gradio_app(app, demo, path="/")

config.yml CHANGED Viewed

@@ -1,4 +1,4 @@
 data:
   img_size: 224
 model:
-  name: "mobileclip_b"

 data:
   img_size: 224
 model:
+  name: "mobileclip2_l14"

model.py CHANGED Viewed

@@ -1,56 +1,61 @@
 import torch
 import torch.nn as nn
-import mobileclip
 import open_clip
 from huggingface_hub import hf_hub_download
 class MobileCLIPRanker(nn.Module):
-    def __init__(self, cfg):
         super().__init__()
-        model_name = cfg.model.name.lower()
-        self.model_type = "mobileclip"
-        if "l14" in model_name or "l-14" in model_name:
-            self.model_type = "open_clip"
-            repo_id = "apple/MobileCLIP2-L-14"
-            filename = "mobileclip2_l14.pt"
-            self.backbone_dim = 768
-            print(f"Initializing Teacher (L14)...")
-            ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename)
-            model, _, _ = open_clip.create_model_and_transforms('MobileCLIP2-L-14', pretrained=ckpt_path)
-            self.backbone = model.visual
-        else:
-            repo_id = "apple/MobileCLIP2-B"
-            filename = "mobileclip2_b.pt"
-            arch = "mobileclip_b"
-            self.backbone_dim = 512
-            print(f"Initializing Student ({arch})...")
-            ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename)
-            model, _, _ = mobileclip.create_model_and_transforms(arch, pretrained=ckpt_path)
-            self.backbone = model.image_encoder
-        for param in self.backbone.parameters():
-            param.requires_grad = False
-        self.head = nn.Linear(self.backbone_dim, 1)
     def forward(self, x, valid_lens=None):
-        b, g, c, h, w = x.shape
-        x_flat = x.view(b * g, c, h, w)
-        if self.model_type == "open_clip":
-            features = self.backbone(x_flat)
         else:
-            features = self.backbone(x_flat)
-        if isinstance(features, tuple):
-            features = features[0]
-        features = features.view(b, g, -1)
-        scores = self.head(features)
-        return scores

 import torch
 import torch.nn as nn
 import open_clip
 from huggingface_hub import hf_hub_download
+class RankingHead(nn.Module):
+    """2-layer MLP head with dropout — matches training checkpoint layout:
+    head.net.0  Linear(in_dim, hidden_dim)
+    head.net.1  GELU
+    head.net.2  Dropout
+    head.net.3  Linear(hidden_dim, 1)
+    """
+    def __init__(self, in_dim, hidden_dim=256, dropout=0.1):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden_dim),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_dim, 1),
+        )
+    def forward(self, x):
+        return self.net(x)
 class MobileCLIPRanker(nn.Module):
+    def __init__(self, backbone_dim=768, head_hidden_dim=256, head_dropout=0.1):
         super().__init__()
+        self.backbone_dim = backbone_dim
+        print("Initializing MobileCLIP2-L14 backbone...")
+        ckpt_path = hf_hub_download(
+            repo_id="apple/MobileCLIP2-L-14",
+            filename="mobileclip2_l14.pt",
+        )
+        model, _, _ = open_clip.create_model_and_transforms(
+            "MobileCLIP2-L-14", pretrained=ckpt_path
+        )
+        self.backbone = model.visual
+        self.backbone.eval()
+        for p in self.backbone.parameters():
+            p.requires_grad = False
+        self.head = RankingHead(backbone_dim, head_hidden_dim, head_dropout)
+    def train(self, mode=True):
+        super().train(mode)
+        self.backbone.eval()
+        return self
     def forward(self, x, valid_lens=None):
+        if x.dim() == 5:
+            b, g, c, h, w = x.shape
+            features = self.backbone(x.view(b * g, c, h, w))
+            features = features.view(b, g, -1)
         else:
+            features = x
+        return self.head(features)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libvips-dev

requirements.txt CHANGED Viewed

@@ -4,10 +4,9 @@ fastapi
 uvicorn
 gradio
 pydantic
-requests
 pyyaml
-pillow
 huggingface_hub
 timm
-git+https://github.com/apple/ml-mobileclip.git
-open_clip_torch

 uvicorn
 gradio
 pydantic
 pyyaml
 huggingface_hub
 timm
+open_clip_torch
+pyvips
+urllib3