Spaces:

MeshMax
/

VidTower

Sleeping

App Files Files Community

MeshMax commited on Oct 4, 2025

Commit

a5705f1

verified ·

1 Parent(s): fa94305

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -154

app.py CHANGED Viewed

@@ -1,42 +1,45 @@
 # app.py
 import os
-import io
-import time
-import json
 import torch
 import torch.nn as nn
-import numpy as np
-from PIL import Image
-from io import BytesIO
 import requests
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
-import gradio as gr
 from transformers import AutoTokenizer, AutoModel
-import timm
 from torchvision import transforms
-# -----------------------
-# Config — mirror your notebook
-# -----------------------
-MODEL_FILENAME = "finetuned_multimodal.pt"   # upload this to your Space
 TEXT_MODEL = "sentence-transformers/LaBSE"
 IMG_MODEL = "vit_base_patch16_224"
 IMG_SIZE = 224
 MAX_LENGTH = 512
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# -----------------------
-# Model class (exact from your notebook)
-# -----------------------
 class MultimodalRegressor(nn.Module):
-    def __init__(self, text_dim=768, img_dim=768, proj_dim=768):  # keep dims consistent with training
         super().__init__()
         self.text_proj = nn.Linear(text_dim, proj_dim)
         self.img_proj = nn.Linear(img_dim, proj_dim)
-        # batch_first=True per your notebook
         self.fusion_layer = nn.MultiheadAttention(embed_dim=proj_dim, num_heads=8, batch_first=True)
         self.dropout = nn.Dropout(0.1)
         self.regressor = nn.Sequential(
@@ -54,162 +57,69 @@ class MultimodalRegressor(nn.Module):
         fused = self.dropout(fused)
         return self.regressor(fused).squeeze(1)
-# -----------------------
-# Utilities: image transform & helpers
-# -----------------------
-img_transform = transforms.Compose([
-    transforms.Resize((IMG_SIZE, IMG_SIZE)),
-    transforms.ToTensor(),
-    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
-])
-def load_image_from_url(url):
-    try:
-        resp = requests.get(url, timeout=6)
-        resp.raise_for_status()
-        img = Image.open(BytesIO(resp.content)).convert("RGB")
-        return img
-    except Exception:
-        # Return a gray image fallback if thumbnail fetch fails
-        return Image.new("RGB", (IMG_SIZE, IMG_SIZE), color=(128, 128, 128))
-def text_to_embedding(tokenizer, text_model, texts):
-    # texts: list[str] (batch)
-    # Return tensor shape (batch, text_dim)
-    text_model.eval()
-    with torch.no_grad():
-        toks = tokenizer(texts, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
-        toks = {k: v.to(DEVICE) for k, v in toks.items()}
-        out = text_model(**toks)
-        # prefer pooler_output if available, else mean of last_hidden_state
-        if hasattr(out, "pooler_output") and out.pooler_output is not None:
-            emb = out.pooler_output
-        else:
-            last = out.last_hidden_state  # (batch, seq, dim)
-            emb = last.mean(dim=1)
-        return emb  # already on DEVICE
-# -----------------------
-# Load pretrained backbone models + head; load checkpoint
-# -----------------------
-print("Device:", DEVICE)
-print("Loading tokenizer and text model:", TEXT_MODEL)
 tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
 text_model = AutoModel.from_pretrained(TEXT_MODEL).to(DEVICE)
-print("Creating image model:", IMG_MODEL)
-# create_model(..., num_classes=0) returns features vector for many timm models
 img_model = timm.create_model(IMG_MODEL, pretrained=False, num_classes=0).to(DEVICE)
-multimodal_head = MultimodalRegressor().to(DEVICE)
-# Load checkpoint (robust to different key names)
-if not os.path.exists(MODEL_FILENAME):
-    print(f"WARNING: {MODEL_FILENAME} not found in the Space. Place your checkpoint at the repository root.")
-else:
-    print("Loading checkpoint:", MODEL_FILENAME)
-    ckpt = torch.load(MODEL_FILENAME, map_location=DEVICE)
-    # expected keys from notebook: 'text_model_state', 'img_model_state', 'head_state'
-    if "text_model_state" in ckpt:
-        text_model.load_state_dict(ckpt["text_model_state"])
-    elif "text_state_dict" in ckpt:
-        text_model.load_state_dict(ckpt["text_state_dict"])
-    else:
-        print("No text_model_state found in checkpoint (skipping).")
-    if "img_model_state" in ckpt:
-        img_model.load_state_dict(ckpt["img_model_state"])
-    elif "img_state_dict" in ckpt:
-        img_model.load_state_dict(ckpt["img_state_dict"])
-    else:
-        print("No img_model_state found in checkpoint (skipping).")
-    if "head_state" in ckpt:
-        multimodal_head.load_state_dict(ckpt["head_state"])
-    elif "head_state_dict" in ckpt:
-        multimodal_head.load_state_dict(ckpt["head_state_dict"])
-    else:
-        print("No head_state found in checkpoint (skipping).")
 text_model.eval()
 img_model.eval()
-multimodal_head.eval()
-print("Models ready.")
-# -----------------------
-# Inference: create fused embedding (same pipeline notebook used)
-# -----------------------
-def compute_fused_embedding(title: str, description: str, tags: str, thumbnail_url: str):
-    # Build text and image inputs
-    text = " ".join([str(title or ""), str(description or ""), str(tags or "")]).strip()
-    texts = [text]
-    # Text embedding (batch of 1)
-    t_emb = text_to_embedding(tokenizer, text_model, texts)  # shape (1, text_dim)
-    # Image embedding: preprocess and forward
-    img = load_image_from_url(thumbnail_url)
-    img_tensor = img_transform(img).unsqueeze(0).to(DEVICE)  # (1,3,H,W)
     with torch.no_grad():
-        i_emb = img_model(img_tensor)  # expected shape (1, img_dim)
-        # Project, fuse via head's fusion layer (exactly as in notebook)
-        t_proj = multimodal_head.text_proj(t_emb)      # (1, proj_dim)
-        i_proj = multimodal_head.img_proj(i_emb)       # (1, proj_dim)
-        # MultiheadAttention expects (batch, seq, dim) because batch_first=True
-        attn_out, _ = multimodal_head.fusion_layer(
-            query=t_proj.unsqueeze(1),   # (1, 1, proj_dim)
-            key=i_proj.unsqueeze(1),     # (1, 1, proj_dim)
-            value=i_proj.unsqueeze(1)    # (1, 1, proj_dim)
-        )
-        fused = attn_out.squeeze(1)  # (1, proj_dim) -> (proj_dim,)
-        fused_np = fused.squeeze(0).cpu().numpy().tolist()
-        return fused_np
-# -----------------------
-# FastAPI + Gradio integration
-# -----------------------
 app = FastAPI()
 @app.post("/api/get_embedding")
 async def api_get_embedding(request: Request):
-    payload = await request.json()
-    title = payload.get("title", "")
-    description = payload.get("description", "")
-    tags = payload.get("tags", "")
-    thumbnail_url = payload.get("thumbnail_url", "")
-    try:
-        emb = compute_fused_embedding(title, description, tags, thumbnail_url)
-    except Exception as e:
-        return JSONResponse({"error": str(e)}, status_code=500)
     return JSONResponse({"embedding": emb})
-# Gradio UI for quick testing (truncated embedding shown)
 def gradio_fn(title, description, tags, thumbnail_url):
-    try:
-        emb = compute_fused_embedding(title, description, tags, thumbnail_url)
-        return f"embedding (len={len(emb)}): {emb[:10]} ... (truncated)"
-    except Exception as e:
-        return f"Error: {e}"
 gr_interface = gr.Interface(
     fn=gradio_fn,
-    inputs=[
-        gr.Textbox(label="Title", lines=1),
-        gr.Textbox(label="Description", lines=3),
-        gr.Textbox(label="Tags", lines=1),
-        gr.Textbox(label="Thumbnail URL", lines=1),
-    ],
-    outputs=gr.Textbox(label="Embedding (truncated)"),
-    title="Multimodal Embedding (Notebook -> Space)",
-    description="Provide title, description, tags and thumbnail URL. Returns fused multimodal embedding (vector).",
-    examples=[
-        ["Cute cat", "A cat doing flips", "cat,funny", "https://example.com/sample.jpg"]
-    ]
 )
-# Mount Gradio app at root
 app = gr.mount_gradio_app(app, gr_interface, path="/")

 # app.py
 import os
 import torch
 import torch.nn as nn
 import requests
+from io import BytesIO
+from PIL import Image
+import timm
+import gradio as gr
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 from transformers import AutoTokenizer, AutoModel
 from torchvision import transforms
+# --- Config ---
+MODEL_URL = "https://drive.google.com/uc?export=download&id=10Y_HLjflL54H7iwP1oz1ZG1SV4SsK6Qw"
+MODEL_FILENAME = "finetuned_multimodal.pt"
 TEXT_MODEL = "sentence-transformers/LaBSE"
 IMG_MODEL = "vit_base_patch16_224"
 IMG_SIZE = 224
 MAX_LENGTH = 512
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# --- Download model from Google Drive ---
+if not os.path.exists(MODEL_FILENAME):
+    print(f"Downloading checkpoint from {MODEL_URL} ...")
+    r = requests.get(MODEL_URL, stream=True)
+    r.raise_for_status()
+    with open(MODEL_FILENAME, "wb") as f:
+        for chunk in r.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+    print("Download complete.")
+else:
+    print("Checkpoint already exists locally.")
+# --- Define model ---
 class MultimodalRegressor(nn.Module):
+    def __init__(self, text_dim=768, img_dim=768, proj_dim=768):
         super().__init__()
         self.text_proj = nn.Linear(text_dim, proj_dim)
         self.img_proj = nn.Linear(img_dim, proj_dim)
         self.fusion_layer = nn.MultiheadAttention(embed_dim=proj_dim, num_heads=8, batch_first=True)
         self.dropout = nn.Dropout(0.1)
         self.regressor = nn.Sequential(
         fused = self.dropout(fused)
         return self.regressor(fused).squeeze(1)
+# --- Load models ---
 tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
 text_model = AutoModel.from_pretrained(TEXT_MODEL).to(DEVICE)
 img_model = timm.create_model(IMG_MODEL, pretrained=False, num_classes=0).to(DEVICE)
+head = MultimodalRegressor().to(DEVICE)
+ckpt = torch.load(MODEL_FILENAME, map_location=DEVICE)
+if "text_model_state" in ckpt:
+    text_model.load_state_dict(ckpt["text_model_state"])
+if "img_model_state" in ckpt:
+    img_model.load_state_dict(ckpt["img_model_state"])
+if "head_state" in ckpt:
+    head.load_state_dict(ckpt["head_state"])
 text_model.eval()
 img_model.eval()
+head.eval()
+transform = transforms.Compose([
+    transforms.Resize((IMG_SIZE, IMG_SIZE)),
+    transforms.ToTensor(),
+    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+])
+def compute_embedding(title, description, tags, thumbnail_url):
+    text = f"{title} {description} {tags}"
+    toks = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE)
     with torch.no_grad():
+        text_emb = text_model(**toks).pooler_output
+    try:
+        img = Image.open(BytesIO(requests.get(thumbnail_url).content)).convert("RGB")
+    except Exception:
+        img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), color=(128, 128, 128))
+    img_tensor = transform(img).unsqueeze(0).to(DEVICE)
+    with torch.no_grad():
+        img_emb = img_model(img_tensor)
+        t_proj = head.text_proj(text_emb)
+        i_proj = head.img_proj(img_emb)
+        attn_out, _ = head.fusion_layer(query=t_proj.unsqueeze(1), key=i_proj.unsqueeze(1), value=i_proj.unsqueeze(1))
+        fused = attn_out.squeeze(1)
+        return fused.squeeze(0).cpu().numpy().tolist()
+# --- FastAPI + Gradio ---
 app = FastAPI()
 @app.post("/api/get_embedding")
 async def api_get_embedding(request: Request):
+    data = await request.json()
+    emb = compute_embedding(data.get("title", ""), data.get("description", ""),
+                            data.get("tags", ""), data.get("thumbnail_url", ""))
     return JSONResponse({"embedding": emb})
 def gradio_fn(title, description, tags, thumbnail_url):
+    emb = compute_embedding(title, description, tags, thumbnail_url)
+    return f"Embedding length {len(emb)}; first 10: {emb[:10]}"
 gr_interface = gr.Interface(
     fn=gradio_fn,
+    inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Description"),
+            gr.Textbox(label="Tags"), gr.Textbox(label="Thumbnail URL")],
+    outputs="text",
+    title="Video Embedding Generator",
+    description="Generates fused multimodal embeddings from video metadata and thumbnail."
 )
 app = gr.mount_gradio_app(app, gr_interface, path="/")