Spaces:

MeshMax
/

VidTower

Sleeping

App Files Files Community

MeshMax commited on Oct 4, 2025

Commit

408b528

verified ·

1 Parent(s): a379734

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -18

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from transformers import AutoTokenizer, AutoModel
 from torchvision import transforms
 # --- Config ---
-MODEL_URL = "https://drive.google.com/uc?export=download&id=10Y_HLjflL54H7iwP1oz1ZG1SV4SsK6Qw"
 MODEL_FILENAME = "finetuned_multimodal.pt"
 TEXT_MODEL = "sentence-transformers/LaBSE"
 IMG_MODEL = "vit_base_patch16_224"
@@ -21,20 +21,20 @@ IMG_SIZE = 224
 MAX_LENGTH = 512
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# --- Download model from Google Drive ---
 if not os.path.exists(MODEL_FILENAME):
-    print(f"Downloading checkpoint from {MODEL_URL} ...")
-    r = requests.get(MODEL_URL, stream=True)
-    r.raise_for_status()
     with open(MODEL_FILENAME, "wb") as f:
-        for chunk in r.iter_content(chunk_size=8192):
             if chunk:
                 f.write(chunk)
-    print("Download complete.")
 else:
-    print("Checkpoint already exists locally.")
-# --- Define model ---
 class MultimodalRegressor(nn.Module):
     def __init__(self, text_dim=768, img_dim=768, proj_dim=768):
         super().__init__()
@@ -57,7 +57,7 @@ class MultimodalRegressor(nn.Module):
         fused = self.dropout(fused)
         return self.regressor(fused).squeeze(1)
-# --- Load models ---
 tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
 text_model = AutoModel.from_pretrained(TEXT_MODEL).to(DEVICE)
 img_model = timm.create_model(IMG_MODEL, pretrained=False, num_classes=0).to(DEVICE)
@@ -85,33 +85,45 @@ def compute_embedding(title, description, tags, thumbnail_url):
     text = f"{title} {description} {tags}"
     toks = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE)
     with torch.no_grad():
-        text_emb = text_model(**toks).pooler_output
     try:
-        img = Image.open(BytesIO(requests.get(thumbnail_url).content)).convert("RGB")
     except Exception:
         img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), color=(128, 128, 128))
     img_tensor = transform(img).unsqueeze(0).to(DEVICE)
     with torch.no_grad():
         img_emb = img_model(img_tensor)
         t_proj = head.text_proj(text_emb)
         i_proj = head.img_proj(img_emb)
-        attn_out, _ = head.fusion_layer(query=t_proj.unsqueeze(1), key=i_proj.unsqueeze(1), value=i_proj.unsqueeze(1))
         fused = attn_out.squeeze(1)
         return fused.squeeze(0).cpu().numpy().tolist()
-# --- FastAPI + Gradio ---
 app = FastAPI()
 @app.post("/api/get_embedding")
 async def api_get_embedding(request: Request):
     data = await request.json()
-    emb = compute_embedding(data.get("title", ""), data.get("description", ""),
-                            data.get("tags", ""), data.get("thumbnail_url", ""))
     return JSONResponse({"embedding": emb})
 def gradio_fn(title, description, tags, thumbnail_url):
     emb = compute_embedding(title, description, tags, thumbnail_url)
-    return f"Embedding length {len(emb)}; first 10: {emb[:10]}"
 gr_interface = gr.Interface(
     fn=gradio_fn,
@@ -119,7 +131,7 @@ gr_interface = gr.Interface(
             gr.Textbox(label="Tags"), gr.Textbox(label="Thumbnail URL")],
     outputs="text",
     title="Video Embedding Generator",
-    description="Generates fused multimodal embeddings from video metadata and thumbnail."
 )
 app = gr.mount_gradio_app(app, gr_interface, path="/")

 from torchvision import transforms
 # --- Config ---
+MODEL_URL = "https://huggingface.co/MeshMax/video_tower/resolve/main/finetuned_multimodal.pt"
 MODEL_FILENAME = "finetuned_multimodal.pt"
 TEXT_MODEL = "sentence-transformers/LaBSE"
 IMG_MODEL = "vit_base_patch16_224"
 MAX_LENGTH = 512
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# --- Download checkpoint if not already present ---
 if not os.path.exists(MODEL_FILENAME):
+    print(f"Downloading model from {MODEL_URL} ...")
+    response = requests.get(MODEL_URL, stream=True)
+    response.raise_for_status()
     with open(MODEL_FILENAME, "wb") as f:
+        for chunk in response.iter_content(chunk_size=8192):
             if chunk:
                 f.write(chunk)
+    print("Download done.")
 else:
+    print("Model file already exists.")
+# --- Model definition (same as before) ---
 class MultimodalRegressor(nn.Module):
     def __init__(self, text_dim=768, img_dim=768, proj_dim=768):
         super().__init__()
         fused = self.dropout(fused)
         return self.regressor(fused).squeeze(1)
+# --- Load backbone models + head ---
 tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)
 text_model = AutoModel.from_pretrained(TEXT_MODEL).to(DEVICE)
 img_model = timm.create_model(IMG_MODEL, pretrained=False, num_classes=0).to(DEVICE)
     text = f"{title} {description} {tags}"
     toks = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE)
     with torch.no_grad():
+        # Using pooler_output or fallback
+        out = text_model(**toks)
+        if hasattr(out, "pooler_output") and out.pooler_output is not None:
+            text_emb = out.pooler_output
+        else:
+            text_emb = out.last_hidden_state.mean(dim=1)
     try:
+        img_resp = requests.get(thumbnail_url, timeout=5)
+        img = Image.open(BytesIO(img_resp.content)).convert("RGB")
     except Exception:
         img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), color=(128, 128, 128))
     img_tensor = transform(img).unsqueeze(0).to(DEVICE)
     with torch.no_grad():
         img_emb = img_model(img_tensor)
         t_proj = head.text_proj(text_emb)
         i_proj = head.img_proj(img_emb)
+        attn_out, _ = head.fusion_layer(
+            query=t_proj.unsqueeze(1), key=i_proj.unsqueeze(1), value=i_proj.unsqueeze(1)
+        )
         fused = attn_out.squeeze(1)
         return fused.squeeze(0).cpu().numpy().tolist()
+# --- FastAPI + Gradio integration ---
 app = FastAPI()
 @app.post("/api/get_embedding")
 async def api_get_embedding(request: Request):
     data = await request.json()
+    emb = compute_embedding(
+        data.get("title", ""), data.get("description", ""),
+        data.get("tags", ""), data.get("thumbnail_url", "")
+    )
     return JSONResponse({"embedding": emb})
 def gradio_fn(title, description, tags, thumbnail_url):
     emb = compute_embedding(title, description, tags, thumbnail_url)
+    return f"Embedding length={len(emb)}; first 10: {emb[:10]}"
 gr_interface = gr.Interface(
     fn=gradio_fn,
             gr.Textbox(label="Tags"), gr.Textbox(label="Thumbnail URL")],
     outputs="text",
     title="Video Embedding Generator",
+    description="Generates fused multimodal embeddings from video metadata",
 )
 app = gr.mount_gradio_app(app, gr_interface, path="/")