Spaces:

MeshMax
/

VidTower

Sleeping

App Files Files Community

MeshMax commited on Oct 4, 2025

Commit

d36dff5

verified ·

1 Parent(s): b8fc529

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -22

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# app.py
 import os
 import torch
 import torch.nn as nn
@@ -11,30 +10,16 @@ from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 from transformers import AutoTokenizer, AutoModel
 from torchvision import transforms
 # --- Config ---
-MODEL_URL = "https://huggingface.co/MeshMax/video_tower/resolve/main/finetuned_multimodal.pt?download=true"
-MODEL_FILENAME = "finetuned_multimodal.pt"
 TEXT_MODEL = "sentence-transformers/LaBSE"
 IMG_MODEL = "vit_base_patch16_224"
 IMG_SIZE = 224
 MAX_LENGTH = 512
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# --- Download checkpoint if not already present ---
-if not os.path.exists(MODEL_FILENAME):
-    print(f"Downloading model from {MODEL_URL} ...")
-    response = requests.get(MODEL_URL, stream=True)
-    response.raise_for_status()
-    with open(MODEL_FILENAME, "wb") as f:
-        for chunk in response.iter_content(chunk_size=8192):
-            if chunk:
-                f.write(chunk)
-    print("Download done.")
-else:
-    print("Model file already exists.")
-# --- Model definition (same as before) ---
 class MultimodalRegressor(nn.Module):
     def __init__(self, text_dim=768, img_dim=768, proj_dim=768):
         super().__init__()
@@ -63,7 +48,20 @@ text_model = AutoModel.from_pretrained(TEXT_MODEL).to(DEVICE)
 img_model = timm.create_model(IMG_MODEL, pretrained=False, num_classes=0).to(DEVICE)
 head = MultimodalRegressor().to(DEVICE)
-ckpt = torch.load(MODEL_FILENAME, map_location=DEVICE, weights_only=False)
 if "text_model_state" in ckpt:
     text_model.load_state_dict(ckpt["text_model_state"])
 if "img_model_state" in ckpt:
@@ -85,7 +83,6 @@ def compute_embedding(title, description, tags, thumbnail_url):
     text = f"{title} {description} {tags}"
     toks = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE)
     with torch.no_grad():
-        # Using pooler_output or fallback
         out = text_model(**toks)
         if hasattr(out, "pooler_output") and out.pooler_output is not None:
             text_emb = out.pooler_output
@@ -94,6 +91,7 @@ def compute_embedding(title, description, tags, thumbnail_url):
     try:
         img_resp = requests.get(thumbnail_url, timeout=5)
         img = Image.open(BytesIO(img_resp.content)).convert("RGB")
     except Exception:
         img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), color=(128, 128, 128))
@@ -107,9 +105,9 @@ def compute_embedding(title, description, tags, thumbnail_url):
             query=t_proj.unsqueeze(1), key=i_proj.unsqueeze(1), value=i_proj.unsqueeze(1)
         )
         fused = attn_out.squeeze(1)
-        return fused.squeeze(0).cpu().numpy().tolist()
-# --- FastAPI + Gradio integration ---
 app = FastAPI()
 @app.post("/api/get_embedding")
@@ -134,4 +132,4 @@ gr_interface = gr.Interface(
     description="Generates fused multimodal embeddings from video metadata",
 )
-app = gr.mount_gradio_app(app, gr_interface, path="/")

 import os
 import torch
 import torch.nn as nn
 from fastapi.responses import JSONResponse
 from transformers import AutoTokenizer, AutoModel
 from torchvision import transforms
+from huggingface_hub import hf_hub_download  # NEW
 # --- Config ---
 TEXT_MODEL = "sentence-transformers/LaBSE"
 IMG_MODEL = "vit_base_patch16_224"
 IMG_SIZE = 224
 MAX_LENGTH = 512
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# --- Model definition (unchanged) ---
 class MultimodalRegressor(nn.Module):
     def __init__(self, text_dim=768, img_dim=768, proj_dim=768):
         super().__init__()
 img_model = timm.create_model(IMG_MODEL, pretrained=False, num_classes=0).to(DEVICE)
 head = MultimodalRegressor().to(DEVICE)
+# NEW: Dynamic load with cache
+def load_model_if_needed():
+    model_path = hf_hub_download(
+        repo_id="MeshMax/video_tower",
+        filename="finetuned_multimodal.pt",
+        local_dir="/tmp",
+        local_dir_use_symlinks=False,
+        cache_dir=None
+    )
+    print(f"Model loaded from: {model_path}")
+    return model_path
+model_path = load_model_if_needed()
+ckpt = torch.load(model_path, map_location=DEVICE, weights_only=False)
 if "text_model_state" in ckpt:
     text_model.load_state_dict(ckpt["text_model_state"])
 if "img_model_state" in ckpt:
     text = f"{title} {description} {tags}"
     toks = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=MAX_LENGTH).to(DEVICE)
     with torch.no_grad():
         out = text_model(**toks)
         if hasattr(out, "pooler_output") and out.pooler_output is not None:
             text_emb = out.pooler_output
     try:
         img_resp = requests.get(thumbnail_url, timeout=5)
+        img_resp.raise_for_status()  # IMPROVED: Raise on HTTP errors
         img = Image.open(BytesIO(img_resp.content)).convert("RGB")
     except Exception:
         img = Image.new("RGB", (IMG_SIZE, IMG_SIZE), color=(128, 128, 128))
             query=t_proj.unsqueeze(1), key=i_proj.unsqueeze(1), value=i_proj.unsqueeze(1)
         )
         fused = attn_out.squeeze(1)
+        return fused.squeeze(0).cpu().numpy().tolist()  # Note: This is proj_dim=768, not 1—adjust if regression output
+# --- FastAPI + Gradio (unchanged) ---
 app = FastAPI()
 @app.post("/api/get_embedding")
     description="Generates fused multimodal embeddings from video metadata",
 )
+app = gr.mount_gradio_app(app, gr_interface, path="/")