Spaces:

BiasLab2025
/

TaskCLIP

Sleeping

App Files Files Community

HanningChen commited on Jan 18

Commit

6feb3b2

1 Parent(s): eb9b67e

Download weights from HF model repo and use cached paths

Browse files

Files changed (4) hide show

requirements.txt +3 -0
webui/app.py +38 -36
webui/runner.py +76 -341
webui/weights.py +14 -0

requirements.txt CHANGED Viewed

@@ -23,3 +23,6 @@ opencv-python-headless==4.10.0.84
 # --- Models / inference ---
 ultralytics==8.4.3
 open-clip-torch==2.24.0

 # --- Models / inference ---
 ultralytics==8.4.3
 open-clip-torch==2.24.0
+huggingface_hub>=0.24.0
+pytorchvideo==0.1.5

webui/app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import uuid
 from pathlib import Path
-from typing import Optional
 from fastapi import FastAPI, Request, UploadFile, File, Form
 from fastapi.responses import HTMLResponse, JSONResponse
@@ -8,8 +8,9 @@ from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
 from webui.runner import ModelRunner
-PROJECT_ROOT = Path(__file__).resolve().parents[1]  # project/
 WEBUI_DIR = Path(__file__).resolve().parent
 UPLOAD_DIR = WEBUI_DIR / "uploads"
 RESULT_DIR = WEBUI_DIR / "results"
@@ -22,6 +23,13 @@ templates = Jinja2Templates(directory=str(WEBUI_DIR / "templates"))
 app.mount("/static", StaticFiles(directory=str(WEBUI_DIR / "static")), name="static")
 app.mount("/results", StaticFiles(directory=str(RESULT_DIR)), name="results")
 VLM_CHOICES = [
     {"label": "imagebind", "value": "imagebind", "folder": "imagebind"},
     {"label": "ViT-B",     "value": "vit-b",     "folder": "ViT-B"},
@@ -34,24 +42,29 @@ HDV_DIMS = [128, 256, 512, 1024]
 DEFAULT_VLM = "imagebind"
 DEFAULT_HDV = 256
-DEFAULT_TASKCLIP_CKPT = "./test_model/default/decoder.pt"
 DEFAULT_SCORE_FUNC = "default"
 OD_CHOICES = [
-    {"label": "nano",   "value": "nano",   "ckpt": "./.checkpoints/yolo12n.pt"},
-    {"label": "small",  "value": "small",  "ckpt": "./.checkpoints/yolo12s.pt"},
-    {"label": "median", "value": "median", "ckpt": "./.checkpoints/yolo12m.pt"},
-    {"label": "large",  "value": "large",  "ckpt": "./.checkpoints/yolo12l.pt"},
-    {"label": "xlarge", "value": "xlarge", "ckpt": "./.checkpoints/yolo12x.pt"},
 ]
 OD_VALUE_TO_CKPT = {x["value"]: x["ckpt"] for x in OD_CHOICES}
 DEFAULT_OD = "xlarge"
-# Load models ONCE at startup
 runner = ModelRunner(
     project_root=str(PROJECT_ROOT),
-    device="cuda:0",  # change if needed
-    yolo_ckpt="./.checkpoints/yolo12x.pt",
     id2task_name_file="./id2task_name.json",
     task2prompt_file="./task20.json",
     threshold=0.01,
@@ -78,7 +91,6 @@ def index(request: Request):
         },
     )
 @app.post("/api/run")
 async def api_run(
     vlm_model: str = Form(DEFAULT_VLM),
@@ -89,58 +101,48 @@ async def api_run(
     viz_mode: str = Form("bbox"),
     upload: UploadFile = File(...),
 ):
-    # compute taskclip checkpoint
     if score_function not in SCORE_FUNCS:
         return JSONResponse({"ok": False, "error": f"Unknown score_function: {score_function}"}, status_code=400)
     if score_function == "HDC":
         if hdv_dim not in HDV_DIMS:
             return JSONResponse({"ok": False, "error": f"Unsupported hdv_dim: {hdv_dim}"}, status_code=400)
         vlm_folder = VLM_VALUE_TO_FOLDER.get(vlm_model)
         if not vlm_folder:
             return JSONResponse({"ok": False, "error": f"Unknown vlm_model: {vlm_model}"}, status_code=400)
-        taskclip_ckpt = f"./test_model/{vlm_folder}/8Layer_4Head_HDV_{hdv_dim}/decoder.pt"
     else:
         taskclip_ckpt = DEFAULT_TASKCLIP_CKPT
-    if score_function == "default" and vlm_model != "imagebind":
-        return JSONResponse(
-            {"ok": False, "error": "score_function=default only supports vlm_model=imagebind. Use HDC for vit-b/vit-l."},
-            status_code=400
-        )
-    # get yolo checkpoint
     yolo_ckpt = OD_VALUE_TO_CKPT.get(od_model)
     if not yolo_ckpt:
         return JSONResponse({"ok": False, "error": f"Unknown od_model size: {od_model}"}, status_code=400)
-    # Save upload
     suffix = Path(upload.filename).suffix or ".jpg"
     job_id = uuid.uuid4().hex
     upload_path = UPLOAD_DIR / f"{job_id}{suffix}"
     upload_path.write_bytes(await upload.read())
-    # Run inference
     try:
-        # print("[API] vlm_model", vlm_model, "score_function", score_function, "hdv_dim", hdv_dim, "taskclip_ckpt", taskclip_ckpt)
         out = runner.run(
-            image_path=str(upload_path),
-            task_id=int(task_id),
-            vlm_model=vlm_model,
-            od_model='yolo',
             yolo_ckpt=yolo_ckpt,
             score_function=score_function,
-            hdv_dim=hdv_dim,
-            taskclip_ckpt=taskclip_ckpt,
             viz_mode=viz_mode,
         )
     except Exception as e:
         return JSONResponse({"ok": False, "error": repr(e)}, status_code=500)
-    # Save 3 images to results/<job_id>/
     job_dir = RESULT_DIR / job_id
     job_dir.mkdir(parents=True, exist_ok=True)
@@ -163,4 +165,4 @@ async def api_run(
             "yolo": f"/results/{job_id}/yolo.jpg",
             "selected": f"/results/{job_id}/selected.jpg",
         },
-    }

+import os
 import uuid
 from pathlib import Path
 from fastapi import FastAPI, Request, UploadFile, File, Form
 from fastapi.responses import HTMLResponse, JSONResponse
 from fastapi.templating import Jinja2Templates
 from webui.runner import ModelRunner
+from webui.weights import get_weights_dir
+PROJECT_ROOT = Path(__file__).resolve().parents[1]  # repo root
 WEBUI_DIR = Path(__file__).resolve().parent
 UPLOAD_DIR = WEBUI_DIR / "uploads"
 RESULT_DIR = WEBUI_DIR / "results"
 app.mount("/static", StaticFiles(directory=str(WEBUI_DIR / "static")), name="static")
 app.mount("/results", StaticFiles(directory=str(RESULT_DIR)), name="results")
+# ---- weights repo ----
+WEIGHTS_REPO = os.getenv("TASKCLIP_WEIGHTS_REPO", "BiasLab2025/YOUR-WEIGHTS-REPO")  # <-- change default
+WEIGHTS_DIR = get_weights_dir(WEIGHTS_REPO)
+CKPT_DIR = WEIGHTS_DIR / "checkpoints"
+DECODER_DIR = WEIGHTS_DIR / "test_model"
 VLM_CHOICES = [
     {"label": "imagebind", "value": "imagebind", "folder": "imagebind"},
     {"label": "ViT-B",     "value": "vit-b",     "folder": "ViT-B"},
 DEFAULT_VLM = "imagebind"
 DEFAULT_HDV = 256
 DEFAULT_SCORE_FUNC = "default"
+DEFAULT_TASKCLIP_CKPT = str(DECODER_DIR / "default" / "decoder.pt")
 OD_CHOICES = [
+    {"label": "nano",   "value": "nano",   "ckpt": str(CKPT_DIR / "yolo12n.pt")},
+    {"label": "small",  "value": "small",  "ckpt": str(CKPT_DIR / "yolo12s.pt")},
+    {"label": "median", "value": "median", "ckpt": str(CKPT_DIR / "yolo12m.pt")},
+    {"label": "large",  "value": "large",  "ckpt": str(CKPT_DIR / "yolo12l.pt")},
+    {"label": "xlarge", "value": "xlarge", "ckpt": str(CKPT_DIR / "yolo12x.pt")},
 ]
 OD_VALUE_TO_CKPT = {x["value"]: x["ckpt"] for x in OD_CHOICES}
 DEFAULT_OD = "xlarge"
+DEFAULT_SAM_CKPT = str(CKPT_DIR / "sam2.1_l.pt")
+DEFAULT_IMAGEBIND_CKPT = str(CKPT_DIR / "imagebind_huge.pth")  # optional but recommended
+# ---- Load runner ONCE at startup ----
 runner = ModelRunner(
     project_root=str(PROJECT_ROOT),
+    device=os.getenv("DEVICE", "cuda:0"),
+    yolo_ckpt=OD_VALUE_TO_CKPT[DEFAULT_OD],
+    sam_ckpt=DEFAULT_SAM_CKPT,
+    imagebind_ckpt=DEFAULT_IMAGEBIND_CKPT,  # if missing, runner can fall back to pretrained=True
     id2task_name_file="./id2task_name.json",
     task2prompt_file="./task20.json",
     threshold=0.01,
         },
     )
 @app.post("/api/run")
 async def api_run(
     vlm_model: str = Form(DEFAULT_VLM),
     viz_mode: str = Form("bbox"),
     upload: UploadFile = File(...),
 ):
+    # validate + pick decoder
     if score_function not in SCORE_FUNCS:
         return JSONResponse({"ok": False, "error": f"Unknown score_function: {score_function}"}, status_code=400)
     if score_function == "HDC":
         if hdv_dim not in HDV_DIMS:
             return JSONResponse({"ok": False, "error": f"Unsupported hdv_dim: {hdv_dim}"}, status_code=400)
         vlm_folder = VLM_VALUE_TO_FOLDER.get(vlm_model)
         if not vlm_folder:
             return JSONResponse({"ok": False, "error": f"Unknown vlm_model: {vlm_model}"}, status_code=400)
+        taskclip_ckpt = str(DECODER_DIR / vlm_folder / f"8Layer_4Head_HDV_{hdv_dim}" / "decoder.pt")
     else:
         taskclip_ckpt = DEFAULT_TASKCLIP_CKPT
+    # pick yolo ckpt
     yolo_ckpt = OD_VALUE_TO_CKPT.get(od_model)
     if not yolo_ckpt:
         return JSONResponse({"ok": False, "error": f"Unknown od_model size: {od_model}"}, status_code=400)
+    # save upload
     suffix = Path(upload.filename).suffix or ".jpg"
     job_id = uuid.uuid4().hex
     upload_path = UPLOAD_DIR / f"{job_id}{suffix}"
     upload_path.write_bytes(await upload.read())
+    # run
     try:
         out = runner.run(
+            image_path=str(upload_path),
+            task_id=int(task_id),
+            vlm_model=vlm_model,
+            od_model="yolo",
             yolo_ckpt=yolo_ckpt,
             score_function=score_function,
+            hdv_dim=int(hdv_dim),
+            taskclip_ckpt=taskclip_ckpt,
             viz_mode=viz_mode,
         )
     except Exception as e:
         return JSONResponse({"ok": False, "error": repr(e)}, status_code=500)
+    # save results
     job_dir = RESULT_DIR / job_id
     job_dir.mkdir(parents=True, exist_ok=True)
             "yolo": f"/results/{job_id}/yolo.jpg",
             "selected": f"/results/{job_id}/selected.jpg",
         },
+    }

webui/runner.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from typing import Dict, Any, List, Tuple
 import numpy as np
 import torch
@@ -8,91 +8,22 @@ from PIL import Image, ImageDraw
 from ultralytics import YOLO, SAM
-# from ImageBind.imagebind import data
-# from ImageBind.imagebind.models import imagebind_model
-# from ImageBind.imagebind.models.imagebind_model import ModalityType
-import sys
-from pathlib import Path
-REPO_ROOT = Path(__file__).resolve().parents[1]  # repo/
-sys.path.insert(0, str(REPO_ROOT / "ImageBind"))  # so "import imagebind" works
-from imagebind import data
-from imagebind.models import imagebind_model
-from imagebind.models.imagebind_model import ModalityType
-import open_clip
 from models.TaskCLIP import TaskCLIP
-def _draw_boxes_pil(
-    img: Image.Image,
-    boxes_xyxy: np.ndarray,
-    color: Tuple[int, int, int],
-    width: int = 3,
-) -> Image.Image:
-    out = img.copy()
-    draw = ImageDraw.Draw(out)
-    if boxes_xyxy is None or len(boxes_xyxy) == 0:
-        return out
-    for (x0, y0, x1, y1) in boxes_xyxy.tolist():
-        draw.rectangle([x0, y0, x1, y1], outline=color, width=width)
-    return out
-def _crop_pil(img: Image.Image, bbox_list: List[List[float]]) -> Tuple[List[Image.Image], List[int]]:
-    """Return list of cropped PIL images + indices mapping back to bbox_list."""
-    W, H = img.size
-    crops = []
-    idxs = []
-    for i, (x0, y0, x1, y1) in enumerate(bbox_list):
-        x0 = max(0, min(W, int(x0)))
-        y0 = max(0, min(H, int(y0)))
-        x1 = max(0, min(W, int(x1)))
-        y1 = max(0, min(H, int(y1)))
-        if x1 <= x0 or y1 <= y0:
-            continue
-        crops.append(img.crop((x0, y0, x1, y1)))
-        idxs.append(i)
-    return crops, idxs
-def overlay_masks(
-    img: Image.Image,
-    masks: np.ndarray,
-    alpha: float = 0.40,
-    color: Tuple[int, int, int] = (255, 0, 0),
-) -> Image.Image:
-    if masks is None or len(masks) == 0:
-        return img
-    base = np.array(img).astype(np.float32)
-    union = np.any(masks.astype(bool), axis=0)  # (H, W)
-    if not np.any(union):
-        return img
-    overlay = base.copy()
-    overlay[union] = overlay[union] * 0.2 + np.array(color, dtype=np.float32) * 0.8
-    out = base * (1 - alpha) + overlay * alpha
-    return Image.fromarray(np.clip(out, 0, 255).astype(np.uint8))
 class ModelRunner:
-    """
-    WebUI runner:
-      - YOLO detects bboxes
-      - VLM (ImageBind or OpenCLIP) embeds text prompts and crops (+ global image)
-      - TaskCLIP scores and selects bboxes
-      - optionally visualize bbox or SAM masks
-    """
     def __init__(
         self,
         project_root: str,
         device: str = "cuda:0",
         yolo_ckpt: str = "./.checkpoints/yolo12x.pt",
         sam_ckpt: str = "./.checkpoints/sam2.1_l.pt",
         id2task_name_file: str = "./id2task_name.json",
         task2prompt_file: str = "./task20.json",
         threshold: float = 0.01,
@@ -107,101 +38,51 @@ class ModelRunner:
         self.cluster = bool(cluster)
         self.forward_thre = float(forward_thre)
-        # files
         self.id2task_name_path = (self.root / id2task_name_file).resolve()
         self.task2prompt_path = (self.root / task2prompt_file).resolve()
-        self.yolo_ckpt_path = (self.root / yolo_ckpt).resolve()
-        # load task metadata
         self.id2task_name = json.loads(self.id2task_name_path.read_text())
         self.task2prompt = json.loads(self.task2prompt_path.read_text())
         # caches
-        self._vlm_cache = {}
         self._yolo_cache = {}
         self._taskclip_cache = {}
         sam_ckpt_path = (self.root / sam_ckpt).resolve() if str(sam_ckpt).startswith(".") else Path(sam_ckpt)
         self.sam = SAM(str(sam_ckpt_path))
-        # lock for single GPU servers
         self._lock = torch.multiprocessing.RLock()
     def _get_yolo(self, ckpt_path: str):
-        ckpt_abs = str((self.root / ckpt_path).resolve()) if ckpt_path.startswith(".") else ckpt_path
         if ckpt_abs not in self._yolo_cache:
             self._yolo_cache[ckpt_abs] = YOLO(ckpt_abs)
         return self._yolo_cache[ckpt_abs]
-    def _get_vlm(self, vlm_model: str):
-        if vlm_model in self._vlm_cache:
-            return self._vlm_cache[vlm_model]
-        if vlm_model == "imagebind":
-            m = imagebind_model.imagebind_huge(pretrained=True).to(self.device).eval()
-            pack = {"kind": "imagebind", "model": m}
-        elif vlm_model == "vit-b":
-            m, _, preprocess = open_clip.create_model_and_transforms(
-                "ViT-B-32", pretrained="laion2b_s34b_b79k"
-            )
-            m = m.to(self.device).eval()
-            tokenizer = open_clip.get_tokenizer("ViT-B-32")
-            pack = {"kind": "openclip", "model": m, "preprocess": preprocess, "tokenizer": tokenizer}
-        elif vlm_model == "vit-l":
-            m, _, preprocess = open_clip.create_model_and_transforms(
-                "ViT-L-14", pretrained="laion2b_s32b_b82k"
-            )
-            m = m.to(self.device).eval()
-            tokenizer = open_clip.get_tokenizer("ViT-L-14")
-            pack = {"kind": "openclip", "model": m, "preprocess": preprocess, "tokenizer": tokenizer}
-        else:
-            raise ValueError(f"Unknown vlm_model: {vlm_model}")
-        self._vlm_cache[vlm_model] = pack
-        return pack
-    def _encode_vlm(self, vlm_model: str, prompt_use, seg_list, full_img_pil):
-        pack = self._get_vlm(vlm_model)
-        with torch.inference_mode():
-            if pack["kind"] == "imagebind":
-                input_pack = {
-                    ModalityType.TEXT: data.load_and_transform_text(prompt_use, self.device),
-                    ModalityType.VISION: data.read_and_transform_vision_data(seg_list, self.device),
-                }
-                emb = pack["model"](input_pack)
-                text_embeddings = emb[ModalityType.TEXT]
-                bbox_embeddings = emb[ModalityType.VISION]
-                input_pack2 = {ModalityType.VISION: data.read_and_transform_vision_data([full_img_pil], self.device)}
-                emb2 = pack["model"](input_pack2)
-                image_embedding = emb2[ModalityType.VISION].squeeze(0)
-                return text_embeddings, bbox_embeddings, image_embedding
-            # openclip branch
-            m = pack["model"]
-            preprocess = pack["preprocess"]
-            tokenizer = pack["tokenizer"]
-            # text
-            text = tokenizer(prompt_use).to(self.device)
-            text_embeddings = m.encode_text(text).float()
-            text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
-            # bbox crops
-            crop_tensors = [preprocess(im) for im in seg_list]
-            crop_batch = torch.stack(crop_tensors, dim=0).to(self.device)
-            bbox_embeddings = m.encode_image(crop_batch).float()
-            bbox_embeddings = bbox_embeddings / bbox_embeddings.norm(dim=-1, keepdim=True)
-            # global image
-            img_tensor = preprocess(full_img_pil).unsqueeze(0).to(self.device)
-            image_embedding = m.encode_image(img_tensor).float().squeeze(0)
-            image_embedding = image_embedding / image_embedding.norm(dim=-1, keepdim=True)
-            return text_embeddings, bbox_embeddings, image_embedding
     def list_task_ids(self) -> List[int]:
         ids = []
         for k in self.id2task_name.keys():
@@ -211,73 +92,16 @@ class ModelRunner:
                 pass
         return sorted(ids)
-    @staticmethod
-    def _unwrap_state_dict(obj: Any) -> Dict[str, torch.Tensor]:
-        # supports {"state_dict": ...} style checkpoints
-        if isinstance(obj, dict) and "state_dict" in obj and isinstance(obj["state_dict"], dict):
-            return obj["state_dict"]
-        if isinstance(obj, dict):
-            return obj
-        raise TypeError(f"Unsupported checkpoint format: {type(obj)}")
-    def _infer_ckpt_flags(self, state: Dict[str, torch.Tensor]) -> Tuple[bool, bool, int]:
-        # infer (is_hdc, has_cross_attention, ckpt_d_model)
-        keys = list(state.keys())
-        is_hdc = any(k.startswith("ScoreFunction.HDReason.") for k in keys)
-        has_cross = any("cross_attn_text" in k for k in keys)
-        if "decoder_norm.weight" in state:
-            ckpt_d_model = int(state["decoder_norm.weight"].shape[0])
-        elif "ScoreFunction.norm.weight" in state:
-            ckpt_d_model = int(state["ScoreFunction.norm.weight"].shape[0])
-        else:
-            ckpt_d_model = -1
-        return is_hdc, has_cross, ckpt_d_model
-    def _get_taskclip(
-        self,
-        ckpt_path: str,
-        d_model: int,
-        n_words: int,
-        score_function: str,
-        hdv_dim: int,
-        cross_attention: bool,
-    ):
-        ckpt_abs = str((self.root / ckpt_path).resolve()) if ckpt_path.startswith(".") else ckpt_path
         if not Path(ckpt_abs).exists():
             raise FileNotFoundError(f"TaskCLIP checkpoint not found: {ckpt_abs}")
         eff_hdv_dim = int(hdv_dim) if score_function == "HDC" else 0
-        # IMPORTANT: cache key must include cross_attention + score_function
-        key = (ckpt_abs, int(d_model), int(n_words), str(score_function), int(eff_hdv_dim), bool(cross_attention))
         if key in self._taskclip_cache:
             return self._taskclip_cache[key]
-        state_raw = torch.load(ckpt_abs, map_location="cpu")
-        state = self._unwrap_state_dict(state_raw)
-        ckpt_is_hdc, ckpt_has_cross, ckpt_d_model = self._infer_ckpt_flags(state)
-        # Validate score_function against checkpoint
-        if score_function == "HDC" and not ckpt_is_hdc:
-            raise RuntimeError(f"Checkpoint is NOT HDC but score_function=HDC was selected. ckpt={ckpt_abs}")
-        if score_function != "HDC" and ckpt_is_hdc:
-            raise RuntimeError(f"Checkpoint IS HDC but score_function=default was selected. ckpt={ckpt_abs}")
-        # Validate cross_attention against checkpoint (your training differs by family)
-        if bool(cross_attention) != bool(ckpt_has_cross):
-            raise RuntimeError(
-                f"cross_attention mismatch: runtime={cross_attention} but checkpoint has_cross_attention={ckpt_has_cross}. ckpt={ckpt_abs}"
-            )
-        # Validate d_model against checkpoint
-        if ckpt_d_model != -1 and int(d_model) != int(ckpt_d_model):
-            raise RuntimeError(
-                f"d_model mismatch: VLM produced d_model={int(d_model)} but checkpoint expects d_model={int(ckpt_d_model)}. ckpt={ckpt_abs}"
-            )
         model_config = {
             "num_layers": 8,
             "norm": None,
@@ -297,52 +121,31 @@ class ModelRunner:
             "norm_after": False,
             "MIN_VAL": 10.0,
             "MAX_VAL": 30.0,
-            "cross_attention": bool(cross_attention),
             "score_function": "HDC" if score_function == "HDC" else "default",
             "HDV_D": int(eff_hdv_dim),
         }
         m = TaskCLIP(model_config, normalize_before=model_config["normalize_before"], device=model_config["device"])
         m.load_state_dict(state, strict=True)
         m = m.to(self.device).eval()
         self._taskclip_cache[key] = m
         return m
-    def _find_same_class(self, predict_res, score, visited, i, classes, confs, forward_thre):
-        cls_i = classes[i]
-        for j in range(len(score)):
-            if visited[j] == 1:
-                continue
-            if classes[j] == cls_i and float(score[j]) > forward_thre:
-                visited[j] = 1
-                predict_res[j]["category_id"] = 1
-                predict_res[j]["score"] = float(score[j])
     def _sam_masks_from_bboxes(self, image_path: str, bbox_list: List[List[float]], img_h: int, img_w: int) -> np.ndarray:
         if not bbox_list:
             return np.zeros((0, img_h, img_w), dtype=bool)
         bboxes = [[float(x0), float(y0), float(x1), float(y1)] for x0, y0, x1, y1 in bbox_list]
-        try:
-            res = self.sam(image_path, bboxes=bboxes)
-            r0 = res[0]
-            if r0.masks is None:
-                return np.zeros((0, img_h, img_w), dtype=bool)
-            masks = r0.masks.data.detach().cpu().numpy().astype(bool)
-            return masks
-        except Exception:
-            masks_list = []
-            for bb in bboxes:
-                rr = self.sam(image_path, bboxes=bb)[0]
-                if rr.masks is None:
-                    continue
-                m = rr.masks.data.detach().cpu().numpy().astype(bool)
-                masks_list.append(m[0])
-            if len(masks_list) == 0:
-                return np.zeros((0, img_h, img_w), dtype=bool)
-            return np.stack(masks_list, axis=0)
     def run(
         self,
@@ -356,47 +159,41 @@ class ModelRunner:
         taskclip_ckpt: str = "./test_model/default/decoder.pt",
         viz_mode: str = "bbox",
     ) -> Dict[str, Any]:
-        if vlm_model not in ["imagebind", "vit-b", "vit-l"]:
-            raise ValueError(f"Unknown vlm_model: {vlm_model}")
         if od_model != "yolo":
-            raise ValueError("Currently only od_model='yolo' is supported.")
-        if viz_mode not in ["bbox", "mask"]:
-            raise ValueError(f"Unknown viz_mode={viz_mode}")
-        # training truth:
-        # - default used cross_attention=True
-        # - HDC used cross_attention=False
-        cross_attention = (score_function != "HDC")
         with self._lock:
             img = Image.open(image_path).convert("RGB")
             task_name = self.id2task_name[str(task_id)]
             prompt_words = self.task2prompt[task_name]
             prompt_use = ["The item is " + w for w in prompt_words]
-            # YOLO detect
             yolo = self._get_yolo(yolo_ckpt)
             outputs = yolo(image_path)
             bbox_list = outputs[0].boxes.xyxy.tolist()
             classes = outputs[0].boxes.cls.tolist()
             confidences = outputs[0].boxes.conf.tolist()
-            H, W = img.size[1], img.size[0]
             all_boxes = np.asarray(bbox_list, dtype=np.float32)
-            # visualize all detections
             if viz_mode == "bbox":
                 img_yolo = _draw_boxes_pil(img, all_boxes, color=(0, 255, 0), width=3)
-                all_masks = None
-            else:
                 all_masks = self._sam_masks_from_bboxes(image_path, bbox_list, img_h=H, img_w=W)
                 img_yolo = overlay_masks(img, all_masks, alpha=0.35, color=(0, 255, 0))
-            # crop bboxes
-            seg_list, seg_idxs = _crop_pil(img, bbox_list)
             if len(seg_list) == 0:
                 return {
                     "task_id": task_id,
@@ -406,95 +203,33 @@ class ModelRunner:
                     "images": {"original": img, "yolo": img_yolo, "selected": img.copy()},
                 }
-            # VLM embeddings
-            text_embeddings, bbox_embeddings, image_embedding = self._encode_vlm(
-                vlm_model=vlm_model,
-                prompt_use=prompt_use,
-                seg_list=seg_list,
-                full_img_pil=img,
-            )
-            # Ensure dims are consistent
-            if int(bbox_embeddings.shape[-1]) != int(image_embedding.shape[-1]):
-                raise RuntimeError(
-                    f"Embedding dim mismatch: bbox_embeddings dim={bbox_embeddings.shape[-1]} vs image_embedding dim={image_embedding.shape[-1]}"
-                )
-            d_model = int(image_embedding.shape[-1])
-            n_words = int(text_embeddings.shape[0])
-            # TaskCLIP (load correct arch)
             taskclip = self._get_taskclip(
                 ckpt_path=taskclip_ckpt,
-                d_model=d_model,
-                n_words=n_words,
                 score_function=score_function,
                 hdv_dim=hdv_dim,
-                cross_attention=cross_attention,
             )
-            # Score
-            with torch.inference_mode():
-                tgt = bbox_embeddings
-                memory = text_embeddings
-                image_embedding_2d = image_embedding.view(1, -1)
-                _, _, score_res, _ = taskclip(tgt, memory, image_embedding_2d)
                 score = score_res.view(-1).detach().cpu().numpy().tolist()
-            # post-process
-            predict_res = []
-            for i in range(len(bbox_list)):
-                predict_res.append({"category_id": -1, "score": -1, "class": int(classes[i])})
-            visited = [0] * len(score)
-            for i, x in enumerate(score):
-                if visited[i] == 1:
-                    continue
-                if float(x) > self.threshold:
-                    visited[i] = 1
-                    predict_res[i]["category_id"] = 1
-                    predict_res[i]["score"] = float(x)
-                    if self.forward:
-                        self._find_same_class(predict_res, score, visited, i, classes, confidences, self.forward_thre)
-                else:
-                    predict_res[i]["category_id"] = 0
-                    predict_res[i]["score"] = 1.0 - float(x)
-            # cluster optimization
-            if self.cluster and self.forward and len(seg_list) > 1:
-                cluster_scores = {}
-                for p in predict_res:
-                    if int(p["category_id"]) == 1:
-                        c = p["class"]
-                        cluster_scores.setdefault(c, []).append(p["score"])
-                if len(cluster_scores) > 1:
-                    cluster_ave = {c: float(np.mean(v)) for c, v in cluster_scores.items()}
-                    select_class = max(cluster_ave, key=lambda k: cluster_ave[k])
-                    for p in predict_res:
-                        if p["category_id"] == 1 and p["class"] != select_class:
-                            p["category_id"] = 0
-            selected_indices = [i for i, p in enumerate(predict_res) if int(p["category_id"]) == 1]
-            selected_boxes = all_boxes[selected_indices] if len(selected_indices) > 0 else np.zeros((0, 4), dtype=np.float32)
-            # visualize selected
-            if viz_mode == "bbox":
-                img_selected = _draw_boxes_pil(img, selected_boxes, color=(255, 0, 0), width=4)
-            else:
-                if all_masks is not None and all_masks.shape[0] > 0 and len(selected_indices) > 0:
-                    sel_masks = all_masks[selected_indices]
-                else:
-                    sel_masks = np.zeros((0, H, W), dtype=bool)
-                img_selected = overlay_masks(img, sel_masks, alpha=0.45, color=(255, 0, 0))
-            return {
-                "task_id": task_id,
-                "task_name": task_name,
-                "bbox_list": bbox_list,
-                "classes": classes,
-                "confidences": confidences,
-                "scores": score,
-                "selected_indices": selected_indices,
-                "images": {"original": img, "yolo": img_yolo, "selected": img_selected},
-            }

 import json
 from pathlib import Path
+from typing import Dict, Any, List, Tuple, Optional
 import numpy as np
 import torch
 from ultralytics import YOLO, SAM
+from ImageBind.imagebind import data
+from ImageBind.imagebind.models import imagebind_model
+from ImageBind.imagebind.models.imagebind_model import ModalityType
 from models.TaskCLIP import TaskCLIP
+# ... keep your helper funcs _draw_boxes_pil/_crop_pil/overlay_masks ...
 class ModelRunner:
     def __init__(
         self,
         project_root: str,
         device: str = "cuda:0",
         yolo_ckpt: str = "./.checkpoints/yolo12x.pt",
         sam_ckpt: str = "./.checkpoints/sam2.1_l.pt",
+        imagebind_ckpt: Optional[str] = None,   # NEW
         id2task_name_file: str = "./id2task_name.json",
         task2prompt_file: str = "./task20.json",
         threshold: float = 0.01,
         self.cluster = bool(cluster)
         self.forward_thre = float(forward_thre)
+        # metadata
         self.id2task_name_path = (self.root / id2task_name_file).resolve()
         self.task2prompt_path = (self.root / task2prompt_file).resolve()
         self.id2task_name = json.loads(self.id2task_name_path.read_text())
         self.task2prompt = json.loads(self.task2prompt_path.read_text())
         # caches
         self._yolo_cache = {}
         self._taskclip_cache = {}
+        # YOLO path (kept for reference; actual YOLO models are cached per ckpt in _get_yolo)
+        self.yolo_ckpt_path = (self.root / yolo_ckpt).resolve() if str(yolo_ckpt).startswith(".") else Path(yolo_ckpt)
+        # ---- SAM load ONCE (from absolute or repo-relative path) ----
         sam_ckpt_path = (self.root / sam_ckpt).resolve() if str(sam_ckpt).startswith(".") else Path(sam_ckpt)
         self.sam = SAM(str(sam_ckpt_path))
+        # ---- ImageBind load ONCE ----
+        # If you provide imagebind_huge.pth from weights repo, use it.
+        # Otherwise fall back to pretrained=True behavior.
+        self.vlm_model = imagebind_model.imagebind_huge(pretrained=False).to(self.device).eval()
+        if imagebind_ckpt:
+            ckpt_path = (self.root / imagebind_ckpt).resolve() if str(imagebind_ckpt).startswith(".") else Path(imagebind_ckpt)
+            if ckpt_path.exists():
+                state = torch.load(str(ckpt_path), map_location="cpu")
+                # robust handling of different checkpoint formats
+                if isinstance(state, dict) and "model" in state and isinstance(state["model"], dict):
+                    state = state["model"]
+                elif isinstance(state, dict) and "state_dict" in state and isinstance(state["state_dict"], dict):
+                    state = state["state_dict"]
+                self.vlm_model.load_state_dict(state, strict=False)
+            else:
+                # fallback if file missing
+                self.vlm_model = imagebind_model.imagebind_huge(pretrained=True).to(self.device).eval()
+        else:
+            self.vlm_model = imagebind_model.imagebind_huge(pretrained=True).to(self.device).eval()
         self._lock = torch.multiprocessing.RLock()
     def _get_yolo(self, ckpt_path: str):
+        ckpt_abs = str((self.root / ckpt_path).resolve()) if str(ckpt_path).startswith(".") else str(ckpt_path)
         if ckpt_abs not in self._yolo_cache:
             self._yolo_cache[ckpt_abs] = YOLO(ckpt_abs)
         return self._yolo_cache[ckpt_abs]
     def list_task_ids(self) -> List[int]:
         ids = []
         for k in self.id2task_name.keys():
                 pass
         return sorted(ids)
+    def _get_taskclip(self, ckpt_path: str, d_model: int, n_words: int, score_function: str, hdv_dim: int):
+        ckpt_abs = str((self.root / ckpt_path).resolve()) if str(ckpt_path).startswith(".") else str(ckpt_path)
         if not Path(ckpt_abs).exists():
             raise FileNotFoundError(f"TaskCLIP checkpoint not found: {ckpt_abs}")
         eff_hdv_dim = int(hdv_dim) if score_function == "HDC" else 0
+        key = (ckpt_abs, int(d_model), int(n_words), str(score_function), eff_hdv_dim)
         if key in self._taskclip_cache:
             return self._taskclip_cache[key]
         model_config = {
             "num_layers": 8,
             "norm": None,
             "norm_after": False,
             "MIN_VAL": 10.0,
             "MAX_VAL": 30.0,
+            "cross_attention": True,  # keep consistent with how your checkpoint was trained
             "score_function": "HDC" if score_function == "HDC" else "default",
             "HDV_D": int(eff_hdv_dim),
         }
         m = TaskCLIP(model_config, normalize_before=model_config["normalize_before"], device=model_config["device"])
+        state = torch.load(ckpt_abs, map_location="cpu")
         m.load_state_dict(state, strict=True)
         m = m.to(self.device).eval()
         self._taskclip_cache[key] = m
         return m
     def _sam_masks_from_bboxes(self, image_path: str, bbox_list: List[List[float]], img_h: int, img_w: int) -> np.ndarray:
         if not bbox_list:
             return np.zeros((0, img_h, img_w), dtype=bool)
         bboxes = [[float(x0), float(y0), float(x1), float(y1)] for x0, y0, x1, y1 in bbox_list]
+        # multi-box call
+        res = self.sam(image_path, bboxes=bboxes)
+        r0 = res[0]
+        if r0.masks is None:
+            return np.zeros((0, img_h, img_w), dtype=bool)
+        return r0.masks.data.detach().cpu().numpy().astype(bool)
     def run(
         self,
         taskclip_ckpt: str = "./test_model/default/decoder.pt",
         viz_mode: str = "bbox",
     ) -> Dict[str, Any]:
+        if vlm_model != "imagebind":
+            raise ValueError("This runner.py currently implements ImageBind only (your OpenCLIP version was in the other runner).")
         if od_model != "yolo":
+            raise ValueError("Only od_model='yolo' supported.")
         with self._lock:
             img = Image.open(image_path).convert("RGB")
             task_name = self.id2task_name[str(task_id)]
             prompt_words = self.task2prompt[task_name]
             prompt_use = ["The item is " + w for w in prompt_words]
+            # YOLO
             yolo = self._get_yolo(yolo_ckpt)
             outputs = yolo(image_path)
             bbox_list = outputs[0].boxes.xyxy.tolist()
             classes = outputs[0].boxes.cls.tolist()
             confidences = outputs[0].boxes.conf.tolist()
             all_boxes = np.asarray(bbox_list, dtype=np.float32)
+            H = img.size[1]
+            W = img.size[0]
+            # IMPORTANT: only run SAM if viz_mode == mask
+            all_masks = None
             if viz_mode == "bbox":
                 img_yolo = _draw_boxes_pil(img, all_boxes, color=(0, 255, 0), width=3)
+            elif viz_mode == "mask":
                 all_masks = self._sam_masks_from_bboxes(image_path, bbox_list, img_h=H, img_w=W)
                 img_yolo = overlay_masks(img, all_masks, alpha=0.35, color=(0, 255, 0))
+            else:
+                raise ValueError(f"Unknown viz_mode={viz_mode}")
+            # crops
+            seg_list, _ = _crop_pil(img, bbox_list)
             if len(seg_list) == 0:
                 return {
                     "task_id": task_id,
                     "images": {"original": img, "yolo": img_yolo, "selected": img.copy()},
                 }
+            # ImageBind embeddings
+            with torch.no_grad():
+                input_pack = {
+                    ModalityType.TEXT: data.load_and_transform_text(prompt_use, self.device),
+                    ModalityType.VISION: data.read_and_transform_vision_data(seg_list, self.device),
+                }
+                emb = self.vlm_model(input_pack)
+                text_embeddings = emb[ModalityType.TEXT]
+                bbox_embeddings = emb[ModalityType.VISION]
+                input_pack2 = {ModalityType.VISION: data.read_and_transform_vision_data([img], self.device)}
+                emb2 = self.vlm_model(input_pack2)
+                image_embedding = emb2[ModalityType.VISION].squeeze(0)
+            # TaskCLIP
             taskclip = self._get_taskclip(
                 ckpt_path=taskclip_ckpt,
+                d_model=int(image_embedding.shape[-1]),
+                n_words=int(text_embeddings.shape[0]),
                 score_function=score_function,
                 hdv_dim=hdv_dim,
             )
+            with torch.no_grad():
+                _, _, score_res, _ = taskclip(bbox_embeddings, text_embeddings, image_embedding.view(1, -1))
                 score = score_res.view(-1).detach().cpu().numpy().tolist()
+            # ... keep your postprocess/selection logic unchanged ...
+            # (use your existing code below this point)
+            # return dict unchanged

webui/weights.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# webui/weights.py
+import os
+from pathlib import Path
+from huggingface_hub import snapshot_download
+def get_weights_dir(repo_id: str) -> Path:
+    token = os.getenv("HF_TOKEN")  # only needed if repo is private
+    p = snapshot_download(
+        repo_id=repo_id,
+        local_dir="weights_cache",
+        local_dir_use_symlinks=False,
+        token=token,
+    )
+    return Path(p).resolve()