Spaces:

reygml
/

vlm_grounding

Sleeping

App Files Files Community

reygml commited on Sep 2, 2025

Commit

43e6aa2

1 Parent(s): 71fd7ae

bugfix

Browse files

Files changed (1) hide show

util.py +31 -48

util.py CHANGED Viewed

@@ -1,9 +1,22 @@
-# util.py
 import os
 import threading
 from io import BytesIO
-from typing import List, Sequence, Union
 import torch
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForVision2Seq
@@ -11,17 +24,13 @@ from transformers.image_utils import load_image as hf_load_image
 class SmolVLMRunner:
-    """
-    Thin wrapper around HuggingFaceTB/SmolVLM-Instruct for single/multi-image VQA or captioning.
-    Reuses a single model instance across calls and serializes inference with a lock (GPU friendly).
-    """
     def __init__(self, model_id: str | None = None, device: str | None = None):
         self.model_id = model_id or os.getenv("SMOLVLM_MODEL_ID", "HuggingFaceTB/SmolVLM-Instruct")
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
-        self.processor = AutoProcessor.from_pretrained(self.model_id)
         attn_impl = "flash_attention_2" if self.device == "cuda" else "eager"
         try:
@@ -29,63 +38,40 @@ class SmolVLMRunner:
                 self.model_id,
                 torch_dtype=self.dtype,
                 _attn_implementation=attn_impl,
             ).to(self.device)
         except Exception:
-            # Fallback if flash-attn isn't available
             self.model = AutoModelForVision2Seq.from_pretrained(
                 self.model_id,
                 torch_dtype=self.dtype,
                 _attn_implementation="eager",
             ).to(self.device)
         self.model.eval()
         self._lock = threading.Lock()
-    # ---------- Image loading helpers ----------
     @staticmethod
     def _ensure_rgb(img: Image.Image) -> Image.Image:
         return img.convert("RGB") if img.mode != "RGB" else img
     @classmethod
     def load_pil_from_urls(cls, urls: Sequence[str]) -> List[Image.Image]:
-        """Load images from HTTP/HTTPS URLs using HF's helper."""
-        images: List[Image.Image] = []
-        for u in urls:
-            img = hf_load_image(u)
-            images.append(cls._ensure_rgb(img))
-        return images
     @classmethod
     def load_pil_from_bytes(cls, blobs: Sequence[bytes]) -> List[Image.Image]:
-        """Load images from raw bytes (e.g., FastAPI uploads)."""
-        images: List[Image.Image] = []
-        for b in blobs:
-            img = Image.open(BytesIO(b))
-            images.append(cls._ensure_rgb(img))
-        return images
-    # ---------- Core inference ----------
-    def generate(
-        self,
-        prompt: str,
-        images: Sequence[Image.Image],
-        max_new_tokens: int = 300,
-        temperature: float | None = None,
-        top_p: float | None = None,
-    ) -> str:
-        """
-        Run generation with 0+ images (text-only works too).
-        """
-        # Build chat template: one "image" token per provided image, then the text.
         content = [{"type": "image"} for _ in images] + [{"type": "text", "text": prompt}]
         messages = [{"role": "user", "content": content}]
         chat_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
         inputs = self.processor(text=chat_prompt, images=list(images), return_tensors="pt")
-        inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
         gen_kwargs = dict(max_new_tokens=max_new_tokens)
         if temperature is not None:
@@ -97,19 +83,16 @@ class SmolVLMRunner:
             generated_ids = self.model.generate(**inputs, **gen_kwargs)
         text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-        # Many chat templates prepend "Assistant: "
         if text.startswith("Assistant:"):
-            text = text[len("Assistant:") :].strip()
         return text
-# Convenience singleton (optional import path)
-_runner_singleton: SmolVLMRunner | None = None
-def get_runner() -> SmolVLMRunner:
     global _runner_singleton
     if _runner_singleton is None:
         _runner_singleton = SmolVLMRunner()
     return _runner_singleton

+# util.py (patched cache handling for HF Spaces)
 import os
+from pathlib import Path
+# Put every cache under /tmp (always writable in Spaces)
+CACHE_DIR = os.getenv("HF_CACHE_DIR", "/tmp/hf-cache")
+Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
+# Make sure libraries don't fall back to "~/.cache" -> "/.cache"
+os.environ.setdefault("HF_HOME", CACHE_DIR)
+os.environ.setdefault("TRANSFORMERS_CACHE", CACHE_DIR)
+os.environ.setdefault("HUGGINGFACE_HUB_CACHE", CACHE_DIR)
+os.environ.setdefault("XDG_CACHE_HOME", CACHE_DIR)
+os.environ.setdefault("TORCH_HOME", CACHE_DIR)
 import threading
 from io import BytesIO
+from typing import List, Sequence
 import torch
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForVision2Seq
 class SmolVLMRunner:
     def __init__(self, model_id: str | None = None, device: str | None = None):
         self.model_id = model_id or os.getenv("SMOLVLM_MODEL_ID", "HuggingFaceTB/SmolVLM-Instruct")
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
+        # Use the writable cache dir explicitly
+        self.processor = AutoProcessor.from_pretrained(self.model_id, cache_dir=CACHE_DIR)
         attn_impl = "flash_attention_2" if self.device == "cuda" else "eager"
         try:
                 self.model_id,
                 torch_dtype=self.dtype,
                 _attn_implementation=attn_impl,
+                cache_dir=CACHE_DIR,
             ).to(self.device)
         except Exception:
+            # Fallback if flash-attn isn't available in the environment
             self.model = AutoModelForVision2Seq.from_pretrained(
                 self.model_id,
                 torch_dtype=self.dtype,
                 _attn_implementation="eager",
+                cache_dir=CACHE_DIR,
             ).to(self.device)
         self.model.eval()
         self._lock = threading.Lock()
     @staticmethod
     def _ensure_rgb(img: Image.Image) -> Image.Image:
         return img.convert("RGB") if img.mode != "RGB" else img
     @classmethod
     def load_pil_from_urls(cls, urls: Sequence[str]) -> List[Image.Image]:
+        return [cls._ensure_rgb(hf_load_image(u)) for u in urls]
     @classmethod
     def load_pil_from_bytes(cls, blobs: Sequence[bytes]) -> List[Image.Image]:
+        return [cls._ensure_rgb(Image.open(BytesIO(b))) for b in blobs]
+    def generate(self, prompt: str, images: Sequence[Image.Image], max_new_tokens: int = 300,
+                 temperature: float | None = None, top_p: float | None = None) -> str:
         content = [{"type": "image"} for _ in images] + [{"type": "text", "text": prompt}]
         messages = [{"role": "user", "content": content}]
         chat_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
         inputs = self.processor(text=chat_prompt, images=list(images), return_tensors="pt")
+        inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
         gen_kwargs = dict(max_new_tokens=max_new_tokens)
         if temperature is not None:
             generated_ids = self.model.generate(**inputs, **gen_kwargs)
         text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
         if text.startswith("Assistant:"):
+            text = text[len("Assistant:"):].strip()
         return text
+_runner_singleton = None
+def get_runner():
     global _runner_singleton
     if _runner_singleton is None:
         _runner_singleton = SmolVLMRunner()
     return _runner_singleton