Spaces:

reygml
/

vlm_grounding

Sleeping

App Files Files Community

reygml commited on Sep 2, 2025

Commit

8b95053

1 Parent(s): 69f3c04

initial commit

Browse files

Files changed (4) hide show

Dockerfile +19 -0
app.py +98 -0
requirements.txt +23 -0
util.py +115 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+RUN pip install --no-deps "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.0.post2/flash_attn-2.7.0.post2+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# app.py
+import asyncio
+from typing import List, Optional
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from pydantic import BaseModel, Field, HttpUrl
+import uvicorn
+from util import get_runner, SmolVLMRunner
+app = FastAPI(title="SmolVLM Inference API", version="1.0.0")
+_runner: Optional[SmolVLMRunner] = None
+class URLRequest(BaseModel):
+    prompt: str = Field(..., description="Text prompt to accompany the images.")
+    image_urls: List[HttpUrl] = Field(..., description="List of image URLs.")
+    max_new_tokens: int = Field(300, ge=1, le=1024)
+    temperature: Optional[float] = Field(None, ge=0.0, le=2.0)
+    top_p: Optional[float] = Field(None, gt=0.0, le=1.0)
+@app.on_event("startup")
+async def _load_model_on_startup():
+    global _runner
+    _runner = get_runner()
+@app.get("/")
+def health():
+    return {"status": "ok", "model": _runner.model_id if _runner else None}
+@app.post("/generate")
+async def generate_from_files(
+    prompt: str = Form(...),
+    images: List[UploadFile] = File(..., description="One or more image files."),
+    max_new_tokens: int = Form(300),
+    temperature: Optional[float] = Form(None),
+    top_p: Optional[float] = Form(None),
+):
+    """
+    Multipart form endpoint:
+      - prompt: str
+      - images: one or more image files (image/*)
+    """
+    if not images:
+        raise HTTPException(status_code=400, detail="At least one image must be provided.")
+    # Read all files into memory (simple & fine for moderate sizes)
+    blobs = []
+    for f in images:
+        if not f.content_type or not f.content_type.startswith("image/"):
+            raise HTTPException(status_code=415, detail=f"Unsupported file type: {f.content_type}")
+        blobs.append(await f.read())
+    pil_images = _runner.load_pil_from_bytes(blobs)
+    text = _runner.generate(
+        prompt=prompt,
+        images=pil_images,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    return {"text": text}
+@app.post("/generate_urls")
+async def generate_from_urls(req: URLRequest):
+    """
+    JSON endpoint:
+      {
+        "prompt": "...",
+        "image_urls": ["https://...","https://..."],
+        "max_new_tokens": 300,
+        "temperature": 0.2,
+        "top_p": 0.95
+      }
+    """
+    if len(req.image_urls) == 0:
+        raise HTTPException(status_code=400, detail="At least one image URL is required.")
+    pil_images = _runner.load_pil_from_urls([str(u) for u in req.image_urls])
+    text = _runner.generate(
+        prompt=req.prompt,
+        images=pil_images,
+        max_new_tokens=req.max_new_tokens,
+        temperature=req.temperature,
+        top_p=req.top_p,
+    )
+    return {"text": text}
+if __name__ == "__main__":
+    # Run with: python app.py  (or: uvicorn app:app --host 0.0.0.0 --port 8000)
+    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+--extra-index-url https://download.pytorch.org/whl/cu121
+fastapi
+uvicorn[standard]
+torch==2.4.0
+torchvision==0.19.0
+pillow==10.4.0
+imageio==2.36.1
+imageio-ffmpeg==0.5.1
+accelerate
+diffusers
+peft
+sentencepiece
+bitsandbytes
+gguf
+pypdfium2
+icecream
+einops
+Pillow
+gradio
+xformers==0.0.27.post2
+spconv-cu120==2.3.6
+transformers==4.46.3

util.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# util.py
+import os
+import threading
+from io import BytesIO
+from typing import List, Sequence, Union
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image as hf_load_image
+class SmolVLMRunner:
+    """
+    Thin wrapper around HuggingFaceTB/SmolVLM-Instruct for single/multi-image VQA or captioning.
+    Reuses a single model instance across calls and serializes inference with a lock (GPU friendly).
+    """
+    def __init__(self, model_id: str | None = None, device: str | None = None):
+        self.model_id = model_id or os.getenv("SMOLVLM_MODEL_ID", "HuggingFaceTB/SmolVLM-Instruct")
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
+        self.processor = AutoProcessor.from_pretrained(self.model_id)
+        attn_impl = "flash_attention_2" if self.device == "cuda" else "eager"
+        try:
+            self.model = AutoModelForVision2Seq.from_pretrained(
+                self.model_id,
+                torch_dtype=self.dtype,
+                _attn_implementation=attn_impl,
+            ).to(self.device)
+        except Exception:
+            # Fallback if flash-attn isn't available
+            self.model = AutoModelForVision2Seq.from_pretrained(
+                self.model_id,
+                torch_dtype=self.dtype,
+                _attn_implementation="eager",
+            ).to(self.device)
+        self.model.eval()
+        self._lock = threading.Lock()
+    # ---------- Image loading helpers ----------
+    @staticmethod
+    def _ensure_rgb(img: Image.Image) -> Image.Image:
+        return img.convert("RGB") if img.mode != "RGB" else img
+    @classmethod
+    def load_pil_from_urls(cls, urls: Sequence[str]) -> List[Image.Image]:
+        """Load images from HTTP/HTTPS URLs using HF's helper."""
+        images: List[Image.Image] = []
+        for u in urls:
+            img = hf_load_image(u)
+            images.append(cls._ensure_rgb(img))
+        return images
+    @classmethod
+    def load_pil_from_bytes(cls, blobs: Sequence[bytes]) -> List[Image.Image]:
+        """Load images from raw bytes (e.g., FastAPI uploads)."""
+        images: List[Image.Image] = []
+        for b in blobs:
+            img = Image.open(BytesIO(b))
+            images.append(cls._ensure_rgb(img))
+        return images
+    # ---------- Core inference ----------
+    def generate(
+        self,
+        prompt: str,
+        images: Sequence[Image.Image],
+        max_new_tokens: int = 300,
+        temperature: float | None = None,
+        top_p: float | None = None,
+    ) -> str:
+        """
+        Run generation with 0+ images (text-only works too).
+        """
+        # Build chat template: one "image" token per provided image, then the text.
+        content = [{"type": "image"} for _ in images] + [{"type": "text", "text": prompt}]
+        messages = [{"role": "user", "content": content}]
+        chat_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = self.processor(text=chat_prompt, images=list(images), return_tensors="pt")
+        inputs = {k: v.to(self.device) if hasattr(v, "to") else v for k, v in inputs.items()}
+        gen_kwargs = dict(max_new_tokens=max_new_tokens)
+        if temperature is not None:
+            gen_kwargs["temperature"] = float(temperature)
+        if top_p is not None:
+            gen_kwargs["top_p"] = float(top_p)
+        with self._lock, torch.inference_mode():
+            generated_ids = self.model.generate(**inputs, **gen_kwargs)
+        text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        # Many chat templates prepend "Assistant: "
+        if text.startswith("Assistant:"):
+            text = text[len("Assistant:") :].strip()
+        return text
+# Convenience singleton (optional import path)
+_runner_singleton: SmolVLMRunner | None = None
+def get_runner() -> SmolVLMRunner:
+    global _runner_singleton
+    if _runner_singleton is None:
+        _runner_singleton = SmolVLMRunner()
+    return _runner_singleton