Spaces:

reygml
/

vlm_grounding

Sleeping

App Files Files Community

reygml commited on Sep 2, 2025

Commit

b7bc425

1 Parent(s): 029cf9d

feat:grounding dino

Browse files

Files changed (4) hide show

app.py +81 -3
grounding_dino2.py +155 -0
ui.py +62 -10
util.py +73 -1

app.py CHANGED Viewed

@@ -1,17 +1,21 @@
 # app.py
 from time import perf_counter
-from typing import List, Optional
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from pydantic import BaseModel, Field, HttpUrl
 import uvicorn
 from util import get_runner, SmolVLMRunner
-app = FastAPI(title="SmolVLM Inference API", version="1.1.0")
 _runner: Optional[SmolVLMRunner] = None
 class URLRequest(BaseModel):
     prompt: str = Field(..., description="Text prompt to accompany the images.")
     image_urls: List[HttpUrl] = Field(..., description="List of image URLs.")
@@ -19,18 +23,32 @@ class URLRequest(BaseModel):
     temperature: Optional[float] = Field(None, ge=0.0, le=2.0)
     top_p: Optional[float] = Field(None, gt=0.0, le=1.0)
 @app.on_event("startup")
 async def _load_model_on_startup():
     global _runner
     _runner = get_runner()
 @app.get("/")
 def health():
     return {"status": "ok", "model": _runner.model_id if _runner else None}
 @app.post("/generate")
 async def generate_from_files(
     prompt: str = Form(...),
@@ -105,6 +123,66 @@ async def generate_from_urls(req: URLRequest):
     return {"text": text, "metrics": metrics}
 if __name__ == "__main__":
     # Run with: python app.py  (or: uvicorn app:app --host 0.0.0.0 --port 8000)
     uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)

 # app.py
 from time import perf_counter
+from io import BytesIO
+from typing import List, Optional, Union
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from pydantic import BaseModel, Field, HttpUrl
+from PIL import Image
 import uvicorn
 from util import get_runner, SmolVLMRunner
+app = FastAPI(title="SmolVLM Inference API", version="1.2.0")
 _runner: Optional[SmolVLMRunner] = None
+# ----------------------- Pydantic models -----------------------
 class URLRequest(BaseModel):
     prompt: str = Field(..., description="Text prompt to accompany the images.")
     image_urls: List[HttpUrl] = Field(..., description="List of image URLs.")
     temperature: Optional[float] = Field(None, ge=0.0, le=2.0)
     top_p: Optional[float] = Field(None, gt=0.0, le=1.0)
+class DetectDescribeURLRequest(BaseModel):
+    image_url: HttpUrl
+    labels: Union[str, List[str]]
+    box_threshold: float = 0.40
+    text_threshold: float = 0.30
+    pad_frac: float = 0.06
+    max_new_tokens: int = 160
+    return_overlay: bool = True
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+# ----------------------- Startup / health -----------------------
 @app.on_event("startup")
 async def _load_model_on_startup():
     global _runner
     _runner = get_runner()
 @app.get("/")
 def health():
     return {"status": "ok", "model": _runner.model_id if _runner else None}
+# ----------------------- Core VLM endpoints -----------------------
 @app.post("/generate")
 async def generate_from_files(
     prompt: str = Form(...),
     return {"text": text, "metrics": metrics}
+# ----------------------- Detect & Describe endpoints -----------------------
+@app.post("/detect_describe")
+async def detect_describe(
+    image: UploadFile = File(..., description="One image file (image/*)"),
+    labels: str = Form(..., description='Comma-separated phrases, e.g. "a man,a dog"'),
+    box_threshold: float = Form(0.40),
+    text_threshold: float = Form(0.30),
+    pad_frac: float = Form(0.06),
+    max_new_tokens: int = Form(160),
+    temperature: Optional[float] = Form(None),
+    top_p: Optional[float] = Form(None),
+    return_overlay: bool = Form(True),
+):
+    if not image.content_type or not image.content_type.startswith("image/"):
+        raise HTTPException(status_code=415, detail=f"Unsupported file type: {image.content_type}")
+    try:
+        raw = await image.read()
+        pil = Image.open(BytesIO(raw)).convert("RGB")
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to read image: {e}")
+    out = _runner.detect_and_describe(
+        image=pil,
+        labels=labels,  # comma-separated string OK
+        box_threshold=box_threshold,
+        text_threshold=text_threshold,
+        pad_frac=pad_frac,
+        max_new_tokens=max_new_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        return_overlay=return_overlay,
+    )
+    return out
+@app.post("/detect_describe_url")
+async def detect_describe_url(req: DetectDescribeURLRequest):
+    try:
+        pil = _runner.load_pil_from_urls([str(req.image_url)])[0]
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Failed to fetch image: {e}")
+    out = _runner.detect_and_describe(
+        image=pil,
+        labels=req.labels,
+        box_threshold=req.box_threshold,
+        text_threshold=req.text_threshold,
+        pad_frac=req.pad_frac,
+        max_new_tokens=req.max_new_tokens,
+        temperature=req.temperature,
+        top_p=req.top_p,
+        return_overlay=req.return_overlay,
+    )
+    return out
+# ----------------------- Entrypoint -----------------------
 if __name__ == "__main__":
     # Run with: python app.py  (or: uvicorn app:app --host 0.0.0.0 --port 8000)
     uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=False)

grounding_dino2.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# grounding_dino_runner.py
+# Lightweight Grounding DINO wrapper for box detection + cropping.
+# Works on CPU or GPU; safe on T4 (no flash-attn).
+from __future__ import annotations
+import os
+import threading
+from pathlib import Path
+from typing import List, Dict, Any, Tuple, Optional
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+def visualize_detections(
+    image: Image.Image,
+    detections: list[dict],
+    *,
+    box_color: tuple[int, int, int] = (0, 255, 0),
+    text_color: tuple[int, int, int] = (0, 0, 0),
+    box_width: int = 3,
+) -> Image.Image:
+    """
+    Draw boxes + labels on a copy of `image`.
+    Each detection item expects: {'label': str, 'score': float, 'box_xyxy': (x0,y0,x1,y1)}
+    """
+    vis = image.copy()
+    draw = ImageDraw.Draw(vis)
+    try:
+        font = ImageFont.truetype("DejaVuSans.ttf", 16)
+    except Exception:
+        font = None
+    for det in detections:
+        x0, y0, x1, y1 = det["box_xyxy"]
+        lab = det.get("label", "")
+        sc = det.get("score", 0.0)
+        draw.rectangle((x0, y0, x1, y1), outline=box_color, width=box_width)
+        text = f"{lab} {sc:.2f}"
+        text_w = draw.textlength(text, font=font) if font else len(text) * 8
+        pad = 4
+        draw.rectangle((x0, y0 - 20, x0 + int(text_w) + pad * 2, y0), fill=box_color)
+        draw.text((x0 + pad, y0 - 18), text, fill=text_color, font=font)
+    return vis
+def _clamp_xyxy(box: List[float], w: int, h: int) -> Tuple[int, int, int, int]:
+    x0, y0, x1, y1 = box
+    x0 = max(0, min(int(round(x0)), w - 1))
+    y0 = max(0, min(int(round(y0)), h - 1))
+    x1 = max(0, min(int(round(x1)), w - 1))
+    y1 = max(0, min(int(round(y1)), h - 1))
+    if x1 < x0:
+        x0, x1 = x1, x0
+    if y1 < y0:
+        y0, y1 = y1, y0
+    return x0, y0, x1, y1
+def _pad_box(box: Tuple[int, int, int, int], w: int, h: int, frac: float = 0.06) -> Tuple[int, int, int, int]:
+    x0, y0, x1, y1 = box
+    bw, bh = x1 - x0, y1 - y0
+    dx, dy = int(bw * frac), int(bh * frac)
+    return max(0, x0 - dx), max(0, y0 - dy), min(w - 1, x1 + dx), min(h - 1, y1 + dy)
+def crop_from_box(img: Image.Image, box_xyxy: Tuple[int, int, int, int]) -> Image.Image:
+    return img.crop(box_xyxy)
+class GroundingDINORunner:
+    """
+    Minimal singleton-style wrapper for Grounding DINO zero-shot detector.
+    """
+    def __init__(self, model_id: Optional[str] = None, device: Optional[str] = None):
+        self.model_id = model_id or os.getenv("GDINO_MODEL_ID", "IDEA-Research/grounding-dino-tiny")
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self._lock = threading.Lock()
+        self.processor = AutoProcessor.from_pretrained(self.model_id, cache_dir=CACHE_DIR)
+        self.model = AutoModelForZeroShotObjectDetection.from_pretrained(
+            self.model_id, cache_dir=CACHE_DIR
+        ).to(self.device)
+        self.model.eval()
+    @staticmethod
+    def _normalize_labels(labels: List[str] | str) -> List[List[str]]:
+        if isinstance(labels, str):
+            items = [x.strip() for x in labels.split(",") if x.strip()]
+        else:
+            items = [x.strip() for x in labels if x and x.strip()]
+        if not items:
+            raise ValueError("No labels provided.")
+        # Grounding DINO expects nested list of phrases: [["a cat", "a remote control"]]
+        return [items]
+    def detect(
+        self,
+        image: Image.Image,
+        labels: List[str] | str,
+        box_threshold: float = 0.4,
+        text_threshold: float = 0.3,
+        pad_frac: float = 0.06,
+    ) -> List[Dict[str, Any]]:
+        """
+        Runs zero-shot detection and returns a list of dicts:
+          { 'label': str, 'score': float, 'box_xyxy': (x0,y0,x1,y1), 'crop': PIL.Image }
+        """
+        w, h = image.size
+        phrases = self._normalize_labels(labels)
+        inputs = self.processor(images=image, text=phrases, return_tensors="pt").to(self.device)
+        with self._lock, torch.no_grad():
+            outputs = self.model(**inputs)
+            # transformers>=4.51 renamed box_threshold -> threshold
+            try:
+                post = self.processor.post_process_grounded_object_detection(
+                    outputs=outputs,
+                    input_ids=inputs.input_ids,
+                    threshold=float(box_threshold),
+                    text_threshold=float(text_threshold),
+                    target_sizes=[(h, w)],
+                )
+            except TypeError:
+                post = self.processor.post_process_grounded_object_detection(
+                    outputs=outputs,
+                    input_ids=inputs.input_ids,
+                    box_threshold=float(box_threshold),
+                    text_threshold=float(text_threshold),
+                    target_sizes=[(h, w)],
+                )
+        det = post[0]
+        boxes = det.get("boxes", [])
+        scores = det.get("scores", [])
+        labels_out = det.get("text_labels", det.get("labels", []))
+        results: List[Dict[str, Any]] = []
+        for b, s, lab in zip(boxes, scores, labels_out):
+            b = b.tolist() if hasattr(b, "tolist") else list(b)
+            bx = _clamp_xyxy(b, w, h)
+            bx = _pad_box(bx, w, h, pad_frac)
+            crop = crop_from_box(image, bx)
+            score = float(s.item()) if torch.is_tensor(s) else float(s)
+            results.append({"label": lab, "score": score, "box_xyxy": bx, "crop": crop})
+        return results
+# convenience singleton
+_runner_singleton: GroundingDINORunner | None = None
+def get_runner() -> GroundingDINORunner:
+    global _runner_singleton
+    if _runner_singleton is None:
+        _runner_singleton = GroundingDINORunner()
+    return _runner_singleton

ui.py CHANGED Viewed

@@ -1,13 +1,11 @@
 # ui.py
 import os
 import io
 import json
 import requests
 import streamlit as st
 from PIL import Image
 st.set_page_config(page_title="SmolVLM UI", layout="wide")
 st.title("SmolVLM")
@@ -22,9 +20,6 @@ with st.sidebar:
     top_p = st.slider("top_p", 0.05, 1.0, 0.95, step=0.05) if topp_on else None
     st.caption("API base: " + API_BASE)
-tabs = st.tabs(["Upload images", "Image URLs"])
-prompt = st.text_area("Prompt", "Can you describe the image(s)?", height=80)
 def show_metrics(metrics: dict):
     if not metrics:
         return
@@ -40,9 +35,13 @@ def show_metrics(metrics: dict):
     cols[3].metric("GPU reserved (MB)", f"{vram:.0f}" if vram is not None else "—")
     st.expander("All metrics").json(info)
-with tabs[0]:
     st.subheader("Upload one or more images")
     files = st.file_uploader("Images", type=["png", "jpg", "jpeg", "webp"], accept_multiple_files=True)
     run = st.button("Generate from uploads", type="primary", use_container_width=True, key="run_files")
     if run:
@@ -87,19 +86,22 @@ with tabs[0]:
                         except Exception:
                             st.write(e.response.text)
-with tabs[1]:
     st.subheader("Use remote image URLs")
-    urls_raw = st.text_area("One URL per line", "", height=120, placeholder="https://example.com/a.jpg\nhttps://example.com/b.png")
     run2 = st.button("Generate from URLs", type="primary", use_container_width=True, key="run_urls")
     if run2:
         urls = [u.strip() for u in urls_raw.splitlines() if u.strip()]
-        if not urls or not prompt.strip():
             st.error("Please add at least one URL and a prompt.")
         else:
             with st.spinner("Calling FastAPI…"):
                 body = {
-                    "prompt": prompt,
                     "image_urls": urls,
                     "max_new_tokens": max_new_tokens,
                     "temperature": temperature,  # FastAPI model allows null
@@ -123,3 +125,53 @@ with tabs[1]:
                         except Exception:
                             st.write(e.response.text)

 # ui.py
 import os
 import io
 import json
 import requests
 import streamlit as st
 from PIL import Image
 st.set_page_config(page_title="SmolVLM UI", layout="wide")
 st.title("SmolVLM")
     top_p = st.slider("top_p", 0.05, 1.0, 0.95, step=0.05) if topp_on else None
     st.caption("API base: " + API_BASE)
 def show_metrics(metrics: dict):
     if not metrics:
         return
     cols[3].metric("GPU reserved (MB)", f"{vram:.0f}" if vram is not None else "—")
     st.expander("All metrics").json(info)
+tab_upload, tab_urls, tab_detect = st.tabs(["Upload images", "Image URLs", "Detect & Describe"])
+# -------------------- Tab 1: uploads -> /generate --------------------
+with tab_upload:
     st.subheader("Upload one or more images")
     files = st.file_uploader("Images", type=["png", "jpg", "jpeg", "webp"], accept_multiple_files=True)
+    prompt = st.text_area("Prompt", "Can you describe the image(s)?", height=80)
     run = st.button("Generate from uploads", type="primary", use_container_width=True, key="run_files")
     if run:
                         except Exception:
                             st.write(e.response.text)
+# -------------------- Tab 2: URLs -> /generate_urls --------------------
+with tab_urls:
     st.subheader("Use remote image URLs")
+    prompt2 = st.text_area("Prompt", "Can you describe the image(s)?", height=80, key="prompt_urls")
+    urls_raw = st.text_area("One URL per line", "", height=120,
+                            placeholder="https://example.com/a.jpg\nhttps://example.com/b.png")
     run2 = st.button("Generate from URLs", type="primary", use_container_width=True, key="run_urls")
     if run2:
         urls = [u.strip() for u in urls_raw.splitlines() if u.strip()]
+        if not urls or not prompt2.strip():
             st.error("Please add at least one URL and a prompt.")
         else:
             with st.spinner("Calling FastAPI…"):
                 body = {
+                    "prompt": prompt2,
                     "image_urls": urls,
                     "max_new_tokens": max_new_tokens,
                     "temperature": temperature,  # FastAPI model allows null
                         except Exception:
                             st.write(e.response.text)
+# -------------------- Tab 3: Detect & Describe -> /detect_describe --------------------
+with tab_detect:
+    st.subheader("Grounding DINO + SmolVLM")
+    det_image = st.file_uploader("Image", type=["jpg", "jpeg", "png", "webp"], accept_multiple_files=False)
+    det_labels = st.text_input("Labels (comma-separated)", "a man,a dog")
+    det_box_thr = st.slider("box_threshold", 0.05, 0.95, 0.40, 0.01)
+    det_text_thr = st.slider("text_threshold", 0.05, 0.95, 0.30, 0.01)
+    det_pad = st.slider("crop padding (fraction)", 0.0, 0.2, 0.06, 0.01)
+    det_max_new = st.slider("max_new_tokens", 1, 512, 160, 1)
+    run_det = st.button("Detect & Describe", type="primary", use_container_width=True)
+    if run_det:
+        if not det_image or not det_labels.strip():
+            st.error("Please provide an image and at least one label.")
+        else:
+            with st.spinner("Calling FastAPI…"):
+                data = {
+                    "labels": det_labels,
+                    "box_threshold": str(det_box_thr),
+                    "text_threshold": str(det_text_thr),
+                    "pad_frac": str(det_pad),
+                    "max_new_tokens": str(det_max_new),
+                    "return_overlay": "true",
+                }
+                files = [("image", (det_image.name, det_image.read(), det_image.type or "application/octet-stream"))]
+                try:
+                    r = requests.post(f"{API_BASE}/detect_describe", data=data, files=files, timeout=300)
+                    r.raise_for_status()
+                    out = r.json()
+                    # Show overlay
+                    b64 = out.get("overlay_png_b64")
+                    if b64:
+                        st.image(f"data:image/png;base64,{b64}", caption="Detections", use_column_width=True)
+                    # List detections
+                    dets = out.get("detections", [])
+                    if not dets:
+                        st.info("No detections at current thresholds.")
+                    for i, d in enumerate(dets, 1):
+                        st.markdown(f"**{i}. {d['label']}** (score={d['score']:.2f}, box={d['box_xyxy']})")
+                        st.write(d["description"])
+                except requests.RequestException as e:
+                    st.error(f"Request failed: {e}")
+                    if hasattr(e, "response") and e.response is not None:
+                        try:
+                            st.code(e.response.text, language="json")
+                        except Exception:
+                            st.write(e.response.text)

util.py CHANGED Viewed

@@ -27,7 +27,7 @@ from PIL import Image
 from transformers import AutoProcessor, AutoModelForVision2Seq
 from transformers.image_utils import load_image as hf_load_image
 def _has_flash_attn() -> bool:
@@ -102,6 +102,78 @@ class SmolVLMRunner:
         return [cls._ensure_rgb(Image.open(BytesIO(b))) for b in blobs]
     # ---------- Inference ----------
     def generate(
         self,
         prompt: str,

 from transformers import AutoProcessor, AutoModelForVision2Seq
 from transformers.image_utils import load_image as hf_load_image
+from grounding_dino2 import get_runner as get_gdino_runner, visualize_detections
 def _has_flash_attn() -> bool:
         return [cls._ensure_rgb(Image.open(BytesIO(b))) for b in blobs]
     # ---------- Inference ----------
+    def detect_and_describe(
+        self,
+        image: Image.Image,
+        labels: list[str] | str,
+        *,
+        box_threshold: float = 0.4,
+        text_threshold: float = 0.3,
+        pad_frac: float = 0.06,
+        max_new_tokens: int = 160,
+        temperature: float | None = None,
+        top_p: float | None = None,
+        return_overlay: bool = False,
+    ) -> list[dict] | dict:
+        """
+        Uses Grounding DINO to detect boxes for `labels`, then asks SmolVLM to
+        describe each cropped box.
+        If return_overlay=False (default): returns a list of dicts:
+           [{ 'label','score','box_xyxy','description' }, ...]
+        If return_overlay=True: returns a dict:
+           { 'detections': [...], 'overlay_png_b64': '<base64 PNG>' }
+        """
+        gdino = get_gdino_runner()
+        detections = gdino.detect(
+            image=image,
+            labels=labels,
+            box_threshold=box_threshold,
+            text_threshold=text_threshold,
+            pad_frac=pad_frac,
+        )
+        if not detections:
+            return [] if not return_overlay else {"detections": [], "overlay_png_b64": None}
+        results: list[dict] = []
+        for det in detections:
+            crop = det["crop"]
+            prompt_txt = f"Describe the object inside this crop in detail. It was detected with the phrase: '{det['label']}'."
+            content = [{"type": "image"}, {"type": "text", "text": prompt_txt}]
+            messages = [{"role": "user", "content": content}]
+            chat_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+            inputs = self.processor(text=chat_prompt, images=[crop], return_tensors="pt")
+            inputs = {k: (v.to(self.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
+            gen_kwargs = dict(max_new_tokens=max_new_tokens)
+            if temperature is not None:
+                gen_kwargs["temperature"] = float(temperature)
+            if top_p is not None:
+                gen_kwargs["top_p"] = float(top_p)
+            with self._lock, torch.inference_mode():
+                out_ids = self.model.generate(**inputs, **gen_kwargs)
+            text = self.processor.batch_decode(out_ids, skip_special_tokens=True)[0].strip()
+            if text.startswith("Assistant:"):
+                text = text[len("Assistant:"):].strip()
+            results.append({
+                "label": det["label"],
+                "score": det["score"],
+                "box_xyxy": det["box_xyxy"],
+                "description": text,
+            })
+        if not return_overlay:
+            return results
+        # Build overlay image (PNG -> base64 string)
+        overlay = visualize_detections(image, detections)
+        buf = io.BytesIO()
+        overlay.save(buf, format="PNG")
+        b64 = base64.b64encode(buf.getvalue()).decode("ascii")
+        return {"detections": results, "overlay_png_b64": b64}
     def generate(
         self,
         prompt: str,