Spaces:

d3evil4
/

Image2Caption

Sleeping

App Files Files Community

khushalcodiste commited on Mar 9

Commit

49f8ccd

1 Parent(s): afd6ed3

fix: added

Browse files

Files changed (3) hide show

README.md +2 -0
src/model.py +31 -3
src/server.py +4 -1

README.md CHANGED Viewed

@@ -11,3 +11,5 @@ pinned: false
 Image captioning API using `microsoft/Florence-2-base` with a Python FastAPI backend. Open `/docs` for Swagger UI.
 Speed tuning env vars: `DEFAULT_MAX_TOKENS` (default `64`), `MAX_IMAGE_SIDE` (default `896`), `MAX_MAX_TOKENS` (default `256`), `MODEL_ID` (default `microsoft/Florence-2-base`), `MODEL_REVISION` (pin to a commit SHA, e.g. `5ca5edf5bd017b9919c05d08aebef5e4c7ac3bac`).

 Image captioning API using `microsoft/Florence-2-base` with a Python FastAPI backend. Open `/docs` for Swagger UI.
 Speed tuning env vars: `DEFAULT_MAX_TOKENS` (default `64`), `MAX_IMAGE_SIDE` (default `896`), `MAX_MAX_TOKENS` (default `256`), `MODEL_ID` (default `microsoft/Florence-2-base`), `MODEL_REVISION` (pin to a commit SHA, e.g. `5ca5edf5bd017b9919c05d08aebef5e4c7ac3bac`).
+`POST /predict` form field `text` is the full Florence-2 task prompt. For standard captioning use `<CAPTION>` only (or omit `text` to use the default). Do not append extra words to `<CAPTION>`.

src/model.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import os
 from io import BytesIO
 from typing import Any
@@ -16,6 +17,7 @@ MAX_IMAGE_SIDE = int(os.getenv("MAX_IMAGE_SIDE", "896"))
 RESIZE_MULTIPLE = int(os.getenv("RESIZE_MULTIPLE", "32"))
 NUM_BEAMS = int(os.getenv("NUM_BEAMS", "3"))
 DEFAULT_PROMPT = os.getenv("DEFAULT_PROMPT", "<CAPTION>")
 _model = None
 _processor = None
@@ -66,18 +68,44 @@ def load_model() -> tuple[Any, Any]:
     return _model, _processor
 def generate_caption(
     image_bytes: bytes,
     text_input: str | None = None,
     max_tokens: int = DEFAULT_MAX_TOKENS,
 ) -> dict[str, Any]:
     model, processor = load_model()
-    prompt = f"{DEFAULT_PROMPT} {text_input.strip()}" if text_input else DEFAULT_PROMPT
     safe_max_tokens = min(max(int(max_tokens), 8), MAX_MAX_TOKENS)
     image = _prepare_image(image_bytes)
-    inputs = processor(text=prompt, images=image, return_tensors="pt")
     input_ids = inputs["input_ids"].to(_device)
     pixel_values = inputs["pixel_values"].to(_device, _dtype)
@@ -98,7 +126,7 @@ def generate_caption(
         try:
             parsed = post_process(
                 generated_text,
-                task=DEFAULT_PROMPT,
                 image_size=(image.width, image.height),
             )
         except Exception:

 from __future__ import annotations
 import os
+import re
 from io import BytesIO
 from typing import Any
 RESIZE_MULTIPLE = int(os.getenv("RESIZE_MULTIPLE", "32"))
 NUM_BEAMS = int(os.getenv("NUM_BEAMS", "3"))
 DEFAULT_PROMPT = os.getenv("DEFAULT_PROMPT", "<CAPTION>")
+TASK_TOKEN_PATTERN = re.compile(r"^<[^>\s]+>")
 _model = None
 _processor = None
     return _model, _processor
+def _build_prompt(text_input: str | None) -> str:
+    if text_input is None:
+        return DEFAULT_PROMPT
+    prompt = text_input.strip()
+    if not prompt:
+        return DEFAULT_PROMPT
+    if not prompt.startswith("<"):
+        raise ValueError(
+            "Invalid prompt in `text`: expected a Florence-2 task token like "
+            "'<CAPTION>' or '<CAPTION_TO_PHRASE_GROUNDING>phrase'."
+        )
+    return prompt
+def _task_token_from_prompt(prompt: str) -> str:
+    match = TASK_TOKEN_PATTERN.match(prompt)
+    return match.group(0) if match else DEFAULT_PROMPT
 def generate_caption(
     image_bytes: bytes,
     text_input: str | None = None,
     max_tokens: int = DEFAULT_MAX_TOKENS,
 ) -> dict[str, Any]:
     model, processor = load_model()
+    prompt = _build_prompt(text_input)
     safe_max_tokens = min(max(int(max_tokens), 8), MAX_MAX_TOKENS)
     image = _prepare_image(image_bytes)
+    try:
+        inputs = processor(text=prompt, images=image, return_tensors="pt")
+    except AssertionError as exc:
+        raise ValueError(
+            "Invalid Florence-2 task format in `text`. For plain captioning, use only "
+            "'<CAPTION>' with no extra words."
+        ) from exc
     input_ids = inputs["input_ids"].to(_device)
     pixel_values = inputs["pixel_values"].to(_device, _dtype)
         try:
             parsed = post_process(
                 generated_text,
+                task=_task_token_from_prompt(prompt),
                 image_size=(image.width, image.height),
             )
         except Exception:

src/server.py CHANGED Viewed

@@ -45,7 +45,10 @@ async def predict(
     if not image_bytes:
         raise HTTPException(status_code=400, detail="Empty file uploaded")
-    result = generate_caption(image_bytes, text, max_tokens)
     return {"result": result}

     if not image_bytes:
         raise HTTPException(status_code=400, detail="Empty file uploaded")
+    try:
+        result = generate_caption(image_bytes, text, max_tokens)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
     return {"result": result}