Spaces:

d3evil4
/

Image2Caption

Sleeping

App Files Files Community

khushalcodiste commited on Mar 9

Commit

210def2

1 Parent(s): 641b32e

fix: added

Browse files

Files changed (4) hide show

README.md +1 -1
docker-compose.yml +2 -0
requirements.txt +2 -0
src/model.py +34 -9

README.md CHANGED Viewed

@@ -10,4 +10,4 @@ pinned: false
 Image captioning API using `microsoft/Florence-2-base` with a Python FastAPI backend. Open `/docs` for Swagger UI.
-Speed tuning env vars: `DEFAULT_MAX_TOKENS` (default `64`), `MAX_IMAGE_SIDE` (default `896`), `MAX_MAX_TOKENS` (default `256`), `MODEL_ID` (default `microsoft/Florence-2-base`).


10
11	Image captioning API using `microsoft/Florence-2-base` with a Python FastAPI backend. Open `/docs` for Swagger UI.
12
13	+ Speed tuning env vars: `DEFAULT_MAX_TOKENS` (default `64`), `MAX_IMAGE_SIDE` (default `896`), `MAX_MAX_TOKENS` (default `256`), `MODEL_ID` (default `microsoft/Florence-2-base`), `MODEL_REVISION` (optional commit SHA to pin remote model code).

docker-compose.yml CHANGED Viewed

@@ -9,4 +9,6 @@ services:
       - MAX_IMAGE_SIDE=896
       - MAX_MAX_TOKENS=256
       - MODEL_ID=microsoft/Florence-2-base
     restart: unless-stopped

       - MAX_IMAGE_SIDE=896
       - MAX_MAX_TOKENS=256
       - MODEL_ID=microsoft/Florence-2-base
+      # Optional: pin to a specific commit SHA from huggingface.co/microsoft/Florence-2-base
+      # - MODEL_REVISION=<commit_sha>
     restart: unless-stopped

requirements.txt CHANGED Viewed

@@ -4,3 +4,5 @@ transformers==4.55.4
 torch==2.8.0
 pillow==11.3.0
 python-multipart==0.0.20

 torch==2.8.0
 pillow==11.3.0
 python-multipart==0.0.20
+einops==0.8.1
+timm==1.0.19

src/model.py CHANGED Viewed

@@ -9,9 +9,12 @@ from PIL import Image
 from transformers import AutoModelForCausalLM, AutoProcessor
 MODEL_ID = os.getenv("MODEL_ID", "microsoft/Florence-2-base")
 DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", "64"))
 MAX_MAX_TOKENS = int(os.getenv("MAX_MAX_TOKENS", "256"))
 MAX_IMAGE_SIDE = int(os.getenv("MAX_IMAGE_SIDE", "896"))
 TASKS = {
     "caption": "<CAPTION>",
@@ -26,8 +29,8 @@ TASKS = {
 _model = None
 _processor = None
-_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-_dtype = torch.float16 if _device.type == "cuda" else torch.float32
 def _prepare_image(image_bytes: bytes) -> Image.Image:
@@ -36,19 +39,37 @@ def _prepare_image(image_bytes: bytes) -> Image.Image:
     if width <= MAX_IMAGE_SIDE and height <= MAX_IMAGE_SIDE:
         return image
-    ratio = min(MAX_IMAGE_SIDE / width, MAX_IMAGE_SIDE / height)
-    new_size = (max(1, int(width * ratio)), max(1, int(height * ratio)))
     return image.resize(new_size, Image.Resampling.LANCZOS)
 def load_model() -> tuple[Any, Any]:
     global _model, _processor
     if _model is None or _processor is None:
-        _processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
         _model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
-            trust_remote_code=True,
             torch_dtype=_dtype,
         ).to(_device)
         _model.eval()
     return _model, _processor
@@ -77,16 +98,20 @@ def generate_caption(
             pixel_values=pixel_values,
             do_sample=False,
             max_new_tokens=safe_max_tokens,
-            num_beams=1,
         )
-    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
     parsed = None
     post_process = getattr(processor, "post_process_generation", None)
     if callable(post_process):
         try:
-            parsed = post_process(generated_text, task=prompt_task, image_size=image.size)
         except Exception:
             parsed = None

 from transformers import AutoModelForCausalLM, AutoProcessor
 MODEL_ID = os.getenv("MODEL_ID", "microsoft/Florence-2-base")
+MODEL_REVISION = os.getenv("MODEL_REVISION")
 DEFAULT_MAX_TOKENS = int(os.getenv("DEFAULT_MAX_TOKENS", "64"))
 MAX_MAX_TOKENS = int(os.getenv("MAX_MAX_TOKENS", "256"))
 MAX_IMAGE_SIDE = int(os.getenv("MAX_IMAGE_SIDE", "896"))
+RESIZE_MULTIPLE = int(os.getenv("RESIZE_MULTIPLE", "32"))
+NUM_BEAMS = int(os.getenv("NUM_BEAMS", "3"))
 TASKS = {
     "caption": "<CAPTION>",
 _model = None
 _processor = None
+_device = torch.device("cpu")
+_dtype = torch.float32
 def _prepare_image(image_bytes: bytes) -> Image.Image:
     if width <= MAX_IMAGE_SIDE and height <= MAX_IMAGE_SIDE:
         return image
+    if width >= height:
+        # Landscape: cap width, preserve aspect ratio.
+        ratio = MAX_IMAGE_SIDE / width
+    else:
+        # Portrait: cap height, preserve aspect ratio.
+        ratio = MAX_IMAGE_SIDE / height
+    new_w = max(1, int(width * ratio))
+    new_h = max(1, int(height * ratio))
+    # Align dimensions to improve tensor-core friendly shapes.
+    if RESIZE_MULTIPLE > 1:
+        new_w = max(RESIZE_MULTIPLE, (new_w // RESIZE_MULTIPLE) * RESIZE_MULTIPLE)
+        new_h = max(RESIZE_MULTIPLE, (new_h // RESIZE_MULTIPLE) * RESIZE_MULTIPLE)
+    new_size = (new_w, new_h)
     return image.resize(new_size, Image.Resampling.LANCZOS)
 def load_model() -> tuple[Any, Any]:
     global _model, _processor
     if _model is None or _processor is None:
+        pretrained_kwargs: dict[str, Any] = {"trust_remote_code": True}
+        if MODEL_REVISION:
+            pretrained_kwargs["revision"] = MODEL_REVISION
+        _processor = AutoProcessor.from_pretrained(MODEL_ID, **pretrained_kwargs)
         _model = AutoModelForCausalLM.from_pretrained(
             MODEL_ID,
             torch_dtype=_dtype,
+            **pretrained_kwargs,
         ).to(_device)
         _model.eval()
     return _model, _processor
             pixel_values=pixel_values,
             do_sample=False,
             max_new_tokens=safe_max_tokens,
+            num_beams=max(1, NUM_BEAMS),
         )
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0].strip()
     parsed = None
     post_process = getattr(processor, "post_process_generation", None)
     if callable(post_process):
         try:
+            parsed = post_process(
+                generated_text,
+                task=prompt_task,
+                image_size=(image.width, image.height),
+            )
         except Exception:
             parsed = None