Spaces:

achase25
/

AiSolMM

Sleeping

App Files Files Community

achase25 commited on Oct 7, 2025

Commit

5251d80

verified ·

1 Parent(s): adb4cdd

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -105

app.py CHANGED Viewed

@@ -1,165 +1,112 @@
-# CPU-only Hugging Face Space: Image → Caption (Florence-2-base)
-# - No story model, only captioning.
-# - Florence runs without flash_attn via a small monkey patch.
-# - AVIF/HEIF image uploads supported via plugins.
-# - Batched Florence processor call (images=[...], padding=True).
 import os
-from typing import Dict, Any
 import torch
 import gradio as gr
 from PIL import Image
-# --- Enable AVIF/HEIF decoding for Pillow (handles .avif, .heic from phones) ---
 try:
-    import pillow_avif  # registers AVIF opener on import
 except Exception:
     pass
 try:
     from pillow_heif import register_heif_opener
     register_heif_opener()
 except Exception:
     pass
-from transformers import (
-    AutoProcessor,
-    AutoModelForCausalLM,
-)
-# -------------------- Config --------------------
 CAPTION_MODEL_ID = "microsoft/Florence-2-base"
-HF_TOKEN         = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") or None
 DEVICE = "cpu"
-DTYPE  = torch.float32
-MAX_IMG_SIDE = int(os.getenv("MAX_IMG_SIDE", "1024"))  # bump if you want bigger inputs
-# -------------------- Cache --------------------
-_caption_bundle: Dict[str, Any] = {}
-# -------------------- Utils --------------------
 def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
     w, h = img.size
     if max(w, h) <= max_side:
         return img
-    if w >= h:
-        new_w = max_side
-        new_h = int(h * (max_side / w))
-    else:
-        new_h = max_side
-        new_w = int(w * (max_side / h))
-    return img.resize((new_w, new_h), Image.LANCZOS)
-def _ensure_image(img) -> Image.Image:
     if not isinstance(img, Image.Image):
-        raise gr.Error("Uploaded file is not a valid image.")
     return img.convert("RGB")
-# -------------------- Monkey patch: ignore flash_attn for Florence --------------------
-from unittest.mock import patch
-from transformers.dynamic_module_utils import get_imports as _orig_get_imports
-def _fixed_get_imports(filename):
-    """Drop flash_attn requirement only for Florence modeling file (CPU-safe)."""
-    imports = _orig_get_imports(filename)
     try:
         name = str(filename).lower()
         if "florence2" in name or "modeling_florence2.py" in name:
-            return [imp for imp in imports if imp != "flash_attn"]
     except Exception:
         pass
-    return imports
-# -------------------- Load Florence --------------------
-def get_caption_bundle() -> Dict[str, Any]:
-    """Return {'processor': ..., 'model': ...} for Florence-2-base."""
-    global _caption_bundle
-    if _caption_bundle:
-        return _caption_bundle
-    processor = AutoProcessor.from_pretrained(
-        CAPTION_MODEL_ID, trust_remote_code=True, token=HF_TOKEN
-    )
-    with patch("transformers.dynamic_module_utils.get_imports", _fixed_get_imports):
-        model = AutoModelForCausalLM.from_pretrained(
             CAPTION_MODEL_ID,
             trust_remote_code=True,
             token=HF_TOKEN,
-            attn_implementation="sdpa",   # non-flash attention path
             torch_dtype=DTYPE,
             device_map="cpu",
         ).eval()
-    _caption_bundle = {"processor": processor, "model": model}
-    return _caption_bundle
-# -------------------- Caption op --------------------
 @torch.inference_mode()
-def op_caption(image: Image.Image, max_new_tokens: int = 128, num_beams: int = 3) -> str:
-    """
-    Florence-2-base caption (CPU):
-      - Task tag: <MORE_DETAILED_CAPTION>
-      - Batched call: images=[image], padding=True
-      - post_process_generation parses the structured output
-    """
-    image = _ensure_image(image)
-    image = _resize_max(image)
-    bun = get_caption_bundle()
-    processor, model = bun["processor"], bun["model"]
-    inputs = processor(
         text="<MORE_DETAILED_CAPTION>",
-        images=[image],          # batch, even for single image
-        padding=True,            # ensure consistent tensor shapes
-        return_tensors="pt"
     )
-    # move tensors to device
-    for k in list(inputs.keys()):
-        if isinstance(inputs[k], torch.Tensor):
-            inputs[k] = inputs[k].to(DEVICE)
-    generated_ids = model.generate(
-        **inputs,
         max_new_tokens=max_new_tokens,
-        do_sample=False,
         num_beams=num_beams,
         early_stopping=False,
     )
-    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
     parsed = processor.post_process_generation(
-        generated_text,
-        task="<MORE_DETAILED_CAPTION>",
-        image_size=[(image.width, image.height)],  # list to match batched input
     )
-    data = parsed[0] if isinstance(parsed, list) and parsed else parsed
-    caption = (data.get("<MORE_DETAILED_CAPTION>", "") or "").strip()
-    return caption or "Unable to generate a caption."
-# -------------------- Gradio UI --------------------
 def run(image: Image.Image):
-    if image is None:
-        raise gr.Error("Upload an image first.")
-    text = op_caption(image)
-    return text, "Task: caption • Model: Florence-2-base (CPU)"
-with gr.Blocks(css="footer {visibility:hidden}") as demo:
-    gr.Markdown("# Image → Caption (CPU-only) — Florence-2-base")
     with gr.Row():
         with gr.Column():
-            inp_img = gr.Image(
-                type="pil",
-                label="Image",
-                sources=["upload", "clipboard", "webcam"],
-            )
-            go = gr.Button("Caption", variant="primary")
         with gr.Column():
-            out_text = gr.Textbox(label="Caption", lines=10)
-            status   = gr.Markdown()
-    go.click(run, inputs=[inp_img], outputs=[out_text, status], scroll_to_output=True)
 if __name__ == "__main__":
     demo.queue(max_size=8).launch()

+# CPU-only: Image → Caption (Florence-2-base), concise build
 import os
+from functools import lru_cache
 import torch
 import gradio as gr
 from PIL import Image
+# AVIF/HEIF support (optional, safe to ignore if unavailable)
 try:
+    import pillow_avif  # noqa: F401
 except Exception:
     pass
 try:
     from pillow_heif import register_heif_opener
     register_heif_opener()
 except Exception:
     pass
+from transformers import AutoProcessor, AutoModelForCausalLM
+from unittest.mock import patch
+from transformers.dynamic_module_utils import get_imports as _orig_get_imports
 CAPTION_MODEL_ID = "microsoft/Florence-2-base"
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
 DEVICE = "cpu"
+DTYPE = torch.float32
+MAX_IMG_SIDE = int(os.getenv("MAX_IMG_SIDE", "1024"))
 def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
     w, h = img.size
     if max(w, h) <= max_side:
         return img
+    r = max_side / max(w, h)
+    return img.resize((int(w * r), int(h * r)), Image.LANCZOS)
+def _ensure_rgb(img) -> Image.Image:
     if not isinstance(img, Image.Image):
+        raise gr.Error("Upload a valid image.")
     return img.convert("RGB")
+def _no_flash_attn_get_imports(filename):
+    imps = _orig_get_imports(filename)
     try:
         name = str(filename).lower()
         if "florence2" in name or "modeling_florence2.py" in name:
+            return [x for x in imps if x != "flash_attn"]
     except Exception:
         pass
+    return imps
+@lru_cache(maxsize=1)
+def _load_florence():
+    proc = AutoProcessor.from_pretrained(CAPTION_MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
+    with patch("transformers.dynamic_module_utils.get_imports", _no_flash_attn_get_imports):
+        mdl = AutoModelForCausalLM.from_pretrained(
             CAPTION_MODEL_ID,
             trust_remote_code=True,
             token=HF_TOKEN,
+            attn_implementation="sdpa",  # CPU-safe
             torch_dtype=DTYPE,
             device_map="cpu",
         ).eval()
+    return proc, mdl
 @torch.inference_mode()
+def caption(image: Image.Image, max_new_tokens: int = 128, num_beams: int = 3) -> str:
+    image = _ensure_rgb(_resize_max(image))
+    processor, model = _load_florence()
+    batch = processor(
         text="<MORE_DETAILED_CAPTION>",
+        images=[image],          # batch even for single
+        padding=True,
+        return_tensors="pt",
     )
+    # move tensors to CPU device (BatchFeature may contain non-tensors)
+    for k, v in list(batch.items()):
+        if torch.is_tensor(v):
+            batch[k] = v.to(DEVICE)
+    out_ids = model.generate(
+        **batch,
         max_new_tokens=max_new_tokens,
         num_beams=num_beams,
+        do_sample=False,
         early_stopping=False,
     )
+    gen = processor.batch_decode(out_ids, skip_special_tokens=False)[0]
     parsed = processor.post_process_generation(
+        gen, task="<MORE_DETAILED_CAPTION>", image_size=[(image.width, image.height)]
     )
+    data = parsed[0] if isinstance(parsed, list) else parsed
+    return (data.get("<MORE_DETAILED_CAPTION>", "") or "Unable to generate a caption.").strip()
 def run(image: Image.Image):
+    txt = caption(image)
+    return txt, "Model: Florence-2-base (CPU)"
+with gr.Blocks(css="footer{visibility:hidden}") as demo:
+    gr.Markdown("# Image → Caption (CPU) — Florence-2-base")
     with gr.Row():
         with gr.Column():
+            img = gr.Image(type="pil", label="Image", sources=["upload", "clipboard", "webcam"])
+            btn = gr.Button("Caption", variant="primary")
         with gr.Column():
+            out = gr.Textbox(label="Caption", lines=10)
+            status = gr.Markdown()
+    btn.click(run, [img], [out, status], scroll_to_output=True)
 if __name__ == "__main__":
     demo.queue(max_size=8).launch()