# CPU-only: Image → Caption (Florence-2-base), concise build import os from functools import lru_cache import torch import gradio as gr from PIL import Image # AVIF/HEIF support (optional, safe to ignore if unavailable) try: import pillow_avif # noqa: F401 except Exception: pass try: from pillow_heif import register_heif_opener register_heif_opener() except Exception: pass from transformers import AutoProcessor, AutoModelForCausalLM from unittest.mock import patch from transformers.dynamic_module_utils import get_imports as _orig_get_imports CAPTION_MODEL_ID = "microsoft/Florence-2-base" HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN") DEVICE = "cpu" DTYPE = torch.float32 MAX_IMG_SIDE = int(os.getenv("MAX_IMG_SIDE", "1024")) def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image: w, h = img.size if max(w, h) <= max_side: return img r = max_side / max(w, h) return img.resize((int(w * r), int(h * r)), Image.LANCZOS) def _ensure_rgb(img) -> Image.Image: if not isinstance(img, Image.Image): raise gr.Error("Upload a valid image.") return img.convert("RGB") def _no_flash_attn_get_imports(filename): imps = _orig_get_imports(filename) try: name = str(filename).lower() if "florence2" in name or "modeling_florence2.py" in name: return [x for x in imps if x != "flash_attn"] except Exception: pass return imps @lru_cache(maxsize=1) def _load_florence(): proc = AutoProcessor.from_pretrained(CAPTION_MODEL_ID, trust_remote_code=True, token=HF_TOKEN) with patch("transformers.dynamic_module_utils.get_imports", _no_flash_attn_get_imports): mdl = AutoModelForCausalLM.from_pretrained( CAPTION_MODEL_ID, trust_remote_code=True, token=HF_TOKEN, attn_implementation="sdpa", # CPU-safe torch_dtype=DTYPE, device_map="cpu", ).eval() return proc, mdl @torch.inference_mode() def caption(image: Image.Image, max_new_tokens: int = 128, num_beams: int = 3) -> str: image = _ensure_rgb(_resize_max(image)) processor, model = _load_florence() batch = processor( text="", images=[image], # batch even for single padding=True, return_tensors="pt", ) # move tensors to CPU device (BatchFeature may contain non-tensors) for k, v in list(batch.items()): if torch.is_tensor(v): batch[k] = v.to(DEVICE) out_ids = model.generate( **batch, max_new_tokens=max_new_tokens, num_beams=num_beams, do_sample=False, early_stopping=False, ) gen = processor.batch_decode(out_ids, skip_special_tokens=False)[0] parsed = processor.post_process_generation( gen, task="", image_size=[(image.width, image.height)] ) data = parsed[0] if isinstance(parsed, list) else parsed return (data.get("", "") or "Unable to generate a caption.").strip() def run(image: Image.Image): txt = caption(image) return txt, "Model: Florence-2-base (CPU)" with gr.Blocks(css="footer{visibility:hidden}") as demo: gr.Markdown("# Image → Caption (CPU) — Florence-2-base") with gr.Row(): with gr.Column(): img = gr.Image(type="pil", label="Image", sources=["upload", "clipboard", "webcam"]) btn = gr.Button("Caption", variant="primary") with gr.Column(): out = gr.Textbox(label="Caption", lines=10) status = gr.Markdown() btn.click(run, [img], [out, status], scroll_to_output=True) if __name__ == "__main__": demo.queue(max_size=8).launch()