Spaces:

achase25
/

AiSolMM

Sleeping

App Files Files Community

AiSolMM / app.py

achase25

Update app.py

5251d80 verified 4 months ago

raw

history blame contribute delete

3.79 kB

	# CPU-only: Image → Caption (Florence-2-base), concise build

	import os
	from functools import lru_cache

	import torch
	import gradio as gr
	from PIL import Image

	# AVIF/HEIF support (optional, safe to ignore if unavailable)
	try:
	import pillow_avif # noqa: F401
	except Exception:
	pass
	try:
	from pillow_heif import register_heif_opener
	register_heif_opener()
	except Exception:
	pass

	from transformers import AutoProcessor, AutoModelForCausalLM
	from unittest.mock import patch
	from transformers.dynamic_module_utils import get_imports as _orig_get_imports

	CAPTION_MODEL_ID = "microsoft/Florence-2-base"
	HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
	DEVICE = "cpu"
	DTYPE = torch.float32
	MAX_IMG_SIDE = int(os.getenv("MAX_IMG_SIDE", "1024"))

	def _resize_max(img: Image.Image, max_side: int = MAX_IMG_SIDE) -> Image.Image:
	w, h = img.size
	if max(w, h) <= max_side:
	return img
	r = max_side / max(w, h)
	return img.resize((int(w * r), int(h * r)), Image.LANCZOS)

	def _ensure_rgb(img) -> Image.Image:
	if not isinstance(img, Image.Image):
	raise gr.Error("Upload a valid image.")
	return img.convert("RGB")

	def _no_flash_attn_get_imports(filename):
	imps = _orig_get_imports(filename)
	try:
	name = str(filename).lower()
	if "florence2" in name or "modeling_florence2.py" in name:
	return [x for x in imps if x != "flash_attn"]
	except Exception:
	pass
	return imps

	@lru_cache(maxsize=1)
	def _load_florence():
	proc = AutoProcessor.from_pretrained(CAPTION_MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
	with patch("transformers.dynamic_module_utils.get_imports", _no_flash_attn_get_imports):
	mdl = AutoModelForCausalLM.from_pretrained(
	CAPTION_MODEL_ID,
	trust_remote_code=True,
	token=HF_TOKEN,
	attn_implementation="sdpa", # CPU-safe
	torch_dtype=DTYPE,
	device_map="cpu",
	).eval()
	return proc, mdl

	@torch.inference_mode()
	def caption(image: Image.Image, max_new_tokens: int = 128, num_beams: int = 3) -> str:
	image = _ensure_rgb(_resize_max(image))
	processor, model = _load_florence()
	batch = processor(
	text="<MORE_DETAILED_CAPTION>",
	images=[image], # batch even for single
	padding=True,
	return_tensors="pt",
	)
	# move tensors to CPU device (BatchFeature may contain non-tensors)
	for k, v in list(batch.items()):
	if torch.is_tensor(v):
	batch[k] = v.to(DEVICE)

	out_ids = model.generate(
	**batch,
	max_new_tokens=max_new_tokens,
	num_beams=num_beams,
	do_sample=False,
	early_stopping=False,
	)
	gen = processor.batch_decode(out_ids, skip_special_tokens=False)[0]
	parsed = processor.post_process_generation(
	gen, task="<MORE_DETAILED_CAPTION>", image_size=[(image.width, image.height)]
	)
	data = parsed[0] if isinstance(parsed, list) else parsed
	return (data.get("<MORE_DETAILED_CAPTION>", "") or "Unable to generate a caption.").strip()

	def run(image: Image.Image):
	txt = caption(image)
	return txt, "Model: Florence-2-base (CPU)"

	with gr.Blocks(css="footer{visibility:hidden}") as demo:
	gr.Markdown("# Image → Caption (CPU) — Florence-2-base")
	with gr.Row():
	with gr.Column():
	img = gr.Image(type="pil", label="Image", sources=["upload", "clipboard", "webcam"])
	btn = gr.Button("Caption", variant="primary")
	with gr.Column():
	out = gr.Textbox(label="Caption", lines=10)
	status = gr.Markdown()
	btn.click(run, [img], [out, status], scroll_to_output=True)

	if __name__ == "__main__":
	demo.queue(max_size=8).launch()