Spaces:

WolfDavid
/

blip-captioner

Sleeping

App Files Files Community

blip-captioner / app.py

WolfDavid

Initial deploy: BLIP image captioning

a388160 about 1 month ago

raw

history blame contribute delete

12.1 kB

	"""
	BLIP Image Captioner — HF Space

	Real image-to-text captioning using Salesforce's BLIP model.
	"""

	from __future__ import annotations

	import time
	from typing import Optional

	import gradio as gr
	import torch
	from PIL import Image
	from transformers import BlipForConditionalGeneration, BlipProcessor

	# ═══════════════════════════════════════════════════════════════════
	# Model loading
	# ═══════════════════════════════════════════════════════════════════

	MODEL_NAME = "Salesforce/blip-image-captioning-base"

	_model: Optional[BlipForConditionalGeneration] = None
	_processor: Optional[BlipProcessor] = None


	def load_model():
	"""Load BLIP model and processor on first use."""
	global _model, _processor

	if _model is not None:
	return

	_processor = BlipProcessor.from_pretrained(MODEL_NAME)
	_model = BlipForConditionalGeneration.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float32,
	)
	_model.eval()


	# ═══════════════════════════════════════════════════════════════════
	# Caption generation
	# ═══════════════════════════════════════════════════════════════════

	def caption_image(
	image: Image.Image,
	prompt: str,
	max_length: int,
	num_beams: int,
	):
	"""Generate a caption for an image, optionally conditioned on a prompt."""
	if image is None:
	return "_Upload an image to get a caption._", "0 ms"

	load_model()

	image = image.convert("RGB")
	prompt = (prompt or "").strip()

	start = time.perf_counter()

	if prompt:
	inputs = _processor(image, prompt, return_tensors="pt")
	else:
	inputs = _processor(image, return_tensors="pt")

	with torch.inference_mode():
	output_ids = _model.generate(
	**inputs,
	max_new_tokens=int(max_length),
	num_beams=int(num_beams),
	early_stopping=True,
	)

	latency_ms = (time.perf_counter() - start) * 1000
	caption = _processor.decode(output_ids[0], skip_special_tokens=True)

	return caption, f"{latency_ms:.0f} ms"


	# ═══════════════════════════════════════════════════════════════════
	# Multiple captions (variety sampling)
	# ═══════════════════════════════════════════════════════════════════

	def generate_multiple_captions(image: Image.Image, n: int = 3):
	"""Generate multiple captions with different beam sizes for variety."""
	if image is None:
	return "_Upload an image first._"

	load_model()
	image = image.convert("RGB")

	start = time.perf_counter()
	inputs = _processor(image, return_tensors="pt")

	captions = []
	with torch.inference_mode():
	for beams in (1, 3, 5):
	output_ids = _model.generate(
	**inputs,
	max_new_tokens=50,
	num_beams=beams,
	early_stopping=True,
	)
	cap = _processor.decode(output_ids[0], skip_special_tokens=True)
	captions.append((beams, cap))

	latency_ms = (time.perf_counter() - start) * 1000

	lines = [f"Generated in {latency_ms:.0f} ms:\n"]
	for beams, cap in captions:
	lines.append(f"- Beams={beams}: {cap}")
	return "\n".join(lines)


	# ═══════════════════════════════════════════════════════════════════
	# Gradio UI
	# ═══════════════════════════════════════════════════════════════════

	with gr.Blocks(title="BLIP Image Captioner", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# BLIP Image Captioner

	Generate natural-language descriptions for any image using
	Salesforce's BLIP (Bootstrapping Language-Image Pre-training).

	Runs on HF's free CPU tier. First request loads the model (~20s),
	subsequent captions generate in a few seconds.

	> Try uploading a photo of a person, scene, object, or activity.
	> You can optionally provide a prompt prefix to condition
	> the caption (e.g., "a photograph of" or "a painting of").
	"""
	)

	with gr.Tabs():
	# ─────────────────────────────────────────────────────────
	# Tab 1 — Single Caption
	# ─────────────────────────────────────────────────────────
	with gr.Tab("Single Caption"):
	with gr.Row():
	with gr.Column(scale=1):
	image_input = gr.Image(
	type="pil",
	label="Upload Image",
	height=400,
	)
	prompt_input = gr.Textbox(
	label="Optional Prompt Prefix",
	placeholder="e.g., 'a photograph of' (leave blank for unconditional)",
	)
	with gr.Row():
	max_length = gr.Slider(
	minimum=20,
	maximum=100,
	step=5,
	value=50,
	label="Max Caption Length",
	)
	num_beams = gr.Slider(
	minimum=1,
	maximum=8,
	step=1,
	value=5,
	label="Beam Search Width",
	)
	caption_btn = gr.Button(
	"Generate Caption",
	variant="primary",
	size="lg",
	)

	with gr.Column(scale=1):
	caption_output = gr.Textbox(
	label="Generated Caption",
	lines=3,
	interactive=False,
	)
	latency_output = gr.Textbox(
	label="Latency",
	interactive=False,
	)

	caption_btn.click(
	caption_image,
	inputs=[image_input, prompt_input, max_length, num_beams],
	outputs=[caption_output, latency_output],
	)

	gr.Examples(
	examples=[
	["https://images.unsplash.com/photo-1574158622682-e40e69881006?w=640", ""],
	["https://images.unsplash.com/photo-1552053831-71594a27632d?w=640", ""],
	["https://images.unsplash.com/photo-1502920917128-1aa500764cbd?w=640", "a photograph of"],
	],
	inputs=[image_input, prompt_input],
	)

	# ─────────────────────────────────────────────────────────
	# Tab 2 — Variety Comparison
	# ─────────────────────────────────────────────────────────
	with gr.Tab("Variety Comparison"):
	gr.Markdown(
	"""
	Generate multiple captions with different beam search
	widths to see how the model's output varies. Higher beam
	width tends to produce more grammatical but sometimes
	blander captions.
	"""
	)
	with gr.Row():
	with gr.Column(scale=1):
	image_input_var = gr.Image(
	type="pil",
	label="Upload Image",
	height=400,
	)
	variety_btn = gr.Button(
	"Generate 3 Captions",
	variant="primary",
	size="lg",
	)
	with gr.Column(scale=1):
	variety_output = gr.Markdown()

	variety_btn.click(
	generate_multiple_captions,
	inputs=[image_input_var],
	outputs=[variety_output],
	)

	# ─────────────────────────────────────────────────────────
	# Tab 3 — About
	# ─────────────────────────────────────────────────────────
	with gr.Tab("About"):
	gr.Markdown(
	"""
	## Model

	Name: [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)

	Paper: [BLIP: Bootstrapping Language-Image Pre-training](https://arxiv.org/abs/2201.12086)
	(Li et al., 2022)

	Architecture: ViT-base vision encoder + BERT-base
	language decoder with cross-attention. Pre-trained on
	a large corpus of image-caption pairs from the web with
	a self-filtering approach (CapFilt) to clean noisy data.

	Parameters: ~250M (base variant)

	Training data: COCO, Visual Genome, SBU Captions,
	Conceptual Captions, Conceptual 12M

	## Why BLIP?

	Pre-BLIP vision-language models typically fell into two
	camps: understanding models (CLIP) or generation
	models (image captioning). BLIP unifies both by training
	a single model that can do:

	1. Image-text contrastive learning (like CLIP)
	2. Image-text matching (binary classification)
	3. Image-grounded text generation (captioning)

	The "Bootstrapping" in the name refers to the CapFilt
	training procedure — using the model itself to filter
	and generate synthetic captions to improve the training
	data.

	## Limitations

	- Base model (not large) — favors speed over quality
	- Trained on English-language captions only
	- May miss nuance or details in complex scenes
	- Can struggle with rare objects or unusual scenes

	## Tech Stack

	- transformers — model loading and inference
	- torch — tensor backend (CPU on HF free tier)
	- Pillow — image processing
	- Gradio — UI

	---
	Source: [github.com/wolfwdavid/ai-tools-collection](https://github.com/wolfwdavid/ai-tools-collection)
	\|
	HF Profile: [@WolfDavid](https://huggingface.co/WolfDavid)
	"""
	)


	if __name__ == "__main__":
	demo.launch()