blip-captioner / app.py
WolfDavid's picture
Initial deploy: BLIP image captioning
a388160
"""
BLIP Image Captioner β€” HF Space
Real image-to-text captioning using Salesforce's BLIP model.
"""
from __future__ import annotations
import time
from typing import Optional
import gradio as gr
import torch
from PIL import Image
from transformers import BlipForConditionalGeneration, BlipProcessor
# ═══════════════════════════════════════════════════════════════════
# Model loading
# ═══════════════════════════════════════════════════════════════════
MODEL_NAME = "Salesforce/blip-image-captioning-base"
_model: Optional[BlipForConditionalGeneration] = None
_processor: Optional[BlipProcessor] = None
def load_model():
"""Load BLIP model and processor on first use."""
global _model, _processor
if _model is not None:
return
_processor = BlipProcessor.from_pretrained(MODEL_NAME)
_model = BlipForConditionalGeneration.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float32,
)
_model.eval()
# ═══════════════════════════════════════════════════════════════════
# Caption generation
# ═══════════════════════════════════════════════════════════════════
def caption_image(
image: Image.Image,
prompt: str,
max_length: int,
num_beams: int,
):
"""Generate a caption for an image, optionally conditioned on a prompt."""
if image is None:
return "_Upload an image to get a caption._", "0 ms"
load_model()
image = image.convert("RGB")
prompt = (prompt or "").strip()
start = time.perf_counter()
if prompt:
inputs = _processor(image, prompt, return_tensors="pt")
else:
inputs = _processor(image, return_tensors="pt")
with torch.inference_mode():
output_ids = _model.generate(
**inputs,
max_new_tokens=int(max_length),
num_beams=int(num_beams),
early_stopping=True,
)
latency_ms = (time.perf_counter() - start) * 1000
caption = _processor.decode(output_ids[0], skip_special_tokens=True)
return caption, f"{latency_ms:.0f} ms"
# ═══════════════════════════════════════════════════════════════════
# Multiple captions (variety sampling)
# ═══════════════════════════════════════════════════════════════════
def generate_multiple_captions(image: Image.Image, n: int = 3):
"""Generate multiple captions with different beam sizes for variety."""
if image is None:
return "_Upload an image first._"
load_model()
image = image.convert("RGB")
start = time.perf_counter()
inputs = _processor(image, return_tensors="pt")
captions = []
with torch.inference_mode():
for beams in (1, 3, 5):
output_ids = _model.generate(
**inputs,
max_new_tokens=50,
num_beams=beams,
early_stopping=True,
)
cap = _processor.decode(output_ids[0], skip_special_tokens=True)
captions.append((beams, cap))
latency_ms = (time.perf_counter() - start) * 1000
lines = [f"**Generated in {latency_ms:.0f} ms:**\n"]
for beams, cap in captions:
lines.append(f"- **Beams={beams}:** {cap}")
return "\n".join(lines)
# ═══════════════════════════════════════════════════════════════════
# Gradio UI
# ═══════════════════════════════════════════════════════════════════
with gr.Blocks(title="BLIP Image Captioner", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# BLIP Image Captioner
Generate natural-language descriptions for any image using
**Salesforce's BLIP** (Bootstrapping Language-Image Pre-training).
Runs on HF's free CPU tier. First request loads the model (~20s),
subsequent captions generate in a few seconds.
> Try uploading a photo of a person, scene, object, or activity.
> You can optionally provide a **prompt prefix** to condition
> the caption (e.g., "a photograph of" or "a painting of").
"""
)
with gr.Tabs():
# ─────────────────────────────────────────────────────────
# Tab 1 β€” Single Caption
# ─────────────────────────────────────────────────────────
with gr.Tab("Single Caption"):
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(
type="pil",
label="Upload Image",
height=400,
)
prompt_input = gr.Textbox(
label="Optional Prompt Prefix",
placeholder="e.g., 'a photograph of' (leave blank for unconditional)",
)
with gr.Row():
max_length = gr.Slider(
minimum=20,
maximum=100,
step=5,
value=50,
label="Max Caption Length",
)
num_beams = gr.Slider(
minimum=1,
maximum=8,
step=1,
value=5,
label="Beam Search Width",
)
caption_btn = gr.Button(
"Generate Caption",
variant="primary",
size="lg",
)
with gr.Column(scale=1):
caption_output = gr.Textbox(
label="Generated Caption",
lines=3,
interactive=False,
)
latency_output = gr.Textbox(
label="Latency",
interactive=False,
)
caption_btn.click(
caption_image,
inputs=[image_input, prompt_input, max_length, num_beams],
outputs=[caption_output, latency_output],
)
gr.Examples(
examples=[
["https://images.unsplash.com/photo-1574158622682-e40e69881006?w=640", ""],
["https://images.unsplash.com/photo-1552053831-71594a27632d?w=640", ""],
["https://images.unsplash.com/photo-1502920917128-1aa500764cbd?w=640", "a photograph of"],
],
inputs=[image_input, prompt_input],
)
# ─────────────────────────────────────────────────────────
# Tab 2 β€” Variety Comparison
# ─────────────────────────────────────────────────────────
with gr.Tab("Variety Comparison"):
gr.Markdown(
"""
Generate **multiple captions** with different beam search
widths to see how the model's output varies. Higher beam
width tends to produce more grammatical but sometimes
blander captions.
"""
)
with gr.Row():
with gr.Column(scale=1):
image_input_var = gr.Image(
type="pil",
label="Upload Image",
height=400,
)
variety_btn = gr.Button(
"Generate 3 Captions",
variant="primary",
size="lg",
)
with gr.Column(scale=1):
variety_output = gr.Markdown()
variety_btn.click(
generate_multiple_captions,
inputs=[image_input_var],
outputs=[variety_output],
)
# ─────────────────────────────────────────────────────────
# Tab 3 β€” About
# ─────────────────────────────────────────────────────────
with gr.Tab("About"):
gr.Markdown(
"""
## Model
**Name:** [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)
**Paper:** [BLIP: Bootstrapping Language-Image Pre-training](https://arxiv.org/abs/2201.12086)
(Li et al., 2022)
**Architecture:** ViT-base vision encoder + BERT-base
language decoder with cross-attention. Pre-trained on
a large corpus of image-caption pairs from the web with
a self-filtering approach (CapFilt) to clean noisy data.
**Parameters:** ~250M (base variant)
**Training data:** COCO, Visual Genome, SBU Captions,
Conceptual Captions, Conceptual 12M
## Why BLIP?
Pre-BLIP vision-language models typically fell into two
camps: **understanding** models (CLIP) or **generation**
models (image captioning). BLIP unifies both by training
a single model that can do:
1. **Image-text contrastive learning** (like CLIP)
2. **Image-text matching** (binary classification)
3. **Image-grounded text generation** (captioning)
The "Bootstrapping" in the name refers to the CapFilt
training procedure β€” using the model itself to filter
and generate synthetic captions to improve the training
data.
## Limitations
- Base model (not large) β€” favors speed over quality
- Trained on English-language captions only
- May miss nuance or details in complex scenes
- Can struggle with rare objects or unusual scenes
## Tech Stack
- **transformers** β€” model loading and inference
- **torch** β€” tensor backend (CPU on HF free tier)
- **Pillow** β€” image processing
- **Gradio** β€” UI
---
**Source:** [github.com/wolfwdavid/ai-tools-collection](https://github.com/wolfwdavid/ai-tools-collection)
 | 
**HF Profile:** [@WolfDavid](https://huggingface.co/WolfDavid)
"""
)
if __name__ == "__main__":
demo.launch()