""" BLIP Image Captioner — HF Space Real image-to-text captioning using Salesforce's BLIP model. """ from __future__ import annotations import time from typing import Optional import gradio as gr import torch from PIL import Image from transformers import BlipForConditionalGeneration, BlipProcessor # ═══════════════════════════════════════════════════════════════════ # Model loading # ═══════════════════════════════════════════════════════════════════ MODEL_NAME = "Salesforce/blip-image-captioning-base" _model: Optional[BlipForConditionalGeneration] = None _processor: Optional[BlipProcessor] = None def load_model(): """Load BLIP model and processor on first use.""" global _model, _processor if _model is not None: return _processor = BlipProcessor.from_pretrained(MODEL_NAME) _model = BlipForConditionalGeneration.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, ) _model.eval() # ═══════════════════════════════════════════════════════════════════ # Caption generation # ═══════════════════════════════════════════════════════════════════ def caption_image( image: Image.Image, prompt: str, max_length: int, num_beams: int, ): """Generate a caption for an image, optionally conditioned on a prompt.""" if image is None: return "_Upload an image to get a caption._", "0 ms" load_model() image = image.convert("RGB") prompt = (prompt or "").strip() start = time.perf_counter() if prompt: inputs = _processor(image, prompt, return_tensors="pt") else: inputs = _processor(image, return_tensors="pt") with torch.inference_mode(): output_ids = _model.generate( **inputs, max_new_tokens=int(max_length), num_beams=int(num_beams), early_stopping=True, ) latency_ms = (time.perf_counter() - start) * 1000 caption = _processor.decode(output_ids[0], skip_special_tokens=True) return caption, f"{latency_ms:.0f} ms" # ═══════════════════════════════════════════════════════════════════ # Multiple captions (variety sampling) # ═══════════════════════════════════════════════════════════════════ def generate_multiple_captions(image: Image.Image, n: int = 3): """Generate multiple captions with different beam sizes for variety.""" if image is None: return "_Upload an image first._" load_model() image = image.convert("RGB") start = time.perf_counter() inputs = _processor(image, return_tensors="pt") captions = [] with torch.inference_mode(): for beams in (1, 3, 5): output_ids = _model.generate( **inputs, max_new_tokens=50, num_beams=beams, early_stopping=True, ) cap = _processor.decode(output_ids[0], skip_special_tokens=True) captions.append((beams, cap)) latency_ms = (time.perf_counter() - start) * 1000 lines = [f"**Generated in {latency_ms:.0f} ms:**\n"] for beams, cap in captions: lines.append(f"- **Beams={beams}:** {cap}") return "\n".join(lines) # ═══════════════════════════════════════════════════════════════════ # Gradio UI # ═══════════════════════════════════════════════════════════════════ with gr.Blocks(title="BLIP Image Captioner", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # BLIP Image Captioner Generate natural-language descriptions for any image using **Salesforce's BLIP** (Bootstrapping Language-Image Pre-training). Runs on HF's free CPU tier. First request loads the model (~20s), subsequent captions generate in a few seconds. > Try uploading a photo of a person, scene, object, or activity. > You can optionally provide a **prompt prefix** to condition > the caption (e.g., "a photograph of" or "a painting of"). """ ) with gr.Tabs(): # ───────────────────────────────────────────────────────── # Tab 1 — Single Caption # ───────────────────────────────────────────────────────── with gr.Tab("Single Caption"): with gr.Row(): with gr.Column(scale=1): image_input = gr.Image( type="pil", label="Upload Image", height=400, ) prompt_input = gr.Textbox( label="Optional Prompt Prefix", placeholder="e.g., 'a photograph of' (leave blank for unconditional)", ) with gr.Row(): max_length = gr.Slider( minimum=20, maximum=100, step=5, value=50, label="Max Caption Length", ) num_beams = gr.Slider( minimum=1, maximum=8, step=1, value=5, label="Beam Search Width", ) caption_btn = gr.Button( "Generate Caption", variant="primary", size="lg", ) with gr.Column(scale=1): caption_output = gr.Textbox( label="Generated Caption", lines=3, interactive=False, ) latency_output = gr.Textbox( label="Latency", interactive=False, ) caption_btn.click( caption_image, inputs=[image_input, prompt_input, max_length, num_beams], outputs=[caption_output, latency_output], ) gr.Examples( examples=[ ["https://images.unsplash.com/photo-1574158622682-e40e69881006?w=640", ""], ["https://images.unsplash.com/photo-1552053831-71594a27632d?w=640", ""], ["https://images.unsplash.com/photo-1502920917128-1aa500764cbd?w=640", "a photograph of"], ], inputs=[image_input, prompt_input], ) # ───────────────────────────────────────────────────────── # Tab 2 — Variety Comparison # ───────────────────────────────────────────────────────── with gr.Tab("Variety Comparison"): gr.Markdown( """ Generate **multiple captions** with different beam search widths to see how the model's output varies. Higher beam width tends to produce more grammatical but sometimes blander captions. """ ) with gr.Row(): with gr.Column(scale=1): image_input_var = gr.Image( type="pil", label="Upload Image", height=400, ) variety_btn = gr.Button( "Generate 3 Captions", variant="primary", size="lg", ) with gr.Column(scale=1): variety_output = gr.Markdown() variety_btn.click( generate_multiple_captions, inputs=[image_input_var], outputs=[variety_output], ) # ───────────────────────────────────────────────────────── # Tab 3 — About # ───────────────────────────────────────────────────────── with gr.Tab("About"): gr.Markdown( """ ## Model **Name:** [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base) **Paper:** [BLIP: Bootstrapping Language-Image Pre-training](https://arxiv.org/abs/2201.12086) (Li et al., 2022) **Architecture:** ViT-base vision encoder + BERT-base language decoder with cross-attention. Pre-trained on a large corpus of image-caption pairs from the web with a self-filtering approach (CapFilt) to clean noisy data. **Parameters:** ~250M (base variant) **Training data:** COCO, Visual Genome, SBU Captions, Conceptual Captions, Conceptual 12M ## Why BLIP? Pre-BLIP vision-language models typically fell into two camps: **understanding** models (CLIP) or **generation** models (image captioning). BLIP unifies both by training a single model that can do: 1. **Image-text contrastive learning** (like CLIP) 2. **Image-text matching** (binary classification) 3. **Image-grounded text generation** (captioning) The "Bootstrapping" in the name refers to the CapFilt training procedure — using the model itself to filter and generate synthetic captions to improve the training data. ## Limitations - Base model (not large) — favors speed over quality - Trained on English-language captions only - May miss nuance or details in complex scenes - Can struggle with rare objects or unusual scenes ## Tech Stack - **transformers** — model loading and inference - **torch** — tensor backend (CPU on HF free tier) - **Pillow** — image processing - **Gradio** — UI --- **Source:** [github.com/wolfwdavid/ai-tools-collection](https://github.com/wolfwdavid/ai-tools-collection)  |  **HF Profile:** [@WolfDavid](https://huggingface.co/WolfDavid) """ ) if __name__ == "__main__": demo.launch()