Spaces:
Sleeping
Sleeping
| """ | |
| BLIP Image Captioner β HF Space | |
| Real image-to-text captioning using Salesforce's BLIP model. | |
| """ | |
| from __future__ import annotations | |
| import time | |
| from typing import Optional | |
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| from transformers import BlipForConditionalGeneration, BlipProcessor | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Model loading | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODEL_NAME = "Salesforce/blip-image-captioning-base" | |
| _model: Optional[BlipForConditionalGeneration] = None | |
| _processor: Optional[BlipProcessor] = None | |
| def load_model(): | |
| """Load BLIP model and processor on first use.""" | |
| global _model, _processor | |
| if _model is not None: | |
| return | |
| _processor = BlipProcessor.from_pretrained(MODEL_NAME) | |
| _model = BlipForConditionalGeneration.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype=torch.float32, | |
| ) | |
| _model.eval() | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Caption generation | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def caption_image( | |
| image: Image.Image, | |
| prompt: str, | |
| max_length: int, | |
| num_beams: int, | |
| ): | |
| """Generate a caption for an image, optionally conditioned on a prompt.""" | |
| if image is None: | |
| return "_Upload an image to get a caption._", "0 ms" | |
| load_model() | |
| image = image.convert("RGB") | |
| prompt = (prompt or "").strip() | |
| start = time.perf_counter() | |
| if prompt: | |
| inputs = _processor(image, prompt, return_tensors="pt") | |
| else: | |
| inputs = _processor(image, return_tensors="pt") | |
| with torch.inference_mode(): | |
| output_ids = _model.generate( | |
| **inputs, | |
| max_new_tokens=int(max_length), | |
| num_beams=int(num_beams), | |
| early_stopping=True, | |
| ) | |
| latency_ms = (time.perf_counter() - start) * 1000 | |
| caption = _processor.decode(output_ids[0], skip_special_tokens=True) | |
| return caption, f"{latency_ms:.0f} ms" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Multiple captions (variety sampling) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_multiple_captions(image: Image.Image, n: int = 3): | |
| """Generate multiple captions with different beam sizes for variety.""" | |
| if image is None: | |
| return "_Upload an image first._" | |
| load_model() | |
| image = image.convert("RGB") | |
| start = time.perf_counter() | |
| inputs = _processor(image, return_tensors="pt") | |
| captions = [] | |
| with torch.inference_mode(): | |
| for beams in (1, 3, 5): | |
| output_ids = _model.generate( | |
| **inputs, | |
| max_new_tokens=50, | |
| num_beams=beams, | |
| early_stopping=True, | |
| ) | |
| cap = _processor.decode(output_ids[0], skip_special_tokens=True) | |
| captions.append((beams, cap)) | |
| latency_ms = (time.perf_counter() - start) * 1000 | |
| lines = [f"**Generated in {latency_ms:.0f} ms:**\n"] | |
| for beams, cap in captions: | |
| lines.append(f"- **Beams={beams}:** {cap}") | |
| return "\n".join(lines) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Gradio UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="BLIP Image Captioner", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # BLIP Image Captioner | |
| Generate natural-language descriptions for any image using | |
| **Salesforce's BLIP** (Bootstrapping Language-Image Pre-training). | |
| Runs on HF's free CPU tier. First request loads the model (~20s), | |
| subsequent captions generate in a few seconds. | |
| > Try uploading a photo of a person, scene, object, or activity. | |
| > You can optionally provide a **prompt prefix** to condition | |
| > the caption (e.g., "a photograph of" or "a painting of"). | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tab 1 β Single Caption | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("Single Caption"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_input = gr.Image( | |
| type="pil", | |
| label="Upload Image", | |
| height=400, | |
| ) | |
| prompt_input = gr.Textbox( | |
| label="Optional Prompt Prefix", | |
| placeholder="e.g., 'a photograph of' (leave blank for unconditional)", | |
| ) | |
| with gr.Row(): | |
| max_length = gr.Slider( | |
| minimum=20, | |
| maximum=100, | |
| step=5, | |
| value=50, | |
| label="Max Caption Length", | |
| ) | |
| num_beams = gr.Slider( | |
| minimum=1, | |
| maximum=8, | |
| step=1, | |
| value=5, | |
| label="Beam Search Width", | |
| ) | |
| caption_btn = gr.Button( | |
| "Generate Caption", | |
| variant="primary", | |
| size="lg", | |
| ) | |
| with gr.Column(scale=1): | |
| caption_output = gr.Textbox( | |
| label="Generated Caption", | |
| lines=3, | |
| interactive=False, | |
| ) | |
| latency_output = gr.Textbox( | |
| label="Latency", | |
| interactive=False, | |
| ) | |
| caption_btn.click( | |
| caption_image, | |
| inputs=[image_input, prompt_input, max_length, num_beams], | |
| outputs=[caption_output, latency_output], | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["https://images.unsplash.com/photo-1574158622682-e40e69881006?w=640", ""], | |
| ["https://images.unsplash.com/photo-1552053831-71594a27632d?w=640", ""], | |
| ["https://images.unsplash.com/photo-1502920917128-1aa500764cbd?w=640", "a photograph of"], | |
| ], | |
| inputs=[image_input, prompt_input], | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tab 2 β Variety Comparison | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("Variety Comparison"): | |
| gr.Markdown( | |
| """ | |
| Generate **multiple captions** with different beam search | |
| widths to see how the model's output varies. Higher beam | |
| width tends to produce more grammatical but sometimes | |
| blander captions. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_input_var = gr.Image( | |
| type="pil", | |
| label="Upload Image", | |
| height=400, | |
| ) | |
| variety_btn = gr.Button( | |
| "Generate 3 Captions", | |
| variant="primary", | |
| size="lg", | |
| ) | |
| with gr.Column(scale=1): | |
| variety_output = gr.Markdown() | |
| variety_btn.click( | |
| generate_multiple_captions, | |
| inputs=[image_input_var], | |
| outputs=[variety_output], | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Tab 3 β About | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("About"): | |
| gr.Markdown( | |
| """ | |
| ## Model | |
| **Name:** [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base) | |
| **Paper:** [BLIP: Bootstrapping Language-Image Pre-training](https://arxiv.org/abs/2201.12086) | |
| (Li et al., 2022) | |
| **Architecture:** ViT-base vision encoder + BERT-base | |
| language decoder with cross-attention. Pre-trained on | |
| a large corpus of image-caption pairs from the web with | |
| a self-filtering approach (CapFilt) to clean noisy data. | |
| **Parameters:** ~250M (base variant) | |
| **Training data:** COCO, Visual Genome, SBU Captions, | |
| Conceptual Captions, Conceptual 12M | |
| ## Why BLIP? | |
| Pre-BLIP vision-language models typically fell into two | |
| camps: **understanding** models (CLIP) or **generation** | |
| models (image captioning). BLIP unifies both by training | |
| a single model that can do: | |
| 1. **Image-text contrastive learning** (like CLIP) | |
| 2. **Image-text matching** (binary classification) | |
| 3. **Image-grounded text generation** (captioning) | |
| The "Bootstrapping" in the name refers to the CapFilt | |
| training procedure β using the model itself to filter | |
| and generate synthetic captions to improve the training | |
| data. | |
| ## Limitations | |
| - Base model (not large) β favors speed over quality | |
| - Trained on English-language captions only | |
| - May miss nuance or details in complex scenes | |
| - Can struggle with rare objects or unusual scenes | |
| ## Tech Stack | |
| - **transformers** β model loading and inference | |
| - **torch** β tensor backend (CPU on HF free tier) | |
| - **Pillow** β image processing | |
| - **Gradio** β UI | |
| --- | |
| **Source:** [github.com/wolfwdavid/ai-tools-collection](https://github.com/wolfwdavid/ai-tools-collection) | |
| | | |
| **HF Profile:** [@WolfDavid](https://huggingface.co/WolfDavid) | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |