Spaces:
Runtime error
Runtime error
| """ | |
| Hugging Face Spaces App - Image Captioning | |
| Deploy this to HF Spaces for free hosting | |
| """ | |
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| import time | |
| def load_models(): | |
| """Load models with error handling""" | |
| models = {} | |
| try: | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| print("Loading BLIP model...") | |
| models['blip_processor'] = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| models['blip_model'] = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
| print("β BLIP loaded successfully") | |
| except Exception as e: | |
| print(f"β BLIP failed to load: {e}") | |
| models['blip_error'] = str(e) | |
| try: | |
| from transformers import AutoProcessor, AutoModelForCausalLM | |
| print("Loading GIT model...") | |
| models['git_processor'] = AutoProcessor.from_pretrained("microsoft/git-base") | |
| models['git_model'] = AutoModelForCausalLM.from_pretrained("microsoft/git-base") | |
| print("β GIT loaded successfully") | |
| except Exception as e: | |
| print(f"β GIT failed to load: {e}") | |
| models['git_error'] = str(e) | |
| return models | |
| # Load models at startup | |
| print("π Loading AI models...") | |
| models = load_models() | |
| print(f"π¦ Models loading completed") | |
| def generate_captions(image, true_caption=""): | |
| """Generate captions using available models""" | |
| if image is None: | |
| return "β Please upload an image first." | |
| # Ensure image is in RGB format | |
| if image.mode != 'RGB': | |
| image = image.convert('RGB') | |
| results = [] | |
| start_time = time.time() | |
| # Add true caption if provided | |
| if true_caption.strip(): | |
| results.append(f"**π― True Caption:**") | |
| results.append(f"{true_caption.strip()}") | |
| results.append("") | |
| # BLIP model | |
| if 'blip_model' in models: | |
| try: | |
| blip_start = time.time() | |
| inputs = models['blip_processor'](image, return_tensors="pt") | |
| out = models['blip_model'].generate(**inputs, max_length=50, num_beams=5) | |
| blip_caption = models['blip_processor'].decode(out[0], skip_special_tokens=True) | |
| blip_time = time.time() - blip_start | |
| results.append(f"**π€ BLIP Model:** ({blip_time:.2f}s)") | |
| results.append(f"{blip_caption}") | |
| results.append("") | |
| except Exception as e: | |
| results.append(f"**π€ BLIP Model:** Error - {str(e)}") | |
| results.append("") | |
| elif 'blip_error' in models: | |
| results.append(f"**π€ BLIP Model:** Not available - {models['blip_error']}") | |
| results.append("") | |
| # GIT model | |
| if 'git_model' in models: | |
| try: | |
| git_start = time.time() | |
| inputs = models['git_processor'](images=image, return_tensors="pt") | |
| generated_ids = models['git_model'].generate( | |
| pixel_values=inputs.pixel_values, | |
| max_length=50, | |
| num_beams=5 | |
| ) | |
| git_caption = models['git_processor'].batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| git_time = time.time() - git_start | |
| results.append(f"**π§ GIT Model:** ({git_time:.2f}s)") | |
| results.append(f"{git_caption}") | |
| results.append("") | |
| except Exception as e: | |
| results.append(f"**π§ GIT Model:** Error - {str(e)}") | |
| results.append("") | |
| elif 'git_error' in models: | |
| results.append(f"**π§ GIT Model:** Not available - {models['git_error']}") | |
| results.append("") | |
| total_time = time.time() - start_time | |
| results.append("---") | |
| results.append(f"**β±οΈ Total Processing Time:** {total_time:.2f} seconds") | |
| results.append("") | |
| results.append("**π About the Models:**") | |
| results.append("β’ **BLIP**: Salesforce's Bootstrapping Language-Image Pre-training") | |
| results.append("β’ **GIT**: Microsoft's Generative Image-to-text Transformer") | |
| return "\n".join(results) | |
| # Create Gradio interface | |
| with gr.Blocks( | |
| title="AI Image Captioning", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π€ AI Image Captioning | |
| Upload an image and get captions from multiple state-of-the-art AI models! | |
| **Available Models:** | |
| - π€ **BLIP** (Salesforce): Fast and accurate image captioning | |
| - π§ **GIT** (Microsoft): Advanced generative image-to-text model | |
| *Simply upload an image or try one of the examples below!* | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| image_input = gr.Image( | |
| type="pil", | |
| label="πΈ Upload Your Image", | |
| height=400 | |
| ) | |
| true_caption_input = gr.Textbox( | |
| label="π― True Caption (Optional)", | |
| placeholder="Enter the correct caption to compare with AI predictions...", | |
| lines=2 | |
| ) | |
| generate_btn = gr.Button( | |
| "β¨ Generate Captions", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=1): | |
| output = gr.Textbox( | |
| label="π€ AI Generated Captions", | |
| lines=20, | |
| max_lines=25, | |
| show_copy_button=True | |
| ) | |
| # Example images | |
| gr.Markdown("### π Try These Examples:") | |
| example_images = [ | |
| ["https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat.jpg", "A cat sitting on a surface"], | |
| ["https://huggingface.co/datasets/mishig/sample_images/resolve/main/dog.jpg", "A dog in a field"], | |
| ["https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=500", "A mountain landscape with snow"], | |
| ["https://images.unsplash.com/photo-1549298916-b41d501d3772?w=500", "A red sports car"], | |
| ["https://images.unsplash.com/photo-1551963831-b3b1ca40c98e?w=500", "A breakfast with coffee and pastries"], | |
| ] | |
| gr.Examples( | |
| examples=example_images, | |
| inputs=[image_input, true_caption_input], | |
| outputs=output, | |
| fn=generate_captions, | |
| cache_examples=False | |
| ) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=generate_captions, | |
| inputs=[image_input, true_caption_input], | |
| outputs=output | |
| ) | |
| # Auto-generate when image is uploaded | |
| image_input.change( | |
| fn=generate_captions, | |
| inputs=[image_input, true_caption_input], | |
| outputs=output | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **π§ Technical Details:** | |
| - Models run on Hugging Face's infrastructure | |
| - Processing time varies based on image size and complexity | |
| - All models are open-source and publicly available | |
| **π Tips:** | |
| - Try different types of images (people, objects, landscapes, etc.) | |
| - Compare the AI captions with your own description | |
| - Larger images may take longer to process | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() | |