import gradio as gr from transformers import pipeline import requests from PIL import Image from io import BytesIO # Initialize the image-to-text pipeline def load_model(): return pipeline("image-text-to-text", model="llava-hf/llava-1.5-7b-hf") # Function to handle image captioning def caption_image(image, question=None): pipe = load_model() # Prepare messages based on whether a question is provided if question and question.strip(): messages = [ { "role": "user", "content": [ {"type": "image", "url": image}, {"type": "text", "text": question}, ], }, ] else: messages = [ { "role": "user", "content": [ {"type": "image", "url": image}, {"type": "text", "text": "Describe this image in detail."}, ], }, ] # Generate caption result = pipe(text=messages, max_new_tokens=150) return result[0]["generated_text"] # Function to handle example images via URL def process_example_url(url): response = requests.get(url) img = Image.open(BytesIO(response.content)) return img # Create Gradio interface with gr.Blocks(title="Image Captioning App") as demo: gr.Markdown("# Image Captioning with LLaVA") gr.Markdown("Upload an image and optionally ask a specific question about it.") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload Image") question_input = gr.Textbox(label="Question (optional)", placeholder="Ask a specific question about the image or leave blank for general description") caption_button = gr.Button("Generate Caption") with gr.Column(): caption_output = gr.Textbox(label="Generated Caption", lines=7) # Add examples example_images = [ ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"], ["https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1920px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", ""] ] gr.Examples( examples=example_images, inputs=[image_input, question_input], fn=process_example_url, cache_examples=True, ) # Set up the button click event caption_button.click( fn=caption_image, inputs=[image_input, question_input], outputs=caption_output ) gr.Markdown("### How to use:") gr.Markdown("1. Upload an image by clicking the upload box or drag-and-drop") gr.Markdown("2. Optionally type a specific question about the image") gr.Markdown("3. Click 'Generate Caption' to get the result") gr.Markdown("4. Try the examples below to see how it works") # Launch the app if __name__ == "__main__": demo.launch()