import gradio as gr
from transformers import pipeline
import requests
from PIL import Image
from io import BytesIO

# Initialize the image-to-text pipeline
def load_model():
    return pipeline("image-text-to-text", model="llava-hf/llava-1.5-7b-hf")

# Function to handle image captioning
def caption_image(image, question=None):
    pipe = load_model()
    
    # Prepare messages based on whether a question is provided
    if question and question.strip():
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": image},
                    {"type": "text", "text": question},
                ],
            },
        ]
    else:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": image},
                    {"type": "text", "text": "Describe this image in detail."},
                ],
            },
        ]
    
    # Generate caption
    result = pipe(text=messages, max_new_tokens=150)
    return result[0]["generated_text"]

# Function to handle example images via URL
def process_example_url(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    return img

# Create Gradio interface
with gr.Blocks(title="Image Captioning App") as demo:
    gr.Markdown("# Image Captioning with LLaVA")
    gr.Markdown("Upload an image and optionally ask a specific question about it.")
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image")
            question_input = gr.Textbox(label="Question (optional)", placeholder="Ask a specific question about the image or leave blank for general description")
            caption_button = gr.Button("Generate Caption")
        
        with gr.Column():
            caption_output = gr.Textbox(label="Generated Caption", lines=7)
    
    # Add examples
    example_images = [
        ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", 
         "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"],
        ["https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/1920px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
         ""]
    ]
    
    gr.Examples(
        examples=example_images,
        inputs=[image_input, question_input],
        fn=process_example_url,
        cache_examples=True,
    )
    
    # Set up the button click event
    caption_button.click(
        fn=caption_image,
        inputs=[image_input, question_input],
        outputs=caption_output
    )
    
    gr.Markdown("### How to use:")
    gr.Markdown("1. Upload an image by clicking the upload box or drag-and-drop")
    gr.Markdown("2. Optionally type a specific question about the image")
    gr.Markdown("3. Click 'Generate Caption' to get the result")
    gr.Markdown("4. Try the examples below to see how it works")

# Launch the app
if __name__ == "__main__":
    demo.launch()