import os import io import IPython.display from PIL import Image import base64 from transformers import pipeline, AutoTokenizer import requests import gradio as gr get_completion = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") def generate_caption(base64_image): # Decode base64 string to PIL image image_data = base64.b64decode(base64_image) image = Image.open(io.BytesIO(image_data)) # Get caption using the BLIP model caption_result = get_completion(image) # Ensure a consistent format by always returning a dictionary if isinstance(caption_result, str): return {'generated_text': caption_result} elif caption_result and isinstance(caption_result, list): return caption_result[0] else: return {'generated_text': None} def image_to_base64_str(pil_image): byte_arr = io.BytesIO() pil_image.save(byte_arr, format='PNG') byte_arr = byte_arr.getvalue() return str(base64.b64encode(byte_arr).decode('utf-8')) def captioner(image): base64_image = image_to_base64_str(image) result = generate_caption(base64_image) print(result) # Debugging print statement to see the structure of the result # Access the 'generated_text' field from the result dictionary caption_text = result['generated_text'] print(caption_text) return caption_text demo = gr.Interface(fn=captioner, inputs=[gr.Image(label="Upload image", type="pil")], outputs=[gr.Textbox(label="Caption")], title="Image Captioning with BLIP", description="Caption any image using the BLIP model", allow_flagging="never", examples=["christmas_dog.jpeg", "bird_flight.jpeg", "cow.jpeg"]) demo.launch() # Remove share=True and server_port for Hugging Face Spaces