# app.py import gradio as gr from transformers import BlipProcessor, BlipForConditionalGeneration from gtts import gTTS import io from PIL import Image # ------------------------------- # Load BLIP-base model (lighter version) # ------------------------------- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # ------------------------------- # Generate caption function # ------------------------------- # def generate_caption_tts(image): # caption = generate_caption(model, processor, image) # audio_file = text_to_audio_file(caption) # return caption, audio_file # return file path, not BytesIO # ------------------------------- # Convert text to speech using gTTS # ------------------------------- import tempfile import pyttsx3 def text_to_audio_file(text): # Create a temporary file tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) tmp_path = tmp_file.name tmp_file.close() engine = pyttsx3.init() engine.save_to_file(text, tmp_path) engine.runAndWait() return tmp_path # ------------------------------- # Gradio interface: Caption + Audio # ------------------------------- def generate_caption_tts(image): caption = generate_caption_from_image(model, processor, image) # uses global model/processor audio_file = text_to_audio_file(caption) return caption, audio_file interface = gr.Interface( fn=generate_caption_tts, inputs=gr.Image(type="numpy"), outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(type="filepath", label="TTS Audio")], title="Image Captioning for Visually Impaired", description="Upload an image, get a caption and audio description." ) interface.launch()