# app.py import gradio as gr from transformers import BlipProcessor, BlipForConditionalGeneration from gtts import gTTS import io from PIL import Image # ------------------------------- # Load BLIP-base model (lighter version) # ------------------------------- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") # ------------------------------- # Generate caption function # ------------------------------- def generate_caption_fn(image): # Convert uploaded image to PIL if not isinstance(image, Image.Image): image = Image.fromarray(image) # BLIP preprocessing inputs = processor(images=image, return_tensors="pt") # Generate caption out = model.generate(**inputs) caption = processor.decode(out[0], skip_special_tokens=True) return caption # ------------------------------- # Convert text to speech using gTTS # ------------------------------- def text_to_speech(caption): tts = gTTS(text=caption, lang='en') mp3_fp = io.BytesIO() tts.write_to_fp(mp3_fp) mp3_fp.seek(0) return mp3_fp # ------------------------------- # Gradio interface: Caption + Audio # ------------------------------- def generate_caption_tts(image): caption = generate_caption_fn(image) audio = text_to_speech(caption) return caption, audio interface = gr.Interface( fn=generate_caption_tts, inputs=gr.Image(type="numpy"), outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(type="file", label="TTS Audio")], title="Blind Assistant: Image Captioning", description="Upload an image and get a descriptive caption + speech." ) interface.launch()