|
|
|
|
|
import gradio as gr |
|
|
from transformers import BlipProcessor, BlipForConditionalGeneration |
|
|
from gtts import gTTS |
|
|
import io |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") |
|
|
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import tempfile |
|
|
import pyttsx3 |
|
|
|
|
|
def text_to_audio_file(text): |
|
|
|
|
|
tmp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) |
|
|
tmp_path = tmp_file.name |
|
|
tmp_file.close() |
|
|
|
|
|
engine = pyttsx3.init() |
|
|
engine.save_to_file(text, tmp_path) |
|
|
engine.runAndWait() |
|
|
|
|
|
return tmp_path |
|
|
|
|
|
def generate_caption_from_image(model, processor, image): |
|
|
|
|
|
inputs = processor(images=image, return_tensors="pt") |
|
|
out = model.generate(**inputs) |
|
|
caption = processor.decode(out[0], skip_special_tokens=True) |
|
|
return caption |
|
|
|
|
|
|
|
|
|
|
|
def generate_caption_tts(image): |
|
|
caption = generate_caption_from_image(model, processor, image) |
|
|
|
|
|
return caption |
|
|
|
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=generate_caption_tts, |
|
|
inputs=gr.Image(type="numpy"), |
|
|
outputs=[gr.Textbox(label="Generated Caption")], |
|
|
title="Image Captioning for Visually Impaired", |
|
|
description="Upload an image, get a caption and audio description." |
|
|
) |
|
|
|
|
|
|
|
|
interface.launch() |
|
|
|
|
|
|
|
|
|