Devarsh24's picture
Update app.py
4c685d2 verified
# to create neural network
import torch
# for interface
import gradio as gr
# to open images
from PIL import Image
# used for audio
import scipy.io.wavfile as wavfile
# Use a pipeline as a high-level helper
from transformers import pipeline
# device: 0 for GPU, -1 for CPU
device = 0 if torch.cuda.is_available() else -1
# Text-to-speech model (English)
narrator = pipeline(
"text-to-speech",
model="facebook/mms-tts-eng",
device=device
)
# Load the pretrained image captioning model
caption_image = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
device=device
)
# Define the function to generate audio from text
def generate_audio(text):
# Generate the narrated text
narrated_text = narrator(text)
# narrator output format: dict with "audio" and "sampling_rate"
audio = narrated_text["audio"]
# sometimes it's a list of arrays, handle that:
if isinstance(audio, list):
audio = audio[0]
# Save the audio to WAV file
output_path = "output.wav"
wavfile.write(output_path, rate=narrated_text["sampling_rate"], data=audio)
# Return the path to the saved output WAV file
return output_path # return audio file path
def caption_my_image(pil_image: Image.Image):
# Call pipeline with positional input (no `images=` keyword)
result = caption_image(pil_image)
# result is usually a list of dicts
if isinstance(result, list):
semantics = result[0]["generated_text"]
else:
semantics = result["generated_text"]
audio = generate_audio(semantics)
return semantics, audio # returns both text and audio output
# gr.close_all() # <- NOT NEEDED, remove to avoid issues
demo = gr.Interface(
fn=caption_my_image,
inputs=[gr.Image(label="Select Image", type="pil")],
outputs=[
gr.Textbox(label="Image Caption"),
gr.Audio(label="Image Caption Audio")
],
title="IMAGE CAPTIONING WITH AUDIO OUTPUT",
description="THIS APPLICATION WILL BE USED TO CAPTION IMAGES WITH THE HELP OF AI"
)
demo.launch()