ITVVideo / app.py
jonloporto's picture
Upload app.py
8f4154f verified
"""
Image to Voice - Hugging Face Space app
This script turns an input image into a spoken audio description by:
1. Using a Hugging Face `image-to-text` pipeline to describe the image.
2. Converting that description to speech with Supertonic TTS.
To use this on Hugging Face Spaces:
- Name this file `app.py`, or set it as the main file in your Space settings.
- Add a `requirements.txt` listing the Python packages.
"""
import gradio as gr
from supertonic import TTS # Getting Text to Speech from Supertonic
from transformers import pipeline # Getting Pipeline which lets us use AI easily
# Set up the image captioning model from Hugging Face
image_to_text = pipeline("image-to-text") # Model for changing images to text
# Set up the text-to-speech model
tts = TTS(auto_download=True) # Gets the Text to Speech Model and assigns it to tts
# Define some example voice styles.
# You can adjust this list based on the voices available in your Supertonic install.
VOICE_STYLES = [
"M1",
"M2",
"M3",
"M4",
"M5",
"F1",
"F2",
"F3",
]
def image_to_voice(image, voice_name):
"""
Take an image, generate a caption, and then synthesize that caption as speech.
Parameters
----------
image : PIL.Image.Image
Image provided by the user (Gradio passes a PIL image).
voice_name : str
Name of the voice style to use (selected from the dropdown).
Returns
-------
audio_path : str
File path to the generated WAV audio.
caption : str
Text description generated from the image.
"""
# Get text description from the image
result = image_to_text(image)
caption = result[0]["generated_text"]
# Generate audio from the caption
style = tts.get_voice_style(voice_name=voice_name)
wav, duration = tts.synthesize(caption, voice_style=style)
# Save the audio to a file that Gradio can serve
output_path = "output.wav"
tts.save_audio(wav, output_path)
return output_path, caption
# Define the Gradio interface
demo = gr.Interface(
fn=image_to_voice,
inputs=[
gr.Image(type="pil", label="Upload an image"),
gr.Dropdown(
choices=VOICE_STYLES,
value="M5",
label="Voice style",
),
],
outputs=[
gr.Audio(type="filepath", label="Generated speech"),
gr.Textbox(label="Generated description"),
],
title="Image to Voice",
description="Upload an image to generate a text description and spoken audio.",
)
if __name__ == "__main__":
# For local testing; on Hugging Face Spaces this will be handled automatically.
demo.launch()