""" Image to Voice - Hugging Face Space app This script turns an input image into a spoken audio description by: 1. Using a Hugging Face `image-to-text` pipeline to describe the image. 2. Converting that description to speech with Supertonic TTS. To use this on Hugging Face Spaces: - Name this file `app.py`, or set it as the main file in your Space settings. - Add a `requirements.txt` listing the Python packages. """ import gradio as gr from supertonic import TTS # Getting Text to Speech from Supertonic from transformers import pipeline # Getting Pipeline which lets us use AI easily # Set up the image captioning model from Hugging Face image_to_text = pipeline("image-to-text") # Model for changing images to text # Set up the text-to-speech model tts = TTS(auto_download=True) # Gets the Text to Speech Model and assigns it to tts # Define some example voice styles. # You can adjust this list based on the voices available in your Supertonic install. VOICE_STYLES = [ "M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", ] def image_to_voice(image, voice_name): """ Take an image, generate a caption, and then synthesize that caption as speech. Parameters ---------- image : PIL.Image.Image Image provided by the user (Gradio passes a PIL image). voice_name : str Name of the voice style to use (selected from the dropdown). Returns ------- audio_path : str File path to the generated WAV audio. caption : str Text description generated from the image. """ # Get text description from the image result = image_to_text(image) caption = result[0]["generated_text"] # Generate audio from the caption style = tts.get_voice_style(voice_name=voice_name) wav, duration = tts.synthesize(caption, voice_style=style) # Save the audio to a file that Gradio can serve output_path = "output.wav" tts.save_audio(wav, output_path) return output_path, caption # Define the Gradio interface demo = gr.Interface( fn=image_to_voice, inputs=[ gr.Image(type="pil", label="Upload an image"), gr.Dropdown( choices=VOICE_STYLES, value="M5", label="Voice style", ), ], outputs=[ gr.Audio(type="filepath", label="Generated speech"), gr.Textbox(label="Generated description"), ], title="Image to Voice", description="Upload an image to generate a text description and spoken audio.", ) if __name__ == "__main__": # For local testing; on Hugging Face Spaces this will be handled automatically. demo.launch()