Spaces:
Sleeping
Sleeping
| """ | |
| Image to Voice - Hugging Face Space app | |
| This script turns an input image into a spoken audio description by: | |
| 1. Using a Hugging Face `image-to-text` pipeline to describe the image. | |
| 2. Converting that description to speech with Supertonic TTS. | |
| To use this on Hugging Face Spaces: | |
| - Name this file `app.py`, or set it as the main file in your Space settings. | |
| - Add a `requirements.txt` listing the Python packages. | |
| """ | |
| import gradio as gr | |
| from supertonic import TTS # Getting Text to Speech from Supertonic | |
| from transformers import pipeline # Getting Pipeline which lets us use AI easily | |
| # Set up the image captioning model from Hugging Face | |
| image_to_text = pipeline("image-to-text") # Model for changing images to text | |
| # Set up the text-to-speech model | |
| tts = TTS(auto_download=True) # Gets the Text to Speech Model and assigns it to tts | |
| # Define some example voice styles. | |
| # You can adjust this list based on the voices available in your Supertonic install. | |
| VOICE_STYLES = [ | |
| "M1", | |
| "M2", | |
| "M3", | |
| "M4", | |
| "M5", | |
| "F1", | |
| "F2", | |
| "F3", | |
| ] | |
| def image_to_voice(image, voice_name): | |
| """ | |
| Take an image, generate a caption, and then synthesize that caption as speech. | |
| Parameters | |
| ---------- | |
| image : PIL.Image.Image | |
| Image provided by the user (Gradio passes a PIL image). | |
| voice_name : str | |
| Name of the voice style to use (selected from the dropdown). | |
| Returns | |
| ------- | |
| audio_path : str | |
| File path to the generated WAV audio. | |
| caption : str | |
| Text description generated from the image. | |
| """ | |
| # Get text description from the image | |
| result = image_to_text(image) | |
| caption = result[0]["generated_text"] | |
| # Generate audio from the caption | |
| style = tts.get_voice_style(voice_name=voice_name) | |
| wav, duration = tts.synthesize(caption, voice_style=style) | |
| # Save the audio to a file that Gradio can serve | |
| output_path = "output.wav" | |
| tts.save_audio(wav, output_path) | |
| return output_path, caption | |
| # Define the Gradio interface | |
| demo = gr.Interface( | |
| fn=image_to_voice, | |
| inputs=[ | |
| gr.Image(type="pil", label="Upload an image"), | |
| gr.Dropdown( | |
| choices=VOICE_STYLES, | |
| value="M5", | |
| label="Voice style", | |
| ), | |
| ], | |
| outputs=[ | |
| gr.Audio(type="filepath", label="Generated speech"), | |
| gr.Textbox(label="Generated description"), | |
| ], | |
| title="Image to Voice", | |
| description="Upload an image to generate a text description and spoken audio.", | |
| ) | |
| if __name__ == "__main__": | |
| # For local testing; on Hugging Face Spaces this will be handled automatically. | |
| demo.launch() | |