File size: 2,668 Bytes
8f4154f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Image to Voice - Hugging Face Space app

This script turns an input image into a spoken audio description by:
1. Using a Hugging Face `image-to-text` pipeline to describe the image.
2. Converting that description to speech with Supertonic TTS.

To use this on Hugging Face Spaces:
- Name this file `app.py`, or set it as the main file in your Space settings.
- Add a `requirements.txt` listing the Python packages.
"""

import gradio as gr
from supertonic import TTS  # Getting Text to Speech from Supertonic
from transformers import pipeline  # Getting Pipeline which lets us use AI easily


# Set up the image captioning model from Hugging Face
image_to_text = pipeline("image-to-text")  # Model for changing images to text

# Set up the text-to-speech model
tts = TTS(auto_download=True)  # Gets the Text to Speech Model and assigns it to tts

# Define some example voice styles.
# You can adjust this list based on the voices available in your Supertonic install.
VOICE_STYLES = [
    "M1",
    "M2",
    "M3",
    "M4",
    "M5",
    "F1",
    "F2",
    "F3",
]


def image_to_voice(image, voice_name):
    """
    Take an image, generate a caption, and then synthesize that caption as speech.

    Parameters
    ----------
    image : PIL.Image.Image
        Image provided by the user (Gradio passes a PIL image).
    voice_name : str
        Name of the voice style to use (selected from the dropdown).

    Returns
    -------
    audio_path : str
        File path to the generated WAV audio.
    caption : str
        Text description generated from the image.
    """
    # Get text description from the image
    result = image_to_text(image)
    caption = result[0]["generated_text"]

    # Generate audio from the caption
    style = tts.get_voice_style(voice_name=voice_name)
    wav, duration = tts.synthesize(caption, voice_style=style)

    # Save the audio to a file that Gradio can serve
    output_path = "output.wav"
    tts.save_audio(wav, output_path)

    return output_path, caption


# Define the Gradio interface
demo = gr.Interface(
    fn=image_to_voice,
    inputs=[
        gr.Image(type="pil", label="Upload an image"),
        gr.Dropdown(
            choices=VOICE_STYLES,
            value="M5",
            label="Voice style",
        ),
    ],
    outputs=[
        gr.Audio(type="filepath", label="Generated speech"),
        gr.Textbox(label="Generated description"),
    ],
    title="Image to Voice",
    description="Upload an image to generate a text description and spoken audio.",
)


if __name__ == "__main__":
    # For local testing; on Hugging Face Spaces this will be handled automatically.
    demo.launch()