File size: 2,344 Bytes
3c5d69c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
"""

Image to Voice - Hugging Face Spaces

Converts images to text and then to speech

"""

import gradio as gr
from supertonic import TTS
from transformers import pipeline

# Initialize the image-to-text pipeline
image_to_text = pipeline("image-to-text")

# Initialize TTS (will be loaded on first use)
tts = None

def get_tts():
    """Lazy load TTS to avoid loading on startup"""
    global tts
    if tts is None:
        tts = TTS(auto_download=True)
    return tts

def image_to_voice(image):
    """

    Convert image to text and then to speech

    

    Args:

        image: PIL Image or numpy array from Gradio

        

    Returns:

        tuple: (audio_file_path, text_description)

    """
    if image is None:
        return None, "Please upload an image."
    
    try:
        # Convert image to text
        result = image_to_text(image)
        text = result[0]['generated_text']
        
        # Convert text to speech
        tts_model = get_tts()
        style = tts_model.get_voice_style(voice_name="M5")
        wav, duration = tts_model.synthesize(text, voice_style=style)
        
        # Save audio to a temporary file
        output_path = "output.wav"
        tts_model.save_audio(wav, output_path)
        
        return output_path, text
        
    except Exception as e:
        return None, f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Image to Voice") as demo:
    gr.Markdown("# 🖼️ Image to Voice Converter")
    gr.Markdown("Upload an image and get an audio description of it!")
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image")
            generate_btn = gr.Button("Generate Audio", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(label="Generated Audio", type="filepath")
            text_output = gr.Textbox(label="Image Description", lines=5)
    
    generate_btn.click(
        fn=image_to_voice,
        inputs=image_input,
        outputs=[audio_output, text_output]
    )
    
    gr.Examples(
        examples=[],
        inputs=image_input,
        label="Example Images (add your own examples)"
    )

if __name__ == "__main__":
    demo.launch()