jonloporto's picture
Upload 3 files
3c5d69c verified
# -*- coding: utf-8 -*-
"""
Image to Voice - Hugging Face Spaces
Converts images to text and then to speech
"""
import gradio as gr
from supertonic import TTS
from transformers import pipeline
# Initialize the image-to-text pipeline
image_to_text = pipeline("image-to-text")
# Initialize TTS (will be loaded on first use)
tts = None
def get_tts():
"""Lazy load TTS to avoid loading on startup"""
global tts
if tts is None:
tts = TTS(auto_download=True)
return tts
def image_to_voice(image):
"""
Convert image to text and then to speech
Args:
image: PIL Image or numpy array from Gradio
Returns:
tuple: (audio_file_path, text_description)
"""
if image is None:
return None, "Please upload an image."
try:
# Convert image to text
result = image_to_text(image)
text = result[0]['generated_text']
# Convert text to speech
tts_model = get_tts()
style = tts_model.get_voice_style(voice_name="M5")
wav, duration = tts_model.synthesize(text, voice_style=style)
# Save audio to a temporary file
output_path = "output.wav"
tts_model.save_audio(wav, output_path)
return output_path, text
except Exception as e:
return None, f"Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Image to Voice") as demo:
gr.Markdown("# 🖼️ Image to Voice Converter")
gr.Markdown("Upload an image and get an audio description of it!")
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="Upload Image")
generate_btn = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio", type="filepath")
text_output = gr.Textbox(label="Image Description", lines=5)
generate_btn.click(
fn=image_to_voice,
inputs=image_input,
outputs=[audio_output, text_output]
)
gr.Examples(
examples=[],
inputs=image_input,
label="Example Images (add your own examples)"
)
if __name__ == "__main__":
demo.launch()