# -*- coding: utf-8 -*- """ Image to Voice - Hugging Face Spaces Converts images to text and then to speech """ import gradio as gr from supertonic import TTS from transformers import pipeline # Initialize the image-to-text pipeline image_to_text = pipeline("image-to-text") # Initialize TTS (will be loaded on first use) tts = None def get_tts(): """Lazy load TTS to avoid loading on startup""" global tts if tts is None: tts = TTS(auto_download=True) return tts def image_to_voice(image): """ Convert image to text and then to speech Args: image: PIL Image or numpy array from Gradio Returns: tuple: (audio_file_path, text_description) """ if image is None: return None, "Please upload an image." try: # Convert image to text result = image_to_text(image) text = result[0]['generated_text'] # Convert text to speech tts_model = get_tts() style = tts_model.get_voice_style(voice_name="M5") wav, duration = tts_model.synthesize(text, voice_style=style) # Save audio to a temporary file output_path = "output.wav" tts_model.save_audio(wav, output_path) return output_path, text except Exception as e: return None, f"Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Image to Voice") as demo: gr.Markdown("# 🖼️ Image to Voice Converter") gr.Markdown("Upload an image and get an audio description of it!") with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="Upload Image") generate_btn = gr.Button("Generate Audio", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Generated Audio", type="filepath") text_output = gr.Textbox(label="Image Description", lines=5) generate_btn.click( fn=image_to_voice, inputs=image_input, outputs=[audio_output, text_output] ) gr.Examples( examples=[], inputs=image_input, label="Example Images (add your own examples)" ) if __name__ == "__main__": demo.launch()