# -*- coding: utf-8 -*-
"""
Image to Voice - Hugging Face Spaces
Converts images to text and then to speech
"""

import gradio as gr
from supertonic import TTS
from transformers import pipeline

# Initialize the image-to-text pipeline
image_to_text = pipeline("image-to-text")

# Initialize TTS (will be loaded on first use)
tts = None

def get_tts():
    """Lazy load TTS to avoid loading on startup"""
    global tts
    if tts is None:
        tts = TTS(auto_download=True)
    return tts

def image_to_voice(image):
    """
    Convert image to text and then to speech
    
    Args:
        image: PIL Image or numpy array from Gradio
        
    Returns:
        tuple: (audio_file_path, text_description)
    """
    if image is None:
        return None, "Please upload an image."
    
    try:
        # Convert image to text
        result = image_to_text(image)
        text = result[0]['generated_text']
        
        # Convert text to speech
        tts_model = get_tts()
        style = tts_model.get_voice_style(voice_name="M5")
        wav, duration = tts_model.synthesize(text, voice_style=style)
        
        # Save audio to a temporary file
        output_path = "output.wav"
        tts_model.save_audio(wav, output_path)
        
        return output_path, text
        
    except Exception as e:
        return None, f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Image to Voice") as demo:
    gr.Markdown("# 🖼️ Image to Voice Converter")
    gr.Markdown("Upload an image and get an audio description of it!")
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload Image")
            generate_btn = gr.Button("Generate Audio", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(label="Generated Audio", type="filepath")
            text_output = gr.Textbox(label="Image Description", lines=5)
    
    generate_btn.click(
        fn=image_to_voice,
        inputs=image_input,
        outputs=[audio_output, text_output]
    )
    
    gr.Examples(
        examples=[],
        inputs=image_input,
        label="Example Images (add your own examples)"
    )

if __name__ == "__main__":
    demo.launch()