import gradio as gr
import tempfile
import torch
from sopro import SoproTTS

# Initialize the model globally so it only loads once when the Space starts
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading Sopro TTS on device: {device}")
tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device=device)

def generate_speech(text, ref_audio_path, temperature, top_p, style_strength):
    if not text:
        raise gr.Error("Please enter some text to synthesize.")
    if not ref_audio_path:
        raise gr.Error("Please upload or record a reference audio file.")
        
    try:
        # Generate the audio wave
        wav = tts.synthesize(
            text=text,
            ref_audio_path=ref_audio_path,
            temperature=temperature,
            top_p=top_p,
            style_strength=style_strength
        )
        
        # Save output to a temporary file for Gradio to serve
        temp_out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        tts.save_wav(temp_out.name, wav)
        
        return temp_out.name
    
    except Exception as e:
        raise gr.Error(f"Error during synthesis: {str(e)}")

# Define the Gradio Interface
with gr.Blocks(title="Sopro TTS - Voice Cloning") as demo:
    gr.Markdown("# 🌬️ Sopro TTS - Zero-Shot Voice Cloning")
    gr.Markdown(
        "A lightweight (135M parameter) text-to-speech model with zero-shot voice cloning by [Samuel Vitorino](https://github.com/samuel-vitorino/sopro). "
        "Upload a 3-12 second audio clip to clone a voice!"
    )
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Text to Synthesize", 
                lines=4, 
                placeholder="Enter text here... (Prefer words over abbreviations/symbols, e.g., '1 plus 2' instead of '1 + 2')"
            )
            
            # Type 'filepath' passes the path of the uploaded file to our function
            ref_audio_input = gr.Audio(
                label="Reference Audio (3 to 12 seconds recommended)", 
                type="filepath",
                sources=["upload", "microphone"]
            )
            
            with gr.Accordion("Advanced Parameters", open=False):
                temp_slider = gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature")
                top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P")
                style_slider = gr.Slider(minimum=0.0, maximum=3.0, value=1.2, step=0.1, label="Style Strength", info="Controls the FiLM strength; increasing it can improve or reduce voice similarity.")
                
            generate_btn = gr.Button("Generate Speech", variant="primary")
            
        with gr.Column():
            audio_output = gr.Audio(label="Generated Audio", autoplay=False)
            
    # Connect UI elements to the function
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, ref_audio_input, temp_slider, top_p_slider, style_slider],
        outputs=[audio_output]
    )
    
    gr.Markdown(
        "### ⚠️ Disclaimers\n"
        "- Sopro can be inconsistent. If the output sounds glitchy, try tweaking the Temperature and Style Strength.\n"
        "- Voice cloning quality is highly dependent on the microphone quality and ambient noise of the reference audio.\n"
        "- Generation length is currently capped at ~32 seconds to prevent hallucinations."
    )

if __name__ == "__main__":
    demo.launch()