import os
import subprocess
import sys

# Fix OMP_NUM_THREADS issue before any imports
os.environ["OMP_NUM_THREADS"] = "4"

# Install dependencies programmatically to avoid conflicts
def setup_dependencies():
    try:
        # Check if already installed
        if os.path.exists('/tmp/deps_installed'):
            return
            
        print("Installing transformers dev version...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
            "git+https://github.com/huggingface/transformers.git"
        ])
        
        # Mark as installed
        with open('/tmp/deps_installed', 'w') as f:
            f.write('done')
            
    except Exception as e:
        print(f"Dependencies setup error: {e}")

# Run setup
setup_dependencies()

import spaces
import gradio as gr
from util import Config, NemoAudioPlayer, KaniModel, Demo
import numpy as np
import torch

# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')

# Model configurations
models_configs = {
    'base': Config(),
    'female': Config(
        model_name='nineninesix/kani-tts-450m-0.2-ft',
    ),
    'male': Config(
        model_name='nineninesix/kani-tts-450m-0.1-ft',
    )
}

# Global variables for models (loaded once)
player = NemoAudioPlayer(Config())
models = {}
for model_name, config in models_configs.items():
    print(f"Loading {model_name}...")
    models[model_name] = KaniModel(config, player, token_)
    print(f"{model_name} loaded!")
print("All models loaded!")


@spaces.GPU
def generate_speech_gpu(text, model_choice, t, top_p, rp, max_tok):
    """
    Generate speech from text using the selected model on GPU
    """
    
    if not text.strip():
        return None, "Please enter text for speech generation."
    
    if not model_choice:
        return None, "Please select a model."
    
    try:
        # Check GPU availability
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        # Get selected model
        selected_model = models[model_choice]
        
        # Generate audio
        print(f"Generating speech with {model_choice}...")
        audio, _, time_report = selected_model.run_model(text, t, top_p, rp, max_tok)
        
        sample_rate = 22050 
        print("Speech generation completed!")
        
        return (sample_rate, audio), time_report   #, f"✅ Audio generated successfully using {model_choice} on {device}"
        
    except Exception as e:
        print(f"Error during generation: {str(e)}")
        return None, f"❌ Error during generation: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="😻 KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
    gr.Markdown("# 😻 KaniTTS: Fast and Expressive Speech Generation Model")
    gr.Markdown("Select a model and enter text to generate emotional speech")
    
    with gr.Row():
        with gr.Column(scale=1):
            model_dropdown = gr.Dropdown(
                choices=list(models_configs.keys()),
                value=list(models_configs.keys())[0],
                label="Selected Model",
                info="Base generates random voices"
            )
            
            text_input = gr.Textbox(
                label="Text",
                placeholder="Enter your text ...",
                lines=3,
                max_lines=10
            )

            with gr.Accordion("Settings", open=False):
                temp = gr.Slider(
                    minimum=0.1, maximum=1.5, value=0.6, step=0.05,
                    label="Temp", 
                )
                top_p = gr.Slider(
                    minimum=0.1, maximum=1.0, value=0.95, step=0.05,
                    label="Top P", 
                )
                rp = gr.Slider(
                    minimum=1.0, maximum=2.0, value=1.1, step=0.05,
                    label="Repetition Penalty", 
                )
                max_tok = gr.Slider(
                    minimum=100, maximum=2000, value=1200, step=100,
                    label="Max Tokens", 
                )
            
            generate_btn = gr.Button("Run", variant="primary", size="lg")

            
        with gr.Column(scale=1):
            audio_output = gr.Audio(
                label="Generated Audio",
                type="numpy"
            )
            
            time_report_output = gr.Textbox(
                label="Time Report",
                interactive=False,
                value="Ready to generate speech",
                lines=3
            )
    
    # GPU generation event
    generate_btn.click(
        fn=generate_speech_gpu,
        inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
        outputs=[audio_output, time_report_output]
    )
    
    with gr.Row():

        examples = [
            ["Anyway, um, so, um, tell me, tell me all about her. I mean, what's she like? Is she really, you know, pretty?", "male", 0.6, 0.95, 1.1, 1200],
            ["No, that does not make you a failure. No, sweetie, no. It just, uh, it just means that you're having a tough time...", "male", 0.6, 0.95, 1.1, 1200],
            ["I-- Oh, I am such an idiot sometimes. I'm so sorry. Um, I-I don't know where my head's at.", "male", 0.6, 0.95, 1.1, 1200],
            ["Got it. $300,000. I can definitely help you get a very good price for your property by selecting a realtor.", "female", 0.6, 0.95, 1.1, 1200],
            ["Holy fu- Oh my God! Don't you understand how dangerous it is, huh?", "male", 0.6, 0.95, 1.1, 1200],
            ["You make my days brighter, and my wildest dreams feel like reality. How do you do that?", "female", 0.6, 0.95, 1.1, 1200],
            ["Great, and just a couple quick questions so we can match you with the right buyer. Is your home address still 330 East Charleston Road?", "female", 0.6, 0.95, 1.1, 1200],
            ["Oh, yeah. I mean did you want to get a quick snack together or maybe something before you go?", "female", 0.6, 0.95, 1.1, 1200],
        ]


        gr.Examples(
            examples=examples,
            inputs=[text_input, model_dropdown, temp, top_p, rp, max_tok],
            fn=generate_speech_gpu,
            outputs=[audio_output, time_report_output],
            cache_examples=True,
        )

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )