import os
import sys
import tempfile
import torch
import gradio as gr
from datetime import datetime
import numpy as np

# Try to import audio libraries
try:
    import scipy.io.wavfile as wavfile
    USE_SCIPY = True
except ImportError:
    USE_SCIPY = False
    try:
        import soundfile as sf
        USE_SOUNDFILE = True
    except ImportError:
        USE_SOUNDFILE = False

# Configuration
MODEL_PATH = "v4_indic.pt"
DEFAULT_SPEAKER = "hindi_female"
DEFAULT_SAMPLE_RATE = 48000

print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")

# Load the model
print(f"Loading model from {MODEL_PATH}")
m = torch.package.PackageImporter(MODEL_PATH).load_pickle("tts_models", "model")
print(f"Model object loaded: {type(m).__name__}")

# Inspect apply_tts signature
import inspect
sig = inspect.signature(m.apply_tts)
print(f"apply_tts signature: {sig}")

# Available speakers
AVAILABLE_SPEAKERS = [
    "bengali_female", "bengali_male",
    "gujarati_female", "gujarati_male",
    "hindi_female", "hindi_male",
    "kannada_female", "kannada_male",
    "malayalam_female", "malayalam_male",
    "manipuri_female",
    "rajasthani_female", "rajasthani_male",
    "tamil_female", "tamil_male",
    "telugu_female", "telugu_male"
]

def _call_apply_tts(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
    """
    Wrapper to call apply_tts with proper error handling.
    """
    # Validate speaker
    if speaker not in AVAILABLE_SPEAKERS:
        print(f"Warning: Invalid speaker '{speaker}', using default '{DEFAULT_SPEAKER}'")
        speaker = DEFAULT_SPEAKER
    
    # Clean and validate text
    text = text.strip()
    if not text:
        raise ValueError("Text cannot be empty")
    
    # Remove zero-width characters and normalize
    text = text.replace('\u200d', '').replace('\u200c', '')
    
    print(f"Calling apply_tts with text: '{text}', speaker: '{speaker}', sample_rate: {sample_rate}")
    
    try:
        # Try with ssml_text parameter (some models prefer this)
        res = m.apply_tts(
            ssml_text=text,
            speaker=speaker,
            sample_rate=sample_rate
        )
        print("Success with ssml_text parameter")
    except Exception as e1:
        print(f"ssml_text attempt failed: {e1}")
        try:
            # Try with text parameter
            res = m.apply_tts(
                text=text,
                speaker=speaker,
                sample_rate=sample_rate
            )
            print("Success with text parameter")
        except Exception as e2:
            print(f"text attempt failed: {e2}")
            try:
                # Try minimal parameters
                res = m.apply_tts(
                    text=text,
                    speaker=speaker
                )
                print("Success with minimal parameters")
            except Exception as e3:
                print(f"All attempts failed. Last error: {e3}")
                raise ValueError(f"Text processing failed. The model may not support this text. Error: {e3}")
    
    # Handle different return types
    if isinstance(res, tuple):
        audio = res[0]
    else:
        audio = res
    
    return audio


def synthesize_text_to_wavfile(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
    """
    Synthesize text to audio and save to temporary WAV file.
    
    Args:
        text: Text to synthesize
        speaker: Speaker voice to use
        sample_rate: Audio sample rate
        
    Returns:
        Path to generated WAV file
    """
    audio = _call_apply_tts(text, speaker, sample_rate)
    
    # Convert to numpy array if needed
    if torch.is_tensor(audio):
        audio = audio.cpu().numpy()
    
    # Ensure audio is in the right format
    if audio.dtype != np.int16:
        # Normalize to -1 to 1 range if needed
        if audio.max() > 1.0 or audio.min() < -1.0:
            audio = audio / max(abs(audio.max()), abs(audio.min()))
        # Convert to 16-bit PCM
        audio = (audio * 32767).astype(np.int16)
    
    # Create temporary file
    fd, path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    
    # Save audio using available library
    if USE_SCIPY:
        wavfile.write(path, sample_rate, audio)
    elif USE_SOUNDFILE:
        sf.write(path, audio, sample_rate)
    else:
        raise RuntimeError("No audio library available. Please install scipy or soundfile.")
    
    return path


def tts_gradio_fn(text, speaker, sample_rate):
    """
    Gradio interface function.
    
    Args:
        text: Input text
        speaker: Selected speaker voice
        sample_rate: Audio sample rate
        
    Returns:
        Path to generated audio file
    """
    if not text or not text.strip():
        raise gr.Error("Please enter some text to synthesize")
    
    # Warn if text is too long
    if len(text) > 200:
        raise gr.Error("Text is too long. Please use shorter text (under 200 characters)")
    
    try:
        path = synthesize_text_to_wavfile(text, speaker, sample_rate)
        return path
    except ValueError as e:
        raise gr.Error(f"Text processing failed: {str(e)}. Try simpler text or a different language.")
    except Exception as e:
        raise gr.Error(f"Speech generation failed: {str(e)}")


# Create Gradio interface
with gr.Blocks(title="Silero v4 Indic TTS") as demo:
    gr.Markdown("# Silero v4 Indic Text-to-Speech")
    gr.Markdown("Convert text to speech in multiple Indian languages")
    gr.Markdown("⚠️ **Note:** Use simple, short phrases for best results. Complex sentences may fail.")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Enter Text",
                placeholder="नमस्ते (Enter short text in Hindi, Bengali, Tamil, etc.)",
                lines=3,
                info="Keep text short and simple for best results"
            )
            
            speaker_dropdown = gr.Dropdown(
                choices=AVAILABLE_SPEAKERS,
                value=DEFAULT_SPEAKER,
                label="Select Speaker Voice"
            )
            
            sample_rate_dropdown = gr.Dropdown(
                choices=[8000, 16000, 24000, 48000],
                value=DEFAULT_SAMPLE_RATE,
                label="Sample Rate (Hz)"
            )
            
            submit_btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Audio",
                type="filepath"
            )
    
    # Examples
    gr.Examples(
        examples=[
            ["नमस्ते", "hindi_female", 48000],
            ["आप कैसे हैं", "hindi_male", 48000],
            ["হ্যালো", "bengali_female", 48000],
            ["வணக்கம்", "tamil_female", 48000],
            ["హలో", "telugu_female", 48000],
            ["ಹಲೋ", "kannada_female", 48000],
            ["હેલો", "gujarati_female", 48000],
        ],
        inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
        outputs=audio_output,
        fn=tts_gradio_fn,
        cache_examples=False
    )
    
    submit_btn.click(
        fn=tts_gradio_fn,
        inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
        outputs=audio_output
    )

# Launch the app with API enabled
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_api=True  # This enables the API documentation
    )