import gradio as gr
import numpy as np
import soundfile as sf
from vieneutts import VieNeuTTS
import tempfile
import os
import torch
import spaces  # Import cho GPU Zero

# Khởi tạo model trên CPU trước
print("📦 Đang tải model...")
tts = VieNeuTTS(
    backbone_repo="pnnbao-ump/VieNeu-TTS",
    backbone_device="cpu",  # Load trên CPU trước
    codec_repo="neuphonic/neucodec",
    codec_device="cpu"
)
print("✅ Model đã tải xong!")

# Danh sách giọng mẫu
VOICE_SAMPLES = {
    "Nam miền Nam": {
        "audio": "./sample/id_0001.wav",
        "text": "./sample/id_0001.txt"
    },
    "Nữ miền Nam": {
        "audio": "./sample/id_0002.wav",
        "text": "./sample/id_0002.txt"
    }
}

@spaces.GPU(duration=120)  # Giữ GPU trong 120 giây cho mỗi request
def synthesize_speech(text, voice_choice, custom_audio=None, custom_text=None):
    """
    Tổng hợp giọng nói từ văn bản - Chạy trên GPU
    """
    try:
        # Kiểm tra text input
        if not text or text.strip() == "":
            return None, "❌ Vui lòng nhập văn bản cần tổng hợp"
        
        # Giới hạn độ dài text để tránh timeout
        if len(text) > 500:
            return None, "❌ Văn bản quá dài! Vui lòng nhập tối đa 500 ký tự"
        
        # Xác định reference audio và text
        if custom_audio is not None and custom_text:
            ref_audio_path = custom_audio
            ref_text = custom_text
        elif voice_choice in VOICE_SAMPLES:
            ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
            ref_text_path = VOICE_SAMPLES[voice_choice]["text"]
            with open(ref_text_path, "r", encoding="utf-8") as f:
                ref_text = f.read()
        else:
            return None, "❌ Vui lòng chọn giọng hoặc tải lên audio tùy chỉnh"
        
        # Di chuyển model lên GPU (spaces.GPU tự động làm điều này)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        if device == "cuda":
            print("🚀 Đang chuyển model lên GPU...")
            tts.backbone = tts.backbone.to("cuda")
            tts.codec = tts.codec.to("cuda")
        
        # Encode reference audio
        print(f"📝 Đang xử lý: {text[:50]}...")
        ref_codes = tts.encode_reference(ref_audio_path)
        
        # Tổng hợp giọng nói
        print(f"🎵 Đang tổng hợp giọng nói trên {device.upper()}...")
        wav = tts.infer(text, ref_codes, ref_text)
        
        # Di chuyển model về CPU để giải phóng GPU
        if device == "cuda":
            print("💾 Đang giải phóng GPU...")
            tts.backbone = tts.backbone.to("cpu")
            tts.codec = tts.codec.to("cpu")
            torch.cuda.empty_cache()
        
        # Lưu file tạm
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            sf.write(tmp_file.name, wav, 24000)
            output_path = tmp_file.name
        
        print("✅ Hoàn thành!")
        return output_path, f"✅ Tổng hợp thành công trên {device.upper()}!"
        
    except Exception as e:
        print(f"❌ Lỗi: {str(e)}")
        import traceback
        traceback.print_exc()
        
        # Đảm bảo giải phóng GPU khi có lỗi
        try:
            if torch.cuda.is_available():
                tts.backbone = tts.backbone.to("cpu")
                tts.codec = tts.codec.to("cpu")
                torch.cuda.empty_cache()
        except:
            pass
            
        return None, f"❌ Lỗi: {str(e)}"

# Các ví dụ mẫu - Ngắn gọn
examples = [
    ["Xin chào, tôi là trợ lý giọng nói tiếng Việt.", "Nam miền Nam"],
    ["Chúc bạn một ngày tốt lành!", "Nữ miền Nam"],
    ["Hôm nay thời tiết đẹp quá.", "Nam miền Nam"],
]

# Custom CSS
custom_css = """
.gradio-container {
    max-width: 900px !important;
}
#warning {
    background-color: #fff3cd;
    border: 1px solid #ffc107;
    border-radius: 5px;
    padding: 10px;
    margin: 10px 0;
}
#info {
    background-color: #d1ecf1;
    border: 1px solid #17a2b8;
    border-radius: 5px;
    padding: 10px;
    margin: 10px 0;
}
"""

# Tạo giao diện Gradio
with gr.Blocks(title="VieNeu-TTS", css=custom_css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎙️ VieNeu-TTS: Vietnamese Text-to-Speech
    
    Hệ thống tổng hợp tiếng nói tiếng Việt sử dụng Large Language Model và Neural Codec.
    """)
    
    # Thông báo GPU Zero
    gr.Markdown("""
    <div id="info">
    ⚡ <strong>GPU Zero Mode:</strong> Space này sử dụng GPU miễn phí với tốc độ nhanh (3-5 giây/câu).
    <br>
    ⏳ Lần đầu sử dụng có thể phải chờ 5-30 phút trong hàng đợi GPU. Sau đó sẽ rất nhanh!
    </div>
    """)
    
    with gr.Row():
        with gr.Column():
            # Input text
            text_input = gr.Textbox(
                label="📝 Văn bản đầu vào (tối đa 500 ký tự)",
                placeholder="Nhập văn bản tiếng Việt...",
                lines=4,
                max_lines=6
            )
            
            # Character counter
            char_count = gr.Markdown("0 / 500 ký tự")
            
            # Voice selection
            voice_select = gr.Radio(
                choices=list(VOICE_SAMPLES.keys()),
                label="🎤 Chọn giọng",
                value="Nam miền Nam"
            )
            
            # Custom voice option
            with gr.Accordion("🎨 Hoặc sử dụng giọng tùy chỉnh", open=False):
                gr.Markdown("*Upload file audio (.wav) và nội dung text tương ứng*")
                custom_audio = gr.Audio(
                    label="File audio mẫu",
                    type="filepath"
                )
                custom_text = gr.Textbox(
                    label="Nội dung của audio mẫu",
                    placeholder="Nhập chính xác nội dung...",
                    lines=2
                )
            
            # Submit button
            submit_btn = gr.Button("🎵 Tổng hợp giọng nói", variant="primary", size="lg")
        
        with gr.Column():
            # Output
            audio_output = gr.Audio(label="🔊 Kết quả")
            status_output = gr.Textbox(label="📊 Trạng thái", interactive=False)
    
    # Examples
    gr.Markdown("### 💡 Ví dụ nhanh")
    gr.Examples(
        examples=examples,
        inputs=[text_input, voice_select],
        outputs=[audio_output, status_output],
        fn=synthesize_speech,
        cache_examples=False  # Không cache để build nhanh hơn
    )
    
    # Update character count
    def update_char_count(text):
        count = len(text) if text else 0
        color = "red" if count > 500 else "green"
        return f"<span style='color: {color}'>{count} / 500 ký tự</span>"
    
    text_input.change(
        fn=update_char_count,
        inputs=[text_input],
        outputs=[char_count]
    )
    
    # Event handler
    submit_btn.click(
        fn=synthesize_speech,
        inputs=[text_input, voice_select, custom_audio, custom_text],
        outputs=[audio_output, status_output]
    )
    
    gr.Markdown("""
    ---
    ### 📌 Thông tin
    
    **Ưu điểm GPU Zero:**
    - ⚡ Tốc độ: 3-5 giây/câu (nhanh hơn CPU 10-20 lần)
    - 💰 Hoàn toàn miễn phí
    - 🎯 Chất lượng âm thanh cao
    
    **Lưu ý:**
    - ⏳ Lần đầu sử dụng phải chờ GPU khả dụng (5-30 phút)
    - 🔄 Sau đó các request tiếp theo sẽ rất nhanh
    - ⏱️ GPU timeout sau 120 giây nếu không dùng
    - 📏 Khuyến nghị: Văn bản dưới 500 ký tự
    
    **Liên kết:**
    - [GitHub Repository](https://github.com/pnnbao97/VieNeu-TTS)
    - [Model Card](https://huggingface.co/pnnbao-ump/VieNeu-TTS)
    
    <sub>Powered by VieNeu-TTS | Built with ❤️ for Vietnamese TTS</sub>
    """)

# Launch
if __name__ == "__main__":
    demo.queue(max_size=20)  # Queue size lớn hơn cho GPU Zero
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )