import gradio as gr
import numpy as np
import soundfile as sf
from vieneutts import VieNeuTTS
import tempfile
import os

# Khởi tạo model
print("Đang tải model...")
tts = VieNeuTTS(
    backbone_repo="pnnbao-ump/VieNeu-TTS",
    backbone_device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu",
    codec_repo="neuphonic/neucodec",
    codec_device="cuda" if os.environ.get("CUDA_VISIBLE_DEVICES") else "cpu"
)

# Danh sách giọng mẫu
VOICE_SAMPLES = {
    "Nam miền Nam": {
        "audio": "./sample/id_0001.wav",
        "text": "./sample/id_0001.txt"
    },
    "Nữ miền Nam": {
        "audio": "./sample/id_0002.wav",
        "text": "./sample/id_0002.txt"
    }
}

def synthesize_speech(text, voice_choice, custom_audio=None, custom_text=None):
    """
    Tổng hợp giọng nói từ văn bản
    
    Args:
        text: Văn bản cần chuyển thành giọng nói
        voice_choice: Lựa chọn giọng có sẵn
        custom_audio: File audio tùy chỉnh (optional)
        custom_text: Text tương ứng với audio tùy chỉnh (optional)
    """
    try:
        # Xác định reference audio và text
        if custom_audio is not None and custom_text:
            ref_audio_path = custom_audio
            ref_text = custom_text
        elif voice_choice in VOICE_SAMPLES:
            ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
            ref_text_path = VOICE_SAMPLES[voice_choice]["text"]
            with open(ref_text_path, "r", encoding="utf-8") as f:
                ref_text = f.read()
        else:
            return None, "❌ Vui lòng chọn giọng hoặc tải lên audio tùy chỉnh"
        
        # Kiểm tra text input
        if not text or text.strip() == "":
            return None, "❌ Vui lòng nhập văn bản cần tổng hợp"
        
        # Encode reference audio
        print(f"Đang encode audio tham chiếu: {ref_audio_path}")
        ref_codes = tts.encode_reference(ref_audio_path)
        
        # Tổng hợp giọng nói
        print(f"Đang tổng hợp giọng nói cho: {text[:50]}...")
        wav = tts.infer(text, ref_codes, ref_text)
        
        # Lưu file tạm
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            sf.write(tmp_file.name, wav, 24000)
            output_path = tmp_file.name
        
        return output_path, "✅ Tổng hợp thành công!"
        
    except Exception as e:
        print(f"Lỗi: {str(e)}")
        return None, f"❌ Lỗi: {str(e)}"

# Các ví dụ mẫu
examples = [
    ["Các khóa học trực tuyến đang giúp học sinh tiếp cận kiến thức mọi lúc mọi nơi.", "Nam miền Nam"],
    ["Các nghiên cứu về bệnh Alzheimer cho thấy tác dụng tích cực của các bài tập trí não.", "Nữ miền Nam"],
    ["Một tiểu thuyết trinh thám hiện đại dẫn dắt độc giả qua những tình tiết phức tạp.", "Nam miền Nam"],
]

# Tạo giao diện Gradio
with gr.Blocks(title="VieNeu-TTS: Vietnamese Text-to-Speech") as demo:
    gr.Markdown("""
    # 🎙️ VieNeu-TTS: Vietnamese Text-to-Speech
    
    Hệ thống tổng hợp tiếng nói tiếng Việt sử dụng mô hình ngôn ngữ lớn và neural codec.
    
    **Hướng dẫn sử dụng:**
    1. Nhập văn bản tiếng Việt cần chuyển thành giọng nói
    2. Chọn giọng có sẵn hoặc tải lên audio tùy chỉnh
    3. Nhấn "Tổng hợp giọng nói"
    """)
    
    with gr.Row():
        with gr.Column():
            # Input text
            text_input = gr.Textbox(
                label="Văn bản đầu vào",
                placeholder="Nhập văn bản tiếng Việt...",
                lines=5
            )
            
            # Voice selection
            voice_select = gr.Radio(
                choices=list(VOICE_SAMPLES.keys()),
                label="Chọn giọng có sẵn",
                value="Nam miền Nam"
            )
            
            # Custom voice option
            with gr.Accordion("Hoặc sử dụng giọng tùy chỉnh", open=False):
                custom_audio = gr.Audio(
                    label="Tải lên file audio mẫu (.wav)",
                    type="filepath"
                )
                custom_text = gr.Textbox(
                    label="Nội dung của audio mẫu",
                    placeholder="Nhập chính xác nội dung trong audio...",
                    lines=3
                )
            
            # Submit button
            submit_btn = gr.Button("🎵 Tổng hợp giọng nói", variant="primary")
        
        with gr.Column():
            # Output
            audio_output = gr.Audio(label="Kết quả")
            status_output = gr.Textbox(label="Trạng thái")
    
    # Examples
    gr.Examples(
        examples=examples,
        inputs=[text_input, voice_select],
        outputs=[audio_output, status_output],
        fn=synthesize_speech,
        cache_examples=True
    )
    
    # Event handler
    submit_btn.click(
        fn=synthesize_speech,
        inputs=[text_input, voice_select, custom_audio, custom_text],
        outputs=[audio_output, status_output]
    )
    
    gr.Markdown("""
    ---
    **Lưu ý:**
    - Model hỗ trợ tiếng Việt với các giọng miền Nam
    - Chất lượng giọng nói phụ thuộc vào audio tham chiếu
    - Để có kết quả tốt nhất, audio tham chiếu nên rõ ràng, không nhiễu
    
    **Liên kết:**
    - [GitHub Repository](https://github.com/pnnbao97/VieNeu-TTS)
    - [Model on Hugging Face](https://huggingface.co/pnnbao-ump/VieNeu-TTS)
    """)

if __name__ == "__main__":
    demo.launch()