kt002

Running

File size: 7,732 Bytes

import spaces
import os
import gradio as gr
import soundfile as sf
import tempfile
import torch
import librosa
import time
from tts_engine import VoiceEngine 

# --- 1. KHỞI TẠO (Giữ nguyên logic nạp model đã thành công) ---
os.environ['SPACES_ZERO_GPU'] = '1'
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"🔄 Đang khởi động trên: {device}")

try:
    tts = VoiceEngine(
        backbone_repo="ktvoice/Backbone", 
        backbone_device=device, 
        codec_repo="ktvoice/Codec", 
        codec_device=device
    )
    print("✅ Đã nạp thành công Model!")
except Exception as e:
    print(f"❌ Lỗi nạp mô hình: {e}")
    tts = None

# Danh sách giọng
VOICE_SAMPLES = {
    "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
    "Thiện Tâm": {"audio": "./sample/ThienTam.mp3", "text": "./sample/ThienTam.txt"},
    "Ngọc Huyền": {"audio": "./sample/NgocHuyen.mp3", "text": "./sample/NgocHuyen.txt"},
    "Minh Quân": {"audio": "./sample/MinhQuan.mp3", "text": "./sample/MinhQuan.txt"},
    "Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
    "Bình (nam miền Bắc)": {"audio": "./sample/Bình (nam miền Bắc).wav", "text": "./sample/Bình (nam miền Bắc).txt"},
    "Nguyên (nam miền Nam)": {"audio": "./sample/Nguyên (nam miền Nam).wav", "text": "./sample/Nguyên (nam miền Nam).txt"},
    "Sơn (nam miền Nam)": {"audio": "./sample/Sơn (nam miền Nam).wav", "text": "./sample/Sơn (nam miền Nam).txt"},
    "Đoan (nữ miền Nam)": {"audio": "./sample/Đoan (nữ miền Nam).wav", "text": "./sample/Đoan (nữ miền Nam).txt"},
    "Ngọc (nữ miền Bắc)": {"audio": "./sample/Ngọc (nữ miền Bắc).wav", "text": "./sample/Ngọc (nữ miền Bắc).txt"},
    "Ly (nữ miền Bắc)": {"audio": "./sample/Ly (nữ miền Bắc).wav", "text": "./sample/Ly (nữ miền Bắc).txt"},
    "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
}

@spaces.GPU(duration=60)
def tts_process(text, voice_choice, custom_audio, custom_text, mode_tab, pause_level, speed_value):
    if tts is None:
        return None, "Hệ thống chưa sẵn sàng."
    
    if not text or not text.strip():
        return None, "Vui lòng nhập văn bản."

    try:
        # Chọn nguồn giọng
        if mode_tab == "custom":
            if not custom_audio: return None, "Thiếu Audio mẫu."
            if not custom_text: return None, "Thiếu lời thoại mẫu."
            ref_path, ref_txt_val = custom_audio, custom_text
        else:
            sample = VOICE_SAMPLES.get(voice_choice)
            if not sample:
                return None, f"Lỗi: Không tìm thấy giọng '{voice_choice}'"
            ref_path = sample["audio"]
            try:
                with open(sample["text"], "r", encoding="utf-8") as f:
                    ref_txt_val = f.read()
            except Exception as e_txt:
                return None, f"Lỗi đọc file text mẫu: {e_txt}"

        # [DEBUG] Kiểm tra file audio tồn tại
        if not os.path.exists(ref_path):
            return None, f"Lỗi: File audio không tồn tại: {ref_path}"
        
        file_size = os.path.getsize(ref_path)
        print(f"[DEBUG] Voice: {voice_choice} | File: {ref_path} | Size: {file_size} bytes | Text mẫu: {ref_txt_val[:50]}...")

        # Xử lý ngắt nghỉ
        processed_text = text
        if pause_level == "Trung bình":
            processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
        elif pause_level == "Dài":
            processed_text = processed_text.replace(",", ", , , ").replace(".", ". . . . ")

        start_time = time.time()
        
        # [DEBUG] Bước 1: Encode reference
        print(f"[DEBUG] Bắt đầu encode_reference: {ref_path}")
        try:
            ref_codes = tts.encode_reference(ref_path)
            print(f"[DEBUG] encode_reference thành công! Type: {type(ref_codes)}")
        except Exception as e_enc:
            import traceback
            traceback.print_exc()
            return None, f"Lỗi encode_reference: {type(e_enc).__name__}: {str(e_enc)}"
        
        # [DEBUG] Bước 2: Infer
        print(f"[DEBUG] Bắt đầu infer. Text length: {len(processed_text)}")
        try:
            wav = tts.infer(processed_text[:500], ref_codes, ref_txt_val)
            print(f"[DEBUG] infer thành công! Wav shape: {wav.shape if hasattr(wav, 'shape') else len(wav)}")
        except Exception as e_inf:
            import traceback
            traceback.print_exc()
            return None, f"Lỗi infer: {type(e_inf).__name__}: {str(e_inf)}"
        
        # Tốc độ
        if speed_value != 1.0:
            wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
            
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            sf.write(tmp.name, wav, 24000)
            elapsed = time.time() - start_time
            print(f"[DEBUG] Hoàn tất TTS: {elapsed:.2f}s | Output: {tmp.name}")
            return tmp.name, f"Hoàn tất: {elapsed:.2f}s"
            
    except Exception as e:
        import traceback
        traceback.print_exc()
        return None, f"Lỗi: {type(e).__name__}: {str(e)}"

# --- 2. GIAO DIỆN CƠ BẢN (Native Gradio) ---
with gr.Blocks(title="AI Voice") as demo:
    gr.Markdown("# 🎙️ AI Voice Studio")
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Văn bản cần đọc", 
                lines=5, 
                placeholder="Nhập tiếng Việt vào đây..."
            )
            
            with gr.Tabs() as tabs:
                with gr.Tab("Chọn giọng có sẵn", id="preset"):
                    voice_dropdown = gr.Dropdown(
                        choices=list(VOICE_SAMPLES.keys()), 
                        value="Tuyên (nam miền Bắc)", 
                        label="Danh sách Nghệ sĩ"
                    )
                with gr.Tab("Tự nhân bản (Clone)", id="custom"):
                    c_audio = gr.Audio(label="Audio mẫu", type="filepath")
                    c_text = gr.Textbox(label="Lời thoại mẫu", lines=2)
            
            with gr.Row():
                pause_radio = gr.Radio(["Mặc định", "Trung bình", "Dài"], value="Mặc định", label="Ngắt nghỉ")
                speed_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Tốc độ")
            
            active_tab = gr.Textbox(value="preset", visible=False, label="Mode")
            gen_btn = gr.Button("ĐỌC NGAY", variant="primary")
            
        with gr.Column():
            output_audio = gr.Audio(label="Kết quả", interactive=False)
            output_status = gr.Textbox(label="Trạng thái", interactive=False)

    tabs.children[0].select(lambda: "preset", None, active_tab)
    tabs.children[1].select(lambda: "custom", None, active_tab)
   
    gen_btn.click(
        fn=tts_process,
        inputs=[input_text, voice_dropdown, c_audio, c_text, active_tab, pause_radio, speed_slider],
        outputs=[output_audio, output_status],
        api_name="tts"  # <--- QUAN TRỌNG: Định danh API là "tts"
    )

if __name__ == "__main__":
    # Cấu hình chuẩn để tránh mọi cảnh báo và lỗi
    demo.queue().launch(
        server_name="0.0.0.0", 
        server_port=7860, 
        ssr_mode=False,
        theme=gr.themes.Default()
    )