tts

Sleeping

File size: 10,120 Bytes

import spaces 
import os
os.environ['SPACES_ZERO_GPU'] = '1'

import gradio as gr
import soundfile as sf
import tempfile
import torch
import librosa # Thêm thư viện xử lý âm thanh
from vieneu_tts import VieNeuTTS
import time

# --- 1. SETUP MODEL ---
device = "cuda" if torch.cuda.is_available() else "cpu"

try:
    tts = VieNeuTTS(
        backbone_repo="pnnbao-ump/VieNeu-TTS",
        backbone_device=device,
        codec_repo="neuphonic/neucodec",
        codec_device=device
    )
except Exception as e:
    class MockTTS:
        def encode_reference(self, path): return None
        def infer(self, text, ref, ref_text): 
            time.sleep(1.2) 
            import numpy as np
            return np.random.uniform(-0.1, 0.1, 24000*2)
    tts = MockTTS()

# --- 2. DATA (Giữ nguyên danh sách giọng mẫu) ---
VOICE_SAMPLES = {
    "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
    "Thiện Tâm": {"audio": "./sample/thientam.mp3", "text": "./sample/thientam.txt"},
    "Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
    "Bình (nam miền Bắc)": {"audio": "./sample/Bình (nam miền Bắc).wav", "text": "./sample/Bình (nam miền Bắc).txt"},
    "Nguyên (nam miền Nam)": {"audio": "./sample/Nguyên (nam miền Nam).wav", "text": "./sample/Nguyên (nam miền Nam).txt"},
    "Sơn (nam miền Nam)": {"audio": "./sample/Sơn (nam miền Nam).wav", "text": "./sample/Sơn (nam miền Nam).txt"},
    "Đoan (nữ miền Nam)": {"audio": "./sample/Đoan (nữ miền Nam).wav", "text": "./sample/Đoan (nữ miền Nam).txt"},
    "Ngọc (nữ miền Bắc)": {"audio": "./sample/Ngọc (nữ miền Bắc).wav", "text": "./sample/Ngọc (nữ miền Bắc).txt"},
    "Ly (nữ miền Bắc)": {"audio": "./sample/Ly (nữ miền Bắc).wav", "text": "./sample/Ly (nữ miền Bắc).txt"},
    "Dung (nữ miền Nam)": {"audio": "./sample/Dung (nữ miền Nam).wav", "text": "./sample/Dung (nữ miền Nam).txt"}
}

# --- 3. HELPER FUNCTIONS ---
def load_reference_info(voice_choice):
    if voice_choice in VOICE_SAMPLES:
        audio_path = VOICE_SAMPLES[voice_choice]["audio"]
        text_path = VOICE_SAMPLES[voice_choice]["text"]
        if os.path.exists(text_path):
            with open(text_path, "r", encoding="utf-8") as f:
                ref_text = f.read()
            return audio_path, ref_text
    return None, ""

@spaces.GPU(duration=120) 
def synthesize_speech(text, voice_choice, custom_audio, custom_text, mode_tab, pause_level, speed_value):
    try:
        if not text or text.strip() == "":
            return None, "⚠️ Vui lòng nhập nội dung!"
        
        # 3.1. Xử lý độ ngắt nghỉ (Pause level)
        processed_text = text
        if pause_level == "Trung bình":
            processed_text = processed_text.replace(",", ", , ").replace(".", ". . ")
        elif pause_level == "Dài":
            processed_text = processed_text.replace(",", ", , , ").replace(".", ". . . . ")
            
        if len(processed_text) > 400:
             processed_text = processed_text[:400]

        # 3.2. Lấy dữ liệu Reference
        if mode_tab == "custom_mode": 
            if custom_audio is None or not custom_text:
                return None, "⚠️ Thiếu Audio mẫu hoặc Text mẫu."
            ref_audio_path = custom_audio
            ref_text_raw = custom_text
        else:
            ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
            with open(VOICE_SAMPLES[voice_choice]["text"], "r", encoding="utf-8") as f:
                ref_text_raw = f.read()

        # 3.3. Thực hiện Inference
        start_time = time.time()
        ref_codes = tts.encode_reference(ref_audio_path)
        wav = tts.infer(processed_text, ref_codes, ref_text_raw)
        
        # 3.4. Điều chỉnh Tốc độ (Speed) bằng librosa
        if speed_value != 1.0:
            # Time stretch giữ nguyên pitch
            wav = librosa.effects.time_stretch(wav, rate=float(speed_value))
            
        process_time = time.time() - start_time
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            sf.write(tmp_file.name, wav, 24000)
            output_path = tmp_file.name
        
        return output_path, f"⚡ Xử lý: {process_time:.2f}s | Tốc độ: {speed_value}x"
    except Exception as e:
        return None, f"❌ Lỗi: {str(e)}"

# --- 4. THEME & CSS ---
theme = gr.themes.Default(
    primary_hue="indigo",
    secondary_hue="blue",
    neutral_hue="slate",
    font=[gr.themes.GoogleFont('Inter'), 'sans-serif'],
).set(
    body_background_fill="#020617",
    block_background_fill="#0f172a",
    block_border_width="1px",
    input_background_fill="#1e293b",
    input_border_color="#334155",
    button_primary_background_fill="linear-gradient(135deg, #4f46e5 0%, #7c3aed 100%)",
)

css = """
.main-wrap { max-width: 1200px !important; margin: auto !important; padding: 20px !important; }
.st-card { 
    border-radius: 16px !important; 
    border: 1px solid rgba(255,255,255,0.1) !important; 
    box-shadow: 0 4px 20px rgba(0,0,0,0.5) !important;
    padding: 15px;
}
.result-card {
    background: linear-gradient(180deg, rgba(15, 23, 42, 0.8) 0%, rgba(30, 41, 59, 0.8) 100%) !important;
    border: 1px solid rgba(99, 102, 241, 0.2) !important;
    margin-top: 15px;
}
audio { filter: invert(90%) hue-rotate(180deg) brightness(1.5); width: 100%; border-radius: 8px; }
.footer { text-align: center; margin-top: 40px; color: #475569; font-size: 0.8rem; font-weight: 500; }
"""

# --- 5. UI CONSTRUCTION ---
with gr.Blocks(title="AI Voice Studio") as demo:
    
    with gr.Column(elem_classes="main-wrap"):
        with gr.Row(equal_height=True):
            # CỘT TRÁI
            with gr.Column(scale=1):
                with gr.Group(elem_classes="st-card"):
                    text_input = gr.Textbox(
                        label="VĂN BẢN CẦN CHUYỂN ĐỔI",
                        placeholder="Nhập nội dung vào đây...",
                        lines=20, # Tăng thêm để cân bằng với các nút mới
                        show_label=True,
                    )
                    char_count = gr.HTML("<div style='text-align: right; color: #6366f1; font-size: 0.85rem; font-weight: bold; padding: 5px;'>0 / 250</div>")

            # CỘT PHẢI
            with gr.Column(scale=1):
                with gr.Tabs() as tabs:
                    with gr.TabItem("👤 Nghệ sĩ đọc", id="preset_mode"):
                        voice_select = gr.Dropdown(
                            choices=list(VOICE_SAMPLES.keys()),
                            value="Tuyên (nam miền Bắc)",
                            label="Lựa chọn giọng đọc mẫu",
                        )
                        with gr.Accordion("Nghe thử giọng mẫu", open=False):
                            ref_audio_preview = gr.Audio(interactive=False, show_label=False)
                            ref_text_preview = gr.Markdown("...")

                    with gr.TabItem("🎙️ Nhân bản (Clone)", id="custom_mode"):
                        custom_audio = gr.Audio(label="Audio gốc", type="filepath")
                        custom_text = gr.Textbox(
                            label="NỘI DUNG AUDIO MẪU", 
                            placeholder="Nhập lời thoại của audio mẫu...",
                            lines=4, 
                            show_label=True
                        )

                # --- KHU VỰC ĐIỀU CHỈNH ÂM THANH ---
                with gr.Row():
                    pause_level = gr.Radio(
                        choices=["Mặc định", "Trung bình", "Dài"], 
                        value="Mặc định", 
                        label="Độ ngắt nghỉ",
                        scale=1
                    )
                    speed_select = gr.Dropdown(
                        choices=[0.8, 0.9, 1.0, 1.1, 1.2, 1.5],
                        value=1.0,
                        label="Tốc độ đọc",
                        scale=1
                    )

                current_mode = gr.State(value="preset_mode")
                
                gr.Markdown("<br>")
                btn_generate = gr.Button("BẮT ĐẦU TỔNG HỢP", variant="primary", size="lg")
                
                with gr.Group(elem_classes="st-card result-card"):
                    audio_output = gr.Audio(label="AUDIO KẾT QUẢ", interactive=False, autoplay=True)
                    status_output = gr.Markdown("<p style='text-align: center; color: #818cf8; font-weight: 500;'>✨ Sẵn sàng thực hiện</p>")

        gr.HTML("<div class='footer'>ENGINE BY VIENEU-TTS • PROFESSIONAL AI SOLUTIONS 2025</div>")

    # --- LOGIC ---
    def update_count(text):
        l = len(text)
        color = "#6366f1" if l <= 250 else "#f43f5e"
        return f"<div style='text-align: right; color: {color}; font-size: 0.85rem; font-weight: bold; padding: 5px;'>{l} / 250</div>"

    text_input.change(update_count, text_input, char_count)

    def update_ref_preview(voice):
        audio, text = load_reference_info(voice)
        return audio, f"**Nội dung mẫu:** *\"{text}\"*"
    
    voice_select.change(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])
    demo.load(update_ref_preview, voice_select, [ref_audio_preview, ref_text_preview])

    tabs.children[0].select(fn=lambda: "preset_mode", outputs=current_mode)
    tabs.children[1].select(fn=lambda: "custom_mode", outputs=current_mode)

    btn_generate.click(
        fn=synthesize_speech,
        inputs=[text_input, voice_select, custom_audio, custom_text, current_mode, pause_level, speed_select],
        outputs=[audio_output, status_output]
    )

if __name__ == "__main__":
    demo.queue().launch(theme=theme, css=css, server_name="0.0.0.0", server_port=7860)