Spaces:

nhantrungsp
/

FSub

Sleeping

App Files Files Community

nhantrungsp commited on Dec 11, 2025

Commit

4a8f5a5

verified ·

1 Parent(s): 4196956

Update gradio_app.py

Browse files

Files changed (1) hide show

gradio_app.py +56 -124

gradio_app.py CHANGED Viewed

@@ -1,11 +1,9 @@
-import spaces # <--- BẮT BUỘC DÒNG 1
 import os
 import time
 import threading
 import pickle
 import hashlib
-import base64
-import io
 import tempfile
 import numpy as np
@@ -14,19 +12,32 @@ import torch
 import soundfile as sf
 from pydub import AudioSegment
 import gradio as gr
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
 from vieneu_tts import VieNeuTTS
-# --- KHỞI TẠO ---
-app = FastAPI()
-print("⏳ Đang khởi động Server...")
-# Biến toàn cục để lưu model (Lazy Load)
 tts_model = None
 model_lock = threading.Lock()
-# Cache
 CACHE_DIR = "./reference_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 reference_cache = {}
@@ -50,30 +61,7 @@ def save_cache_to_disk(cache_key, ref_codes):
         with open(cache_path, 'wb') as f: pickle.dump(ref_codes, f)
     except Exception: pass
-# --- HELPER: LOAD MODEL AN TOÀN ---
-def get_tts_model():
-    """Hàm này chỉ tải model khi được gọi lần đầu tiên"""
-    global tts_model
-    with model_lock:
-        if tts_model is None:
-            print("📦 Đang khởi tạo model lần đầu (Lazy Load)...")
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            print(f"   🖥️ Device: {device}")
-            try:
-                # Load model
-                tts_model = VieNeuTTS(
-                    backbone_repo="pnnbao-ump/VieNeu-TTS",
-                    backbone_device=device,
-                    codec_repo="neuphonic/neucodec",
-                    codec_device=device
-                )
-                print("   ✅ Model tải thành công!")
-            except Exception as e:
-                print(f"   ❌ Lỗi tải model: {e}")
-                raise e
-        return tts_model
-# --- DATA ---
 VOICE_SAMPLES = {
     "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
     "Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
@@ -87,14 +75,20 @@ VOICE_SAMPLES = {
     "Nhỏ Ngọt Ngào": {"audio": "./sample/Nhỏ Ngọt Ngào.wav", "text": "./sample/Nhỏ Ngọt Ngào.txt"},
 }
-# --- CORE LOGIC (DECORATED WITH @spaces.GPU) ---
-@spaces.GPU(duration=120) # Tăng thời gian timeout lên 120s cho lần đầu load model
-def core_synthesize(text, voice_choice, speed_factor):
-    # 1. Lấy model (Sẽ tải nếu chưa có)
     tts = get_tts_model()
-    # 2. Đảm bảo model ở đúng device (GPU)
     if torch.cuda.is_available():
         try:
             if next(tts.backbone.parameters()).device.type != 'cuda':
@@ -105,7 +99,9 @@ def core_synthesize(text, voice_choice, speed_factor):
     # 3. Lấy thông tin giọng
     voice_info = VOICE_SAMPLES.get(voice_choice)
     if not voice_info:
-        raise ValueError("Giọng không tồn tại")
     ref_audio_path = voice_info["audio"]
     ref_text_path = voice_info["text"]
@@ -113,7 +109,7 @@ def core_synthesize(text, voice_choice, speed_factor):
     with open(ref_text_path, "r", encoding="utf-8") as f:
         ref_text_raw = f.read()
-    # 4. Encode Reference
     cache_key = f"preset:{voice_choice}"
     with reference_cache_lock:
         if cache_key in reference_cache:
@@ -123,18 +119,18 @@ def core_synthesize(text, voice_choice, speed_factor):
         else:
             ref_codes = load_cache_from_disk(cache_key)
             if ref_codes is None:
                 ref_codes = tts.encode_reference(ref_audio_path)
-                # Cache trên CPU
                 save_cache_to_disk(cache_key, ref_codes.cpu() if isinstance(ref_codes, torch.Tensor) else ref_codes)
             if isinstance(ref_codes, torch.Tensor) and torch.cuda.is_available():
                 ref_codes = ref_codes.to("cuda")
             reference_cache[cache_key] = ref_codes
-    # 5. Infer
     wav = tts.infer(text, ref_codes, ref_text_raw)
-    # 6. Speed Control (CPU Processing)
     if speed_factor != 1.0:
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
             sf.write(tmp.name, wav, 24000)
@@ -149,100 +145,36 @@ def core_synthesize(text, voice_choice, speed_factor):
         if sound_stretched.channels == 2:
             wav = wav.reshape((-1, 2)).mean(axis=1)
         os.unlink(tmp_path)
-    return wav
-@spaces.GPU(duration=120)
-def custom_synthesize_logic(text, ref_audio_path, ref_text_raw):
-    tts = get_tts_model()
-    if torch.cuda.is_available():
-        try:
-            if next(tts.backbone.parameters()).device.type != 'cuda':
-                tts.backbone.to("cuda")
-                tts.codec.to("cuda")
-        except: pass
-    ref_codes = tts.encode_reference(ref_audio_path)
-    wav = tts.infer(text, ref_codes, ref_text_raw)
-    return wav
-# --- API ---
-class FastTTSRequest(BaseModel):
-    text: str
-    voice_choice: str
-    speed_factor: float = 1.0
-    return_base64: bool = False
-@app.get("/voices")
-async def get_voices():
-    return {"voices": list(VOICE_SAMPLES.keys())}
-@app.post("/fast-tts")
-async def fast_tts(request: FastTTSRequest):
-    try:
-        start = time.time()
-        # Gọi hàm GPU
-        wav = core_synthesize(request.text, request.voice_choice, request.speed_factor)
-        process_time = time.time() - start
-        audio_buffer = io.BytesIO()
-        sf.write(audio_buffer, wav, 24000, format='WAV')
-        audio_base64 = base64.b64encode(audio_buffer.getvalue()).decode('utf-8')
-        return {
-            "status": "success",
-            "audio_base64": audio_base64,
-            "processing_time": process_time
-        }
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-# --- GRADIO UI ---
 theme = gr.themes.Soft()
 css = ".container { max-width: 900px; margin: auto; }"
-def ui_synthesize(text, voice, custom_audio, custom_text, mode, speed):
-    try:
-        start = time.time()
-        if mode == "custom_mode":
-            wav = custom_synthesize_logic(text, custom_audio, custom_text)
-        else:
-            wav = core_synthesize(text, voice, speed)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-            sf.write(tmp.name, wav, 24000)
-            path = tmp.name
-        return path, f"✅ Xong! ({time.time()-start:.2f}s)"
-    except Exception as e:
-        return None, f"❌ Lỗi: {e}"
 with gr.Blocks(theme=theme, css=css, title="VieNeu-TTS") as demo:
-    gr.Markdown("# 🎙️ VieNeu-TTS (API + UI)")
     with gr.Row():
         with gr.Column():
-            inp_text = gr.Textbox(label="Văn bản", lines=3, value="Xin chào Việt Nam")
-            with gr.Tabs() as tabs:
-                with gr.TabItem("Giọng mẫu", id="preset_mode"):
-                    inp_voice = gr.Dropdown(list(VOICE_SAMPLES.keys()), value="Tuyên (nam miền Bắc)", label="Chọn giọng")
-                with gr.TabItem("Custom", id="custom_mode"):
-                    inp_audio = gr.Audio(type="filepath")
-                    inp_ref_text = gr.Textbox(label="Lời thoại mẫu")
             inp_speed = gr.Slider(0.5, 2.0, value=1.0, label="Tốc độ")
             btn = gr.Button("Đọc ngay", variant="primary")
         with gr.Column():
             out_audio = gr.Audio(label="Kết quả", autoplay=True)
             out_status = gr.Textbox(label="Trạng thái")
-    mode_state = gr.Textbox(visible=False, value="preset_mode")
-    tabs.children[0].select(lambda: "preset_mode", None, mode_state)
-    tabs.children[1].select(lambda: "custom_mode", None, mode_state)
-    btn.click(ui_synthesize, [inp_text, inp_voice, inp_audio, inp_ref_text, mode_state, inp_speed], [out_audio, out_status])
-# Mount Gradio vào FastAPI
-app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
-    import uvicorn
-    # Mở port 7860 để Hugging Face truy cập
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+import spaces  # <--- BẮT BUỘC DÒNG 1
 import os
 import time
 import threading
 import pickle
 import hashlib
 import tempfile
 import numpy as np
 import soundfile as sf
 from pydub import AudioSegment
 import gradio as gr
 from vieneu_tts import VieNeuTTS
+print("⏳ Đang khởi động Server Gradio...")
+# --- 1. QUẢN LÝ MODEL (Lazy Loading) ---
 tts_model = None
 model_lock = threading.Lock()
+def get_tts_model():
+    """Chỉ tải model khi có người dùng gọi (Tiết kiệm tài nguyên khởi động)"""
+    global tts_model
+    with model_lock:
+        if tts_model is None:
+            print("📦 Đang khởi tạo model lần đầu (Lazy Load)...")
+            # ZeroGPU yêu cầu khởi tạo model trên CPU hoặc trong hàm @spaces.GPU
+            # Ở đây ta khởi tạo trên CPU cho an toàn
+            tts_model = VieNeuTTS(
+                backbone_repo="pnnbao-ump/VieNeu-TTS",
+                backbone_device="cpu",
+                codec_repo="neuphonic/neucodec",
+                codec_device="cpu"
+            )
+            print("✅ Model tải thành công!")
+        return tts_model
+# --- 2. XỬ LÝ CACHE ---
 CACHE_DIR = "./reference_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 reference_cache = {}
         with open(cache_path, 'wb') as f: pickle.dump(ref_codes, f)
     except Exception: pass
+# --- 3. DỮ LIỆU GIỌNG NÓI ---
 VOICE_SAMPLES = {
     "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
     "Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
     "Nhỏ Ngọt Ngào": {"audio": "./sample/Nhỏ Ngọt Ngào.wav", "text": "./sample/Nhỏ Ngọt Ngào.txt"},
 }
+# --- 4. HÀM XỬ LÝ CHÍNH (GPU) ---
+@spaces.GPU(duration=120)
+def generate_speech(text, voice_choice, speed_factor):
+    """
+    Hàm này sẽ được ZeroGPU cấp phát GPU khi chạy.
+    Nó cũng đóng vai trò là API endpoint chính.
+    """
+    start_time = time.time()
+    # 1. Lấy Model (Tải nếu chưa có)
     tts = get_tts_model()
+    # 2. Chuyển Model sang GPU (Chỉ làm trong hàm này)
     if torch.cuda.is_available():
         try:
             if next(tts.backbone.parameters()).device.type != 'cuda':
     # 3. Lấy thông tin giọng
     voice_info = VOICE_SAMPLES.get(voice_choice)
     if not voice_info:
+        # Fallback nếu không tìm thấy giọng
+        voice_choice = "Tuyên (nam miền Bắc)"
+        voice_info = VOICE_SAMPLES[voice_choice]
     ref_audio_path = voice_info["audio"]
     ref_text_path = voice_info["text"]
     with open(ref_text_path, "r", encoding="utf-8") as f:
         ref_text_raw = f.read()
+    # 4. Encode Reference (Có Cache)
     cache_key = f"preset:{voice_choice}"
     with reference_cache_lock:
         if cache_key in reference_cache:
         else:
             ref_codes = load_cache_from_disk(cache_key)
             if ref_codes is None:
+                # Encode
                 ref_codes = tts.encode_reference(ref_audio_path)
                 save_cache_to_disk(cache_key, ref_codes.cpu() if isinstance(ref_codes, torch.Tensor) else ref_codes)
             if isinstance(ref_codes, torch.Tensor) and torch.cuda.is_available():
                 ref_codes = ref_codes.to("cuda")
             reference_cache[cache_key] = ref_codes
+    # 5. Infer (Tạo giọng nói)
     wav = tts.infer(text, ref_codes, ref_text_raw)
+    # 6. Xử lý tốc độ (Speed)
     if speed_factor != 1.0:
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
             sf.write(tmp.name, wav, 24000)
         if sound_stretched.channels == 2:
             wav = wav.reshape((-1, 2)).mean(axis=1)
         os.unlink(tmp_path)
+    # 7. Lưu file kết quả
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+        sf.write(tmp_file.name, wav, 24000)
+        output_path = tmp_file.name
+    return output_path, f"✅ Hoàn tất ({time.time() - start_time:.2f}s)"
+# --- 5. GIAO DIỆN GRADIO ---
 theme = gr.themes.Soft()
 css = ".container { max-width: 900px; margin: auto; }"
 with gr.Blocks(theme=theme, css=css, title="VieNeu-TTS") as demo:
+    gr.Markdown("# 🎙️ VieNeu-TTS (ZeroGPU)")
     with gr.Row():
         with gr.Column():
+            inp_text = gr.Textbox(label="Văn bản", lines=3, value="Xin chào Việt Nam, đây là thử nghiệm giọng nói.")
+            inp_voice = gr.Dropdown(list(VOICE_SAMPLES.keys()), value="Tuyên (nam miền Bắc)", label="Chọn giọng")
             inp_speed = gr.Slider(0.5, 2.0, value=1.0, label="Tốc độ")
             btn = gr.Button("Đọc ngay", variant="primary")
         with gr.Column():
             out_audio = gr.Audio(label="Kết quả", autoplay=True)
             out_status = gr.Textbox(label="Trạng thái")
+    # Map function vào button
+    btn.click(generate_speech, [inp_text, inp_voice, inp_speed], [out_audio, out_status])
+# --- 6. KHỞI CHẠY ---
 if __name__ == "__main__":
+    # Dùng demo.launch() chuẩn để ZeroGPU nhận diện được
+    demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", server_port=7860)