Spaces:

nhantrungsp
/

FSub

Running on Zero

App Files Files Community

nhantrungsp commited on 2 days ago

Commit

4196956

verified ·

1 Parent(s): d593d54

Update gradio_app.py

Browse files

Files changed (1) hide show

gradio_app.py +63 -61

gradio_app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import spaces  # <--- LUÔN ĐỂ ĐẦU TIÊN
 import os
 import time
 import threading
@@ -9,7 +9,7 @@ import io
 import tempfile
 import numpy as np
-# Import các thư viện khác
 import torch
 import soundfile as sf
 from pydub import AudioSegment
@@ -18,16 +18,13 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from vieneu_tts import VieNeuTTS
-# --- KHỞI TẠO FASTAPI ---
 app = FastAPI()
-print("⏳ Đang khởi động VieNeu-TTS...")
-# --- 1. SETUP MODEL (SỬA LẠI CHO ZEROGPU) ---
-# QUAN TRỌNG: Trên ZeroGPU, lúc khởi động PHẢI DÙNG CPU
-# GPU chỉ được kích hoạt bên trong hàm @spaces.GPU
-device = "cpu"
-print(f"🖥️ Thiết bị khởi động (Global): {device.upper()} (Sẽ chuyển sang CUDA khi chạy)")
 # Cache
 CACHE_DIR = "./reference_cache"
@@ -53,21 +50,30 @@ def save_cache_to_disk(cache_key, ref_codes):
         with open(cache_path, 'wb') as f: pickle.dump(ref_codes, f)
     except Exception: pass
-# Load Model vào CPU trước
-try:
-    print("📦 Đang tải model vào RAM (CPU)...")
-    tts = VieNeuTTS(
-        backbone_repo="pnnbao-ump/VieNeu-TTS",
-        backbone_device="cpu", # Bắt buộc là CPU
-        codec_repo="neuphonic/neucodec",
-        codec_device="cpu"     # Bắt buộc là CPU
-    )
-    print("✅ Model đã tải xong (Ready on CPU)!")
-except Exception as e:
-    print(f"⚠️ Lỗi tải model: {e}")
-    tts = None
-# --- 2. DATA ---
 VOICE_SAMPLES = {
     "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
     "Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
@@ -81,31 +87,22 @@ VOICE_SAMPLES = {
     "Nhỏ Ngọt Ngào": {"audio": "./sample/Nhỏ Ngọt Ngào.wav", "text": "./sample/Nhỏ Ngọt Ngào.txt"},
 }
-# --- 3. CORE LOGIC (ZeroGPU Optimization) ---
-def move_model_to_cuda():
-    """Hàm helper để đẩy model sang GPU khi cần"""
     if torch.cuda.is_available():
-        # Kiểm tra xem model đã ở trên GPU chưa để tránh move thừa
-        # VieNeuTTS lưu model trong self.backbone và self.codec
         try:
-            # Move backbone
             if next(tts.backbone.parameters()).device.type != 'cuda':
-                print("   🚀 Moving model to GPU...")
                 tts.backbone.to("cuda")
-            # Move codec
-            if next(tts.codec.parameters()).device.type != 'cuda':
                 tts.codec.to("cuda")
-        except Exception as e:
-            print(f"⚠️ Lỗi khi move model sang GPU: {e}")
-@spaces.GPU
-def core_synthesize(text, voice_choice, speed_factor):
-    # 1. Đẩy model sang GPU (Chỉ làm việc này bên trong hàm @spaces.GPU)
-    move_model_to_cuda()
-    # 2. Lấy thông tin giọng
     voice_info = VOICE_SAMPLES.get(voice_choice)
     if not voice_info:
         raise ValueError("Giọng không tồn tại")
@@ -116,38 +113,38 @@ def core_synthesize(text, voice_choice, speed_factor):
     with open(ref_text_path, "r", encoding="utf-8") as f:
         ref_text_raw = f.read()
-    # 3. Encode Reference
     cache_key = f"preset:{voice_choice}"
     with reference_cache_lock:
         if cache_key in reference_cache:
             ref_codes = reference_cache[cache_key]
-            # Đảm bảo ref_codes cũng ở trên GPU
-            if isinstance(ref_codes, torch.Tensor):
                 ref_codes = ref_codes.to("cuda")
         else:
             ref_codes = load_cache_from_disk(cache_key)
             if ref_codes is None:
-                ref_codes = tts.encode_reference(ref_audio_path) # Lúc này model đã ở GPU nên encode sẽ nhanh
-                # Move về CPU để cache
                 save_cache_to_disk(cache_key, ref_codes.cpu() if isinstance(ref_codes, torch.Tensor) else ref_codes)
-            # Đẩy lại lên GPU để dùng
-            if isinstance(ref_codes, torch.Tensor):
                 ref_codes = ref_codes.to("cuda")
             reference_cache[cache_key] = ref_codes
-    # 4. Infer
     wav = tts.infer(text, ref_codes, ref_text_raw)
-    # 5. Speed Control (CPU)
     if speed_factor != 1.0:
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
             sf.write(tmp.name, wav, 24000)
             tmp_path = tmp.name
         sound = AudioSegment.from_wav(tmp_path)
         new_frame_rate = int(sound.frame_rate * speed_factor)
         sound_stretched = sound._spawn(sound.raw_data, overrides={'frame_rate': new_frame_rate})
         sound_stretched = sound_stretched.set_frame_rate(24000)
         wav = np.array(sound_stretched.get_array_of_samples()).astype(np.float32) / 32768.0
         if sound_stretched.channels == 2:
             wav = wav.reshape((-1, 2)).mean(axis=1)
@@ -155,17 +152,21 @@ def core_synthesize(text, voice_choice, speed_factor):
     return wav
-@spaces.GPU
 def custom_synthesize_logic(text, ref_audio_path, ref_text_raw):
-    # 1. Đẩy model sang GPU
-    move_model_to_cuda()
-    # 2. Xử lý
     ref_codes = tts.encode_reference(ref_audio_path)
     wav = tts.infer(text, ref_codes, ref_text_raw)
     return wav
-# --- 4. API ---
 class FastTTSRequest(BaseModel):
     text: str
     voice_choice: str
@@ -180,11 +181,10 @@ async def get_voices():
 async def fast_tts(request: FastTTSRequest):
     try:
         start = time.time()
-        # Gọi hàm đã decorate
         wav = core_synthesize(request.text, request.voice_choice, request.speed_factor)
         process_time = time.time() - start
-        # Base64
         audio_buffer = io.BytesIO()
         sf.write(audio_buffer, wav, 24000, format='WAV')
         audio_base64 = base64.b64encode(audio_buffer.getvalue()).decode('utf-8')
@@ -197,7 +197,7 @@ async def fast_tts(request: FastTTSRequest):
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-# --- 5. UI ---
 theme = gr.themes.Soft()
 css = ".container { max-width: 900px; margin: auto; }"
@@ -239,8 +239,10 @@ with gr.Blocks(theme=theme, css=css, title="VieNeu-TTS") as demo:
     tabs.children[1].select(lambda: "custom_mode", None, mode_state)
     btn.click(ui_synthesize, [inp_text, inp_voice, inp_audio, inp_ref_text, mode_state, inp_speed], [out_audio, out_status])
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

+import spaces # <--- BẮT BUỘC DÒNG 1
 import os
 import time
 import threading
 import tempfile
 import numpy as np
+# Các thư viện khác
 import torch
 import soundfile as sf
 from pydub import AudioSegment
 from pydantic import BaseModel
 from vieneu_tts import VieNeuTTS
+# --- KHỞI TẠO ---
 app = FastAPI()
+print("⏳ Đang khởi động Server...")
+# Biến toàn cục để lưu model (Lazy Load)
+tts_model = None
+model_lock = threading.Lock()
 # Cache
 CACHE_DIR = "./reference_cache"
         with open(cache_path, 'wb') as f: pickle.dump(ref_codes, f)
     except Exception: pass
+# --- HELPER: LOAD MODEL AN TOÀN ---
+def get_tts_model():
+    """Hàm này chỉ tải model khi được gọi lần đầu tiên"""
+    global tts_model
+    with model_lock:
+        if tts_model is None:
+            print("📦 Đang khởi tạo model lần đầu (Lazy Load)...")
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"   🖥️ Device: {device}")
+            try:
+                # Load model
+                tts_model = VieNeuTTS(
+                    backbone_repo="pnnbao-ump/VieNeu-TTS",
+                    backbone_device=device,
+                    codec_repo="neuphonic/neucodec",
+                    codec_device=device
+                )
+                print("   ✅ Model tải thành công!")
+            except Exception as e:
+                print(f"   ❌ Lỗi tải model: {e}")
+                raise e
+        return tts_model
+# --- DATA ---
 VOICE_SAMPLES = {
     "Tuyên (nam miền Bắc)": {"audio": "./sample/Tuyên (nam miền Bắc).wav", "text": "./sample/Tuyên (nam miền Bắc).txt"},
     "Vĩnh (nam miền Nam)": {"audio": "./sample/Vĩnh (nam miền Nam).wav", "text": "./sample/Vĩnh (nam miền Nam).txt"},
     "Nhỏ Ngọt Ngào": {"audio": "./sample/Nhỏ Ngọt Ngào.wav", "text": "./sample/Nhỏ Ngọt Ngào.txt"},
 }
+# --- CORE LOGIC (DECORATED WITH @spaces.GPU) ---
+@spaces.GPU(duration=120) # Tăng thời gian timeout lên 120s cho lần đầu load model
+def core_synthesize(text, voice_choice, speed_factor):
+    # 1. Lấy model (Sẽ tải nếu chưa có)
+    tts = get_tts_model()
+    # 2. Đảm bảo model ở đúng device (GPU)
     if torch.cuda.is_available():
         try:
             if next(tts.backbone.parameters()).device.type != 'cuda':
                 tts.backbone.to("cuda")
                 tts.codec.to("cuda")
+        except: pass
+    # 3. Lấy thông tin giọng
     voice_info = VOICE_SAMPLES.get(voice_choice)
     if not voice_info:
         raise ValueError("Giọng không tồn tại")
     with open(ref_text_path, "r", encoding="utf-8") as f:
         ref_text_raw = f.read()
+    # 4. Encode Reference
     cache_key = f"preset:{voice_choice}"
     with reference_cache_lock:
         if cache_key in reference_cache:
             ref_codes = reference_cache[cache_key]
+            if isinstance(ref_codes, torch.Tensor) and torch.cuda.is_available():
                 ref_codes = ref_codes.to("cuda")
         else:
             ref_codes = load_cache_from_disk(cache_key)
             if ref_codes is None:
+                ref_codes = tts.encode_reference(ref_audio_path)
+                # Cache trên CPU
                 save_cache_to_disk(cache_key, ref_codes.cpu() if isinstance(ref_codes, torch.Tensor) else ref_codes)
+            if isinstance(ref_codes, torch.Tensor) and torch.cuda.is_available():
                 ref_codes = ref_codes.to("cuda")
             reference_cache[cache_key] = ref_codes
+    # 5. Infer
     wav = tts.infer(text, ref_codes, ref_text_raw)
+    # 6. Speed Control (CPU Processing)
     if speed_factor != 1.0:
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
             sf.write(tmp.name, wav, 24000)
             tmp_path = tmp.name
         sound = AudioSegment.from_wav(tmp_path)
         new_frame_rate = int(sound.frame_rate * speed_factor)
         sound_stretched = sound._spawn(sound.raw_data, overrides={'frame_rate': new_frame_rate})
         sound_stretched = sound_stretched.set_frame_rate(24000)
         wav = np.array(sound_stretched.get_array_of_samples()).astype(np.float32) / 32768.0
         if sound_stretched.channels == 2:
             wav = wav.reshape((-1, 2)).mean(axis=1)
     return wav
+@spaces.GPU(duration=120)
 def custom_synthesize_logic(text, ref_audio_path, ref_text_raw):
+    tts = get_tts_model()
+    if torch.cuda.is_available():
+        try:
+            if next(tts.backbone.parameters()).device.type != 'cuda':
+                tts.backbone.to("cuda")
+                tts.codec.to("cuda")
+        except: pass
     ref_codes = tts.encode_reference(ref_audio_path)
     wav = tts.infer(text, ref_codes, ref_text_raw)
     return wav
+# --- API ---
 class FastTTSRequest(BaseModel):
     text: str
     voice_choice: str
 async def fast_tts(request: FastTTSRequest):
     try:
         start = time.time()
+        # Gọi hàm GPU
         wav = core_synthesize(request.text, request.voice_choice, request.speed_factor)
         process_time = time.time() - start
         audio_buffer = io.BytesIO()
         sf.write(audio_buffer, wav, 24000, format='WAV')
         audio_base64 = base64.b64encode(audio_buffer.getvalue()).decode('utf-8')
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+# --- GRADIO UI ---
 theme = gr.themes.Soft()
 css = ".container { max-width: 900px; margin: auto; }"
     tabs.children[1].select(lambda: "custom_mode", None, mode_state)
     btn.click(ui_synthesize, [inp_text, inp_voice, inp_audio, inp_ref_text, mode_state, inp_speed], [out_audio, out_status])
+# Mount Gradio vào FastAPI
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     import uvicorn
+    # Mở port 7860 để Hugging Face truy cập
     uvicorn.run(app, host="0.0.0.0", port=7860)