Spaces:

nhantrungsp
/

FSub

Running on Zero

App Files Files Community

nhantrungsp commited on 1 day ago

Commit

d593d54

verified ·

1 Parent(s): d46de93

Update gradio_app.py

Browse files

Files changed (1) hide show

gradio_app.py +58 -46

gradio_app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import spaces  # <--- QUAN TRỌNG: PHẢI ĐỂ DÒNG ĐẦU TIÊN
 import os
 import time
 import threading
@@ -9,15 +9,13 @@ import io
 import tempfile
 import numpy as np
-# Các thư viện khác import sau spaces
 import torch
 import soundfile as sf
 from pydub import AudioSegment
 import gradio as gr
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-# Import thư viện nội bộ
 from vieneu_tts import VieNeuTTS
 # --- KHỞI TẠO FASTAPI ---
@@ -25,9 +23,11 @@ app = FastAPI()
 print("⏳ Đang khởi động VieNeu-TTS...")
-# --- 1. SETUP MODEL ---
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"🖥️ Sử dụng thiết bị (Global): {device.upper()}")
 # Cache
 CACHE_DIR = "./reference_cache"
@@ -35,7 +35,6 @@ os.makedirs(CACHE_DIR, exist_ok=True)
 reference_cache = {}
 reference_cache_lock = threading.Lock()
-# Hàm Cache Helper
 def get_cache_path(cache_key):
     key_hash = hashlib.md5(cache_key.encode()).hexdigest()
     return os.path.join(CACHE_DIR, f"{key_hash}.pkl")
@@ -54,16 +53,16 @@ def save_cache_to_disk(cache_key, ref_codes):
         with open(cache_path, 'wb') as f: pickle.dump(ref_codes, f)
     except Exception: pass
-# Load Model
 try:
-    print("📦 Đang tải model vào bộ nhớ...")
     tts = VieNeuTTS(
         backbone_repo="pnnbao-ump/VieNeu-TTS",
-        backbone_device=device,
         codec_repo="neuphonic/neucodec",
-        codec_device=device
     )
-    print("✅ Model đã tải xong!")
 except Exception as e:
     print(f"⚠️ Lỗi tải model: {e}")
     tts = None
@@ -82,12 +81,31 @@ VOICE_SAMPLES = {
     "Nhỏ Ngọt Ngào": {"audio": "./sample/Nhỏ Ngọt Ngào.wav", "text": "./sample/Nhỏ Ngọt Ngào.txt"},
 }
-# --- 3. CORE LOGIC (Dùng chung cho cả API và UI) ---
-# QUAN TRỌNG: Decorator GPU
 @spaces.GPU
 def core_synthesize(text, voice_choice, speed_factor):
-    # Lấy thông tin giọng
     voice_info = VOICE_SAMPLES.get(voice_choice)
     if not voice_info:
         raise ValueError("Giọng không tồn tại")
@@ -95,41 +113,41 @@ def core_synthesize(text, voice_choice, speed_factor):
     ref_audio_path = voice_info["audio"]
     ref_text_path = voice_info["text"]
-    # Load reference text
     with open(ref_text_path, "r", encoding="utf-8") as f:
         ref_text_raw = f.read()
-    # Encode reference (Cache logic)
     cache_key = f"preset:{voice_choice}"
     with reference_cache_lock:
         if cache_key in reference_cache:
             ref_codes = reference_cache[cache_key]
         else:
             ref_codes = load_cache_from_disk(cache_key)
             if ref_codes is None:
-                # Đảm bảo dọn dẹp bộ nhớ trước khi encode
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                ref_codes = tts.encode_reference(ref_audio_path)
-                save_cache_to_disk(cache_key, ref_codes)
             reference_cache[cache_key] = ref_codes
-    # Infer
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
     wav = tts.infer(text, ref_codes, ref_text_raw)
-    # Speed
     if speed_factor != 1.0:
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
             sf.write(tmp.name, wav, 24000)
             tmp_path = tmp.name
         sound = AudioSegment.from_wav(tmp_path)
         new_frame_rate = int(sound.frame_rate * speed_factor)
         sound_stretched = sound._spawn(sound.raw_data, overrides={'frame_rate': new_frame_rate})
         sound_stretched = sound_stretched.set_frame_rate(24000)
         wav = np.array(sound_stretched.get_array_of_samples()).astype(np.float32) / 32768.0
         if sound_stretched.channels == 2:
             wav = wav.reshape((-1, 2)).mean(axis=1)
@@ -137,16 +155,17 @@ def core_synthesize(text, voice_choice, speed_factor):
     return wav
-# Hàm riêng cho Custom Voice cũng cần GPU
 @spaces.GPU
 def custom_synthesize_logic(text, ref_audio_path, ref_text_raw):
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
     ref_codes = tts.encode_reference(ref_audio_path)
     wav = tts.infer(text, ref_codes, ref_text_raw)
     return wav
-# --- 4. API ENDPOINTS (Cho Client App kết nối) ---
 class FastTTSRequest(BaseModel):
     text: str
     voice_choice: str
@@ -161,15 +180,14 @@ async def get_voices():
 async def fast_tts(request: FastTTSRequest):
     try:
         start = time.time()
-        # Gọi hàm đã được decorate @spaces.GPU
         wav = core_synthesize(request.text, request.voice_choice, request.speed_factor)
         process_time = time.time() - start
-        # Convert to Base64
         audio_buffer = io.BytesIO()
         sf.write(audio_buffer, wav, 24000, format='WAV')
-        audio_bytes = audio_buffer.getvalue()
-        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
         return {
             "status": "success",
@@ -179,7 +197,7 @@ async def fast_tts(request: FastTTSRequest):
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-# --- 5. GRADIO UI SETUP ---
 theme = gr.themes.Soft()
 css = ".container { max-width: 900px; margin: auto; }"
@@ -190,7 +208,7 @@ def ui_synthesize(text, voice, custom_audio, custom_text, mode, speed):
             wav = custom_synthesize_logic(text, custom_audio, custom_text)
         else:
             wav = core_synthesize(text, voice, speed)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
             sf.write(tmp.name, wav, 24000)
             path = tmp.name
@@ -204,17 +222,14 @@ with gr.Blocks(theme=theme, css=css, title="VieNeu-TTS") as demo:
     with gr.Row():
         with gr.Column():
             inp_text = gr.Textbox(label="Văn bản", lines=3, value="Xin chào Việt Nam")
             with gr.Tabs() as tabs:
                 with gr.TabItem("Giọng mẫu", id="preset_mode"):
                     inp_voice = gr.Dropdown(list(VOICE_SAMPLES.keys()), value="Tuyên (nam miền Bắc)", label="Chọn giọng")
                 with gr.TabItem("Custom", id="custom_mode"):
                     inp_audio = gr.Audio(type="filepath")
                     inp_ref_text = gr.Textbox(label="Lời thoại mẫu")
             inp_speed = gr.Slider(0.5, 2.0, value=1.0, label="Tốc độ")
             btn = gr.Button("Đọc ngay", variant="primary")
         with gr.Column():
             out_audio = gr.Audio(label="Kết quả", autoplay=True)
             out_status = gr.Textbox(label="Trạng thái")
@@ -222,13 +237,10 @@ with gr.Blocks(theme=theme, css=css, title="VieNeu-TTS") as demo:
     mode_state = gr.Textbox(visible=False, value="preset_mode")
     tabs.children[0].select(lambda: "preset_mode", None, mode_state)
     tabs.children[1].select(lambda: "custom_mode", None, mode_state)
     btn.click(ui_synthesize, [inp_text, inp_voice, inp_audio, inp_ref_text, mode_state, inp_speed], [out_audio, out_status])
-# --- 6. MOUNT GRADIO VÀO FASTAPI ---
 app = gr.mount_gradio_app(app, demo, path="/")
-# --- 7. CHẠY SERVER ---
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

+import spaces  # <--- LUÔN ĐỂ ĐẦU TIÊN
 import os
 import time
 import threading
 import tempfile
 import numpy as np
+# Import các thư viện khác
 import torch
 import soundfile as sf
 from pydub import AudioSegment
 import gradio as gr
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from vieneu_tts import VieNeuTTS
 # --- KHỞI TẠO FASTAPI ---
 print("⏳ Đang khởi động VieNeu-TTS...")
+# --- 1. SETUP MODEL (SỬA LẠI CHO ZEROGPU) ---
+# QUAN TRỌNG: Trên ZeroGPU, lúc khởi động PHẢI DÙNG CPU
+# GPU chỉ được kích hoạt bên trong hàm @spaces.GPU
+device = "cpu"
+print(f"🖥️ Thiết bị khởi động (Global): {device.upper()} (Sẽ chuyển sang CUDA khi chạy)")
 # Cache
 CACHE_DIR = "./reference_cache"
 reference_cache = {}
 reference_cache_lock = threading.Lock()
 def get_cache_path(cache_key):
     key_hash = hashlib.md5(cache_key.encode()).hexdigest()
     return os.path.join(CACHE_DIR, f"{key_hash}.pkl")
         with open(cache_path, 'wb') as f: pickle.dump(ref_codes, f)
     except Exception: pass
+# Load Model vào CPU trước
 try:
+    print("📦 Đang tải model vào RAM (CPU)...")
     tts = VieNeuTTS(
         backbone_repo="pnnbao-ump/VieNeu-TTS",
+        backbone_device="cpu", # Bắt buộc là CPU
         codec_repo="neuphonic/neucodec",
+        codec_device="cpu"     # Bắt buộc là CPU
     )
+    print("✅ Model đã tải xong (Ready on CPU)!")
 except Exception as e:
     print(f"⚠️ Lỗi tải model: {e}")
     tts = None
     "Nhỏ Ngọt Ngào": {"audio": "./sample/Nhỏ Ngọt Ngào.wav", "text": "./sample/Nhỏ Ngọt Ngào.txt"},
 }
+# --- 3. CORE LOGIC (ZeroGPU Optimization) ---
+def move_model_to_cuda():
+    """Hàm helper để đẩy model sang GPU khi cần"""
+    if torch.cuda.is_available():
+        # Kiểm tra xem model đã ở trên GPU chưa để tránh move thừa
+        # VieNeuTTS lưu model trong self.backbone và self.codec
+        try:
+            # Move backbone
+            if next(tts.backbone.parameters()).device.type != 'cuda':
+                print("   🚀 Moving model to GPU...")
+                tts.backbone.to("cuda")
+            # Move codec
+            if next(tts.codec.parameters()).device.type != 'cuda':
+                tts.codec.to("cuda")
+        except Exception as e:
+            print(f"⚠️ Lỗi khi move model sang GPU: {e}")
 @spaces.GPU
 def core_synthesize(text, voice_choice, speed_factor):
+    # 1. Đẩy model sang GPU (Chỉ làm việc này bên trong hàm @spaces.GPU)
+    move_model_to_cuda()
+    # 2. Lấy thông tin giọng
     voice_info = VOICE_SAMPLES.get(voice_choice)
     if not voice_info:
         raise ValueError("Giọng không tồn tại")
     ref_audio_path = voice_info["audio"]
     ref_text_path = voice_info["text"]
     with open(ref_text_path, "r", encoding="utf-8") as f:
         ref_text_raw = f.read()
+    # 3. Encode Reference
     cache_key = f"preset:{voice_choice}"
     with reference_cache_lock:
         if cache_key in reference_cache:
             ref_codes = reference_cache[cache_key]
+            # Đảm bảo ref_codes cũng ở trên GPU
+            if isinstance(ref_codes, torch.Tensor):
+                ref_codes = ref_codes.to("cuda")
         else:
             ref_codes = load_cache_from_disk(cache_key)
             if ref_codes is None:
+                ref_codes = tts.encode_reference(ref_audio_path) # Lúc này model đã ở GPU nên encode sẽ nhanh
+                # Move về CPU để cache
+                save_cache_to_disk(cache_key, ref_codes.cpu() if isinstance(ref_codes, torch.Tensor) else ref_codes)
+            # Đẩy lại lên GPU để dùng
+            if isinstance(ref_codes, torch.Tensor):
+                ref_codes = ref_codes.to("cuda")
             reference_cache[cache_key] = ref_codes
+    # 4. Infer
     wav = tts.infer(text, ref_codes, ref_text_raw)
+    # 5. Speed Control (CPU)
     if speed_factor != 1.0:
         with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
             sf.write(tmp.name, wav, 24000)
             tmp_path = tmp.name
         sound = AudioSegment.from_wav(tmp_path)
         new_frame_rate = int(sound.frame_rate * speed_factor)
         sound_stretched = sound._spawn(sound.raw_data, overrides={'frame_rate': new_frame_rate})
         sound_stretched = sound_stretched.set_frame_rate(24000)
         wav = np.array(sound_stretched.get_array_of_samples()).astype(np.float32) / 32768.0
         if sound_stretched.channels == 2:
             wav = wav.reshape((-1, 2)).mean(axis=1)
     return wav
 @spaces.GPU
 def custom_synthesize_logic(text, ref_audio_path, ref_text_raw):
+    # 1. Đẩy model sang GPU
+    move_model_to_cuda()
+    # 2. Xử lý
     ref_codes = tts.encode_reference(ref_audio_path)
     wav = tts.infer(text, ref_codes, ref_text_raw)
     return wav
+# --- 4. API ---
 class FastTTSRequest(BaseModel):
     text: str
     voice_choice: str
 async def fast_tts(request: FastTTSRequest):
     try:
         start = time.time()
+        # Gọi hàm đã decorate
         wav = core_synthesize(request.text, request.voice_choice, request.speed_factor)
         process_time = time.time() - start
+        # Base64
         audio_buffer = io.BytesIO()
         sf.write(audio_buffer, wav, 24000, format='WAV')
+        audio_base64 = base64.b64encode(audio_buffer.getvalue()).decode('utf-8')
         return {
             "status": "success",
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+# --- 5. UI ---
 theme = gr.themes.Soft()
 css = ".container { max-width: 900px; margin: auto; }"
             wav = custom_synthesize_logic(text, custom_audio, custom_text)
         else:
             wav = core_synthesize(text, voice, speed)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
             sf.write(tmp.name, wav, 24000)
             path = tmp.name
     with gr.Row():
         with gr.Column():
             inp_text = gr.Textbox(label="Văn bản", lines=3, value="Xin chào Việt Nam")
             with gr.Tabs() as tabs:
                 with gr.TabItem("Giọng mẫu", id="preset_mode"):
                     inp_voice = gr.Dropdown(list(VOICE_SAMPLES.keys()), value="Tuyên (nam miền Bắc)", label="Chọn giọng")
                 with gr.TabItem("Custom", id="custom_mode"):
                     inp_audio = gr.Audio(type="filepath")
                     inp_ref_text = gr.Textbox(label="Lời thoại mẫu")
             inp_speed = gr.Slider(0.5, 2.0, value=1.0, label="Tốc độ")
             btn = gr.Button("Đọc ngay", variant="primary")
         with gr.Column():
             out_audio = gr.Audio(label="Kết quả", autoplay=True)
             out_status = gr.Textbox(label="Trạng thái")
     mode_state = gr.Textbox(visible=False, value="preset_mode")
     tabs.children[0].select(lambda: "preset_mode", None, mode_state)
     tabs.children[1].select(lambda: "custom_mode", None, mode_state)
     btn.click(ui_synthesize, [inp_text, inp_voice, inp_audio, inp_ref_text, mode_state, inp_speed], [out_audio, out_status])
 app = gr.mount_gradio_app(app, demo, path="/")
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)