Character_Based_AI_Paper_Tutor_audio

Runtime error

App Files Files Community

LappyundTexas commited on Feb 27

Commit

bc2252a

verified ·

1 Parent(s): 11a961b

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -42

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import re
 import zipfile
 from pathlib import Path
 import numpy as np
 import soundfile as sf
 import gradio as gr
 import torch
 from qwen_tts import Qwen3TTSModel
 ASSETS_DIR = Path("assets")
@@ -19,64 +21,86 @@ FEMALE_REF_TXT = ASSETS_DIR / "female_ref.txt"
 TMP_DIR = Path("tmp_outputs")
 TMP_DIR.mkdir(parents=True, exist_ok=True)
 def read_text(path: Path) -> str:
     return path.read_text(encoding="utf-8").strip()
-def load_model():
-    # Zero GPU typically provides a CUDA GPU when the Space is running.
-    # Use bfloat16 on GPU to reduce memory.
-    use_cuda = torch.cuda.is_available()
     return Qwen3TTSModel.from_pretrained(
         "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
-        device_map="cuda:0" if use_cuda else "cpu",
-        dtype=torch.bfloat16 if use_cuda else torch.float32,
-        # 如果你后面确认 flash-attn 可用，可加：attn_implementation="flash_attention_2"
     )
-MODEL = load_model()
-def build_prompt(ref_wav: Path, ref_txt: Path):
-    if not ref_wav.exists():
-        raise RuntimeError(f"Missing {ref_wav}. Please upload it to assets/.")
-    if not ref_txt.exists():
-        raise RuntimeError(f"Missing {ref_txt}. Please upload it to assets/.")
-    ref_text = read_text(ref_txt)
-    # Prompt caching in memory only (Zero GPU has no persistent storage)
-    prompt = MODEL.create_voice_clone_prompt(
-        ref_audio=str(ref_wav),
-        ref_text=ref_text,
-        x_vector_only_mode=False,
-    )
-    return prompt
-# Build prompts at startup (one-time per container lifetime)
-MALE_PROMPT = build_prompt(MALE_REF_WAV, MALE_REF_TXT)
-FEMALE_PROMPT = build_prompt(FEMALE_REF_WAV, FEMALE_REF_TXT)
 def chunk_text(text: str, max_chars: int = 500):
-    """
-    Split long text into chunks suitable for TTS.
-    - split by blank lines
-    - then split by sentence boundaries (. ! ?)
-    - keep each chunk <= max_chars (hard cut if needed)
-    """
     text = text.strip()
     if not text:
         return []
     text = re.sub(r"\r\n", "\n", text)
     paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
     sent_split = re.compile(r"(?<=[\.\!\?])\s+")
-    chunks = []
     for p in paras:
         sents = sent_split.split(p)
         buf = ""
@@ -89,7 +113,6 @@ def chunk_text(text: str, max_chars: int = 500):
             else:
                 if buf:
                     chunks.append(buf)
-                # if one sentence is too long, hard cut
                 while len(s) > max_chars:
                     chunks.append(s[:max_chars])
                     s = s[max_chars:]
@@ -100,13 +123,24 @@ def chunk_text(text: str, max_chars: int = 500):
     return chunks
 def synthesize(text: str, voice: str, max_chars: int):
-    prompt = MALE_PROMPT if voice == "male" else FEMALE_PROMPT
     parts = chunk_text(text, max_chars=max_chars)
     if not parts:
-        raise gr.Error("Empty text.")
-    # create a per-request folder under tmp_outputs
     run_id = str(abs(hash((voice, text))) % (10**12))
     run_dir = TMP_DIR / run_id
     chunks_dir = run_dir / "chunks"
@@ -117,7 +151,7 @@ def synthesize(text: str, voice: str, max_chars: int):
     sr_out = None
     for i, t in enumerate(parts, start=1):
-        wavs, sr = MODEL.generate_voice_clone(
             text=t,
             language="English",
             voice_clone_prompt=prompt,
@@ -146,7 +180,11 @@ def synthesize(text: str, voice: str, max_chars: int):
 with gr.Blocks() as demo:
-    gr.Markdown("# Paper Reading TTS (Zero GPU dev)\nTwo fixed cloned voices (male/female). Returns WAV.")
     text_in = gr.Textbox(label="Text", lines=10, placeholder="Paste paper summary/paragraphs here...")
     voice_in = gr.Radio(choices=["male", "female"], value="male", label="Voice")
@@ -164,4 +202,5 @@ with gr.Blocks() as demo:
         api_name="/tts",
     )
-demo.queue().launch()

 import re
 import zipfile
 from pathlib import Path
+import threading
 import numpy as np
 import soundfile as sf
 import gradio as gr
 import torch
+import spaces  # ✅ required for ZeroGPU
 from qwen_tts import Qwen3TTSModel
 ASSETS_DIR = Path("assets")
 TMP_DIR = Path("tmp_outputs")
 TMP_DIR.mkdir(parents=True, exist_ok=True)
+# ----------------------------
+# Global caches (per container)
+# ----------------------------
+_MODEL = None
+_MALE_PROMPT = None
+_FEMALE_PROMPT = None
+_CACHE_LOCK = threading.Lock()
 def read_text(path: Path) -> str:
     return path.read_text(encoding="utf-8").strip()
+def _load_model_cpu_only():
+    """
+    Load model on CPU WITHOUT touching CUDA.
+    This is safe to call at startup if you ever need it (we won't).
+    """
     return Qwen3TTSModel.from_pretrained(
         "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
+        device_map="cpu",
+        dtype=torch.float32,
     )
+def _ensure_assets_exist():
+    for p in [MALE_REF_WAV, MALE_REF_TXT, FEMALE_REF_WAV, FEMALE_REF_TXT]:
+        if not p.exists():
+            raise RuntimeError(f"Missing {p}. Please upload it to assets/.")
+def _ensure_model_and_prompts(device: str):
+    """
+    Ensure model and prompts are loaded/cached.
+    Must be called INSIDE a @spaces.GPU function so CUDA is available when device='cuda'.
+    """
+    global _MODEL, _MALE_PROMPT, _FEMALE_PROMPT
+    _ensure_assets_exist()
+    with _CACHE_LOCK:
+        if _MODEL is None:
+            # device is either 'cuda' or 'cpu'
+            dtype = torch.bfloat16 if device == "cuda" else torch.float32
+            device_map = "cuda:0" if device == "cuda" else "cpu"
+            _MODEL = Qwen3TTSModel.from_pretrained(
+                "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
+                device_map=device_map,
+                dtype=dtype,
+                # 如果你确认 flash-attn 在此环境可用再打开（ZeroGPU通常不建议强装）
+                # attn_implementation="flash_attention_2",
+            )
+        # Prompts depend on model; cache them too
+        if _MALE_PROMPT is None:
+            _MALE_PROMPT = _MODEL.create_voice_clone_prompt(
+                ref_audio=str(MALE_REF_WAV),
+                ref_text=read_text(MALE_REF_TXT),
+                x_vector_only_mode=False,
+            )
+        if _FEMALE_PROMPT is None:
+            _FEMALE_PROMPT = _MODEL.create_voice_clone_prompt(
+                ref_audio=str(FEMALE_REF_WAV),
+                ref_text=read_text(FEMALE_REF_TXT),
+                x_vector_only_mode=False,
+            )
 def chunk_text(text: str, max_chars: int = 500):
     text = text.strip()
     if not text:
         return []
     text = re.sub(r"\r\n", "\n", text)
     paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
     sent_split = re.compile(r"(?<=[\.\!\?])\s+")
+    chunks = []
     for p in paras:
         sents = sent_split.split(p)
         buf = ""
             else:
                 if buf:
                     chunks.append(buf)
                 while len(s) > max_chars:
                     chunks.append(s[:max_chars])
                     s = s[max_chars:]
     return chunks
+@spaces.GPU(duration=120)  # ✅ keep within ZeroGPU limits; adjust if your Space allows
 def synthesize(text: str, voice: str, max_chars: int):
+    text = (text or "").strip()
+    if not text:
+        raise gr.Error("Empty text.")
+    # On ZeroGPU, CUDA becomes available only inside this function
+    use_cuda = torch.cuda.is_available()
+    device = "cuda" if use_cuda else "cpu"
+    # Load model + prompts lazily (inside GPU function)
+    _ensure_model_and_prompts(device=device)
+    prompt = _MALE_PROMPT if voice == "male" else _FEMALE_PROMPT
     parts = chunk_text(text, max_chars=max_chars)
     if not parts:
+        raise gr.Error("No valid text chunks after splitting.")
     run_id = str(abs(hash((voice, text))) % (10**12))
     run_dir = TMP_DIR / run_id
     chunks_dir = run_dir / "chunks"
     sr_out = None
     for i, t in enumerate(parts, start=1):
+        wavs, sr = _MODEL.generate_voice_clone(
             text=t,
             language="English",
             voice_clone_prompt=prompt,
 with gr.Blocks() as demo:
+    gr.Markdown(
+        "# Paper Reading TTS (ZeroGPU)\n"
+        "Two fixed cloned voices (male/female). Returns WAV + ZIP of chunks.\n"
+        "Tip: keep chunks small to avoid ZeroGPU timeouts."
+    )
     text_in = gr.Textbox(label="Text", lines=10, placeholder="Paste paper summary/paragraphs here...")
     voice_in = gr.Radio(choices=["male", "female"], value="male", label="Voice")
         api_name="/tts",
     )
+# ✅ Disable SSR to reduce instability in Spaces (recommended while debugging)
+demo.queue().launch(ssr_mode=False)