Character_Based_AI_Paper_Tutor_audio

Runtime error

App Files Files Community

LappyundTexas commited on Feb 27

Commit

5e0ac88

verified ·

1 Parent(s): ac1b8c5

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -110

app.py CHANGED Viewed

@@ -1,14 +1,13 @@
-import re
-import zipfile
-from pathlib import Path
 import threading
 import numpy as np
 import soundfile as sf
 import gradio as gr
 import torch
-import spaces  # ✅ required for ZeroGPU
 from qwen_tts import Qwen3TTSModel
 ASSETS_DIR = Path("assets")
@@ -34,18 +33,6 @@ def read_text(path: Path) -> str:
     return path.read_text(encoding="utf-8").strip()
-def _load_model_cpu_only():
-    """
-    Load model on CPU WITHOUT touching CUDA.
-    This is safe to call at startup if you ever need it (we won't).
-    """
-    return Qwen3TTSModel.from_pretrained(
-        "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
-        device_map="cpu",
-        dtype=torch.float32,
-    )
 def _ensure_assets_exist():
     for p in [MALE_REF_WAV, MALE_REF_TXT, FEMALE_REF_WAV, FEMALE_REF_TXT]:
         if not p.exists():
@@ -63,7 +50,6 @@ def _ensure_model_and_prompts(device: str):
     with _CACHE_LOCK:
         if _MODEL is None:
-            # device is either 'cuda' or 'cpu'
             dtype = torch.bfloat16 if device == "cuda" else torch.float32
             device_map = "cuda:0" if device == "cuda" else "cpu"
@@ -71,11 +57,10 @@ def _ensure_model_and_prompts(device: str):
                 "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
                 device_map=device_map,
                 dtype=dtype,
-                # 如果你确认 flash-attn 在此环境可用再打开（ZeroGPU通常不建议强装）
                 # attn_implementation="flash_attention_2",
             )
-        # Prompts depend on model; cache them too
         if _MALE_PROMPT is None:
             _MALE_PROMPT = _MODEL.create_voice_clone_prompt(
                 ref_audio=str(MALE_REF_WAV),
@@ -91,116 +76,71 @@ def _ensure_model_and_prompts(device: str):
             )
-def chunk_text(text: str, max_chars: int = 500):
-    text = text.strip()
-    if not text:
-        return []
-    text = re.sub(r"\r\n", "\n", text)
-    paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
-    sent_split = re.compile(r"(?<=[\.\!\?])\s+")
-    chunks = []
-    for p in paras:
-        sents = sent_split.split(p)
-        buf = ""
-        for s in sents:
-            s = s.strip()
-            if not s:
-                continue
-            if len(buf) + len(s) + 1 <= max_chars:
-                buf = (buf + " " + s).strip()
-            else:
-                if buf:
-                    chunks.append(buf)
-                while len(s) > max_chars:
-                    chunks.append(s[:max_chars])
-                    s = s[max_chars:]
-                buf = s
-        if buf:
-            chunks.append(buf)
-    return chunks
-@spaces.GPU(duration=120)  # ✅ keep within ZeroGPU limits; adjust if your Space allows
-def synthesize(text: str, voice: str, max_chars: int):
     text = (text or "").strip()
     if not text:
         raise gr.Error("Empty text.")
-    # On ZeroGPU, CUDA becomes available only inside this function
     use_cuda = torch.cuda.is_available()
     device = "cuda" if use_cuda else "cpu"
-    # Load model + prompts lazily (inside GPU function)
     _ensure_model_and_prompts(device=device)
-    prompt = _MALE_PROMPT if voice == "male" else _FEMALE_PROMPT
-    parts = chunk_text(text, max_chars=max_chars)
-    if not parts:
-        raise gr.Error("No valid text chunks after splitting.")
-    run_id = str(abs(hash((voice, text))) % (10**12))
-    run_dir = TMP_DIR / run_id
-    chunks_dir = run_dir / "chunks"
-    chunks_dir.mkdir(parents=True, exist_ok=True)
-    wav_arrays = []
-    chunk_files = []
-    sr_out = None
-    for i, t in enumerate(parts, start=1):
-        wavs, sr = _MODEL.generate_voice_clone(
-            text=t,
-            language="English",
-            voice_clone_prompt=prompt,
-        )
-        wav = wavs[0]
-        if sr_out is None:
-            sr_out = sr
-        if sr != sr_out:
-            raise gr.Error(f"Sample rate mismatch: got {sr}, expected {sr_out}")
-        chunk_path = chunks_dir / f"{i:03d}.wav"
-        sf.write(str(chunk_path), wav, sr_out)
-        chunk_files.append(str(chunk_path))
-        wav_arrays.append(wav.astype(np.float32))
-    combined = np.concatenate(wav_arrays, axis=0)
-    combined_path = run_dir / "combined.wav"
-    sf.write(str(combined_path), combined, sr_out)
-    zip_path = run_dir / "chunks.zip"
-    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
-        for p in chunk_files:
-            zf.write(p, arcname=Path(p).name)
-    return str(combined_path), str(zip_path), parts
 with gr.Blocks() as demo:
     gr.Markdown(
-        "# Paper Reading TTS (ZeroGPU)\n"
-        "Two fixed cloned voices (male/female). Returns WAV + ZIP of chunks.\n"
-        "Tip: keep chunks small to avoid ZeroGPU timeouts."
     )
-    text_in = gr.Textbox(label="Text", lines=10, placeholder="Paste paper summary/paragraphs here...")
     voice_in = gr.Radio(choices=["male", "female"], value="male", label="Voice")
-    max_chars_in = gr.Slider(200, 1200, value=500, step=50, label="Max chars per chunk")
-    btn = gr.Button("Generate WAV")
-    out_audio = gr.Audio(label="Combined WAV", type="filepath")
-    out_zip = gr.File(label="Chunks ZIP (each segment is a wav)")
-    out_chunks = gr.JSON(label="Chunked text preview")
     btn.click(
-        fn=synthesize,
-        inputs=[text_in, voice_in, max_chars_in],
-        outputs=[out_audio, out_zip, out_chunks],
-        api_name="/tts",
     )
-# ✅ Disable SSR to reduce instability in Spaces (recommended while debugging)
 demo.queue().launch(ssr_mode=False)

 import threading
+import uuid
+from pathlib import Path
 import numpy as np
 import soundfile as sf
 import gradio as gr
 import torch
+import spaces  # required for ZeroGPU
 from qwen_tts import Qwen3TTSModel
 ASSETS_DIR = Path("assets")
     return path.read_text(encoding="utf-8").strip()
 def _ensure_assets_exist():
     for p in [MALE_REF_WAV, MALE_REF_TXT, FEMALE_REF_WAV, FEMALE_REF_TXT]:
         if not p.exists():
     with _CACHE_LOCK:
         if _MODEL is None:
             dtype = torch.bfloat16 if device == "cuda" else torch.float32
             device_map = "cuda:0" if device == "cuda" else "cpu"
                 "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
                 device_map=device_map,
                 dtype=dtype,
+                # ZeroGPU 环境一般不建议强装 flash-attn
                 # attn_implementation="flash_attention_2",
             )
         if _MALE_PROMPT is None:
             _MALE_PROMPT = _MODEL.create_voice_clone_prompt(
                 ref_audio=str(MALE_REF_WAV),
             )
+def _get_prompt(voice: str):
+    if voice == "male":
+        return _MALE_PROMPT
+    if voice == "female":
+        return _FEMALE_PROMPT
+    raise gr.Error("voice must be 'male' or 'female'.")
+@spaces.GPU(duration=120)
+def tts_chunk(text: str, voice: str, language: str = "English"):
+    """
+    Voice Service API:
+      /tts_chunk(text, voice, language) -> wav filepath
+    - text: a SINGLE chunk (short text)
+    - voice: 'male' | 'female'
+    - returns: path to a generated .wav file
+    """
     text = (text or "").strip()
     if not text:
         raise gr.Error("Empty text.")
+    if len(text) > 2000:
+        # 这里给一个硬阈值，避免上游误传超长 chunk 直接超时
+        raise gr.Error("Text too long for chunk-level API. Please split upstream (PDF Space).")
     use_cuda = torch.cuda.is_available()
     device = "cuda" if use_cuda else "cpu"
     _ensure_model_and_prompts(device=device)
+    prompt = _get_prompt(voice)
+    wavs, sr = _MODEL.generate_voice_clone(
+        text=text,
+        language=language,
+        voice_clone_prompt=prompt,
+    )
+    wav = wavs[0].astype(np.float32)
+    out_name = f"{voice}_{uuid.uuid4().hex}.wav"
+    out_path = TMP_DIR / out_name
+    sf.write(str(out_path), wav, sr)
+    return str(out_path)
 with gr.Blocks() as demo:
     gr.Markdown(
+        "# Voice Service (ZeroGPU)\n"
+        "Chunk-level TTS API only: `/tts_chunk(text, voice) -> wav`.\n"
+        "- Upstream (PDF Space) must split text into chunks.\n"
+        "- This Space does NOT concatenate or zip.\n"
     )
+    text_in = gr.Textbox(label="Text (ONE chunk)", lines=6, placeholder="A single paragraph / sentence chunk ...")
     voice_in = gr.Radio(choices=["male", "female"], value="male", label="Voice")
+    lang_in = gr.Dropdown(choices=["English", "Chinese"], value="English", label="Language")
+    btn = gr.Button("Generate WAV (chunk)")
+    out_audio = gr.Audio(label="WAV", type="filepath")
     btn.click(
+        fn=tts_chunk,
+        inputs=[text_in, voice_in, lang_in],
+        outputs=[out_audio],
+        api_name="/tts_chunk",
     )
 demo.queue().launch(ssr_mode=False)