LappyundTexas commited on
Commit
5e0ac88
·
verified ·
1 Parent(s): ac1b8c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -110
app.py CHANGED
@@ -1,14 +1,13 @@
1
- import re
2
- import zipfile
3
- from pathlib import Path
4
  import threading
 
 
5
 
6
  import numpy as np
7
  import soundfile as sf
8
  import gradio as gr
9
  import torch
10
 
11
- import spaces # required for ZeroGPU
12
  from qwen_tts import Qwen3TTSModel
13
 
14
  ASSETS_DIR = Path("assets")
@@ -34,18 +33,6 @@ def read_text(path: Path) -> str:
34
  return path.read_text(encoding="utf-8").strip()
35
 
36
 
37
- def _load_model_cpu_only():
38
- """
39
- Load model on CPU WITHOUT touching CUDA.
40
- This is safe to call at startup if you ever need it (we won't).
41
- """
42
- return Qwen3TTSModel.from_pretrained(
43
- "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
44
- device_map="cpu",
45
- dtype=torch.float32,
46
- )
47
-
48
-
49
  def _ensure_assets_exist():
50
  for p in [MALE_REF_WAV, MALE_REF_TXT, FEMALE_REF_WAV, FEMALE_REF_TXT]:
51
  if not p.exists():
@@ -63,7 +50,6 @@ def _ensure_model_and_prompts(device: str):
63
 
64
  with _CACHE_LOCK:
65
  if _MODEL is None:
66
- # device is either 'cuda' or 'cpu'
67
  dtype = torch.bfloat16 if device == "cuda" else torch.float32
68
  device_map = "cuda:0" if device == "cuda" else "cpu"
69
 
@@ -71,11 +57,10 @@ def _ensure_model_and_prompts(device: str):
71
  "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
72
  device_map=device_map,
73
  dtype=dtype,
74
- # 如果你确认 flash-attn 在此环境可用再打开(ZeroGPU通常不建议强装
75
  # attn_implementation="flash_attention_2",
76
  )
77
 
78
- # Prompts depend on model; cache them too
79
  if _MALE_PROMPT is None:
80
  _MALE_PROMPT = _MODEL.create_voice_clone_prompt(
81
  ref_audio=str(MALE_REF_WAV),
@@ -91,116 +76,71 @@ def _ensure_model_and_prompts(device: str):
91
  )
92
 
93
 
94
- def chunk_text(text: str, max_chars: int = 500):
95
- text = text.strip()
96
- if not text:
97
- return []
98
-
99
- text = re.sub(r"\r\n", "\n", text)
100
- paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
101
- sent_split = re.compile(r"(?<=[\.\!\?])\s+")
102
-
103
- chunks = []
104
- for p in paras:
105
- sents = sent_split.split(p)
106
- buf = ""
107
- for s in sents:
108
- s = s.strip()
109
- if not s:
110
- continue
111
- if len(buf) + len(s) + 1 <= max_chars:
112
- buf = (buf + " " + s).strip()
113
- else:
114
- if buf:
115
- chunks.append(buf)
116
- while len(s) > max_chars:
117
- chunks.append(s[:max_chars])
118
- s = s[max_chars:]
119
- buf = s
120
- if buf:
121
- chunks.append(buf)
122
-
123
- return chunks
124
-
125
-
126
- @spaces.GPU(duration=120) # ✅ keep within ZeroGPU limits; adjust if your Space allows
127
- def synthesize(text: str, voice: str, max_chars: int):
128
  text = (text or "").strip()
129
  if not text:
130
  raise gr.Error("Empty text.")
 
 
 
131
 
132
- # On ZeroGPU, CUDA becomes available only inside this function
133
  use_cuda = torch.cuda.is_available()
134
  device = "cuda" if use_cuda else "cpu"
135
 
136
- # Load model + prompts lazily (inside GPU function)
137
  _ensure_model_and_prompts(device=device)
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- prompt = _MALE_PROMPT if voice == "male" else _FEMALE_PROMPT
140
- parts = chunk_text(text, max_chars=max_chars)
141
- if not parts:
142
- raise gr.Error("No valid text chunks after splitting.")
143
-
144
- run_id = str(abs(hash((voice, text))) % (10**12))
145
- run_dir = TMP_DIR / run_id
146
- chunks_dir = run_dir / "chunks"
147
- chunks_dir.mkdir(parents=True, exist_ok=True)
148
-
149
- wav_arrays = []
150
- chunk_files = []
151
- sr_out = None
152
-
153
- for i, t in enumerate(parts, start=1):
154
- wavs, sr = _MODEL.generate_voice_clone(
155
- text=t,
156
- language="English",
157
- voice_clone_prompt=prompt,
158
- )
159
- wav = wavs[0]
160
- if sr_out is None:
161
- sr_out = sr
162
- if sr != sr_out:
163
- raise gr.Error(f"Sample rate mismatch: got {sr}, expected {sr_out}")
164
-
165
- chunk_path = chunks_dir / f"{i:03d}.wav"
166
- sf.write(str(chunk_path), wav, sr_out)
167
- chunk_files.append(str(chunk_path))
168
- wav_arrays.append(wav.astype(np.float32))
169
-
170
- combined = np.concatenate(wav_arrays, axis=0)
171
- combined_path = run_dir / "combined.wav"
172
- sf.write(str(combined_path), combined, sr_out)
173
-
174
- zip_path = run_dir / "chunks.zip"
175
- with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
176
- for p in chunk_files:
177
- zf.write(p, arcname=Path(p).name)
178
-
179
- return str(combined_path), str(zip_path), parts
180
 
181
 
182
  with gr.Blocks() as demo:
183
  gr.Markdown(
184
- "# Paper Reading TTS (ZeroGPU)\n"
185
- "Two fixed cloned voices (male/female). Returns WAV + ZIP of chunks.\n"
186
- "Tip: keep chunks small to avoid ZeroGPU timeouts."
 
187
  )
188
 
189
- text_in = gr.Textbox(label="Text", lines=10, placeholder="Paste paper summary/paragraphs here...")
190
  voice_in = gr.Radio(choices=["male", "female"], value="male", label="Voice")
191
- max_chars_in = gr.Slider(200, 1200, value=500, step=50, label="Max chars per chunk")
192
- btn = gr.Button("Generate WAV")
193
 
194
- out_audio = gr.Audio(label="Combined WAV", type="filepath")
195
- out_zip = gr.File(label="Chunks ZIP (each segment is a wav)")
196
- out_chunks = gr.JSON(label="Chunked text preview")
197
 
198
  btn.click(
199
- fn=synthesize,
200
- inputs=[text_in, voice_in, max_chars_in],
201
- outputs=[out_audio, out_zip, out_chunks],
202
- api_name="/tts",
203
  )
204
 
205
- # ✅ Disable SSR to reduce instability in Spaces (recommended while debugging)
206
  demo.queue().launch(ssr_mode=False)
 
 
 
 
1
  import threading
2
+ import uuid
3
+ from pathlib import Path
4
 
5
  import numpy as np
6
  import soundfile as sf
7
  import gradio as gr
8
  import torch
9
 
10
+ import spaces # required for ZeroGPU
11
  from qwen_tts import Qwen3TTSModel
12
 
13
  ASSETS_DIR = Path("assets")
 
33
  return path.read_text(encoding="utf-8").strip()
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def _ensure_assets_exist():
37
  for p in [MALE_REF_WAV, MALE_REF_TXT, FEMALE_REF_WAV, FEMALE_REF_TXT]:
38
  if not p.exists():
 
50
 
51
  with _CACHE_LOCK:
52
  if _MODEL is None:
 
53
  dtype = torch.bfloat16 if device == "cuda" else torch.float32
54
  device_map = "cuda:0" if device == "cuda" else "cpu"
55
 
 
57
  "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
58
  device_map=device_map,
59
  dtype=dtype,
60
+ # ZeroGPU 环境一般不建议强装 flash-attn
61
  # attn_implementation="flash_attention_2",
62
  )
63
 
 
64
  if _MALE_PROMPT is None:
65
  _MALE_PROMPT = _MODEL.create_voice_clone_prompt(
66
  ref_audio=str(MALE_REF_WAV),
 
76
  )
77
 
78
 
79
+ def _get_prompt(voice: str):
80
+ if voice == "male":
81
+ return _MALE_PROMPT
82
+ if voice == "female":
83
+ return _FEMALE_PROMPT
84
+ raise gr.Error("voice must be 'male' or 'female'.")
85
+
86
+
87
+ @spaces.GPU(duration=120)
88
+ def tts_chunk(text: str, voice: str, language: str = "English"):
89
+ """
90
+ Voice Service API:
91
+ /tts_chunk(text, voice, language) -> wav filepath
92
+ - text: a SINGLE chunk (short text)
93
+ - voice: 'male' | 'female'
94
+ - returns: path to a generated .wav file
95
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  text = (text or "").strip()
97
  if not text:
98
  raise gr.Error("Empty text.")
99
+ if len(text) > 2000:
100
+ # 这里给一个硬阈值,避免上游误传超长 chunk 直接超时
101
+ raise gr.Error("Text too long for chunk-level API. Please split upstream (PDF Space).")
102
 
 
103
  use_cuda = torch.cuda.is_available()
104
  device = "cuda" if use_cuda else "cpu"
105
 
 
106
  _ensure_model_and_prompts(device=device)
107
+ prompt = _get_prompt(voice)
108
+
109
+ wavs, sr = _MODEL.generate_voice_clone(
110
+ text=text,
111
+ language=language,
112
+ voice_clone_prompt=prompt,
113
+ )
114
+
115
+ wav = wavs[0].astype(np.float32)
116
+
117
+ out_name = f"{voice}_{uuid.uuid4().hex}.wav"
118
+ out_path = TMP_DIR / out_name
119
+ sf.write(str(out_path), wav, sr)
120
 
121
+ return str(out_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
  with gr.Blocks() as demo:
125
  gr.Markdown(
126
+ "# Voice Service (ZeroGPU)\n"
127
+ "Chunk-level TTS API only: `/tts_chunk(text, voice) -> wav`.\n"
128
+ "- Upstream (PDF Space) must split text into chunks.\n"
129
+ "- This Space does NOT concatenate or zip.\n"
130
  )
131
 
132
+ text_in = gr.Textbox(label="Text (ONE chunk)", lines=6, placeholder="A single paragraph / sentence chunk ...")
133
  voice_in = gr.Radio(choices=["male", "female"], value="male", label="Voice")
134
+ lang_in = gr.Dropdown(choices=["English", "Chinese"], value="English", label="Language")
135
+ btn = gr.Button("Generate WAV (chunk)")
136
 
137
+ out_audio = gr.Audio(label="WAV", type="filepath")
 
 
138
 
139
  btn.click(
140
+ fn=tts_chunk,
141
+ inputs=[text_in, voice_in, lang_in],
142
+ outputs=[out_audio],
143
+ api_name="/tts_chunk",
144
  )
145
 
 
146
  demo.queue().launch(ssr_mode=False)