Jekyll2000 commited on
Commit
a15f8bb
·
verified ·
1 Parent(s): 010981c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -46
app.py CHANGED
@@ -1,17 +1,30 @@
1
  import io
2
  import re
 
3
  import zipfile
4
  import numpy as np
5
  import streamlit as st
6
  import soundfile as sf
7
 
8
  import torch
9
- from transformers import pipeline, AutoProcessor
 
10
 
11
  import lameenc # MP3 encoder (no ffmpeg needed)
12
 
13
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
14
 
 
 
 
 
 
 
 
 
 
 
 
15
  # -----------------------------
16
  # Helpers
17
  # -----------------------------
@@ -34,11 +47,13 @@ DEFAULT_LANGS = [
34
  ("Vietnamese", "vi"),
35
  ]
36
 
 
37
  def pick_device():
38
  if torch.cuda.is_available():
39
  return "cuda", 0, torch.float16
40
  return "cpu", -1, torch.float32
41
 
 
42
  def normalize_audio(x: np.ndarray) -> np.ndarray:
43
  x = x.astype(np.float32)
44
  peak = float(np.max(np.abs(x))) if x.size else 0.0
@@ -46,16 +61,17 @@ def normalize_audio(x: np.ndarray) -> np.ndarray:
46
  x = x / max(peak, 1e-8)
47
  return x
48
 
 
49
  def make_silence(sr: int, ms: int) -> np.ndarray:
50
  n = int(sr * (ms / 1000.0))
51
  return np.zeros(n, dtype=np.float32)
52
 
 
53
  def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
54
  text = re.sub(r"\r\n", "\n", text).strip()
55
  if not text:
56
  return []
57
 
58
- # Sentence-ish split (works across many languages reasonably)
59
  parts = re.split(r"(?<=[\.\!\?\。\!\?\n])\s+", text)
60
  chunks = []
61
  cur = ""
@@ -69,9 +85,8 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
69
  if cur:
70
  chunks.append(cur)
71
  if len(p) > max_chars:
72
- # hard-split huge segments
73
  for i in range(0, len(p), max_chars):
74
- chunks.append(p[i:i+max_chars])
75
  cur = ""
76
  else:
77
  cur = p
@@ -80,11 +95,8 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
80
  chunks.append(cur)
81
  return chunks
82
 
 
83
  def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
84
- """
85
- Tag-based control. If you later confirm a different schema from Qwen's demo,
86
- you only need to change this function.
87
- """
88
  tags = []
89
  if lang:
90
  tags.append(f"[LANG={lang}]")
@@ -94,8 +106,8 @@ def format_prompt(text: str, lang: str | None, speaker: str | None, instruction:
94
  tags.append(f"[INSTRUCTION={instruction}]")
95
  return " ".join(tags + [text])
96
 
 
97
  def safe_get_speakers(proc, pipe_obj):
98
- # Try processor attributes
99
  for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
100
  if hasattr(proc, attr):
101
  val = getattr(proc, attr)
@@ -104,7 +116,6 @@ def safe_get_speakers(proc, pipe_obj):
104
  if isinstance(val, (list, tuple)):
105
  return sorted(set(map(str, val)))
106
 
107
- # Try model config attributes
108
  model = getattr(pipe_obj, "model", None)
109
  cfg = getattr(model, "config", None) if model is not None else None
110
  if cfg is not None:
@@ -118,14 +129,15 @@ def safe_get_speakers(proc, pipe_obj):
118
 
119
  return []
120
 
 
121
  def try_reference_audio(wav_bytes: bytes):
122
  audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
123
  if audio.ndim > 1:
124
  audio = audio.mean(axis=1)
125
  return {"array": audio, "sampling_rate": sr}
126
 
 
127
  def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
128
- # Try with reference audio if supported; otherwise fall back gracefully
129
  if ref_audio is not None:
130
  try:
131
  return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
@@ -135,27 +147,25 @@ def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
135
  pass
136
  return pipe_obj(prompt, **gen_kwargs)
137
 
 
138
  def float_to_int16_pcm(x: np.ndarray) -> bytes:
139
  x = np.clip(x, -1.0, 1.0)
140
  pcm = (x * 32767.0).astype(np.int16)
141
  return pcm.tobytes()
142
 
 
143
  def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
144
- """
145
- Encode mono float32 audio (-1..1) to MP3 bytes using lameenc.
146
- No ffmpeg required.
147
- """
148
  enc = lameenc.Encoder()
149
  enc.set_bit_rate(int(bitrate_kbps))
150
  enc.set_in_sample_rate(int(sr))
151
  enc.set_channels(1)
152
- enc.set_quality(2) # 2=high quality, 7=faster
153
-
154
  pcm_bytes = float_to_int16_pcm(audio_float32)
155
  mp3 = enc.encode(pcm_bytes)
156
  mp3 += enc.flush()
157
  return mp3
158
 
 
159
  def sanitize_filename(name: str) -> str:
160
  name = name.strip().replace("\\", "_").replace("/", "_")
161
  name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
@@ -164,6 +174,7 @@ def sanitize_filename(name: str) -> str:
164
  name = "chapter"
165
  return name
166
 
 
167
  @st.cache_resource(show_spinner=False)
168
  def load_tts():
169
  device, device_id, dtype = pick_device()
@@ -193,7 +204,6 @@ colA, colB = st.columns([2, 1], gap="large")
193
  with colB:
194
  st.subheader("Controls")
195
 
196
- # Language
197
  lang_label = st.selectbox(
198
  "Language",
199
  options=[x[0] for x in DEFAULT_LANGS],
@@ -202,7 +212,6 @@ with colB:
202
  )
203
  lang = dict(DEFAULT_LANGS).get(lang_label)
204
 
205
- # Speakers
206
  st.markdown("### Voice / Speaker")
207
  speaker = None
208
  if detected_speakers:
@@ -224,7 +233,6 @@ with colB:
224
  if custom_speaker:
225
  speaker = custom_speaker
226
 
227
- # Instruction
228
  st.markdown("### Instruction Control")
229
  instruction = st.text_area(
230
  "Instruction (style/emotion/pacing/etc.)",
@@ -234,7 +242,6 @@ with colB:
234
  if instruction == "":
235
  instruction = None
236
 
237
- # Optional reference voice
238
  st.markdown("### Optional: Reference Voice")
239
  ref_file = st.file_uploader(
240
  "Upload reference WAV (optional)",
@@ -242,7 +249,6 @@ with colB:
242
  help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
243
  )
244
 
245
- # Long text chunking
246
  st.markdown("### Long Text Settings")
247
  max_chars = st.slider(
248
  "Chunk size (characters)",
@@ -260,7 +266,6 @@ with colB:
260
  step=50,
261
  )
262
 
263
- # Generation params
264
  st.markdown("### Generation Parameters")
265
  max_new_tokens = st.slider(
266
  "max_new_tokens",
@@ -278,7 +283,6 @@ with colB:
278
  step=0.1,
279
  )
280
 
281
- # MP3 export
282
  st.markdown("### MP3 Export")
283
  mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
284
  normalize = st.checkbox("Normalize output audio", value=True)
@@ -292,7 +296,6 @@ with colA:
292
  horizontal=True,
293
  )
294
 
295
- # Shared ref audio prep
296
  ref_audio = None
297
  if ref_file is not None:
298
  try:
@@ -318,7 +321,6 @@ with colA:
318
  stitched = None
319
  out_sr = None
320
 
321
- # chunk-level progress
322
  for i, chunk in enumerate(chunks, start=1):
323
  st.session_state["_status"].write(f"{label}: chunk {i}/{len(chunks)}")
324
  prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
@@ -338,23 +340,19 @@ with colA:
338
  out_sr = int(sr)
339
  else:
340
  if int(sr) != out_sr:
341
- # usually consistent; warn once
342
  st.warning(f"{label}: sample rate changed ({sr} != {out_sr}). Stitching anyway.")
343
  if gap_ms > 0:
344
  stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
345
  else:
346
  stitched = np.concatenate([stitched, audio])
347
 
348
- # update overall progress bar
349
  frac = i / len(chunks)
350
  st.session_state["_progress"].progress(int((progress_base + frac * progress_span) * 100))
351
 
352
- # encode mp3
353
- mp3_bytes = encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
354
- return mp3_bytes
355
 
356
  # -----------------------------
357
- # Single mode
358
  # -----------------------------
359
  if input_mode == "Single chapter (paste/upload)":
360
  single_submode = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
@@ -400,7 +398,7 @@ with colA:
400
  )
401
 
402
  # -----------------------------
403
- # Batch mode
404
  # -----------------------------
405
  else:
406
  st.markdown("Upload multiple `.txt` files (each file = one chapter).")
@@ -411,12 +409,6 @@ with colA:
411
  key="batch_txts",
412
  )
413
 
414
- if batch_files:
415
- total_chars = 0
416
- for f in batch_files:
417
- total_chars += len(f.getvalue())
418
- st.write(f"**Files:** {len(batch_files)} | **Total bytes:** {total_chars:,}")
419
-
420
  st.divider()
421
 
422
  if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
@@ -427,9 +419,8 @@ with colA:
427
  st.session_state["_progress"] = st.progress(0)
428
  st.session_state["_status"] = st.empty()
429
 
430
- # Generate each file -> mp3, and pack into ZIP
431
  zip_buf = io.BytesIO()
432
- results_preview = [] # (name, mp3_bytes) for in-page audio preview
433
 
434
  with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
435
  n = len(batch_files)
@@ -439,24 +430,23 @@ with colA:
439
  mp3_name = f"{base}.mp3"
440
  label = f"{idx}/{n} {base}"
441
 
442
- # allocate progress range per file
443
  base_prog = (idx - 1) / n
444
  span_prog = 1.0 / n
445
 
446
  try:
447
- mp3_bytes = generate_mp3_from_text(raw, label=label, progress_base=base_prog, progress_span=span_prog)
 
 
448
  except Exception as e:
449
  st.error(f"Failed on file '{f.name}': {e}")
450
  st.stop()
451
 
452
  zf.writestr(mp3_name, mp3_bytes)
453
-
454
- # Keep a small preview list (all, but could be large; still OK)
455
  results_preview.append((mp3_name, mp3_bytes))
456
 
457
  st.session_state["_status"].write("✅ Batch complete. Download your ZIP below.")
458
-
459
  zip_buf.seek(0)
 
460
  st.download_button(
461
  "Download ZIP (all MP3s)",
462
  data=zip_buf.getvalue(),
 
1
  import io
2
  import re
3
+ import os
4
  import zipfile
5
  import numpy as np
6
  import streamlit as st
7
  import soundfile as sf
8
 
9
  import torch
10
+ from transformers import AutoProcessor
11
+ from transformers.pipelines import pipeline
12
 
13
  import lameenc # MP3 encoder (no ffmpeg needed)
14
 
15
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
16
 
17
+ # Show a clear error if torch isn't functional
18
+ try:
19
+ _ = torch.tensor([1.0])
20
+ except Exception as e:
21
+ st.error(
22
+ "PyTorch is not available or failed to initialize.\n\n"
23
+ "Fix: Set `python_version: \"3.10\"` in README.md and pin a supported torch build in requirements.txt.\n\n"
24
+ f"Details: {e}"
25
+ )
26
+ st.stop()
27
+
28
  # -----------------------------
29
  # Helpers
30
  # -----------------------------
 
47
  ("Vietnamese", "vi"),
48
  ]
49
 
50
+
51
  def pick_device():
52
  if torch.cuda.is_available():
53
  return "cuda", 0, torch.float16
54
  return "cpu", -1, torch.float32
55
 
56
+
57
  def normalize_audio(x: np.ndarray) -> np.ndarray:
58
  x = x.astype(np.float32)
59
  peak = float(np.max(np.abs(x))) if x.size else 0.0
 
61
  x = x / max(peak, 1e-8)
62
  return x
63
 
64
+
65
  def make_silence(sr: int, ms: int) -> np.ndarray:
66
  n = int(sr * (ms / 1000.0))
67
  return np.zeros(n, dtype=np.float32)
68
 
69
+
70
  def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
71
  text = re.sub(r"\r\n", "\n", text).strip()
72
  if not text:
73
  return []
74
 
 
75
  parts = re.split(r"(?<=[\.\!\?\。\!\?\n])\s+", text)
76
  chunks = []
77
  cur = ""
 
85
  if cur:
86
  chunks.append(cur)
87
  if len(p) > max_chars:
 
88
  for i in range(0, len(p), max_chars):
89
+ chunks.append(p[i:i + max_chars])
90
  cur = ""
91
  else:
92
  cur = p
 
95
  chunks.append(cur)
96
  return chunks
97
 
98
+
99
  def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
 
 
 
 
100
  tags = []
101
  if lang:
102
  tags.append(f"[LANG={lang}]")
 
106
  tags.append(f"[INSTRUCTION={instruction}]")
107
  return " ".join(tags + [text])
108
 
109
+
110
  def safe_get_speakers(proc, pipe_obj):
 
111
  for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
112
  if hasattr(proc, attr):
113
  val = getattr(proc, attr)
 
116
  if isinstance(val, (list, tuple)):
117
  return sorted(set(map(str, val)))
118
 
 
119
  model = getattr(pipe_obj, "model", None)
120
  cfg = getattr(model, "config", None) if model is not None else None
121
  if cfg is not None:
 
129
 
130
  return []
131
 
132
+
133
  def try_reference_audio(wav_bytes: bytes):
134
  audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
135
  if audio.ndim > 1:
136
  audio = audio.mean(axis=1)
137
  return {"array": audio, "sampling_rate": sr}
138
 
139
+
140
  def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
 
141
  if ref_audio is not None:
142
  try:
143
  return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
 
147
  pass
148
  return pipe_obj(prompt, **gen_kwargs)
149
 
150
+
151
  def float_to_int16_pcm(x: np.ndarray) -> bytes:
152
  x = np.clip(x, -1.0, 1.0)
153
  pcm = (x * 32767.0).astype(np.int16)
154
  return pcm.tobytes()
155
 
156
+
157
  def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
 
 
 
 
158
  enc = lameenc.Encoder()
159
  enc.set_bit_rate(int(bitrate_kbps))
160
  enc.set_in_sample_rate(int(sr))
161
  enc.set_channels(1)
162
+ enc.set_quality(2)
 
163
  pcm_bytes = float_to_int16_pcm(audio_float32)
164
  mp3 = enc.encode(pcm_bytes)
165
  mp3 += enc.flush()
166
  return mp3
167
 
168
+
169
  def sanitize_filename(name: str) -> str:
170
  name = name.strip().replace("\\", "_").replace("/", "_")
171
  name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
 
174
  name = "chapter"
175
  return name
176
 
177
+
178
  @st.cache_resource(show_spinner=False)
179
  def load_tts():
180
  device, device_id, dtype = pick_device()
 
204
  with colB:
205
  st.subheader("Controls")
206
 
 
207
  lang_label = st.selectbox(
208
  "Language",
209
  options=[x[0] for x in DEFAULT_LANGS],
 
212
  )
213
  lang = dict(DEFAULT_LANGS).get(lang_label)
214
 
 
215
  st.markdown("### Voice / Speaker")
216
  speaker = None
217
  if detected_speakers:
 
233
  if custom_speaker:
234
  speaker = custom_speaker
235
 
 
236
  st.markdown("### Instruction Control")
237
  instruction = st.text_area(
238
  "Instruction (style/emotion/pacing/etc.)",
 
242
  if instruction == "":
243
  instruction = None
244
 
 
245
  st.markdown("### Optional: Reference Voice")
246
  ref_file = st.file_uploader(
247
  "Upload reference WAV (optional)",
 
249
  help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
250
  )
251
 
 
252
  st.markdown("### Long Text Settings")
253
  max_chars = st.slider(
254
  "Chunk size (characters)",
 
266
  step=50,
267
  )
268
 
 
269
  st.markdown("### Generation Parameters")
270
  max_new_tokens = st.slider(
271
  "max_new_tokens",
 
283
  step=0.1,
284
  )
285
 
 
286
  st.markdown("### MP3 Export")
287
  mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
288
  normalize = st.checkbox("Normalize output audio", value=True)
 
296
  horizontal=True,
297
  )
298
 
 
299
  ref_audio = None
300
  if ref_file is not None:
301
  try:
 
321
  stitched = None
322
  out_sr = None
323
 
 
324
  for i, chunk in enumerate(chunks, start=1):
325
  st.session_state["_status"].write(f"{label}: chunk {i}/{len(chunks)}")
326
  prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
 
340
  out_sr = int(sr)
341
  else:
342
  if int(sr) != out_sr:
 
343
  st.warning(f"{label}: sample rate changed ({sr} != {out_sr}). Stitching anyway.")
344
  if gap_ms > 0:
345
  stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
346
  else:
347
  stitched = np.concatenate([stitched, audio])
348
 
 
349
  frac = i / len(chunks)
350
  st.session_state["_progress"].progress(int((progress_base + frac * progress_span) * 100))
351
 
352
+ return encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
 
 
353
 
354
  # -----------------------------
355
+ # Single
356
  # -----------------------------
357
  if input_mode == "Single chapter (paste/upload)":
358
  single_submode = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
 
398
  )
399
 
400
  # -----------------------------
401
+ # Batch
402
  # -----------------------------
403
  else:
404
  st.markdown("Upload multiple `.txt` files (each file = one chapter).")
 
409
  key="batch_txts",
410
  )
411
 
 
 
 
 
 
 
412
  st.divider()
413
 
414
  if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
 
419
  st.session_state["_progress"] = st.progress(0)
420
  st.session_state["_status"] = st.empty()
421
 
 
422
  zip_buf = io.BytesIO()
423
+ results_preview = []
424
 
425
  with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
426
  n = len(batch_files)
 
430
  mp3_name = f"{base}.mp3"
431
  label = f"{idx}/{n} {base}"
432
 
 
433
  base_prog = (idx - 1) / n
434
  span_prog = 1.0 / n
435
 
436
  try:
437
+ mp3_bytes = generate_mp3_from_text(
438
+ raw, label=label, progress_base=base_prog, progress_span=span_prog
439
+ )
440
  except Exception as e:
441
  st.error(f"Failed on file '{f.name}': {e}")
442
  st.stop()
443
 
444
  zf.writestr(mp3_name, mp3_bytes)
 
 
445
  results_preview.append((mp3_name, mp3_bytes))
446
 
447
  st.session_state["_status"].write("✅ Batch complete. Download your ZIP below.")
 
448
  zip_buf.seek(0)
449
+
450
  st.download_button(
451
  "Download ZIP (all MP3s)",
452
  data=zip_buf.getvalue(),