Jekyll2000 commited on
Commit
e24e6a2
·
verified ·
1 Parent(s): db6c05c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -83
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import io
2
  import re
3
- import math
4
- import os
5
  import numpy as np
6
  import streamlit as st
7
  import soundfile as sf
@@ -11,7 +10,6 @@ from transformers import pipeline, AutoProcessor
11
 
12
  import lameenc # MP3 encoder (no ffmpeg needed)
13
 
14
-
15
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
16
 
17
  # -----------------------------
@@ -57,6 +55,7 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
57
  if not text:
58
  return []
59
 
 
60
  parts = re.split(r"(?<=[\.\!\?\。\!\?\n])\s+", text)
61
  chunks = []
62
  cur = ""
@@ -70,6 +69,7 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
70
  if cur:
71
  chunks.append(cur)
72
  if len(p) > max_chars:
 
73
  for i in range(0, len(p), max_chars):
74
  chunks.append(p[i:i+max_chars])
75
  cur = ""
@@ -81,7 +81,10 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
81
  return chunks
82
 
83
  def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
84
- # Adjust tag format if you later confirm the model expects different tokens
 
 
 
85
  tags = []
86
  if lang:
87
  tags.append(f"[LANG={lang}]")
@@ -92,6 +95,7 @@ def format_prompt(text: str, lang: str | None, speaker: str | None, instruction:
92
  return " ".join(tags + [text])
93
 
94
  def safe_get_speakers(proc, pipe_obj):
 
95
  for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
96
  if hasattr(proc, attr):
97
  val = getattr(proc, attr)
@@ -100,6 +104,7 @@ def safe_get_speakers(proc, pipe_obj):
100
  if isinstance(val, (list, tuple)):
101
  return sorted(set(map(str, val)))
102
 
 
103
  model = getattr(pipe_obj, "model", None)
104
  cfg = getattr(model, "config", None) if model is not None else None
105
  if cfg is not None:
@@ -120,6 +125,7 @@ def try_reference_audio(wav_bytes: bytes):
120
  return {"array": audio, "sampling_rate": sr}
121
 
122
  def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
 
123
  if ref_audio is not None:
124
  try:
125
  return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
@@ -140,16 +146,23 @@ def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192)
140
  No ffmpeg required.
141
  """
142
  enc = lameenc.Encoder()
143
- enc.set_bit_rate(bitrate_kbps)
144
- enc.set_in_sample_rate(sr)
145
  enc.set_channels(1)
146
- enc.set_quality(2) # 2=high, 7=fast
147
 
148
  pcm_bytes = float_to_int16_pcm(audio_float32)
149
  mp3 = enc.encode(pcm_bytes)
150
  mp3 += enc.flush()
151
  return mp3
152
 
 
 
 
 
 
 
 
153
 
154
  @st.cache_resource(show_spinner=False)
155
  def load_tts():
@@ -170,9 +183,9 @@ def load_tts():
170
  # -----------------------------
171
  st.set_page_config(page_title="Haseeb's TTS", layout="wide")
172
  st.title("🎧 Haseeb's TTS")
173
- st.caption("Audiobook Generator • MP3 Output • Language • Voices • Instruction Control")
174
 
175
- with st.spinner("Loading model (first run can take a while)..."):
176
  pipe_obj, proc, detected_speakers, device, dtype = load_tts()
177
 
178
  colA, colB = st.columns([2, 1], gap="large")
@@ -180,6 +193,7 @@ colA, colB = st.columns([2, 1], gap="large")
180
  with colB:
181
  st.subheader("Controls")
182
 
 
183
  lang_label = st.selectbox(
184
  "Language",
185
  options=[x[0] for x in DEFAULT_LANGS],
@@ -188,6 +202,7 @@ with colB:
188
  )
189
  lang = dict(DEFAULT_LANGS).get(lang_label)
190
 
 
191
  st.markdown("### Voice / Speaker")
192
  speaker = None
193
  if detected_speakers:
@@ -199,7 +214,7 @@ with colB:
199
  )
200
  speaker = None if speaker_choice == "(none)" else speaker_choice
201
  else:
202
- st.info("No speaker list detected from model config. You can still type a custom speaker name below.")
203
 
204
  custom_speaker = st.text_input(
205
  "Custom speaker name (optional)",
@@ -209,6 +224,7 @@ with colB:
209
  if custom_speaker:
210
  speaker = custom_speaker
211
 
 
212
  st.markdown("### Instruction Control")
213
  instruction = st.text_area(
214
  "Instruction (style/emotion/pacing/etc.)",
@@ -218,6 +234,7 @@ with colB:
218
  if instruction == "":
219
  instruction = None
220
 
 
221
  st.markdown("### Optional: Reference Voice")
222
  ref_file = st.file_uploader(
223
  "Upload reference WAV (optional)",
@@ -225,14 +242,15 @@ with colB:
225
  help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
226
  )
227
 
228
- st.markdown("### Long Text (Audiobook)")
 
229
  max_chars = st.slider(
230
  "Chunk size (characters)",
231
  min_value=600,
232
  max_value=3000,
233
  value=1400,
234
  step=100,
235
- help="10,000 chars will be split into multiple chunks then stitched.",
236
  )
237
  gap_ms = st.slider(
238
  "Silence between chunks (ms)",
@@ -242,6 +260,7 @@ with colB:
242
  step=50,
243
  )
244
 
 
245
  st.markdown("### Generation Parameters")
246
  max_new_tokens = st.slider(
247
  "max_new_tokens",
@@ -259,6 +278,7 @@ with colB:
259
  step=0.1,
260
  )
261
 
 
262
  st.markdown("### MP3 Export")
263
  mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
264
  normalize = st.checkbox("Normalize output audio", value=True)
@@ -266,73 +286,48 @@ with colB:
266
  with colA:
267
  st.subheader("Input")
268
 
269
- input_mode = st.radio("Input mode", ["Paste text", "Upload .txt"], horizontal=True)
270
-
271
- text = ""
272
- if input_mode == "Paste text":
273
- text = st.text_area(
274
- "Chapter text",
275
- value="",
276
- height=420,
277
- placeholder="Paste up to ~10,000+ characters here. The app will chunk, generate, stitch, then export MP3.",
278
- )
279
- else:
280
- txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="txt_uploader")
281
- if txt_file is not None:
282
- text = txt_file.read().decode("utf-8", errors="ignore")
283
-
284
- st.write(f"**Characters:** {len(text):,}")
285
 
286
- st.divider()
 
 
 
 
 
 
 
287
 
288
- generate = st.button("Generate MP3 Audiobook", type="primary", use_container_width=True)
 
 
 
289
 
290
- if generate:
291
- if not text.strip():
292
- st.error("Please provide some text.")
293
- st.stop()
294
 
295
- chunks = split_text_into_chunks(text, max_chars=max_chars)
296
  if not chunks:
297
- st.error("Text chunking failed (empty chunks).")
298
- st.stop()
299
-
300
- st.info(f"Split into **{len(chunks)}** chunk(s). Generating audio…")
301
-
302
- ref_audio = None
303
- if ref_file is not None:
304
- try:
305
- ref_audio = try_reference_audio(ref_file.read())
306
- except Exception as e:
307
- st.warning(f"Could not read reference WAV. Ignoring it. ({e})")
308
- ref_audio = None
309
-
310
- gen_kwargs = {
311
- "max_new_tokens": int(max_new_tokens),
312
- "temperature": float(temperature),
313
- }
314
-
315
- progress = st.progress(0)
316
- status = st.empty()
317
 
318
  stitched = None
319
  out_sr = None
320
 
 
321
  for i, chunk in enumerate(chunks, start=1):
322
- status.write(f"Generating chunk {i}/{len(chunks)}")
323
  prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
324
 
325
- try:
326
- out = synthesize_chunk(pipe_obj, prompt, gen_kwargs=gen_kwargs, ref_audio=ref_audio)
327
- except Exception as e:
328
- st.error(f"Generation failed on chunk {i}: {e}")
329
- st.stop()
330
-
331
  audio = out.get("audio", None)
332
  sr = out.get("sampling_rate", None)
333
  if audio is None or sr is None:
334
- st.error(f"Unexpected pipeline output on chunk {i}.")
335
- st.stop()
336
 
337
  audio = np.asarray(audio, dtype=np.float32)
338
  if normalize:
@@ -343,33 +338,142 @@ with colA:
343
  out_sr = int(sr)
344
  else:
345
  if int(sr) != out_sr:
346
- st.warning(
347
- f"Chunk {i} sample rate {sr} != {out_sr}. "
348
- "Stitching anyway (best if consistent)."
349
- )
350
  if gap_ms > 0:
351
  stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
352
  else:
353
  stitched = np.concatenate([stitched, audio])
354
 
355
- progress.progress(int((i / len(chunks)) * 100))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
- status.write(" Done. Encoding MP3…")
 
358
 
359
- try:
360
- mp3_bytes = encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
361
- except Exception as e:
362
- st.error(f"MP3 encoding failed: {e}")
363
- st.stop()
 
 
364
 
365
- st.audio(mp3_bytes, format="audio/mp3")
 
 
 
 
366
 
367
- st.download_button(
368
- "Download MP3",
369
- data=mp3_bytes,
370
- file_name="audiobook_chapter.mp3",
371
- mime="audio/mpeg",
372
- use_container_width=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  )
374
 
375
- st.success("Generated MP3 audiobook successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import io
2
  import re
3
+ import zipfile
 
4
  import numpy as np
5
  import streamlit as st
6
  import soundfile as sf
 
10
 
11
  import lameenc # MP3 encoder (no ffmpeg needed)
12
 
 
13
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
14
 
15
  # -----------------------------
 
55
  if not text:
56
  return []
57
 
58
+ # Sentence-ish split (works across many languages reasonably)
59
  parts = re.split(r"(?<=[\.\!\?\。\!\?\n])\s+", text)
60
  chunks = []
61
  cur = ""
 
69
  if cur:
70
  chunks.append(cur)
71
  if len(p) > max_chars:
72
+ # hard-split huge segments
73
  for i in range(0, len(p), max_chars):
74
  chunks.append(p[i:i+max_chars])
75
  cur = ""
 
81
  return chunks
82
 
83
  def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
84
+ """
85
+ Tag-based control. If you later confirm a different schema from Qwen's demo,
86
+ you only need to change this function.
87
+ """
88
  tags = []
89
  if lang:
90
  tags.append(f"[LANG={lang}]")
 
95
  return " ".join(tags + [text])
96
 
97
  def safe_get_speakers(proc, pipe_obj):
98
+ # Try processor attributes
99
  for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
100
  if hasattr(proc, attr):
101
  val = getattr(proc, attr)
 
104
  if isinstance(val, (list, tuple)):
105
  return sorted(set(map(str, val)))
106
 
107
+ # Try model config attributes
108
  model = getattr(pipe_obj, "model", None)
109
  cfg = getattr(model, "config", None) if model is not None else None
110
  if cfg is not None:
 
125
  return {"array": audio, "sampling_rate": sr}
126
 
127
  def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
128
+ # Try with reference audio if supported; otherwise fall back gracefully
129
  if ref_audio is not None:
130
  try:
131
  return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
 
146
  No ffmpeg required.
147
  """
148
  enc = lameenc.Encoder()
149
+ enc.set_bit_rate(int(bitrate_kbps))
150
+ enc.set_in_sample_rate(int(sr))
151
  enc.set_channels(1)
152
+ enc.set_quality(2) # 2=high quality, 7=faster
153
 
154
  pcm_bytes = float_to_int16_pcm(audio_float32)
155
  mp3 = enc.encode(pcm_bytes)
156
  mp3 += enc.flush()
157
  return mp3
158
 
159
+ def sanitize_filename(name: str) -> str:
160
+ name = name.strip().replace("\\", "_").replace("/", "_")
161
+ name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
162
+ name = re.sub(r"\s+", " ", name).strip()
163
+ if not name:
164
+ name = "chapter"
165
+ return name
166
 
167
  @st.cache_resource(show_spinner=False)
168
  def load_tts():
 
183
  # -----------------------------
184
  st.set_page_config(page_title="Haseeb's TTS", layout="wide")
185
  st.title("🎧 Haseeb's TTS")
186
+ st.caption("Audiobook Generator • MP3 Output • Batch Mode • Language • Voices • Instruction Control")
187
 
188
+ with st.spinner("Loading model (first run can take a while)"):
189
  pipe_obj, proc, detected_speakers, device, dtype = load_tts()
190
 
191
  colA, colB = st.columns([2, 1], gap="large")
 
193
  with colB:
194
  st.subheader("Controls")
195
 
196
+ # Language
197
  lang_label = st.selectbox(
198
  "Language",
199
  options=[x[0] for x in DEFAULT_LANGS],
 
202
  )
203
  lang = dict(DEFAULT_LANGS).get(lang_label)
204
 
205
+ # Speakers
206
  st.markdown("### Voice / Speaker")
207
  speaker = None
208
  if detected_speakers:
 
214
  )
215
  speaker = None if speaker_choice == "(none)" else speaker_choice
216
  else:
217
+ st.info("No speaker list detected. You can still type a custom speaker name below.")
218
 
219
  custom_speaker = st.text_input(
220
  "Custom speaker name (optional)",
 
224
  if custom_speaker:
225
  speaker = custom_speaker
226
 
227
+ # Instruction
228
  st.markdown("### Instruction Control")
229
  instruction = st.text_area(
230
  "Instruction (style/emotion/pacing/etc.)",
 
234
  if instruction == "":
235
  instruction = None
236
 
237
+ # Optional reference voice
238
  st.markdown("### Optional: Reference Voice")
239
  ref_file = st.file_uploader(
240
  "Upload reference WAV (optional)",
 
242
  help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
243
  )
244
 
245
+ # Long text chunking
246
+ st.markdown("### Long Text Settings")
247
  max_chars = st.slider(
248
  "Chunk size (characters)",
249
  min_value=600,
250
  max_value=3000,
251
  value=1400,
252
  step=100,
253
+ help="Long chapters (10,000+ chars) are split into chunks, generated, then stitched.",
254
  )
255
  gap_ms = st.slider(
256
  "Silence between chunks (ms)",
 
260
  step=50,
261
  )
262
 
263
+ # Generation params
264
  st.markdown("### Generation Parameters")
265
  max_new_tokens = st.slider(
266
  "max_new_tokens",
 
278
  step=0.1,
279
  )
280
 
281
+ # MP3 export
282
  st.markdown("### MP3 Export")
283
  mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
284
  normalize = st.checkbox("Normalize output audio", value=True)
 
286
  with colA:
287
  st.subheader("Input")
288
 
289
+ input_mode = st.radio(
290
+ "Mode",
291
+ ["Single chapter (paste/upload)", "Batch mode (upload multiple .txt)"],
292
+ horizontal=True,
293
+ )
 
 
 
 
 
 
 
 
 
 
 
294
 
295
+ # Shared ref audio prep
296
+ ref_audio = None
297
+ if ref_file is not None:
298
+ try:
299
+ ref_audio = try_reference_audio(ref_file.read())
300
+ except Exception as e:
301
+ st.warning(f"Could not read reference WAV. Ignoring it. ({e})")
302
+ ref_audio = None
303
 
304
+ gen_kwargs = {
305
+ "max_new_tokens": int(max_new_tokens),
306
+ "temperature": float(temperature),
307
+ }
308
 
309
+ def generate_mp3_from_text(chapter_text: str, label: str, progress_base: float = 0.0, progress_span: float = 1.0):
310
+ chapter_text = chapter_text.strip()
311
+ if not chapter_text:
312
+ raise ValueError("Empty text")
313
 
314
+ chunks = split_text_into_chunks(chapter_text, max_chars=max_chars)
315
  if not chunks:
316
+ raise ValueError("Chunking produced no chunks")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
  stitched = None
319
  out_sr = None
320
 
321
+ # chunk-level progress
322
  for i, chunk in enumerate(chunks, start=1):
323
+ st.session_state["_status"].write(f"{label}: chunk {i}/{len(chunks)}")
324
  prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
325
 
326
+ out = synthesize_chunk(pipe_obj, prompt, gen_kwargs=gen_kwargs, ref_audio=ref_audio)
 
 
 
 
 
327
  audio = out.get("audio", None)
328
  sr = out.get("sampling_rate", None)
329
  if audio is None or sr is None:
330
+ raise RuntimeError("Unexpected pipeline output")
 
331
 
332
  audio = np.asarray(audio, dtype=np.float32)
333
  if normalize:
 
338
  out_sr = int(sr)
339
  else:
340
  if int(sr) != out_sr:
341
+ # usually consistent; warn once
342
+ st.warning(f"{label}: sample rate changed ({sr} != {out_sr}). Stitching anyway.")
 
 
343
  if gap_ms > 0:
344
  stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
345
  else:
346
  stitched = np.concatenate([stitched, audio])
347
 
348
+ # update overall progress bar
349
+ frac = i / len(chunks)
350
+ st.session_state["_progress"].progress(int((progress_base + frac * progress_span) * 100))
351
+
352
+ # encode mp3
353
+ mp3_bytes = encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
354
+ return mp3_bytes
355
+
356
+ # -----------------------------
357
+ # Single mode
358
+ # -----------------------------
359
+ if input_mode == "Single chapter (paste/upload)":
360
+ single_submode = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
361
+
362
+ text = ""
363
+ if single_submode == "Paste text":
364
+ text = st.text_area(
365
+ "Chapter text",
366
+ value="",
367
+ height=420,
368
+ placeholder="Paste up to ~10,000+ characters here. The app will chunk, generate, stitch, then export MP3.",
369
+ )
370
+ else:
371
+ txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="single_txt")
372
+ if txt_file is not None:
373
+ text = txt_file.read().decode("utf-8", errors="ignore")
374
 
375
+ st.write(f"**Characters:** {len(text):,}")
376
+ st.divider()
377
 
378
+ if st.button("Generate MP3", type="primary", use_container_width=True):
379
+ if not text.strip():
380
+ st.error("Please provide some text.")
381
+ st.stop()
382
+
383
+ st.session_state["_progress"] = st.progress(0)
384
+ st.session_state["_status"] = st.empty()
385
 
386
+ try:
387
+ mp3_bytes = generate_mp3_from_text(text, label="Single")
388
+ except Exception as e:
389
+ st.error(f"Generation failed: {e}")
390
+ st.stop()
391
 
392
+ st.session_state["_status"].write("✅ Done.")
393
+ st.audio(mp3_bytes, format="audio/mp3")
394
+ st.download_button(
395
+ "Download MP3",
396
+ data=mp3_bytes,
397
+ file_name="audiobook_chapter.mp3",
398
+ mime="audio/mpeg",
399
+ use_container_width=True,
400
+ )
401
+
402
+ # -----------------------------
403
+ # Batch mode
404
+ # -----------------------------
405
+ else:
406
+ st.markdown("Upload multiple `.txt` files (each file = one chapter).")
407
+ batch_files = st.file_uploader(
408
+ "Upload chapter .txt files",
409
+ type=["txt"],
410
+ accept_multiple_files=True,
411
+ key="batch_txts",
412
  )
413
 
414
+ if batch_files:
415
+ total_chars = 0
416
+ for f in batch_files:
417
+ total_chars += len(f.getvalue())
418
+ st.write(f"**Files:** {len(batch_files)} | **Total bytes:** {total_chars:,}")
419
+
420
+ st.divider()
421
+
422
+ if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
423
+ if not batch_files:
424
+ st.error("Please upload at least one .txt file.")
425
+ st.stop()
426
+
427
+ st.session_state["_progress"] = st.progress(0)
428
+ st.session_state["_status"] = st.empty()
429
+
430
+ # Generate each file -> mp3, and pack into ZIP
431
+ zip_buf = io.BytesIO()
432
+ results_preview = [] # (name, mp3_bytes) for in-page audio preview
433
+
434
+ with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
435
+ n = len(batch_files)
436
+ for idx, f in enumerate(batch_files, start=1):
437
+ raw = f.read().decode("utf-8", errors="ignore")
438
+ base = sanitize_filename(os.path.splitext(f.name)[0])
439
+ mp3_name = f"{base}.mp3"
440
+ label = f"{idx}/{n} {base}"
441
+
442
+ # allocate progress range per file
443
+ base_prog = (idx - 1) / n
444
+ span_prog = 1.0 / n
445
+
446
+ try:
447
+ mp3_bytes = generate_mp3_from_text(raw, label=label, progress_base=base_prog, progress_span=span_prog)
448
+ except Exception as e:
449
+ st.error(f"Failed on file '{f.name}': {e}")
450
+ st.stop()
451
+
452
+ zf.writestr(mp3_name, mp3_bytes)
453
+
454
+ # Keep a small preview list (all, but could be large; still OK)
455
+ results_preview.append((mp3_name, mp3_bytes))
456
+
457
+ st.session_state["_status"].write("✅ Batch complete. Download your ZIP below.")
458
+
459
+ zip_buf.seek(0)
460
+ st.download_button(
461
+ "Download ZIP (all MP3s)",
462
+ data=zip_buf.getvalue(),
463
+ file_name="audiobook_mp3_batch.zip",
464
+ mime="application/zip",
465
+ use_container_width=True,
466
+ )
467
+
468
+ st.markdown("### Preview")
469
+ for name, mp3_bytes in results_preview:
470
+ with st.expander(name, expanded=False):
471
+ st.audio(mp3_bytes, format="audio/mp3")
472
+ st.download_button(
473
+ f"Download {name}",
474
+ data=mp3_bytes,
475
+ file_name=name,
476
+ mime="audio/mpeg",
477
+ use_container_width=True,
478
+ key=f"dl_{name}",
479
+ )