Jekyll2000 commited on
Commit
d0cc5a4
·
verified ·
1 Parent(s): 084ef93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -270
app.py CHANGED
@@ -1,68 +1,23 @@
1
  import io
2
- import re
3
  import os
 
4
  import zipfile
5
  import numpy as np
6
  import streamlit as st
7
  import soundfile as sf
8
 
9
  import torch
10
- from transformers import AutoProcessor
11
- from transformers.pipelines import pipeline
 
12
 
13
- import lameenc # MP3 encoder (no ffmpeg needed)
14
 
15
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
16
 
17
- # Torch sanity check
18
- try:
19
- _ = torch.tensor([1.0])
20
- except Exception as e:
21
- st.error(
22
- "PyTorch failed to initialize.\n\n"
23
- "Make sure your Space is using python_version: '3.10' and torch is installed correctly.\n\n"
24
- f"Details: {e}"
25
- )
26
- st.stop()
27
 
28
  # -----------------------------
29
- # Helpers
30
  # -----------------------------
31
- DEFAULT_LANGS = [
32
- ("Auto", None),
33
- ("English", "en"),
34
- ("Chinese (Simplified)", "zh"),
35
- ("Japanese", "ja"),
36
- ("Korean", "ko"),
37
- ("French", "fr"),
38
- ("German", "de"),
39
- ("Spanish", "es"),
40
- ("Portuguese", "pt"),
41
- ("Italian", "it"),
42
- ("Russian", "ru"),
43
- ("Arabic", "ar"),
44
- ("Hindi", "hi"),
45
- ("Turkish", "tr"),
46
- ("Indonesian", "id"),
47
- ("Vietnamese", "vi"),
48
- ]
49
-
50
- def pick_device():
51
- if torch.cuda.is_available():
52
- return "cuda", 0, torch.float16
53
- return "cpu", -1, torch.float32
54
-
55
- def normalize_audio(x: np.ndarray) -> np.ndarray:
56
- x = x.astype(np.float32)
57
- peak = float(np.max(np.abs(x))) if x.size else 0.0
58
- if peak > 0:
59
- x = x / max(peak, 1e-8)
60
- return x
61
-
62
- def make_silence(sr: int, ms: int) -> np.ndarray:
63
- n = int(sr * (ms / 1000.0))
64
- return np.zeros(n, dtype=np.float32)
65
-
66
  def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
67
  text = re.sub(r"\r\n", "\n", text).strip()
68
  if not text:
@@ -91,91 +46,80 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
91
  chunks.append(cur)
92
  return chunks
93
 
94
- def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
95
- tags = []
96
- if lang:
97
- tags.append(f"[LANG={lang}]")
98
- if speaker:
99
- tags.append(f"[SPEAKER={speaker}]")
100
- if instruction:
101
- tags.append(f"[INSTRUCTION={instruction}]")
102
- return " ".join(tags + [text])
103
-
104
- def safe_get_speakers(proc, pipe_obj):
105
- for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
106
- if hasattr(proc, attr):
107
- val = getattr(proc, attr)
108
- if isinstance(val, dict):
109
- return sorted(set(map(str, val.keys())))
110
- if isinstance(val, (list, tuple)):
111
- return sorted(set(map(str, val)))
112
-
113
- model = getattr(pipe_obj, "model", None)
114
- cfg = getattr(model, "config", None) if model is not None else None
115
- if cfg is not None:
116
- for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
117
- if hasattr(cfg, attr):
118
- val = getattr(cfg, attr)
119
- if isinstance(val, dict):
120
- return sorted(set(map(str, val.keys())))
121
- if isinstance(val, (list, tuple)):
122
- return sorted(set(map(str, val)))
123
-
124
- return []
125
-
126
- def try_reference_audio(wav_bytes: bytes):
127
- audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
128
- if audio.ndim > 1:
129
- audio = audio.mean(axis=1)
130
- return {"array": audio, "sampling_rate": sr}
131
-
132
- def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
133
- if ref_audio is not None:
134
- try:
135
- return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
136
- except TypeError:
137
- pass
138
- except Exception:
139
- pass
140
- return pipe_obj(prompt, **gen_kwargs)
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def float_to_int16_pcm(x: np.ndarray) -> bytes:
143
  x = np.clip(x, -1.0, 1.0)
144
- pcm = (x * 32767.0).astype(np.int16)
145
- return pcm.tobytes()
146
 
147
  def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
148
  enc = lameenc.Encoder()
149
  enc.set_bit_rate(int(bitrate_kbps))
150
  enc.set_in_sample_rate(int(sr))
151
  enc.set_channels(1)
152
- enc.set_quality(2)
153
  mp3 = enc.encode(float_to_int16_pcm(audio_float32))
154
  mp3 += enc.flush()
155
  return mp3
156
 
 
157
  def sanitize_filename(name: str) -> str:
158
  name = name.strip().replace("\\", "_").replace("/", "_")
159
  name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
160
  name = re.sub(r"\s+", " ", name).strip()
161
  return name or "chapter"
162
 
 
 
 
 
 
 
 
 
 
 
 
163
  @st.cache_resource(show_spinner=False)
164
- def load_tts():
165
- device, device_id, dtype = pick_device()
166
-
167
- # IMPORTANT: trust_remote_code=True for new architectures
168
- pipe_obj = pipeline(
169
- task="text-to-audio",
170
- model=MODEL_ID,
171
- device=device_id,
172
- torch_dtype=dtype,
173
- trust_remote_code=True,
174
  )
175
 
176
- proc = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
177
- speakers = safe_get_speakers(proc, pipe_obj)
178
- return pipe_obj, proc, speakers, device, dtype
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  # -----------------------------
181
  # UI
@@ -184,179 +128,109 @@ st.set_page_config(page_title="Haseeb's TTS", layout="wide")
184
  st.title("🎧 Haseeb's TTS")
185
  st.caption("Audiobook Generator • MP3 Output • Batch Mode • Language • Voices • Instruction Control")
186
 
 
 
 
 
 
 
 
187
  with st.spinner("Loading model (first run can take a while)…"):
188
- pipe_obj, proc, detected_speakers, device, dtype = load_tts()
189
 
190
  colA, colB = st.columns([2, 1], gap="large")
191
 
192
  with colB:
193
  st.subheader("Controls")
 
194
 
195
- lang_label = st.selectbox(
196
- "Language",
197
- options=[x[0] for x in DEFAULT_LANGS],
198
- index=1,
199
- help="Select a language tag to steer pronunciation. 'Auto' disables the language tag.",
200
- )
201
- lang = dict(DEFAULT_LANGS).get(lang_label)
202
-
203
- st.markdown("### Voice / Speaker")
204
- speaker = None
205
- if detected_speakers:
206
- speaker_choice = st.selectbox(
207
- "Detected speakers",
208
- options=["(none)"] + detected_speakers,
209
- index=0,
210
- help="Speakers detected from model config/processor.",
211
- )
212
- speaker = None if speaker_choice == "(none)" else speaker_choice
213
- else:
214
- st.info("No speaker list detected. You can still type a custom speaker name below.")
215
 
216
- custom_speaker = st.text_input(
217
- "Custom speaker name (optional)",
218
- value="",
219
- help="If the model supports speaker conditioning by name/tag, enter it here.",
220
- ).strip()
221
- if custom_speaker:
222
- speaker = custom_speaker
223
 
224
- st.markdown("### Instruction Control")
225
- instruction = st.text_area(
226
- "Instruction (style/emotion/pacing/etc.)",
227
  value="Warm, clear narration. Medium pace. Slightly expressive.",
228
  height=90,
 
229
  ).strip()
230
- if instruction == "":
231
- instruction = None
232
-
233
- st.markdown("### Optional: Reference Voice")
234
- ref_file = st.file_uploader(
235
- "Upload reference WAV (optional)",
236
- type=["wav"],
237
- help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
238
- )
239
 
 
240
  st.markdown("### Long Text Settings")
241
- max_chars = st.slider(
242
- "Chunk size (characters)",
243
- min_value=600,
244
- max_value=3000,
245
- value=1400,
246
- step=100,
247
- help="Long chapters (10,000+ chars) are split into chunks, generated, then stitched.",
248
- )
249
- gap_ms = st.slider(
250
- "Silence between chunks (ms)",
251
- min_value=0,
252
- max_value=1200,
253
- value=250,
254
- step=50,
255
- )
256
 
 
257
  st.markdown("### Generation Parameters")
258
- max_new_tokens = st.slider(
259
- "max_new_tokens",
260
- min_value=256,
261
- max_value=4096,
262
- value=2048,
263
- step=128,
264
- help="Higher can produce longer audio per chunk but uses more compute/memory.",
265
- )
266
- temperature = st.slider(
267
- "temperature",
268
- min_value=0.1,
269
- max_value=1.5,
270
- value=0.9,
271
- step=0.1,
272
- )
273
 
 
274
  st.markdown("### MP3 Export")
275
- mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
276
- normalize = st.checkbox("Normalize output audio", value=True)
277
 
278
  with colA:
279
  st.subheader("Input")
280
 
281
- input_mode = st.radio(
282
- "Mode",
283
- ["Single chapter (paste/upload)", "Batch mode (upload multiple .txt)"],
284
- horizontal=True,
285
- )
286
 
287
- ref_audio = None
288
- if ref_file is not None:
289
- try:
290
- ref_audio = try_reference_audio(ref_file.read())
291
- except Exception as e:
292
- st.warning(f"Could not read reference WAV. Ignoring it. ({e})")
293
- ref_audio = None
294
-
295
- gen_kwargs = {
296
- "max_new_tokens": int(max_new_tokens),
297
- "temperature": float(temperature),
298
- }
299
-
300
- def generate_mp3_from_text(chapter_text: str, label: str, progress_base: float = 0.0, progress_span: float = 1.0):
301
- chapter_text = chapter_text.strip()
302
- if not chapter_text:
303
- raise ValueError("Empty text")
304
-
305
- chunks = split_text_into_chunks(chapter_text, max_chars=max_chars)
306
  if not chunks:
307
- raise ValueError("Chunking produced no chunks")
308
 
309
  stitched = None
310
- out_sr = None
311
 
312
  for i, chunk in enumerate(chunks, start=1):
313
- st.session_state["_status"].write(f"{label}: chunk {i}/{len(chunks)}")
314
- prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
315
-
316
- out = synthesize_chunk(pipe_obj, prompt, gen_kwargs=gen_kwargs, ref_audio=ref_audio)
317
- audio = out.get("audio", None)
318
- sr = out.get("sampling_rate", None)
319
- if audio is None or sr is None:
320
- raise RuntimeError("Unexpected pipeline output")
 
321
 
322
- audio = np.asarray(audio, dtype=np.float32)
323
- if normalize:
324
  audio = normalize_audio(audio)
325
 
326
  if stitched is None:
327
  stitched = audio
328
- out_sr = int(sr)
329
  else:
330
- if int(sr) != out_sr:
331
- st.warning(f"{label}: sample rate changed ({sr} != {out_sr}). Stitching anyway.")
332
  if gap_ms > 0:
333
- stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
334
  else:
335
  stitched = np.concatenate([stitched, audio])
336
 
337
  frac = i / len(chunks)
338
- st.session_state["_progress"].progress(int((progress_base + frac * progress_span) * 100))
339
-
340
- return encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
341
 
342
- # -----------------------------
343
- # Single mode
344
- # -----------------------------
345
- if input_mode == "Single chapter (paste/upload)":
346
- single_submode = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
347
 
 
 
348
  text = ""
349
- if single_submode == "Paste text":
350
- text = st.text_area(
351
- "Chapter text",
352
- value="",
353
- height=420,
354
- placeholder="Paste up to ~10,000+ characters here. The app will chunk, generate, stitch, then export MP3.",
355
- )
356
  else:
357
- txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="single_txt")
358
- if txt_file is not None:
359
- text = txt_file.read().decode("utf-8", errors="ignore")
360
 
361
  st.write(f"**Characters:** {len(text):,}")
362
  st.divider()
@@ -366,16 +240,16 @@ with colA:
366
  st.error("Please provide some text.")
367
  st.stop()
368
 
369
- st.session_state["_progress"] = st.progress(0)
370
- st.session_state["_status"] = st.empty()
371
 
372
  try:
373
- mp3_bytes = generate_mp3_from_text(text, label="Single")
374
  except Exception as e:
375
  st.error(f"Generation failed: {e}")
376
  st.stop()
377
 
378
- st.session_state["_status"].write("✅ Done.")
379
  st.audio(mp3_bytes, format="audio/mp3")
380
  st.download_button(
381
  "Download MP3",
@@ -385,54 +259,43 @@ with colA:
385
  use_container_width=True,
386
  )
387
 
388
- # -----------------------------
389
- # Batch mode
390
- # -----------------------------
391
  else:
392
  st.markdown("Upload multiple `.txt` files (each file = one chapter).")
393
- batch_files = st.file_uploader(
394
- "Upload chapter .txt files",
395
- type=["txt"],
396
- accept_multiple_files=True,
397
- key="batch_txts",
398
- )
399
 
400
  st.divider()
401
 
402
  if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
403
- if not batch_files:
404
  st.error("Please upload at least one .txt file.")
405
  st.stop()
406
 
407
- st.session_state["_progress"] = st.progress(0)
408
- st.session_state["_status"] = st.empty()
409
 
410
  zip_buf = io.BytesIO()
411
- results_preview = []
412
 
413
  with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
414
- n = len(batch_files)
415
- for idx, f in enumerate(batch_files, start=1):
416
  raw = f.read().decode("utf-8", errors="ignore")
417
  base = sanitize_filename(os.path.splitext(f.name)[0])
418
  mp3_name = f"{base}.mp3"
419
- label = f"{idx}/{n} {base}"
420
 
421
  base_prog = (idx - 1) / n
422
  span_prog = 1.0 / n
423
 
424
  try:
425
- mp3_bytes = generate_mp3_from_text(
426
- raw, label=label, progress_base=base_prog, progress_span=span_prog
427
- )
428
  except Exception as e:
429
- st.error(f"Failed on file '{f.name}': {e}")
430
  st.stop()
431
 
432
  zf.writestr(mp3_name, mp3_bytes)
433
- results_preview.append((mp3_name, mp3_bytes))
434
 
435
- st.session_state["_status"].write("✅ Batch complete. Download your ZIP below.")
436
  zip_buf.seek(0)
437
 
438
  st.download_button(
@@ -444,7 +307,7 @@ with colA:
444
  )
445
 
446
  st.markdown("### Preview")
447
- for name, mp3_bytes in results_preview:
448
  with st.expander(name, expanded=False):
449
  st.audio(mp3_bytes, format="audio/mp3")
450
  st.download_button(
 
1
  import io
 
2
  import os
3
+ import re
4
  import zipfile
5
  import numpy as np
6
  import streamlit as st
7
  import soundfile as sf
8
 
9
  import torch
10
+ import lameenc
11
+
12
+ from qwen_tts import Qwen3TTSModel # official package API (recommended by Qwen docs)
13
 
 
14
 
15
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
16
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  # -----------------------------
19
+ # Text chunking (10k+ chars)
20
  # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
22
  text = re.sub(r"\r\n", "\n", text).strip()
23
  if not text:
 
46
  chunks.append(cur)
47
  return chunks
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ def make_silence(sr: int, ms: int) -> np.ndarray:
51
+ n = int(sr * (ms / 1000.0))
52
+ return np.zeros(n, dtype=np.float32)
53
+
54
+
55
+ def normalize_audio(x: np.ndarray) -> np.ndarray:
56
+ x = x.astype(np.float32)
57
+ peak = float(np.max(np.abs(x))) if x.size else 0.0
58
+ if peak > 0:
59
+ x = x / max(peak, 1e-8)
60
+ return x
61
+
62
+
63
+ # -----------------------------
64
+ # MP3 encoding (no ffmpeg)
65
+ # -----------------------------
66
  def float_to_int16_pcm(x: np.ndarray) -> bytes:
67
  x = np.clip(x, -1.0, 1.0)
68
+ return (x * 32767.0).astype(np.int16).tobytes()
69
+
70
 
71
  def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
72
  enc = lameenc.Encoder()
73
  enc.set_bit_rate(int(bitrate_kbps))
74
  enc.set_in_sample_rate(int(sr))
75
  enc.set_channels(1)
76
+ enc.set_quality(2) # 2=high quality
77
  mp3 = enc.encode(float_to_int16_pcm(audio_float32))
78
  mp3 += enc.flush()
79
  return mp3
80
 
81
+
82
  def sanitize_filename(name: str) -> str:
83
  name = name.strip().replace("\\", "_").replace("/", "_")
84
  name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
85
  name = re.sub(r"\s+", " ", name).strip()
86
  return name or "chapter"
87
 
88
+
89
+ # -----------------------------
90
+ # Model loading (qwen-tts)
91
+ # -----------------------------
92
+ def pick_device_and_dtype():
93
+ if torch.cuda.is_available():
94
+ # bfloat16 is recommended in Qwen docs examples for modern GPUs
95
+ return "cuda:0", torch.bfloat16
96
+ return "cpu", torch.float32
97
+
98
+
99
  @st.cache_resource(show_spinner=False)
100
+ def load_qwen_tts():
101
+ device_map, dtype = pick_device_and_dtype()
102
+
103
+ model = Qwen3TTSModel.from_pretrained(
104
+ MODEL_ID,
105
+ device_map=device_map,
106
+ dtype=dtype,
 
 
 
107
  )
108
 
109
+ # Try to read supported languages/speakers from the model
110
+ # (These helper methods are documented by Qwen for CustomVoice models)
111
+ try:
112
+ speakers = model.get_supported_speakers()
113
+ except Exception:
114
+ speakers = []
115
+
116
+ try:
117
+ languages = model.get_supported_languages()
118
+ except Exception:
119
+ languages = []
120
+
121
+ return model, speakers, languages, device_map, str(dtype)
122
+
123
 
124
  # -----------------------------
125
  # UI
 
128
  st.title("🎧 Haseeb's TTS")
129
  st.caption("Audiobook Generator • MP3 Output • Batch Mode • Language • Voices • Instruction Control")
130
 
131
+ # Torch sanity check
132
+ try:
133
+ _ = torch.tensor([1.0])
134
+ except Exception as e:
135
+ st.error(f"PyTorch failed to initialize: {e}")
136
+ st.stop()
137
+
138
  with st.spinner("Loading model (first run can take a while)…"):
139
+ tts_model, supported_speakers, supported_langs, device_map, dtype_str = load_qwen_tts()
140
 
141
  colA, colB = st.columns([2, 1], gap="large")
142
 
143
  with colB:
144
  st.subheader("Controls")
145
+ st.caption(f"Device: `{device_map}` • dtype: `{dtype_str}`")
146
 
147
+ # Language dropdown (fallback list if model doesn't provide)
148
+ fallback_langs = ["Auto", "Chinese", "English", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"]
149
+ lang_options = supported_langs if supported_langs else fallback_langs
150
+ language = st.selectbox("Language", options=lang_options, index=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ # Speaker dropdown (fallback common names from Qwen docs snippet)
153
+ fallback_speakers = ["Vivian", "Ryan"]
154
+ spk_options = supported_speakers if supported_speakers else fallback_speakers
155
+ speaker = st.selectbox("Speaker / Voice", options=spk_options, index=0)
 
 
 
156
 
157
+ # Instruction control
158
+ instruct = st.text_area(
159
+ "Instruction (style/emotion/pacing)",
160
  value="Warm, clear narration. Medium pace. Slightly expressive.",
161
  height=90,
162
+ help="Leave empty for neutral/default speaking style.",
163
  ).strip()
 
 
 
 
 
 
 
 
 
164
 
165
+ # Long chapter handling
166
  st.markdown("### Long Text Settings")
167
+ max_chars = st.slider("Chunk size (characters)", 600, 3000, 1400, 100)
168
+ gap_ms = st.slider("Silence between chunks (ms)", 0, 1200, 250, 50)
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ # Generation params
171
  st.markdown("### Generation Parameters")
172
+ max_new_tokens = st.slider("max_new_tokens", 256, 8192, 4096, 256, help="Increase for longer audio per chunk (more compute).")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ # MP3
175
  st.markdown("### MP3 Export")
176
+ mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", [96, 128, 160, 192, 256, 320], index=3)
177
+ do_normalize = st.checkbox("Normalize output audio", value=True)
178
 
179
  with colA:
180
  st.subheader("Input")
181
 
182
+ mode = st.radio("Mode", ["Single chapter", "Batch (multiple .txt)"], horizontal=True)
 
 
 
 
183
 
184
+ progress = st.progress(0)
185
+ status = st.empty()
186
+
187
+ def synth_one_mp3(text: str, label: str, base_prog: float, span_prog: float) -> bytes:
188
+ chunks = split_text_into_chunks(text, max_chars=max_chars)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  if not chunks:
190
+ raise ValueError("No text chunks produced.")
191
 
192
  stitched = None
193
+ sr_out = None
194
 
195
  for i, chunk in enumerate(chunks, start=1):
196
+ status.write(f"{label}: chunk {i}/{len(chunks)}")
197
+
198
+ wavs, sr = tts_model.generate_custom_voice(
199
+ text=chunk,
200
+ language=language if language else "Auto",
201
+ speaker=speaker,
202
+ instruct=instruct if instruct else "",
203
+ max_new_tokens=int(max_new_tokens),
204
+ )
205
 
206
+ audio = np.asarray(wavs[0], dtype=np.float32)
207
+ if do_normalize:
208
  audio = normalize_audio(audio)
209
 
210
  if stitched is None:
211
  stitched = audio
212
+ sr_out = int(sr)
213
  else:
 
 
214
  if gap_ms > 0:
215
+ stitched = np.concatenate([stitched, make_silence(sr_out, gap_ms), audio])
216
  else:
217
  stitched = np.concatenate([stitched, audio])
218
 
219
  frac = i / len(chunks)
220
+ progress.progress(int((base_prog + frac * span_prog) * 100))
 
 
221
 
222
+ return encode_mp3_mono(stitched, sr_out, bitrate_kbps=int(mp3_bitrate))
 
 
 
 
223
 
224
+ if mode == "Single chapter":
225
+ input_type = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
226
  text = ""
227
+
228
+ if input_type == "Paste text":
229
+ text = st.text_area("Chapter text", height=420, placeholder="Paste your chapter text here…")
 
 
 
 
230
  else:
231
+ f = st.file_uploader("Upload a .txt file", type=["txt"])
232
+ if f is not None:
233
+ text = f.read().decode("utf-8", errors="ignore")
234
 
235
  st.write(f"**Characters:** {len(text):,}")
236
  st.divider()
 
240
  st.error("Please provide some text.")
241
  st.stop()
242
 
243
+ progress.progress(0)
244
+ status.write("Starting…")
245
 
246
  try:
247
+ mp3_bytes = synth_one_mp3(text, "Single", 0.0, 1.0)
248
  except Exception as e:
249
  st.error(f"Generation failed: {e}")
250
  st.stop()
251
 
252
+ status.write("✅ Done.")
253
  st.audio(mp3_bytes, format="audio/mp3")
254
  st.download_button(
255
  "Download MP3",
 
259
  use_container_width=True,
260
  )
261
 
 
 
 
262
  else:
263
  st.markdown("Upload multiple `.txt` files (each file = one chapter).")
264
+ files = st.file_uploader("Upload chapter .txt files", type=["txt"], accept_multiple_files=True)
 
 
 
 
 
265
 
266
  st.divider()
267
 
268
  if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
269
+ if not files:
270
  st.error("Please upload at least one .txt file.")
271
  st.stop()
272
 
273
+ progress.progress(0)
274
+ status.write("Starting batch…")
275
 
276
  zip_buf = io.BytesIO()
277
+ previews = []
278
 
279
  with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
280
+ n = len(files)
281
+ for idx, f in enumerate(files, start=1):
282
  raw = f.read().decode("utf-8", errors="ignore")
283
  base = sanitize_filename(os.path.splitext(f.name)[0])
284
  mp3_name = f"{base}.mp3"
 
285
 
286
  base_prog = (idx - 1) / n
287
  span_prog = 1.0 / n
288
 
289
  try:
290
+ mp3_bytes = synth_one_mp3(raw, f"{idx}/{n} {base}", base_prog, span_prog)
 
 
291
  except Exception as e:
292
+ st.error(f"Failed on '{f.name}': {e}")
293
  st.stop()
294
 
295
  zf.writestr(mp3_name, mp3_bytes)
296
+ previews.append((mp3_name, mp3_bytes))
297
 
298
+ status.write("✅ Batch complete.")
299
  zip_buf.seek(0)
300
 
301
  st.download_button(
 
307
  )
308
 
309
  st.markdown("### Preview")
310
+ for name, mp3_bytes in previews:
311
  with st.expander(name, expanded=False):
312
  st.audio(mp3_bytes, format="audio/mp3")
313
  st.download_button(