Jekyll2000 commited on
Commit
db6c05c
·
verified ·
1 Parent(s): 1c065e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -88
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import io
2
  import re
3
- import base64
 
4
  import numpy as np
5
  import streamlit as st
6
  import soundfile as sf
@@ -8,6 +9,8 @@ import soundfile as sf
8
  import torch
9
  from transformers import pipeline, AutoProcessor
10
 
 
 
11
 
12
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
13
 
@@ -40,7 +43,7 @@ def pick_device():
40
 
41
  def normalize_audio(x: np.ndarray) -> np.ndarray:
42
  x = x.astype(np.float32)
43
- peak = np.max(np.abs(x)) if x.size else 0.0
44
  if peak > 0:
45
  x = x / max(peak, 1e-8)
46
  return x
@@ -50,16 +53,10 @@ def make_silence(sr: int, ms: int) -> np.ndarray:
50
  return np.zeros(n, dtype=np.float32)
51
 
52
  def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
53
- """
54
- Chunk long text into <= max_chars chunks.
55
- Uses sentence-ish boundaries where possible.
56
- """
57
  text = re.sub(r"\r\n", "\n", text).strip()
58
  if not text:
59
  return []
60
 
61
- # Split into "sentences" while keeping separators
62
- # Works decently for many languages, not perfect.
63
  parts = re.split(r"(?<=[\.\!\?\。\!\?\n])\s+", text)
64
  chunks = []
65
  cur = ""
@@ -72,7 +69,6 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
72
  else:
73
  if cur:
74
  chunks.append(cur)
75
- # If a single part is huge, hard-split it
76
  if len(p) > max_chars:
77
  for i in range(0, len(p), max_chars):
78
  chunks.append(p[i:i+max_chars])
@@ -85,10 +81,7 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
85
  return chunks
86
 
87
  def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
88
- """
89
- Many instructionable TTS models accept tags.
90
- If your model expects a different schema, adjust here.
91
- """
92
  tags = []
93
  if lang:
94
  tags.append(f"[LANG={lang}]")
@@ -99,22 +92,14 @@ def format_prompt(text: str, lang: str | None, speaker: str | None, instruction:
99
  return " ".join(tags + [text])
100
 
101
  def safe_get_speakers(proc, pipe_obj):
102
- """
103
- Try to discover speakers/voices from processor/config.
104
- If none found, return empty list.
105
- """
106
- candidates = []
107
- # From processor
108
  for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
109
  if hasattr(proc, attr):
110
  val = getattr(proc, attr)
111
  if isinstance(val, dict):
112
- candidates = list(val.keys())
113
- return sorted(set(map(str, candidates)))
114
  if isinstance(val, (list, tuple)):
115
  return sorted(set(map(str, val)))
116
 
117
- # From model config
118
  model = getattr(pipe_obj, "model", None)
119
  cfg = getattr(model, "config", None) if model is not None else None
120
  if cfg is not None:
@@ -122,28 +107,55 @@ def safe_get_speakers(proc, pipe_obj):
122
  if hasattr(cfg, attr):
123
  val = getattr(cfg, attr)
124
  if isinstance(val, dict):
125
- candidates = list(val.keys())
126
- return sorted(set(map(str, candidates)))
127
  if isinstance(val, (list, tuple)):
128
  return sorted(set(map(str, val)))
129
 
130
  return []
131
 
132
  def try_reference_audio(wav_bytes: bytes):
133
- """
134
- Load a reference wav file into a dict compatible with HF audio pipelines.
135
- """
136
  audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
137
  if audio.ndim > 1:
138
  audio = audio.mean(axis=1)
139
  return {"array": audio, "sampling_rate": sr}
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  @st.cache_resource(show_spinner=False)
143
  def load_tts():
144
  device, device_id, dtype = pick_device()
145
  pipe_obj = pipeline(
146
- task="text-to-audio", # alias: "text-to-speech"
147
  model=MODEL_ID,
148
  device=device_id,
149
  torch_dtype=dtype,
@@ -153,33 +165,12 @@ def load_tts():
153
  return pipe_obj, proc, speakers, device, dtype
154
 
155
 
156
- def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
157
- """
158
- Run pipeline for one chunk.
159
- If ref_audio isn't supported by this model/pipeline, ignore it gracefully.
160
- """
161
- if ref_audio is not None:
162
- try:
163
- out = pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
164
- return out
165
- except TypeError:
166
- # pipeline/model does not accept ref_audio
167
- pass
168
- except Exception:
169
- # any other issue: also fall back without ref audio
170
- pass
171
-
172
- out = pipe_obj(prompt, **gen_kwargs)
173
- return out
174
-
175
-
176
  # -----------------------------
177
  # UI
178
  # -----------------------------
179
- st.set_page_config(page_title="Qwen3 TTS Audiobook Generator", layout="wide")
180
-
181
- st.title("🎧 Qwen3 TTS Audiobook Generator")
182
- st.caption(f"Model: `{MODEL_ID}`")
183
 
184
  with st.spinner("Loading model (first run can take a while)..."):
185
  pipe_obj, proc, detected_speakers, device, dtype = load_tts()
@@ -189,57 +180,51 @@ colA, colB = st.columns([2, 1], gap="large")
189
  with colB:
190
  st.subheader("Controls")
191
 
192
- # Language
193
  lang_label = st.selectbox(
194
  "Language",
195
  options=[x[0] for x in DEFAULT_LANGS],
196
- index=1, # English default
197
- help="Select a language tag to steer pronunciation. 'Auto' disables language tag.",
198
  )
199
  lang = dict(DEFAULT_LANGS).get(lang_label)
200
 
201
- # Speakers / voices
202
  st.markdown("### Voice / Speaker")
 
203
  if detected_speakers:
204
  speaker_choice = st.selectbox(
205
  "Detected speakers",
206
  options=["(none)"] + detected_speakers,
207
  index=0,
208
- help="Speakers detected from model config/processor. If empty, use Custom speaker field below.",
209
  )
210
  speaker = None if speaker_choice == "(none)" else speaker_choice
211
  else:
212
- st.info("No speaker list detected from model config. You can still provide a speaker name below.")
213
- speaker = None
214
 
215
  custom_speaker = st.text_input(
216
  "Custom speaker name (optional)",
217
  value="",
218
- help="If your model supports speaker conditioning by name/tag, enter it here.",
219
  ).strip()
220
  if custom_speaker:
221
  speaker = custom_speaker
222
 
223
- # Instruction control
224
  st.markdown("### Instruction Control")
225
  instruction = st.text_area(
226
  "Instruction (style/emotion/pacing/etc.)",
227
  value="Warm, clear narration. Medium pace. Slightly expressive.",
228
  height=90,
229
- help="Free-form style instruction. Example: 'Calm, slow, deep voice. Dramatic pauses.'",
230
  ).strip()
231
  if instruction == "":
232
  instruction = None
233
 
234
- # Reference voice (optional)
235
  st.markdown("### Optional: Reference Voice")
236
  ref_file = st.file_uploader(
237
  "Upload reference WAV (optional)",
238
  type=["wav"],
239
- help="If the model supports 'CustomVoice' conditioning, this can steer voice cloning. If unsupported, it will be ignored.",
240
  )
241
 
242
- # Long-text chunking
243
  st.markdown("### Long Text (Audiobook)")
244
  max_chars = st.slider(
245
  "Chunk size (characters)",
@@ -247,7 +232,7 @@ with colB:
247
  max_value=3000,
248
  value=1400,
249
  step=100,
250
- help="10,000 chars will be split into multiple chunks then stitched into one WAV.",
251
  )
252
  gap_ms = st.slider(
253
  "Silence between chunks (ms)",
@@ -255,10 +240,8 @@ with colB:
255
  max_value=1200,
256
  value=250,
257
  step=50,
258
- help="Adds a small pause between chunks.",
259
  )
260
 
261
- # Generation parameters (audio length etc.)
262
  st.markdown("### Generation Parameters")
263
  max_new_tokens = st.slider(
264
  "max_new_tokens",
@@ -274,9 +257,10 @@ with colB:
274
  max_value=1.5,
275
  value=0.9,
276
  step=0.1,
277
- help="Sampling temperature (if supported by the model).",
278
  )
279
 
 
 
280
  normalize = st.checkbox("Normalize output audio", value=True)
281
 
282
  with colA:
@@ -290,7 +274,7 @@ with colA:
290
  "Chapter text",
291
  value="",
292
  height=420,
293
- placeholder="Paste up to ~10,000+ characters here. The app will chunk and stitch.",
294
  )
295
  else:
296
  txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="txt_uploader")
@@ -301,8 +285,7 @@ with colA:
301
 
302
  st.divider()
303
 
304
- # Run
305
- generate = st.button("Generate Audiobook WAV", type="primary", use_container_width=True)
306
 
307
  if generate:
308
  if not text.strip():
@@ -316,7 +299,6 @@ with colA:
316
 
317
  st.info(f"Split into **{len(chunks)}** chunk(s). Generating audio…")
318
 
319
- # Prepare reference audio (optional)
320
  ref_audio = None
321
  if ref_file is not None:
322
  try:
@@ -336,7 +318,6 @@ with colA:
336
  stitched = None
337
  out_sr = None
338
 
339
- # Generate each chunk and stitch
340
  for i, chunk in enumerate(chunks, start=1):
341
  status.write(f"Generating chunk {i}/{len(chunks)} …")
342
  prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
@@ -361,12 +342,10 @@ with colA:
361
  stitched = audio
362
  out_sr = int(sr)
363
  else:
364
- # If sample rates differ, you should resample. Most pipelines keep it consistent.
365
  if int(sr) != out_sr:
366
  st.warning(
367
  f"Chunk {i} sample rate {sr} != {out_sr}. "
368
- "For best results, adjust to a consistent sample rate. "
369
- "Stitching anyway."
370
  )
371
  if gap_ms > 0:
372
  stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
@@ -375,21 +354,22 @@ with colA:
375
 
376
  progress.progress(int((i / len(chunks)) * 100))
377
 
378
- status.write("✅ Done. Preparing download…")
379
 
380
- # Write WAV to bytes
381
- wav_buf = io.BytesIO()
382
- sf.write(wav_buf, stitched, out_sr, format="WAV")
383
- wav_bytes = wav_buf.getvalue()
 
384
 
385
- st.audio(wav_bytes, format="audio/wav")
386
 
387
  st.download_button(
388
- "Download WAV",
389
- data=wav_bytes,
390
- file_name="audiobook_chapter.wav",
391
- mime="audio/wav",
392
  use_container_width=True,
393
  )
394
 
395
- st.success("Generated audiobook chapter WAV successfully.")
 
1
  import io
2
  import re
3
+ import math
4
+ import os
5
  import numpy as np
6
  import streamlit as st
7
  import soundfile as sf
 
9
  import torch
10
  from transformers import pipeline, AutoProcessor
11
 
12
+ import lameenc # MP3 encoder (no ffmpeg needed)
13
+
14
 
15
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
16
 
 
43
 
44
  def normalize_audio(x: np.ndarray) -> np.ndarray:
45
  x = x.astype(np.float32)
46
+ peak = float(np.max(np.abs(x))) if x.size else 0.0
47
  if peak > 0:
48
  x = x / max(peak, 1e-8)
49
  return x
 
53
  return np.zeros(n, dtype=np.float32)
54
 
55
  def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
 
 
 
 
56
  text = re.sub(r"\r\n", "\n", text).strip()
57
  if not text:
58
  return []
59
 
 
 
60
  parts = re.split(r"(?<=[\.\!\?\。\!\?\n])\s+", text)
61
  chunks = []
62
  cur = ""
 
69
  else:
70
  if cur:
71
  chunks.append(cur)
 
72
  if len(p) > max_chars:
73
  for i in range(0, len(p), max_chars):
74
  chunks.append(p[i:i+max_chars])
 
81
  return chunks
82
 
83
  def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
84
+ # Adjust tag format if you later confirm the model expects different tokens
 
 
 
85
  tags = []
86
  if lang:
87
  tags.append(f"[LANG={lang}]")
 
92
  return " ".join(tags + [text])
93
 
94
  def safe_get_speakers(proc, pipe_obj):
 
 
 
 
 
 
95
  for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
96
  if hasattr(proc, attr):
97
  val = getattr(proc, attr)
98
  if isinstance(val, dict):
99
+ return sorted(set(map(str, val.keys())))
 
100
  if isinstance(val, (list, tuple)):
101
  return sorted(set(map(str, val)))
102
 
 
103
  model = getattr(pipe_obj, "model", None)
104
  cfg = getattr(model, "config", None) if model is not None else None
105
  if cfg is not None:
 
107
  if hasattr(cfg, attr):
108
  val = getattr(cfg, attr)
109
  if isinstance(val, dict):
110
+ return sorted(set(map(str, val.keys())))
 
111
  if isinstance(val, (list, tuple)):
112
  return sorted(set(map(str, val)))
113
 
114
  return []
115
 
116
  def try_reference_audio(wav_bytes: bytes):
 
 
 
117
  audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
118
  if audio.ndim > 1:
119
  audio = audio.mean(axis=1)
120
  return {"array": audio, "sampling_rate": sr}
121
 
122
+ def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
123
+ if ref_audio is not None:
124
+ try:
125
+ return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
126
+ except TypeError:
127
+ pass
128
+ except Exception:
129
+ pass
130
+ return pipe_obj(prompt, **gen_kwargs)
131
+
132
+ def float_to_int16_pcm(x: np.ndarray) -> bytes:
133
+ x = np.clip(x, -1.0, 1.0)
134
+ pcm = (x * 32767.0).astype(np.int16)
135
+ return pcm.tobytes()
136
+
137
+ def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
138
+ """
139
+ Encode mono float32 audio (-1..1) to MP3 bytes using lameenc.
140
+ No ffmpeg required.
141
+ """
142
+ enc = lameenc.Encoder()
143
+ enc.set_bit_rate(bitrate_kbps)
144
+ enc.set_in_sample_rate(sr)
145
+ enc.set_channels(1)
146
+ enc.set_quality(2) # 2=high, 7=fast
147
+
148
+ pcm_bytes = float_to_int16_pcm(audio_float32)
149
+ mp3 = enc.encode(pcm_bytes)
150
+ mp3 += enc.flush()
151
+ return mp3
152
+
153
 
154
  @st.cache_resource(show_spinner=False)
155
  def load_tts():
156
  device, device_id, dtype = pick_device()
157
  pipe_obj = pipeline(
158
+ task="text-to-audio",
159
  model=MODEL_ID,
160
  device=device_id,
161
  torch_dtype=dtype,
 
165
  return pipe_obj, proc, speakers, device, dtype
166
 
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # -----------------------------
169
  # UI
170
  # -----------------------------
171
+ st.set_page_config(page_title="Haseeb's TTS", layout="wide")
172
+ st.title("🎧 Haseeb's TTS")
173
+ st.caption("Audiobook Generator MP3 Output • Language • Voices • Instruction Control")
 
174
 
175
  with st.spinner("Loading model (first run can take a while)..."):
176
  pipe_obj, proc, detected_speakers, device, dtype = load_tts()
 
180
  with colB:
181
  st.subheader("Controls")
182
 
 
183
  lang_label = st.selectbox(
184
  "Language",
185
  options=[x[0] for x in DEFAULT_LANGS],
186
+ index=1,
187
+ help="Select a language tag to steer pronunciation. 'Auto' disables the language tag.",
188
  )
189
  lang = dict(DEFAULT_LANGS).get(lang_label)
190
 
 
191
  st.markdown("### Voice / Speaker")
192
+ speaker = None
193
  if detected_speakers:
194
  speaker_choice = st.selectbox(
195
  "Detected speakers",
196
  options=["(none)"] + detected_speakers,
197
  index=0,
198
+ help="Speakers detected from model config/processor.",
199
  )
200
  speaker = None if speaker_choice == "(none)" else speaker_choice
201
  else:
202
+ st.info("No speaker list detected from model config. You can still type a custom speaker name below.")
 
203
 
204
  custom_speaker = st.text_input(
205
  "Custom speaker name (optional)",
206
  value="",
207
+ help="If the model supports speaker conditioning by name/tag, enter it here.",
208
  ).strip()
209
  if custom_speaker:
210
  speaker = custom_speaker
211
 
 
212
  st.markdown("### Instruction Control")
213
  instruction = st.text_area(
214
  "Instruction (style/emotion/pacing/etc.)",
215
  value="Warm, clear narration. Medium pace. Slightly expressive.",
216
  height=90,
 
217
  ).strip()
218
  if instruction == "":
219
  instruction = None
220
 
 
221
  st.markdown("### Optional: Reference Voice")
222
  ref_file = st.file_uploader(
223
  "Upload reference WAV (optional)",
224
  type=["wav"],
225
+ help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
226
  )
227
 
 
228
  st.markdown("### Long Text (Audiobook)")
229
  max_chars = st.slider(
230
  "Chunk size (characters)",
 
232
  max_value=3000,
233
  value=1400,
234
  step=100,
235
+ help="10,000 chars will be split into multiple chunks then stitched.",
236
  )
237
  gap_ms = st.slider(
238
  "Silence between chunks (ms)",
 
240
  max_value=1200,
241
  value=250,
242
  step=50,
 
243
  )
244
 
 
245
  st.markdown("### Generation Parameters")
246
  max_new_tokens = st.slider(
247
  "max_new_tokens",
 
257
  max_value=1.5,
258
  value=0.9,
259
  step=0.1,
 
260
  )
261
 
262
+ st.markdown("### MP3 Export")
263
+ mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
264
  normalize = st.checkbox("Normalize output audio", value=True)
265
 
266
  with colA:
 
274
  "Chapter text",
275
  value="",
276
  height=420,
277
+ placeholder="Paste up to ~10,000+ characters here. The app will chunk, generate, stitch, then export MP3.",
278
  )
279
  else:
280
  txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="txt_uploader")
 
285
 
286
  st.divider()
287
 
288
+ generate = st.button("Generate MP3 Audiobook", type="primary", use_container_width=True)
 
289
 
290
  if generate:
291
  if not text.strip():
 
299
 
300
  st.info(f"Split into **{len(chunks)}** chunk(s). Generating audio…")
301
 
 
302
  ref_audio = None
303
  if ref_file is not None:
304
  try:
 
318
  stitched = None
319
  out_sr = None
320
 
 
321
  for i, chunk in enumerate(chunks, start=1):
322
  status.write(f"Generating chunk {i}/{len(chunks)} …")
323
  prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
 
342
  stitched = audio
343
  out_sr = int(sr)
344
  else:
 
345
  if int(sr) != out_sr:
346
  st.warning(
347
  f"Chunk {i} sample rate {sr} != {out_sr}. "
348
+ "Stitching anyway (best if consistent)."
 
349
  )
350
  if gap_ms > 0:
351
  stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
 
354
 
355
  progress.progress(int((i / len(chunks)) * 100))
356
 
357
+ status.write("✅ Done. Encoding MP3…")
358
 
359
+ try:
360
+ mp3_bytes = encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
361
+ except Exception as e:
362
+ st.error(f"MP3 encoding failed: {e}")
363
+ st.stop()
364
 
365
+ st.audio(mp3_bytes, format="audio/mp3")
366
 
367
  st.download_button(
368
+ "Download MP3",
369
+ data=mp3_bytes,
370
+ file_name="audiobook_chapter.mp3",
371
+ mime="audio/mpeg",
372
  use_container_width=True,
373
  )
374
 
375
+ st.success("Generated MP3 audiobook successfully.")