Jekyll2000 commited on
Commit
ab39842
·
verified ·
1 Parent(s): d0cc5a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -36
app.py CHANGED
@@ -9,24 +9,18 @@ import soundfile as sf
9
  import torch
10
  import lameenc
11
 
12
- from qwen_tts import Qwen3TTSModel # official package API (recommended by Qwen docs)
13
 
14
 
15
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
16
 
17
 
18
- # -----------------------------
19
- # Text chunking (10k+ chars)
20
- # -----------------------------
21
  def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
22
  text = re.sub(r"\r\n", "\n", text).strip()
23
  if not text:
24
  return []
25
-
26
  parts = re.split(r"(?<=[\.\!\?\。\!\?\n])\s+", text)
27
- chunks = []
28
- cur = ""
29
-
30
  for p in parts:
31
  if not p:
32
  continue
@@ -41,7 +35,6 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
41
  cur = ""
42
  else:
43
  cur = p
44
-
45
  if cur:
46
  chunks.append(cur)
47
  return chunks
@@ -60,23 +53,27 @@ def normalize_audio(x: np.ndarray) -> np.ndarray:
60
  return x
61
 
62
 
63
- # -----------------------------
64
- # MP3 encoding (no ffmpeg)
65
- # -----------------------------
66
  def float_to_int16_pcm(x: np.ndarray) -> bytes:
67
  x = np.clip(x, -1.0, 1.0)
68
  return (x * 32767.0).astype(np.int16).tobytes()
69
 
70
 
71
  def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
 
 
 
72
  enc = lameenc.Encoder()
73
  enc.set_bit_rate(int(bitrate_kbps))
74
  enc.set_in_sample_rate(int(sr))
75
  enc.set_channels(1)
76
- enc.set_quality(2) # 2=high quality
 
77
  mp3 = enc.encode(float_to_int16_pcm(audio_float32))
78
  mp3 += enc.flush()
79
- return mp3
 
 
 
80
 
81
 
82
  def sanitize_filename(name: str) -> str:
@@ -86,12 +83,8 @@ def sanitize_filename(name: str) -> str:
86
  return name or "chapter"
87
 
88
 
89
- # -----------------------------
90
- # Model loading (qwen-tts)
91
- # -----------------------------
92
  def pick_device_and_dtype():
93
  if torch.cuda.is_available():
94
- # bfloat16 is recommended in Qwen docs examples for modern GPUs
95
  return "cuda:0", torch.bfloat16
96
  return "cpu", torch.float32
97
 
@@ -106,8 +99,6 @@ def load_qwen_tts():
106
  dtype=dtype,
107
  )
108
 
109
- # Try to read supported languages/speakers from the model
110
- # (These helper methods are documented by Qwen for CustomVoice models)
111
  try:
112
  speakers = model.get_supported_speakers()
113
  except Exception:
@@ -144,34 +135,27 @@ with colB:
144
  st.subheader("Controls")
145
  st.caption(f"Device: `{device_map}` • dtype: `{dtype_str}`")
146
 
147
- # Language dropdown (fallback list if model doesn't provide)
148
  fallback_langs = ["Auto", "Chinese", "English", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"]
149
  lang_options = supported_langs if supported_langs else fallback_langs
150
  language = st.selectbox("Language", options=lang_options, index=0)
151
 
152
- # Speaker dropdown (fallback common names from Qwen docs snippet)
153
  fallback_speakers = ["Vivian", "Ryan"]
154
  spk_options = supported_speakers if supported_speakers else fallback_speakers
155
  speaker = st.selectbox("Speaker / Voice", options=spk_options, index=0)
156
 
157
- # Instruction control
158
  instruct = st.text_area(
159
  "Instruction (style/emotion/pacing)",
160
  value="Warm, clear narration. Medium pace. Slightly expressive.",
161
  height=90,
162
- help="Leave empty for neutral/default speaking style.",
163
  ).strip()
164
 
165
- # Long chapter handling
166
  st.markdown("### Long Text Settings")
167
  max_chars = st.slider("Chunk size (characters)", 600, 3000, 1400, 100)
168
  gap_ms = st.slider("Silence between chunks (ms)", 0, 1200, 250, 50)
169
 
170
- # Generation params
171
  st.markdown("### Generation Parameters")
172
- max_new_tokens = st.slider("max_new_tokens", 256, 8192, 4096, 256, help="Increase for longer audio per chunk (more compute).")
173
 
174
- # MP3
175
  st.markdown("### MP3 Export")
176
  mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", [96, 128, 160, 192, 256, 320], index=3)
177
  do_normalize = st.checkbox("Normalize output audio", value=True)
@@ -219,7 +203,8 @@ with colA:
219
  frac = i / len(chunks)
220
  progress.progress(int((base_prog + frac * span_prog) * 100))
221
 
222
- return encode_mp3_mono(stitched, sr_out, bitrate_kbps=int(mp3_bitrate))
 
223
 
224
  if mode == "Single chapter":
225
  input_type = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
@@ -250,10 +235,11 @@ with colA:
250
  st.stop()
251
 
252
  status.write("✅ Done.")
253
- st.audio(mp3_bytes, format="audio/mp3")
 
254
  st.download_button(
255
  "Download MP3",
256
- data=mp3_bytes,
257
  file_name="audiobook_chapter.mp3",
258
  mime="audio/mpeg",
259
  use_container_width=True,
@@ -292,8 +278,8 @@ with colA:
292
  st.error(f"Failed on '{f.name}': {e}")
293
  st.stop()
294
 
295
- zf.writestr(mp3_name, mp3_bytes)
296
- previews.append((mp3_name, mp3_bytes))
297
 
298
  status.write("✅ Batch complete.")
299
  zip_buf.seek(0)
@@ -307,12 +293,12 @@ with colA:
307
  )
308
 
309
  st.markdown("### Preview")
310
- for name, mp3_bytes in previews:
311
  with st.expander(name, expanded=False):
312
- st.audio(mp3_bytes, format="audio/mp3")
313
  st.download_button(
314
  f"Download {name}",
315
- data=mp3_bytes,
316
  file_name=name,
317
  mime="audio/mpeg",
318
  use_container_width=True,
 
9
  import torch
10
  import lameenc
11
 
12
+ from qwen_tts import Qwen3TTSModel # official package API
13
 
14
 
15
  MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
16
 
17
 
 
 
 
18
  def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
19
  text = re.sub(r"\r\n", "\n", text).strip()
20
  if not text:
21
  return []
 
22
  parts = re.split(r"(?<=[\.\!\?\。\!\?\n])\s+", text)
23
+ chunks, cur = [], ""
 
 
24
  for p in parts:
25
  if not p:
26
  continue
 
35
  cur = ""
36
  else:
37
  cur = p
 
38
  if cur:
39
  chunks.append(cur)
40
  return chunks
 
53
  return x
54
 
55
 
 
 
 
56
  def float_to_int16_pcm(x: np.ndarray) -> bytes:
57
  x = np.clip(x, -1.0, 1.0)
58
  return (x * 32767.0).astype(np.int16).tobytes()
59
 
60
 
61
  def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
62
+ """
63
+ Always return **bytes** (not bytearray) for Streamlit compatibility.
64
+ """
65
  enc = lameenc.Encoder()
66
  enc.set_bit_rate(int(bitrate_kbps))
67
  enc.set_in_sample_rate(int(sr))
68
  enc.set_channels(1)
69
+ enc.set_quality(2)
70
+
71
  mp3 = enc.encode(float_to_int16_pcm(audio_float32))
72
  mp3 += enc.flush()
73
+
74
+ # lameenc sometimes returns bytearray depending on build;
75
+ # Streamlit requires bytes.
76
+ return bytes(mp3)
77
 
78
 
79
  def sanitize_filename(name: str) -> str:
 
83
  return name or "chapter"
84
 
85
 
 
 
 
86
  def pick_device_and_dtype():
87
  if torch.cuda.is_available():
 
88
  return "cuda:0", torch.bfloat16
89
  return "cpu", torch.float32
90
 
 
99
  dtype=dtype,
100
  )
101
 
 
 
102
  try:
103
  speakers = model.get_supported_speakers()
104
  except Exception:
 
135
  st.subheader("Controls")
136
  st.caption(f"Device: `{device_map}` • dtype: `{dtype_str}`")
137
 
 
138
  fallback_langs = ["Auto", "Chinese", "English", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"]
139
  lang_options = supported_langs if supported_langs else fallback_langs
140
  language = st.selectbox("Language", options=lang_options, index=0)
141
 
 
142
  fallback_speakers = ["Vivian", "Ryan"]
143
  spk_options = supported_speakers if supported_speakers else fallback_speakers
144
  speaker = st.selectbox("Speaker / Voice", options=spk_options, index=0)
145
 
 
146
  instruct = st.text_area(
147
  "Instruction (style/emotion/pacing)",
148
  value="Warm, clear narration. Medium pace. Slightly expressive.",
149
  height=90,
 
150
  ).strip()
151
 
 
152
  st.markdown("### Long Text Settings")
153
  max_chars = st.slider("Chunk size (characters)", 600, 3000, 1400, 100)
154
  gap_ms = st.slider("Silence between chunks (ms)", 0, 1200, 250, 50)
155
 
 
156
  st.markdown("### Generation Parameters")
157
+ max_new_tokens = st.slider("max_new_tokens", 256, 8192, 4096, 256)
158
 
 
159
  st.markdown("### MP3 Export")
160
  mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", [96, 128, 160, 192, 256, 320], index=3)
161
  do_normalize = st.checkbox("Normalize output audio", value=True)
 
203
  frac = i / len(chunks)
204
  progress.progress(int((base_prog + frac * span_prog) * 100))
205
 
206
+ mp3_bytes = encode_mp3_mono(stitched, sr_out, bitrate_kbps=int(mp3_bitrate))
207
+ return bytes(mp3_bytes) # ensure bytes
208
 
209
  if mode == "Single chapter":
210
  input_type = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
 
235
  st.stop()
236
 
237
  status.write("✅ Done.")
238
+ st.audio(bytes(mp3_bytes), format="audio/mp3")
239
+
240
  st.download_button(
241
  "Download MP3",
242
+ data=bytes(mp3_bytes),
243
  file_name="audiobook_chapter.mp3",
244
  mime="audio/mpeg",
245
  use_container_width=True,
 
278
  st.error(f"Failed on '{f.name}': {e}")
279
  st.stop()
280
 
281
+ zf.writestr(mp3_name, bytes(mp3_bytes))
282
+ previews.append((mp3_name, bytes(mp3_bytes)))
283
 
284
  status.write("✅ Batch complete.")
285
  zip_buf.seek(0)
 
293
  )
294
 
295
  st.markdown("### Preview")
296
+ for name, mp3_b in previews:
297
  with st.expander(name, expanded=False):
298
+ st.audio(bytes(mp3_b), format="audio/mp3")
299
  st.download_button(
300
  f"Download {name}",
301
+ data=bytes(mp3_b),
302
  file_name=name,
303
  mime="audio/mpeg",
304
  use_container_width=True,