Spaces:

Jekyll2000
/

MY_TTS

Sleeping

App Files Files Community

Jekyll2000 commited on Feb 18

Commit

ab39842

verified ·

1 Parent(s): d0cc5a4

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -36

app.py CHANGED Viewed

@@ -9,24 +9,18 @@ import soundfile as sf
 import torch
 import lameenc
-from qwen_tts import Qwen3TTSModel  # official package API (recommended by Qwen docs)
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
-# -----------------------------
-# Text chunking (10k+ chars)
-# -----------------------------
 def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     text = re.sub(r"\r\n", "\n", text).strip()
     if not text:
         return []
     parts = re.split(r"(?<=[\.\!\?\。\！\？\n])\s+", text)
-    chunks = []
-    cur = ""
     for p in parts:
         if not p:
             continue
@@ -41,7 +35,6 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
                 cur = ""
             else:
                 cur = p
     if cur:
         chunks.append(cur)
     return chunks
@@ -60,23 +53,27 @@ def normalize_audio(x: np.ndarray) -> np.ndarray:
     return x
-# -----------------------------
-# MP3 encoding (no ffmpeg)
-# -----------------------------
 def float_to_int16_pcm(x: np.ndarray) -> bytes:
     x = np.clip(x, -1.0, 1.0)
     return (x * 32767.0).astype(np.int16).tobytes()
 def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
     enc = lameenc.Encoder()
     enc.set_bit_rate(int(bitrate_kbps))
     enc.set_in_sample_rate(int(sr))
     enc.set_channels(1)
-    enc.set_quality(2)  # 2=high quality
     mp3 = enc.encode(float_to_int16_pcm(audio_float32))
     mp3 += enc.flush()
-    return mp3
 def sanitize_filename(name: str) -> str:
@@ -86,12 +83,8 @@ def sanitize_filename(name: str) -> str:
     return name or "chapter"
-# -----------------------------
-# Model loading (qwen-tts)
-# -----------------------------
 def pick_device_and_dtype():
     if torch.cuda.is_available():
-        # bfloat16 is recommended in Qwen docs examples for modern GPUs
         return "cuda:0", torch.bfloat16
     return "cpu", torch.float32
@@ -106,8 +99,6 @@ def load_qwen_tts():
         dtype=dtype,
     )
-    # Try to read supported languages/speakers from the model
-    # (These helper methods are documented by Qwen for CustomVoice models)
     try:
         speakers = model.get_supported_speakers()
     except Exception:
@@ -144,34 +135,27 @@ with colB:
     st.subheader("Controls")
     st.caption(f"Device: `{device_map}` • dtype: `{dtype_str}`")
-    # Language dropdown (fallback list if model doesn't provide)
     fallback_langs = ["Auto", "Chinese", "English", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"]
     lang_options = supported_langs if supported_langs else fallback_langs
     language = st.selectbox("Language", options=lang_options, index=0)
-    # Speaker dropdown (fallback common names from Qwen docs snippet)
     fallback_speakers = ["Vivian", "Ryan"]
     spk_options = supported_speakers if supported_speakers else fallback_speakers
     speaker = st.selectbox("Speaker / Voice", options=spk_options, index=0)
-    # Instruction control
     instruct = st.text_area(
         "Instruction (style/emotion/pacing)",
         value="Warm, clear narration. Medium pace. Slightly expressive.",
         height=90,
-        help="Leave empty for neutral/default speaking style.",
     ).strip()
-    # Long chapter handling
     st.markdown("### Long Text Settings")
     max_chars = st.slider("Chunk size (characters)", 600, 3000, 1400, 100)
     gap_ms = st.slider("Silence between chunks (ms)", 0, 1200, 250, 50)
-    # Generation params
     st.markdown("### Generation Parameters")
-    max_new_tokens = st.slider("max_new_tokens", 256, 8192, 4096, 256, help="Increase for longer audio per chunk (more compute).")
-    # MP3
     st.markdown("### MP3 Export")
     mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", [96, 128, 160, 192, 256, 320], index=3)
     do_normalize = st.checkbox("Normalize output audio", value=True)
@@ -219,7 +203,8 @@ with colA:
             frac = i / len(chunks)
             progress.progress(int((base_prog + frac * span_prog) * 100))
-        return encode_mp3_mono(stitched, sr_out, bitrate_kbps=int(mp3_bitrate))
     if mode == "Single chapter":
         input_type = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
@@ -250,10 +235,11 @@ with colA:
                 st.stop()
             status.write("✅ Done.")
-            st.audio(mp3_bytes, format="audio/mp3")
             st.download_button(
                 "Download MP3",
-                data=mp3_bytes,
                 file_name="audiobook_chapter.mp3",
                 mime="audio/mpeg",
                 use_container_width=True,
@@ -292,8 +278,8 @@ with colA:
                         st.error(f"Failed on '{f.name}': {e}")
                         st.stop()
-                    zf.writestr(mp3_name, mp3_bytes)
-                    previews.append((mp3_name, mp3_bytes))
             status.write("✅ Batch complete.")
             zip_buf.seek(0)
@@ -307,12 +293,12 @@ with colA:
             )
             st.markdown("### Preview")
-            for name, mp3_bytes in previews:
                 with st.expander(name, expanded=False):
-                    st.audio(mp3_bytes, format="audio/mp3")
                     st.download_button(
                         f"Download {name}",
-                        data=mp3_bytes,
                         file_name=name,
                         mime="audio/mpeg",
                         use_container_width=True,

 import torch
 import lameenc
+from qwen_tts import Qwen3TTSModel  # official package API
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
 def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     text = re.sub(r"\r\n", "\n", text).strip()
     if not text:
         return []
     parts = re.split(r"(?<=[\.\!\?\。\！\？\n])\s+", text)
+    chunks, cur = [], ""
     for p in parts:
         if not p:
             continue
                 cur = ""
             else:
                 cur = p
     if cur:
         chunks.append(cur)
     return chunks
     return x
 def float_to_int16_pcm(x: np.ndarray) -> bytes:
     x = np.clip(x, -1.0, 1.0)
     return (x * 32767.0).astype(np.int16).tobytes()
 def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
+    """
+    Always return **bytes** (not bytearray) for Streamlit compatibility.
+    """
     enc = lameenc.Encoder()
     enc.set_bit_rate(int(bitrate_kbps))
     enc.set_in_sample_rate(int(sr))
     enc.set_channels(1)
+    enc.set_quality(2)
     mp3 = enc.encode(float_to_int16_pcm(audio_float32))
     mp3 += enc.flush()
+    # lameenc sometimes returns bytearray depending on build;
+    # Streamlit requires bytes.
+    return bytes(mp3)
 def sanitize_filename(name: str) -> str:
     return name or "chapter"
 def pick_device_and_dtype():
     if torch.cuda.is_available():
         return "cuda:0", torch.bfloat16
     return "cpu", torch.float32
         dtype=dtype,
     )
     try:
         speakers = model.get_supported_speakers()
     except Exception:
     st.subheader("Controls")
     st.caption(f"Device: `{device_map}` • dtype: `{dtype_str}`")
     fallback_langs = ["Auto", "Chinese", "English", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"]
     lang_options = supported_langs if supported_langs else fallback_langs
     language = st.selectbox("Language", options=lang_options, index=0)
     fallback_speakers = ["Vivian", "Ryan"]
     spk_options = supported_speakers if supported_speakers else fallback_speakers
     speaker = st.selectbox("Speaker / Voice", options=spk_options, index=0)
     instruct = st.text_area(
         "Instruction (style/emotion/pacing)",
         value="Warm, clear narration. Medium pace. Slightly expressive.",
         height=90,
     ).strip()
     st.markdown("### Long Text Settings")
     max_chars = st.slider("Chunk size (characters)", 600, 3000, 1400, 100)
     gap_ms = st.slider("Silence between chunks (ms)", 0, 1200, 250, 50)
     st.markdown("### Generation Parameters")
+    max_new_tokens = st.slider("max_new_tokens", 256, 8192, 4096, 256)
     st.markdown("### MP3 Export")
     mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", [96, 128, 160, 192, 256, 320], index=3)
     do_normalize = st.checkbox("Normalize output audio", value=True)
             frac = i / len(chunks)
             progress.progress(int((base_prog + frac * span_prog) * 100))
+        mp3_bytes = encode_mp3_mono(stitched, sr_out, bitrate_kbps=int(mp3_bitrate))
+        return bytes(mp3_bytes)  # ensure bytes
     if mode == "Single chapter":
         input_type = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
                 st.stop()
             status.write("✅ Done.")
+            st.audio(bytes(mp3_bytes), format="audio/mp3")
             st.download_button(
                 "Download MP3",
+                data=bytes(mp3_bytes),
                 file_name="audiobook_chapter.mp3",
                 mime="audio/mpeg",
                 use_container_width=True,
                         st.error(f"Failed on '{f.name}': {e}")
                         st.stop()
+                    zf.writestr(mp3_name, bytes(mp3_bytes))
+                    previews.append((mp3_name, bytes(mp3_bytes)))
             status.write("✅ Batch complete.")
             zip_buf.seek(0)
             )
             st.markdown("### Preview")
+            for name, mp3_b in previews:
                 with st.expander(name, expanded=False):
+                    st.audio(bytes(mp3_b), format="audio/mp3")
                     st.download_button(
                         f"Download {name}",
+                        data=bytes(mp3_b),
                         file_name=name,
                         mime="audio/mpeg",
                         use_container_width=True,