Spaces:

Jekyll2000
/

MY_TTS

Sleeping

App Files Files Community

Jekyll2000 commited on Feb 18

Commit

a15f8bb

verified ·

1 Parent(s): 010981c

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -46

app.py CHANGED Viewed

@@ -1,17 +1,30 @@
 import io
 import re
 import zipfile
 import numpy as np
 import streamlit as st
 import soundfile as sf
 import torch
-from transformers import pipeline, AutoProcessor
 import lameenc  # MP3 encoder (no ffmpeg needed)
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
 # -----------------------------
 # Helpers
 # -----------------------------
@@ -34,11 +47,13 @@ DEFAULT_LANGS = [
     ("Vietnamese", "vi"),
 ]
 def pick_device():
     if torch.cuda.is_available():
         return "cuda", 0, torch.float16
     return "cpu", -1, torch.float32
 def normalize_audio(x: np.ndarray) -> np.ndarray:
     x = x.astype(np.float32)
     peak = float(np.max(np.abs(x))) if x.size else 0.0
@@ -46,16 +61,17 @@ def normalize_audio(x: np.ndarray) -> np.ndarray:
         x = x / max(peak, 1e-8)
     return x
 def make_silence(sr: int, ms: int) -> np.ndarray:
     n = int(sr * (ms / 1000.0))
     return np.zeros(n, dtype=np.float32)
 def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     text = re.sub(r"\r\n", "\n", text).strip()
     if not text:
         return []
-    # Sentence-ish split (works across many languages reasonably)
     parts = re.split(r"(?<=[\.\!\?\。\！\？\n])\s+", text)
     chunks = []
     cur = ""
@@ -69,9 +85,8 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
             if cur:
                 chunks.append(cur)
             if len(p) > max_chars:
-                # hard-split huge segments
                 for i in range(0, len(p), max_chars):
-                    chunks.append(p[i:i+max_chars])
                 cur = ""
             else:
                 cur = p
@@ -80,11 +95,8 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
         chunks.append(cur)
     return chunks
 def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
-    """
-    Tag-based control. If you later confirm a different schema from Qwen's demo,
-    you only need to change this function.
-    """
     tags = []
     if lang:
         tags.append(f"[LANG={lang}]")
@@ -94,8 +106,8 @@ def format_prompt(text: str, lang: str | None, speaker: str | None, instruction:
         tags.append(f"[INSTRUCTION={instruction}]")
     return " ".join(tags + [text])
 def safe_get_speakers(proc, pipe_obj):
-    # Try processor attributes
     for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
         if hasattr(proc, attr):
             val = getattr(proc, attr)
@@ -104,7 +116,6 @@ def safe_get_speakers(proc, pipe_obj):
             if isinstance(val, (list, tuple)):
                 return sorted(set(map(str, val)))
-    # Try model config attributes
     model = getattr(pipe_obj, "model", None)
     cfg = getattr(model, "config", None) if model is not None else None
     if cfg is not None:
@@ -118,14 +129,15 @@ def safe_get_speakers(proc, pipe_obj):
     return []
 def try_reference_audio(wav_bytes: bytes):
     audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
     if audio.ndim > 1:
         audio = audio.mean(axis=1)
     return {"array": audio, "sampling_rate": sr}
 def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
-    # Try with reference audio if supported; otherwise fall back gracefully
     if ref_audio is not None:
         try:
             return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
@@ -135,27 +147,25 @@ def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
             pass
     return pipe_obj(prompt, **gen_kwargs)
 def float_to_int16_pcm(x: np.ndarray) -> bytes:
     x = np.clip(x, -1.0, 1.0)
     pcm = (x * 32767.0).astype(np.int16)
     return pcm.tobytes()
 def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
-    """
-    Encode mono float32 audio (-1..1) to MP3 bytes using lameenc.
-    No ffmpeg required.
-    """
     enc = lameenc.Encoder()
     enc.set_bit_rate(int(bitrate_kbps))
     enc.set_in_sample_rate(int(sr))
     enc.set_channels(1)
-    enc.set_quality(2)  # 2=high quality, 7=faster
     pcm_bytes = float_to_int16_pcm(audio_float32)
     mp3 = enc.encode(pcm_bytes)
     mp3 += enc.flush()
     return mp3
 def sanitize_filename(name: str) -> str:
     name = name.strip().replace("\\", "_").replace("/", "_")
     name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
@@ -164,6 +174,7 @@ def sanitize_filename(name: str) -> str:
         name = "chapter"
     return name
 @st.cache_resource(show_spinner=False)
 def load_tts():
     device, device_id, dtype = pick_device()
@@ -193,7 +204,6 @@ colA, colB = st.columns([2, 1], gap="large")
 with colB:
     st.subheader("Controls")
-    # Language
     lang_label = st.selectbox(
         "Language",
         options=[x[0] for x in DEFAULT_LANGS],
@@ -202,7 +212,6 @@ with colB:
     )
     lang = dict(DEFAULT_LANGS).get(lang_label)
-    # Speakers
     st.markdown("### Voice / Speaker")
     speaker = None
     if detected_speakers:
@@ -224,7 +233,6 @@ with colB:
     if custom_speaker:
         speaker = custom_speaker
-    # Instruction
     st.markdown("### Instruction Control")
     instruction = st.text_area(
         "Instruction (style/emotion/pacing/etc.)",
@@ -234,7 +242,6 @@ with colB:
     if instruction == "":
         instruction = None
-    # Optional reference voice
     st.markdown("### Optional: Reference Voice")
     ref_file = st.file_uploader(
         "Upload reference WAV (optional)",
@@ -242,7 +249,6 @@ with colB:
         help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
     )
-    # Long text chunking
     st.markdown("### Long Text Settings")
     max_chars = st.slider(
         "Chunk size (characters)",
@@ -260,7 +266,6 @@ with colB:
         step=50,
     )
-    # Generation params
     st.markdown("### Generation Parameters")
     max_new_tokens = st.slider(
         "max_new_tokens",
@@ -278,7 +283,6 @@ with colB:
         step=0.1,
     )
-    # MP3 export
     st.markdown("### MP3 Export")
     mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
     normalize = st.checkbox("Normalize output audio", value=True)
@@ -292,7 +296,6 @@ with colA:
         horizontal=True,
     )
-    # Shared ref audio prep
     ref_audio = None
     if ref_file is not None:
         try:
@@ -318,7 +321,6 @@ with colA:
         stitched = None
         out_sr = None
-        # chunk-level progress
         for i, chunk in enumerate(chunks, start=1):
             st.session_state["_status"].write(f"{label}: chunk {i}/{len(chunks)}")
             prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
@@ -338,23 +340,19 @@ with colA:
                 out_sr = int(sr)
             else:
                 if int(sr) != out_sr:
-                    # usually consistent; warn once
                     st.warning(f"{label}: sample rate changed ({sr} != {out_sr}). Stitching anyway.")
                 if gap_ms > 0:
                     stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
                 else:
                     stitched = np.concatenate([stitched, audio])
-            # update overall progress bar
             frac = i / len(chunks)
             st.session_state["_progress"].progress(int((progress_base + frac * progress_span) * 100))
-        # encode mp3
-        mp3_bytes = encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
-        return mp3_bytes
     # -----------------------------
-    # Single mode
     # -----------------------------
     if input_mode == "Single chapter (paste/upload)":
         single_submode = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
@@ -400,7 +398,7 @@ with colA:
             )
     # -----------------------------
-    # Batch mode
     # -----------------------------
     else:
         st.markdown("Upload multiple `.txt` files (each file = one chapter).")
@@ -411,12 +409,6 @@ with colA:
             key="batch_txts",
         )
-        if batch_files:
-            total_chars = 0
-            for f in batch_files:
-                total_chars += len(f.getvalue())
-            st.write(f"**Files:** {len(batch_files)}  |  **Total bytes:** {total_chars:,}")
         st.divider()
         if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
@@ -427,9 +419,8 @@ with colA:
             st.session_state["_progress"] = st.progress(0)
             st.session_state["_status"] = st.empty()
-            # Generate each file -> mp3, and pack into ZIP
             zip_buf = io.BytesIO()
-            results_preview = []  # (name, mp3_bytes) for in-page audio preview
             with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
                 n = len(batch_files)
@@ -439,24 +430,23 @@ with colA:
                     mp3_name = f"{base}.mp3"
                     label = f"{idx}/{n} {base}"
-                    # allocate progress range per file
                     base_prog = (idx - 1) / n
                     span_prog = 1.0 / n
                     try:
-                        mp3_bytes = generate_mp3_from_text(raw, label=label, progress_base=base_prog, progress_span=span_prog)
                     except Exception as e:
                         st.error(f"Failed on file '{f.name}': {e}")
                         st.stop()
                     zf.writestr(mp3_name, mp3_bytes)
-                    # Keep a small preview list (all, but could be large; still OK)
                     results_preview.append((mp3_name, mp3_bytes))
             st.session_state["_status"].write("✅ Batch complete. Download your ZIP below.")
             zip_buf.seek(0)
             st.download_button(
                 "Download ZIP (all MP3s)",
                 data=zip_buf.getvalue(),

 import io
 import re
+import os
 import zipfile
 import numpy as np
 import streamlit as st
 import soundfile as sf
 import torch
+from transformers import AutoProcessor
+from transformers.pipelines import pipeline
 import lameenc  # MP3 encoder (no ffmpeg needed)
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
+# Show a clear error if torch isn't functional
+try:
+    _ = torch.tensor([1.0])
+except Exception as e:
+    st.error(
+        "PyTorch is not available or failed to initialize.\n\n"
+        "Fix: Set `python_version: \"3.10\"` in README.md and pin a supported torch build in requirements.txt.\n\n"
+        f"Details: {e}"
+    )
+    st.stop()
 # -----------------------------
 # Helpers
 # -----------------------------
     ("Vietnamese", "vi"),
 ]
 def pick_device():
     if torch.cuda.is_available():
         return "cuda", 0, torch.float16
     return "cpu", -1, torch.float32
 def normalize_audio(x: np.ndarray) -> np.ndarray:
     x = x.astype(np.float32)
     peak = float(np.max(np.abs(x))) if x.size else 0.0
         x = x / max(peak, 1e-8)
     return x
 def make_silence(sr: int, ms: int) -> np.ndarray:
     n = int(sr * (ms / 1000.0))
     return np.zeros(n, dtype=np.float32)
 def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     text = re.sub(r"\r\n", "\n", text).strip()
     if not text:
         return []
     parts = re.split(r"(?<=[\.\!\?\。\！\？\n])\s+", text)
     chunks = []
     cur = ""
             if cur:
                 chunks.append(cur)
             if len(p) > max_chars:
                 for i in range(0, len(p), max_chars):
+                    chunks.append(p[i:i + max_chars])
                 cur = ""
             else:
                 cur = p
         chunks.append(cur)
     return chunks
 def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
     tags = []
     if lang:
         tags.append(f"[LANG={lang}]")
         tags.append(f"[INSTRUCTION={instruction}]")
     return " ".join(tags + [text])
 def safe_get_speakers(proc, pipe_obj):
     for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
         if hasattr(proc, attr):
             val = getattr(proc, attr)
             if isinstance(val, (list, tuple)):
                 return sorted(set(map(str, val)))
     model = getattr(pipe_obj, "model", None)
     cfg = getattr(model, "config", None) if model is not None else None
     if cfg is not None:
     return []
 def try_reference_audio(wav_bytes: bytes):
     audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
     if audio.ndim > 1:
         audio = audio.mean(axis=1)
     return {"array": audio, "sampling_rate": sr}
 def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
     if ref_audio is not None:
         try:
             return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
             pass
     return pipe_obj(prompt, **gen_kwargs)
 def float_to_int16_pcm(x: np.ndarray) -> bytes:
     x = np.clip(x, -1.0, 1.0)
     pcm = (x * 32767.0).astype(np.int16)
     return pcm.tobytes()
 def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
     enc = lameenc.Encoder()
     enc.set_bit_rate(int(bitrate_kbps))
     enc.set_in_sample_rate(int(sr))
     enc.set_channels(1)
+    enc.set_quality(2)
     pcm_bytes = float_to_int16_pcm(audio_float32)
     mp3 = enc.encode(pcm_bytes)
     mp3 += enc.flush()
     return mp3
 def sanitize_filename(name: str) -> str:
     name = name.strip().replace("\\", "_").replace("/", "_")
     name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
         name = "chapter"
     return name
 @st.cache_resource(show_spinner=False)
 def load_tts():
     device, device_id, dtype = pick_device()
 with colB:
     st.subheader("Controls")
     lang_label = st.selectbox(
         "Language",
         options=[x[0] for x in DEFAULT_LANGS],
     )
     lang = dict(DEFAULT_LANGS).get(lang_label)
     st.markdown("### Voice / Speaker")
     speaker = None
     if detected_speakers:
     if custom_speaker:
         speaker = custom_speaker
     st.markdown("### Instruction Control")
     instruction = st.text_area(
         "Instruction (style/emotion/pacing/etc.)",
     if instruction == "":
         instruction = None
     st.markdown("### Optional: Reference Voice")
     ref_file = st.file_uploader(
         "Upload reference WAV (optional)",
         help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
     )
     st.markdown("### Long Text Settings")
     max_chars = st.slider(
         "Chunk size (characters)",
         step=50,
     )
     st.markdown("### Generation Parameters")
     max_new_tokens = st.slider(
         "max_new_tokens",
         step=0.1,
     )
     st.markdown("### MP3 Export")
     mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
     normalize = st.checkbox("Normalize output audio", value=True)
         horizontal=True,
     )
     ref_audio = None
     if ref_file is not None:
         try:
         stitched = None
         out_sr = None
         for i, chunk in enumerate(chunks, start=1):
             st.session_state["_status"].write(f"{label}: chunk {i}/{len(chunks)}")
             prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
                 out_sr = int(sr)
             else:
                 if int(sr) != out_sr:
                     st.warning(f"{label}: sample rate changed ({sr} != {out_sr}). Stitching anyway.")
                 if gap_ms > 0:
                     stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
                 else:
                     stitched = np.concatenate([stitched, audio])
             frac = i / len(chunks)
             st.session_state["_progress"].progress(int((progress_base + frac * progress_span) * 100))
+        return encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
     # -----------------------------
+    # Single
     # -----------------------------
     if input_mode == "Single chapter (paste/upload)":
         single_submode = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
             )
     # -----------------------------
+    # Batch
     # -----------------------------
     else:
         st.markdown("Upload multiple `.txt` files (each file = one chapter).")
             key="batch_txts",
         )
         st.divider()
         if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
             st.session_state["_progress"] = st.progress(0)
             st.session_state["_status"] = st.empty()
             zip_buf = io.BytesIO()
+            results_preview = []
             with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
                 n = len(batch_files)
                     mp3_name = f"{base}.mp3"
                     label = f"{idx}/{n} {base}"
                     base_prog = (idx - 1) / n
                     span_prog = 1.0 / n
                     try:
+                        mp3_bytes = generate_mp3_from_text(
+                            raw, label=label, progress_base=base_prog, progress_span=span_prog
+                        )
                     except Exception as e:
                         st.error(f"Failed on file '{f.name}': {e}")
                         st.stop()
                     zf.writestr(mp3_name, mp3_bytes)
                     results_preview.append((mp3_name, mp3_bytes))
             st.session_state["_status"].write("✅ Batch complete. Download your ZIP below.")
             zip_buf.seek(0)
             st.download_button(
                 "Download ZIP (all MP3s)",
                 data=zip_buf.getvalue(),