Spaces:

Jekyll2000
/

MY_TTS

Sleeping

App Files Files Community

Jekyll2000 commited on Feb 18

Commit

db6c05c

verified ·

1 Parent(s): 1c065e5

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -88

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import io
 import re
-import base64
 import numpy as np
 import streamlit as st
 import soundfile as sf
@@ -8,6 +9,8 @@ import soundfile as sf
 import torch
 from transformers import pipeline, AutoProcessor
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
@@ -40,7 +43,7 @@ def pick_device():
 def normalize_audio(x: np.ndarray) -> np.ndarray:
     x = x.astype(np.float32)
-    peak = np.max(np.abs(x)) if x.size else 0.0
     if peak > 0:
         x = x / max(peak, 1e-8)
     return x
@@ -50,16 +53,10 @@ def make_silence(sr: int, ms: int) -> np.ndarray:
     return np.zeros(n, dtype=np.float32)
 def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
-    """
-    Chunk long text into <= max_chars chunks.
-    Uses sentence-ish boundaries where possible.
-    """
     text = re.sub(r"\r\n", "\n", text).strip()
     if not text:
         return []
-    # Split into "sentences" while keeping separators
-    # Works decently for many languages, not perfect.
     parts = re.split(r"(?<=[\.\!\?\。\！\？\n])\s+", text)
     chunks = []
     cur = ""
@@ -72,7 +69,6 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
         else:
             if cur:
                 chunks.append(cur)
-            # If a single part is huge, hard-split it
             if len(p) > max_chars:
                 for i in range(0, len(p), max_chars):
                     chunks.append(p[i:i+max_chars])
@@ -85,10 +81,7 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     return chunks
 def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
-    """
-    Many instructionable TTS models accept tags.
-    If your model expects a different schema, adjust here.
-    """
     tags = []
     if lang:
         tags.append(f"[LANG={lang}]")
@@ -99,22 +92,14 @@ def format_prompt(text: str, lang: str | None, speaker: str | None, instruction:
     return " ".join(tags + [text])
 def safe_get_speakers(proc, pipe_obj):
-    """
-    Try to discover speakers/voices from processor/config.
-    If none found, return empty list.
-    """
-    candidates = []
-    # From processor
     for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
         if hasattr(proc, attr):
             val = getattr(proc, attr)
             if isinstance(val, dict):
-                candidates = list(val.keys())
-                return sorted(set(map(str, candidates)))
             if isinstance(val, (list, tuple)):
                 return sorted(set(map(str, val)))
-    # From model config
     model = getattr(pipe_obj, "model", None)
     cfg = getattr(model, "config", None) if model is not None else None
     if cfg is not None:
@@ -122,28 +107,55 @@ def safe_get_speakers(proc, pipe_obj):
             if hasattr(cfg, attr):
                 val = getattr(cfg, attr)
                 if isinstance(val, dict):
-                    candidates = list(val.keys())
-                    return sorted(set(map(str, candidates)))
                 if isinstance(val, (list, tuple)):
                     return sorted(set(map(str, val)))
     return []
 def try_reference_audio(wav_bytes: bytes):
-    """
-    Load a reference wav file into a dict compatible with HF audio pipelines.
-    """
     audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
     if audio.ndim > 1:
         audio = audio.mean(axis=1)
     return {"array": audio, "sampling_rate": sr}
 @st.cache_resource(show_spinner=False)
 def load_tts():
     device, device_id, dtype = pick_device()
     pipe_obj = pipeline(
-        task="text-to-audio",  # alias: "text-to-speech"
         model=MODEL_ID,
         device=device_id,
         torch_dtype=dtype,
@@ -153,33 +165,12 @@ def load_tts():
     return pipe_obj, proc, speakers, device, dtype
-def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
-    """
-    Run pipeline for one chunk.
-    If ref_audio isn't supported by this model/pipeline, ignore it gracefully.
-    """
-    if ref_audio is not None:
-        try:
-            out = pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
-            return out
-        except TypeError:
-            # pipeline/model does not accept ref_audio
-            pass
-        except Exception:
-            # any other issue: also fall back without ref audio
-            pass
-    out = pipe_obj(prompt, **gen_kwargs)
-    return out
 # -----------------------------
 # UI
 # -----------------------------
-st.set_page_config(page_title="Qwen3 TTS Audiobook Generator", layout="wide")
-st.title("🎧 Qwen3 TTS Audiobook Generator")
-st.caption(f"Model: `{MODEL_ID}`")
 with st.spinner("Loading model (first run can take a while)..."):
     pipe_obj, proc, detected_speakers, device, dtype = load_tts()
@@ -189,57 +180,51 @@ colA, colB = st.columns([2, 1], gap="large")
 with colB:
     st.subheader("Controls")
-    # Language
     lang_label = st.selectbox(
         "Language",
         options=[x[0] for x in DEFAULT_LANGS],
-        index=1,  # English default
-        help="Select a language tag to steer pronunciation. 'Auto' disables language tag.",
     )
     lang = dict(DEFAULT_LANGS).get(lang_label)
-    # Speakers / voices
     st.markdown("### Voice / Speaker")
     if detected_speakers:
         speaker_choice = st.selectbox(
             "Detected speakers",
             options=["(none)"] + detected_speakers,
             index=0,
-            help="Speakers detected from model config/processor. If empty, use Custom speaker field below.",
         )
         speaker = None if speaker_choice == "(none)" else speaker_choice
     else:
-        st.info("No speaker list detected from model config. You can still provide a speaker name below.")
-        speaker = None
     custom_speaker = st.text_input(
         "Custom speaker name (optional)",
         value="",
-        help="If your model supports speaker conditioning by name/tag, enter it here.",
     ).strip()
     if custom_speaker:
         speaker = custom_speaker
-    # Instruction control
     st.markdown("### Instruction Control")
     instruction = st.text_area(
         "Instruction (style/emotion/pacing/etc.)",
         value="Warm, clear narration. Medium pace. Slightly expressive.",
         height=90,
-        help="Free-form style instruction. Example: 'Calm, slow, deep voice. Dramatic pauses.'",
     ).strip()
     if instruction == "":
         instruction = None
-    # Reference voice (optional)
     st.markdown("### Optional: Reference Voice")
     ref_file = st.file_uploader(
         "Upload reference WAV (optional)",
         type=["wav"],
-        help="If the model supports 'CustomVoice' conditioning, this can steer voice cloning. If unsupported, it will be ignored.",
     )
-    # Long-text chunking
     st.markdown("### Long Text (Audiobook)")
     max_chars = st.slider(
         "Chunk size (characters)",
@@ -247,7 +232,7 @@ with colB:
         max_value=3000,
         value=1400,
         step=100,
-        help="10,000 chars will be split into multiple chunks then stitched into one WAV.",
     )
     gap_ms = st.slider(
         "Silence between chunks (ms)",
@@ -255,10 +240,8 @@ with colB:
         max_value=1200,
         value=250,
         step=50,
-        help="Adds a small pause between chunks.",
     )
-    # Generation parameters (audio length etc.)
     st.markdown("### Generation Parameters")
     max_new_tokens = st.slider(
         "max_new_tokens",
@@ -274,9 +257,10 @@ with colB:
         max_value=1.5,
         value=0.9,
         step=0.1,
-        help="Sampling temperature (if supported by the model).",
     )
     normalize = st.checkbox("Normalize output audio", value=True)
 with colA:
@@ -290,7 +274,7 @@ with colA:
             "Chapter text",
             value="",
             height=420,
-            placeholder="Paste up to ~10,000+ characters here. The app will chunk and stitch.",
         )
     else:
         txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="txt_uploader")
@@ -301,8 +285,7 @@ with colA:
     st.divider()
-    # Run
-    generate = st.button("Generate Audiobook WAV", type="primary", use_container_width=True)
     if generate:
         if not text.strip():
@@ -316,7 +299,6 @@ with colA:
         st.info(f"Split into **{len(chunks)}** chunk(s). Generating audio…")
-        # Prepare reference audio (optional)
         ref_audio = None
         if ref_file is not None:
             try:
@@ -336,7 +318,6 @@ with colA:
         stitched = None
         out_sr = None
-        # Generate each chunk and stitch
         for i, chunk in enumerate(chunks, start=1):
             status.write(f"Generating chunk {i}/{len(chunks)} …")
             prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
@@ -361,12 +342,10 @@ with colA:
                 stitched = audio
                 out_sr = int(sr)
             else:
-                # If sample rates differ, you should resample. Most pipelines keep it consistent.
                 if int(sr) != out_sr:
                     st.warning(
                         f"Chunk {i} sample rate {sr} != {out_sr}. "
-                        "For best results, adjust to a consistent sample rate. "
-                        "Stitching anyway."
                     )
                 if gap_ms > 0:
                     stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
@@ -375,21 +354,22 @@ with colA:
             progress.progress(int((i / len(chunks)) * 100))
-        status.write("✅ Done. Preparing download…")
-        # Write WAV to bytes
-        wav_buf = io.BytesIO()
-        sf.write(wav_buf, stitched, out_sr, format="WAV")
-        wav_bytes = wav_buf.getvalue()
-        st.audio(wav_bytes, format="audio/wav")
         st.download_button(
-            "Download WAV",
-            data=wav_bytes,
-            file_name="audiobook_chapter.wav",
-            mime="audio/wav",
             use_container_width=True,
         )
-        st.success("Generated audiobook chapter WAV successfully.")

 import io
 import re
+import math
+import os
 import numpy as np
 import streamlit as st
 import soundfile as sf
 import torch
 from transformers import pipeline, AutoProcessor
+import lameenc  # MP3 encoder (no ffmpeg needed)
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
 def normalize_audio(x: np.ndarray) -> np.ndarray:
     x = x.astype(np.float32)
+    peak = float(np.max(np.abs(x))) if x.size else 0.0
     if peak > 0:
         x = x / max(peak, 1e-8)
     return x
     return np.zeros(n, dtype=np.float32)
 def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     text = re.sub(r"\r\n", "\n", text).strip()
     if not text:
         return []
     parts = re.split(r"(?<=[\.\!\?\。\！\？\n])\s+", text)
     chunks = []
     cur = ""
         else:
             if cur:
                 chunks.append(cur)
             if len(p) > max_chars:
                 for i in range(0, len(p), max_chars):
                     chunks.append(p[i:i+max_chars])
     return chunks
 def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
+    # Adjust tag format if you later confirm the model expects different tokens
     tags = []
     if lang:
         tags.append(f"[LANG={lang}]")
     return " ".join(tags + [text])
 def safe_get_speakers(proc, pipe_obj):
     for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
         if hasattr(proc, attr):
             val = getattr(proc, attr)
             if isinstance(val, dict):
+                return sorted(set(map(str, val.keys())))
             if isinstance(val, (list, tuple)):
                 return sorted(set(map(str, val)))
     model = getattr(pipe_obj, "model", None)
     cfg = getattr(model, "config", None) if model is not None else None
     if cfg is not None:
             if hasattr(cfg, attr):
                 val = getattr(cfg, attr)
                 if isinstance(val, dict):
+                    return sorted(set(map(str, val.keys())))
                 if isinstance(val, (list, tuple)):
                     return sorted(set(map(str, val)))
     return []
 def try_reference_audio(wav_bytes: bytes):
     audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
     if audio.ndim > 1:
         audio = audio.mean(axis=1)
     return {"array": audio, "sampling_rate": sr}
+def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
+    if ref_audio is not None:
+        try:
+            return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
+        except TypeError:
+            pass
+        except Exception:
+            pass
+    return pipe_obj(prompt, **gen_kwargs)
+def float_to_int16_pcm(x: np.ndarray) -> bytes:
+    x = np.clip(x, -1.0, 1.0)
+    pcm = (x * 32767.0).astype(np.int16)
+    return pcm.tobytes()
+def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
+    """
+    Encode mono float32 audio (-1..1) to MP3 bytes using lameenc.
+    No ffmpeg required.
+    """
+    enc = lameenc.Encoder()
+    enc.set_bit_rate(bitrate_kbps)
+    enc.set_in_sample_rate(sr)
+    enc.set_channels(1)
+    enc.set_quality(2)  # 2=high, 7=fast
+    pcm_bytes = float_to_int16_pcm(audio_float32)
+    mp3 = enc.encode(pcm_bytes)
+    mp3 += enc.flush()
+    return mp3
 @st.cache_resource(show_spinner=False)
 def load_tts():
     device, device_id, dtype = pick_device()
     pipe_obj = pipeline(
+        task="text-to-audio",
         model=MODEL_ID,
         device=device_id,
         torch_dtype=dtype,
     return pipe_obj, proc, speakers, device, dtype
 # -----------------------------
 # UI
 # -----------------------------
+st.set_page_config(page_title="Haseeb's TTS", layout="wide")
+st.title("🎧 Haseeb's TTS")
+st.caption("Audiobook Generator • MP3 Output • Language • Voices • Instruction Control")
 with st.spinner("Loading model (first run can take a while)..."):
     pipe_obj, proc, detected_speakers, device, dtype = load_tts()
 with colB:
     st.subheader("Controls")
     lang_label = st.selectbox(
         "Language",
         options=[x[0] for x in DEFAULT_LANGS],
+        index=1,
+        help="Select a language tag to steer pronunciation. 'Auto' disables the language tag.",
     )
     lang = dict(DEFAULT_LANGS).get(lang_label)
     st.markdown("### Voice / Speaker")
+    speaker = None
     if detected_speakers:
         speaker_choice = st.selectbox(
             "Detected speakers",
             options=["(none)"] + detected_speakers,
             index=0,
+            help="Speakers detected from model config/processor.",
         )
         speaker = None if speaker_choice == "(none)" else speaker_choice
     else:
+        st.info("No speaker list detected from model config. You can still type a custom speaker name below.")
     custom_speaker = st.text_input(
         "Custom speaker name (optional)",
         value="",
+        help="If the model supports speaker conditioning by name/tag, enter it here.",
     ).strip()
     if custom_speaker:
         speaker = custom_speaker
     st.markdown("### Instruction Control")
     instruction = st.text_area(
         "Instruction (style/emotion/pacing/etc.)",
         value="Warm, clear narration. Medium pace. Slightly expressive.",
         height=90,
     ).strip()
     if instruction == "":
         instruction = None
     st.markdown("### Optional: Reference Voice")
     ref_file = st.file_uploader(
         "Upload reference WAV (optional)",
         type=["wav"],
+        help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
     )
     st.markdown("### Long Text (Audiobook)")
     max_chars = st.slider(
         "Chunk size (characters)",
         max_value=3000,
         value=1400,
         step=100,
+        help="10,000 chars will be split into multiple chunks then stitched.",
     )
     gap_ms = st.slider(
         "Silence between chunks (ms)",
         max_value=1200,
         value=250,
         step=50,
     )
     st.markdown("### Generation Parameters")
     max_new_tokens = st.slider(
         "max_new_tokens",
         max_value=1.5,
         value=0.9,
         step=0.1,
     )
+    st.markdown("### MP3 Export")
+    mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
     normalize = st.checkbox("Normalize output audio", value=True)
 with colA:
             "Chapter text",
             value="",
             height=420,
+            placeholder="Paste up to ~10,000+ characters here. The app will chunk, generate, stitch, then export MP3.",
         )
     else:
         txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="txt_uploader")
     st.divider()
+    generate = st.button("Generate MP3 Audiobook", type="primary", use_container_width=True)
     if generate:
         if not text.strip():
         st.info(f"Split into **{len(chunks)}** chunk(s). Generating audio…")
         ref_audio = None
         if ref_file is not None:
             try:
         stitched = None
         out_sr = None
         for i, chunk in enumerate(chunks, start=1):
             status.write(f"Generating chunk {i}/{len(chunks)} …")
             prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
                 stitched = audio
                 out_sr = int(sr)
             else:
                 if int(sr) != out_sr:
                     st.warning(
                         f"Chunk {i} sample rate {sr} != {out_sr}. "
+                        "Stitching anyway (best if consistent)."
                     )
                 if gap_ms > 0:
                     stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
             progress.progress(int((i / len(chunks)) * 100))
+        status.write("✅ Done. Encoding MP3…")
+        try:
+            mp3_bytes = encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
+        except Exception as e:
+            st.error(f"MP3 encoding failed: {e}")
+            st.stop()
+        st.audio(mp3_bytes, format="audio/mp3")
         st.download_button(
+            "Download MP3",
+            data=mp3_bytes,
+            file_name="audiobook_chapter.mp3",
+            mime="audio/mpeg",
             use_container_width=True,
         )
+        st.success("Generated MP3 audiobook successfully.")