Spaces:

Jekyll2000
/

MY_TTS

Sleeping

App Files Files Community

Jekyll2000 commited on Feb 18

Commit

e24e6a2

verified ·

1 Parent(s): db6c05c

Update app.py

Browse files

Files changed (1) hide show

app.py +187 -83

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import io
 import re
-import math
-import os
 import numpy as np
 import streamlit as st
 import soundfile as sf
@@ -11,7 +10,6 @@ from transformers import pipeline, AutoProcessor
 import lameenc  # MP3 encoder (no ffmpeg needed)
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
 # -----------------------------
@@ -57,6 +55,7 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     if not text:
         return []
     parts = re.split(r"(?<=[\.\!\?\。\！\？\n])\s+", text)
     chunks = []
     cur = ""
@@ -70,6 +69,7 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
             if cur:
                 chunks.append(cur)
             if len(p) > max_chars:
                 for i in range(0, len(p), max_chars):
                     chunks.append(p[i:i+max_chars])
                 cur = ""
@@ -81,7 +81,10 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     return chunks
 def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
-    # Adjust tag format if you later confirm the model expects different tokens
     tags = []
     if lang:
         tags.append(f"[LANG={lang}]")
@@ -92,6 +95,7 @@ def format_prompt(text: str, lang: str | None, speaker: str | None, instruction:
     return " ".join(tags + [text])
 def safe_get_speakers(proc, pipe_obj):
     for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
         if hasattr(proc, attr):
             val = getattr(proc, attr)
@@ -100,6 +104,7 @@ def safe_get_speakers(proc, pipe_obj):
             if isinstance(val, (list, tuple)):
                 return sorted(set(map(str, val)))
     model = getattr(pipe_obj, "model", None)
     cfg = getattr(model, "config", None) if model is not None else None
     if cfg is not None:
@@ -120,6 +125,7 @@ def try_reference_audio(wav_bytes: bytes):
     return {"array": audio, "sampling_rate": sr}
 def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
     if ref_audio is not None:
         try:
             return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
@@ -140,16 +146,23 @@ def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192)
     No ffmpeg required.
     """
     enc = lameenc.Encoder()
-    enc.set_bit_rate(bitrate_kbps)
-    enc.set_in_sample_rate(sr)
     enc.set_channels(1)
-    enc.set_quality(2)  # 2=high, 7=fast
     pcm_bytes = float_to_int16_pcm(audio_float32)
     mp3 = enc.encode(pcm_bytes)
     mp3 += enc.flush()
     return mp3
 @st.cache_resource(show_spinner=False)
 def load_tts():
@@ -170,9 +183,9 @@ def load_tts():
 # -----------------------------
 st.set_page_config(page_title="Haseeb's TTS", layout="wide")
 st.title("🎧 Haseeb's TTS")
-st.caption("Audiobook Generator • MP3 Output • Language • Voices • Instruction Control")
-with st.spinner("Loading model (first run can take a while)..."):
     pipe_obj, proc, detected_speakers, device, dtype = load_tts()
 colA, colB = st.columns([2, 1], gap="large")
@@ -180,6 +193,7 @@ colA, colB = st.columns([2, 1], gap="large")
 with colB:
     st.subheader("Controls")
     lang_label = st.selectbox(
         "Language",
         options=[x[0] for x in DEFAULT_LANGS],
@@ -188,6 +202,7 @@ with colB:
     )
     lang = dict(DEFAULT_LANGS).get(lang_label)
     st.markdown("### Voice / Speaker")
     speaker = None
     if detected_speakers:
@@ -199,7 +214,7 @@ with colB:
         )
         speaker = None if speaker_choice == "(none)" else speaker_choice
     else:
-        st.info("No speaker list detected from model config. You can still type a custom speaker name below.")
     custom_speaker = st.text_input(
         "Custom speaker name (optional)",
@@ -209,6 +224,7 @@ with colB:
     if custom_speaker:
         speaker = custom_speaker
     st.markdown("### Instruction Control")
     instruction = st.text_area(
         "Instruction (style/emotion/pacing/etc.)",
@@ -218,6 +234,7 @@ with colB:
     if instruction == "":
         instruction = None
     st.markdown("### Optional: Reference Voice")
     ref_file = st.file_uploader(
         "Upload reference WAV (optional)",
@@ -225,14 +242,15 @@ with colB:
         help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
     )
-    st.markdown("### Long Text (Audiobook)")
     max_chars = st.slider(
         "Chunk size (characters)",
         min_value=600,
         max_value=3000,
         value=1400,
         step=100,
-        help="10,000 chars will be split into multiple chunks then stitched.",
     )
     gap_ms = st.slider(
         "Silence between chunks (ms)",
@@ -242,6 +260,7 @@ with colB:
         step=50,
     )
     st.markdown("### Generation Parameters")
     max_new_tokens = st.slider(
         "max_new_tokens",
@@ -259,6 +278,7 @@ with colB:
         step=0.1,
     )
     st.markdown("### MP3 Export")
     mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
     normalize = st.checkbox("Normalize output audio", value=True)
@@ -266,73 +286,48 @@ with colB:
 with colA:
     st.subheader("Input")
-    input_mode = st.radio("Input mode", ["Paste text", "Upload .txt"], horizontal=True)
-    text = ""
-    if input_mode == "Paste text":
-        text = st.text_area(
-            "Chapter text",
-            value="",
-            height=420,
-            placeholder="Paste up to ~10,000+ characters here. The app will chunk, generate, stitch, then export MP3.",
-        )
-    else:
-        txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="txt_uploader")
-        if txt_file is not None:
-            text = txt_file.read().decode("utf-8", errors="ignore")
-    st.write(f"**Characters:** {len(text):,}")
-    st.divider()
-    generate = st.button("Generate MP3 Audiobook", type="primary", use_container_width=True)
-    if generate:
-        if not text.strip():
-            st.error("Please provide some text.")
-            st.stop()
-        chunks = split_text_into_chunks(text, max_chars=max_chars)
         if not chunks:
-            st.error("Text chunking failed (empty chunks).")
-            st.stop()
-        st.info(f"Split into **{len(chunks)}** chunk(s). Generating audio…")
-        ref_audio = None
-        if ref_file is not None:
-            try:
-                ref_audio = try_reference_audio(ref_file.read())
-            except Exception as e:
-                st.warning(f"Could not read reference WAV. Ignoring it. ({e})")
-                ref_audio = None
-        gen_kwargs = {
-            "max_new_tokens": int(max_new_tokens),
-            "temperature": float(temperature),
-        }
-        progress = st.progress(0)
-        status = st.empty()
         stitched = None
         out_sr = None
         for i, chunk in enumerate(chunks, start=1):
-            status.write(f"Generating chunk {i}/{len(chunks)} …")
             prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
-            try:
-                out = synthesize_chunk(pipe_obj, prompt, gen_kwargs=gen_kwargs, ref_audio=ref_audio)
-            except Exception as e:
-                st.error(f"Generation failed on chunk {i}: {e}")
-                st.stop()
             audio = out.get("audio", None)
             sr = out.get("sampling_rate", None)
             if audio is None or sr is None:
-                st.error(f"Unexpected pipeline output on chunk {i}.")
-                st.stop()
             audio = np.asarray(audio, dtype=np.float32)
             if normalize:
@@ -343,33 +338,142 @@ with colA:
                 out_sr = int(sr)
             else:
                 if int(sr) != out_sr:
-                    st.warning(
-                        f"Chunk {i} sample rate {sr} != {out_sr}. "
-                        "Stitching anyway (best if consistent)."
-                    )
                 if gap_ms > 0:
                     stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
                 else:
                     stitched = np.concatenate([stitched, audio])
-            progress.progress(int((i / len(chunks)) * 100))
-        status.write("✅ Done. Encoding MP3…")
-        try:
-            mp3_bytes = encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
-        except Exception as e:
-            st.error(f"MP3 encoding failed: {e}")
-            st.stop()
-        st.audio(mp3_bytes, format="audio/mp3")
-        st.download_button(
-            "Download MP3",
-            data=mp3_bytes,
-            file_name="audiobook_chapter.mp3",
-            mime="audio/mpeg",
-            use_container_width=True,
         )
-        st.success("Generated MP3 audiobook successfully.")

 import io
 import re
+import zipfile
 import numpy as np
 import streamlit as st
 import soundfile as sf
 import lameenc  # MP3 encoder (no ffmpeg needed)
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
 # -----------------------------
     if not text:
         return []
+    # Sentence-ish split (works across many languages reasonably)
     parts = re.split(r"(?<=[\.\!\?\。\！\？\n])\s+", text)
     chunks = []
     cur = ""
             if cur:
                 chunks.append(cur)
             if len(p) > max_chars:
+                # hard-split huge segments
                 for i in range(0, len(p), max_chars):
                     chunks.append(p[i:i+max_chars])
                 cur = ""
     return chunks
 def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
+    """
+    Tag-based control. If you later confirm a different schema from Qwen's demo,
+    you only need to change this function.
+    """
     tags = []
     if lang:
         tags.append(f"[LANG={lang}]")
     return " ".join(tags + [text])
 def safe_get_speakers(proc, pipe_obj):
+    # Try processor attributes
     for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
         if hasattr(proc, attr):
             val = getattr(proc, attr)
             if isinstance(val, (list, tuple)):
                 return sorted(set(map(str, val)))
+    # Try model config attributes
     model = getattr(pipe_obj, "model", None)
     cfg = getattr(model, "config", None) if model is not None else None
     if cfg is not None:
     return {"array": audio, "sampling_rate": sr}
 def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
+    # Try with reference audio if supported; otherwise fall back gracefully
     if ref_audio is not None:
         try:
             return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
     No ffmpeg required.
     """
     enc = lameenc.Encoder()
+    enc.set_bit_rate(int(bitrate_kbps))
+    enc.set_in_sample_rate(int(sr))
     enc.set_channels(1)
+    enc.set_quality(2)  # 2=high quality, 7=faster
     pcm_bytes = float_to_int16_pcm(audio_float32)
     mp3 = enc.encode(pcm_bytes)
     mp3 += enc.flush()
     return mp3
+def sanitize_filename(name: str) -> str:
+    name = name.strip().replace("\\", "_").replace("/", "_")
+    name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
+    name = re.sub(r"\s+", " ", name).strip()
+    if not name:
+        name = "chapter"
+    return name
 @st.cache_resource(show_spinner=False)
 def load_tts():
 # -----------------------------
 st.set_page_config(page_title="Haseeb's TTS", layout="wide")
 st.title("🎧 Haseeb's TTS")
+st.caption("Audiobook Generator • MP3 Output • Batch Mode • Language • Voices • Instruction Control")
+with st.spinner("Loading model (first run can take a while)…"):
     pipe_obj, proc, detected_speakers, device, dtype = load_tts()
 colA, colB = st.columns([2, 1], gap="large")
 with colB:
     st.subheader("Controls")
+    # Language
     lang_label = st.selectbox(
         "Language",
         options=[x[0] for x in DEFAULT_LANGS],
     )
     lang = dict(DEFAULT_LANGS).get(lang_label)
+    # Speakers
     st.markdown("### Voice / Speaker")
     speaker = None
     if detected_speakers:
         )
         speaker = None if speaker_choice == "(none)" else speaker_choice
     else:
+        st.info("No speaker list detected. You can still type a custom speaker name below.")
     custom_speaker = st.text_input(
         "Custom speaker name (optional)",
     if custom_speaker:
         speaker = custom_speaker
+    # Instruction
     st.markdown("### Instruction Control")
     instruction = st.text_area(
         "Instruction (style/emotion/pacing/etc.)",
     if instruction == "":
         instruction = None
+    # Optional reference voice
     st.markdown("### Optional: Reference Voice")
     ref_file = st.file_uploader(
         "Upload reference WAV (optional)",
         help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
     )
+    # Long text chunking
+    st.markdown("### Long Text Settings")
     max_chars = st.slider(
         "Chunk size (characters)",
         min_value=600,
         max_value=3000,
         value=1400,
         step=100,
+        help="Long chapters (10,000+ chars) are split into chunks, generated, then stitched.",
     )
     gap_ms = st.slider(
         "Silence between chunks (ms)",
         step=50,
     )
+    # Generation params
     st.markdown("### Generation Parameters")
     max_new_tokens = st.slider(
         "max_new_tokens",
         step=0.1,
     )
+    # MP3 export
     st.markdown("### MP3 Export")
     mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
     normalize = st.checkbox("Normalize output audio", value=True)
 with colA:
     st.subheader("Input")
+    input_mode = st.radio(
+        "Mode",
+        ["Single chapter (paste/upload)", "Batch mode (upload multiple .txt)"],
+        horizontal=True,
+    )
+    # Shared ref audio prep
+    ref_audio = None
+    if ref_file is not None:
+        try:
+            ref_audio = try_reference_audio(ref_file.read())
+        except Exception as e:
+            st.warning(f"Could not read reference WAV. Ignoring it. ({e})")
+            ref_audio = None
+    gen_kwargs = {
+        "max_new_tokens": int(max_new_tokens),
+        "temperature": float(temperature),
+    }
+    def generate_mp3_from_text(chapter_text: str, label: str, progress_base: float = 0.0, progress_span: float = 1.0):
+        chapter_text = chapter_text.strip()
+        if not chapter_text:
+            raise ValueError("Empty text")
+        chunks = split_text_into_chunks(chapter_text, max_chars=max_chars)
         if not chunks:
+            raise ValueError("Chunking produced no chunks")
         stitched = None
         out_sr = None
+        # chunk-level progress
         for i, chunk in enumerate(chunks, start=1):
+            st.session_state["_status"].write(f"{label}: chunk {i}/{len(chunks)}")
             prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
+            out = synthesize_chunk(pipe_obj, prompt, gen_kwargs=gen_kwargs, ref_audio=ref_audio)
             audio = out.get("audio", None)
             sr = out.get("sampling_rate", None)
             if audio is None or sr is None:
+                raise RuntimeError("Unexpected pipeline output")
             audio = np.asarray(audio, dtype=np.float32)
             if normalize:
                 out_sr = int(sr)
             else:
                 if int(sr) != out_sr:
+                    # usually consistent; warn once
+                    st.warning(f"{label}: sample rate changed ({sr} != {out_sr}). Stitching anyway.")
                 if gap_ms > 0:
                     stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
                 else:
                     stitched = np.concatenate([stitched, audio])
+            # update overall progress bar
+            frac = i / len(chunks)
+            st.session_state["_progress"].progress(int((progress_base + frac * progress_span) * 100))
+        # encode mp3
+        mp3_bytes = encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
+        return mp3_bytes
+    # -----------------------------
+    # Single mode
+    # -----------------------------
+    if input_mode == "Single chapter (paste/upload)":
+        single_submode = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
+        text = ""
+        if single_submode == "Paste text":
+            text = st.text_area(
+                "Chapter text",
+                value="",
+                height=420,
+                placeholder="Paste up to ~10,000+ characters here. The app will chunk, generate, stitch, then export MP3.",
+            )
+        else:
+            txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="single_txt")
+            if txt_file is not None:
+                text = txt_file.read().decode("utf-8", errors="ignore")
+        st.write(f"**Characters:** {len(text):,}")
+        st.divider()
+        if st.button("Generate MP3", type="primary", use_container_width=True):
+            if not text.strip():
+                st.error("Please provide some text.")
+                st.stop()
+            st.session_state["_progress"] = st.progress(0)
+            st.session_state["_status"] = st.empty()
+            try:
+                mp3_bytes = generate_mp3_from_text(text, label="Single")
+            except Exception as e:
+                st.error(f"Generation failed: {e}")
+                st.stop()
+            st.session_state["_status"].write("✅ Done.")
+            st.audio(mp3_bytes, format="audio/mp3")
+            st.download_button(
+                "Download MP3",
+                data=mp3_bytes,
+                file_name="audiobook_chapter.mp3",
+                mime="audio/mpeg",
+                use_container_width=True,
+            )
+    # -----------------------------
+    # Batch mode
+    # -----------------------------
+    else:
+        st.markdown("Upload multiple `.txt` files (each file = one chapter).")
+        batch_files = st.file_uploader(
+            "Upload chapter .txt files",
+            type=["txt"],
+            accept_multiple_files=True,
+            key="batch_txts",
         )
+        if batch_files:
+            total_chars = 0
+            for f in batch_files:
+                total_chars += len(f.getvalue())
+            st.write(f"**Files:** {len(batch_files)}  |  **Total bytes:** {total_chars:,}")
+        st.divider()
+        if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
+            if not batch_files:
+                st.error("Please upload at least one .txt file.")
+                st.stop()
+            st.session_state["_progress"] = st.progress(0)
+            st.session_state["_status"] = st.empty()
+            # Generate each file -> mp3, and pack into ZIP
+            zip_buf = io.BytesIO()
+            results_preview = []  # (name, mp3_bytes) for in-page audio preview
+            with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
+                n = len(batch_files)
+                for idx, f in enumerate(batch_files, start=1):
+                    raw = f.read().decode("utf-8", errors="ignore")
+                    base = sanitize_filename(os.path.splitext(f.name)[0])
+                    mp3_name = f"{base}.mp3"
+                    label = f"{idx}/{n} {base}"
+                    # allocate progress range per file
+                    base_prog = (idx - 1) / n
+                    span_prog = 1.0 / n
+                    try:
+                        mp3_bytes = generate_mp3_from_text(raw, label=label, progress_base=base_prog, progress_span=span_prog)
+                    except Exception as e:
+                        st.error(f"Failed on file '{f.name}': {e}")
+                        st.stop()
+                    zf.writestr(mp3_name, mp3_bytes)
+                    # Keep a small preview list (all, but could be large; still OK)
+                    results_preview.append((mp3_name, mp3_bytes))
+            st.session_state["_status"].write("✅ Batch complete. Download your ZIP below.")
+            zip_buf.seek(0)
+            st.download_button(
+                "Download ZIP (all MP3s)",
+                data=zip_buf.getvalue(),
+                file_name="audiobook_mp3_batch.zip",
+                mime="application/zip",
+                use_container_width=True,
+            )
+            st.markdown("### Preview")
+            for name, mp3_bytes in results_preview:
+                with st.expander(name, expanded=False):
+                    st.audio(mp3_bytes, format="audio/mp3")
+                    st.download_button(
+                        f"Download {name}",
+                        data=mp3_bytes,
+                        file_name=name,
+                        mime="audio/mpeg",
+                        use_container_width=True,
+                        key=f"dl_{name}",
+                    )