Spaces:

Jekyll2000
/

MY_TTS

Sleeping

App Files Files Community

Jekyll2000 commited on Feb 18

Commit

d0cc5a4

verified ·

1 Parent(s): 084ef93

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -270

app.py CHANGED Viewed

@@ -1,68 +1,23 @@
 import io
-import re
 import os
 import zipfile
 import numpy as np
 import streamlit as st
 import soundfile as sf
 import torch
-from transformers import AutoProcessor
-from transformers.pipelines import pipeline
-import lameenc  # MP3 encoder (no ffmpeg needed)
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
-# Torch sanity check
-try:
-    _ = torch.tensor([1.0])
-except Exception as e:
-    st.error(
-        "PyTorch failed to initialize.\n\n"
-        "Make sure your Space is using python_version: '3.10' and torch is installed correctly.\n\n"
-        f"Details: {e}"
-    )
-    st.stop()
 # -----------------------------
-# Helpers
 # -----------------------------
-DEFAULT_LANGS = [
-    ("Auto", None),
-    ("English", "en"),
-    ("Chinese (Simplified)", "zh"),
-    ("Japanese", "ja"),
-    ("Korean", "ko"),
-    ("French", "fr"),
-    ("German", "de"),
-    ("Spanish", "es"),
-    ("Portuguese", "pt"),
-    ("Italian", "it"),
-    ("Russian", "ru"),
-    ("Arabic", "ar"),
-    ("Hindi", "hi"),
-    ("Turkish", "tr"),
-    ("Indonesian", "id"),
-    ("Vietnamese", "vi"),
-]
-def pick_device():
-    if torch.cuda.is_available():
-        return "cuda", 0, torch.float16
-    return "cpu", -1, torch.float32
-def normalize_audio(x: np.ndarray) -> np.ndarray:
-    x = x.astype(np.float32)
-    peak = float(np.max(np.abs(x))) if x.size else 0.0
-    if peak > 0:
-        x = x / max(peak, 1e-8)
-    return x
-def make_silence(sr: int, ms: int) -> np.ndarray:
-    n = int(sr * (ms / 1000.0))
-    return np.zeros(n, dtype=np.float32)
 def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     text = re.sub(r"\r\n", "\n", text).strip()
     if not text:
@@ -91,91 +46,80 @@ def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
         chunks.append(cur)
     return chunks
-def format_prompt(text: str, lang: str | None, speaker: str | None, instruction: str | None) -> str:
-    tags = []
-    if lang:
-        tags.append(f"[LANG={lang}]")
-    if speaker:
-        tags.append(f"[SPEAKER={speaker}]")
-    if instruction:
-        tags.append(f"[INSTRUCTION={instruction}]")
-    return " ".join(tags + [text])
-def safe_get_speakers(proc, pipe_obj):
-    for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
-        if hasattr(proc, attr):
-            val = getattr(proc, attr)
-            if isinstance(val, dict):
-                return sorted(set(map(str, val.keys())))
-            if isinstance(val, (list, tuple)):
-                return sorted(set(map(str, val)))
-    model = getattr(pipe_obj, "model", None)
-    cfg = getattr(model, "config", None) if model is not None else None
-    if cfg is not None:
-        for attr in ("speakers", "speaker_ids", "speaker_map", "voice_names", "voices"):
-            if hasattr(cfg, attr):
-                val = getattr(cfg, attr)
-                if isinstance(val, dict):
-                    return sorted(set(map(str, val.keys())))
-                if isinstance(val, (list, tuple)):
-                    return sorted(set(map(str, val)))
-    return []
-def try_reference_audio(wav_bytes: bytes):
-    audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32")
-    if audio.ndim > 1:
-        audio = audio.mean(axis=1)
-    return {"array": audio, "sampling_rate": sr}
-def synthesize_chunk(pipe_obj, prompt: str, gen_kwargs: dict, ref_audio=None):
-    if ref_audio is not None:
-        try:
-            return pipe_obj(prompt, ref_audio=ref_audio, **gen_kwargs)
-        except TypeError:
-            pass
-        except Exception:
-            pass
-    return pipe_obj(prompt, **gen_kwargs)
 def float_to_int16_pcm(x: np.ndarray) -> bytes:
     x = np.clip(x, -1.0, 1.0)
-    pcm = (x * 32767.0).astype(np.int16)
-    return pcm.tobytes()
 def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
     enc = lameenc.Encoder()
     enc.set_bit_rate(int(bitrate_kbps))
     enc.set_in_sample_rate(int(sr))
     enc.set_channels(1)
-    enc.set_quality(2)
     mp3 = enc.encode(float_to_int16_pcm(audio_float32))
     mp3 += enc.flush()
     return mp3
 def sanitize_filename(name: str) -> str:
     name = name.strip().replace("\\", "_").replace("/", "_")
     name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
     name = re.sub(r"\s+", " ", name).strip()
     return name or "chapter"
 @st.cache_resource(show_spinner=False)
-def load_tts():
-    device, device_id, dtype = pick_device()
-    # IMPORTANT: trust_remote_code=True for new architectures
-    pipe_obj = pipeline(
-        task="text-to-audio",
-        model=MODEL_ID,
-        device=device_id,
-        torch_dtype=dtype,
-        trust_remote_code=True,
     )
-    proc = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-    speakers = safe_get_speakers(proc, pipe_obj)
-    return pipe_obj, proc, speakers, device, dtype
 # -----------------------------
 # UI
@@ -184,179 +128,109 @@ st.set_page_config(page_title="Haseeb's TTS", layout="wide")
 st.title("🎧 Haseeb's TTS")
 st.caption("Audiobook Generator • MP3 Output • Batch Mode • Language • Voices • Instruction Control")
 with st.spinner("Loading model (first run can take a while)…"):
-    pipe_obj, proc, detected_speakers, device, dtype = load_tts()
 colA, colB = st.columns([2, 1], gap="large")
 with colB:
     st.subheader("Controls")
-    lang_label = st.selectbox(
-        "Language",
-        options=[x[0] for x in DEFAULT_LANGS],
-        index=1,
-        help="Select a language tag to steer pronunciation. 'Auto' disables the language tag.",
-    )
-    lang = dict(DEFAULT_LANGS).get(lang_label)
-    st.markdown("### Voice / Speaker")
-    speaker = None
-    if detected_speakers:
-        speaker_choice = st.selectbox(
-            "Detected speakers",
-            options=["(none)"] + detected_speakers,
-            index=0,
-            help="Speakers detected from model config/processor.",
-        )
-        speaker = None if speaker_choice == "(none)" else speaker_choice
-    else:
-        st.info("No speaker list detected. You can still type a custom speaker name below.")
-    custom_speaker = st.text_input(
-        "Custom speaker name (optional)",
-        value="",
-        help="If the model supports speaker conditioning by name/tag, enter it here.",
-    ).strip()
-    if custom_speaker:
-        speaker = custom_speaker
-    st.markdown("### Instruction Control")
-    instruction = st.text_area(
-        "Instruction (style/emotion/pacing/etc.)",
         value="Warm, clear narration. Medium pace. Slightly expressive.",
         height=90,
     ).strip()
-    if instruction == "":
-        instruction = None
-    st.markdown("### Optional: Reference Voice")
-    ref_file = st.file_uploader(
-        "Upload reference WAV (optional)",
-        type=["wav"],
-        help="If the model supports voice cloning, this may help. If unsupported, it will be ignored.",
-    )
     st.markdown("### Long Text Settings")
-    max_chars = st.slider(
-        "Chunk size (characters)",
-        min_value=600,
-        max_value=3000,
-        value=1400,
-        step=100,
-        help="Long chapters (10,000+ chars) are split into chunks, generated, then stitched.",
-    )
-    gap_ms = st.slider(
-        "Silence between chunks (ms)",
-        min_value=0,
-        max_value=1200,
-        value=250,
-        step=50,
-    )
     st.markdown("### Generation Parameters")
-    max_new_tokens = st.slider(
-        "max_new_tokens",
-        min_value=256,
-        max_value=4096,
-        value=2048,
-        step=128,
-        help="Higher can produce longer audio per chunk but uses more compute/memory.",
-    )
-    temperature = st.slider(
-        "temperature",
-        min_value=0.1,
-        max_value=1.5,
-        value=0.9,
-        step=0.1,
-    )
     st.markdown("### MP3 Export")
-    mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", options=[96, 128, 160, 192, 256, 320], index=3)
-    normalize = st.checkbox("Normalize output audio", value=True)
 with colA:
     st.subheader("Input")
-    input_mode = st.radio(
-        "Mode",
-        ["Single chapter (paste/upload)", "Batch mode (upload multiple .txt)"],
-        horizontal=True,
-    )
-    ref_audio = None
-    if ref_file is not None:
-        try:
-            ref_audio = try_reference_audio(ref_file.read())
-        except Exception as e:
-            st.warning(f"Could not read reference WAV. Ignoring it. ({e})")
-            ref_audio = None
-    gen_kwargs = {
-        "max_new_tokens": int(max_new_tokens),
-        "temperature": float(temperature),
-    }
-    def generate_mp3_from_text(chapter_text: str, label: str, progress_base: float = 0.0, progress_span: float = 1.0):
-        chapter_text = chapter_text.strip()
-        if not chapter_text:
-            raise ValueError("Empty text")
-        chunks = split_text_into_chunks(chapter_text, max_chars=max_chars)
         if not chunks:
-            raise ValueError("Chunking produced no chunks")
         stitched = None
-        out_sr = None
         for i, chunk in enumerate(chunks, start=1):
-            st.session_state["_status"].write(f"{label}: chunk {i}/{len(chunks)}")
-            prompt = format_prompt(chunk, lang=lang, speaker=speaker, instruction=instruction)
-            out = synthesize_chunk(pipe_obj, prompt, gen_kwargs=gen_kwargs, ref_audio=ref_audio)
-            audio = out.get("audio", None)
-            sr = out.get("sampling_rate", None)
-            if audio is None or sr is None:
-                raise RuntimeError("Unexpected pipeline output")
-            audio = np.asarray(audio, dtype=np.float32)
-            if normalize:
                 audio = normalize_audio(audio)
             if stitched is None:
                 stitched = audio
-                out_sr = int(sr)
             else:
-                if int(sr) != out_sr:
-                    st.warning(f"{label}: sample rate changed ({sr} != {out_sr}). Stitching anyway.")
                 if gap_ms > 0:
-                    stitched = np.concatenate([stitched, make_silence(out_sr, gap_ms), audio])
                 else:
                     stitched = np.concatenate([stitched, audio])
             frac = i / len(chunks)
-            st.session_state["_progress"].progress(int((progress_base + frac * progress_span) * 100))
-        return encode_mp3_mono(stitched, out_sr, bitrate_kbps=int(mp3_bitrate))
-    # -----------------------------
-    # Single mode
-    # -----------------------------
-    if input_mode == "Single chapter (paste/upload)":
-        single_submode = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
         text = ""
-        if single_submode == "Paste text":
-            text = st.text_area(
-                "Chapter text",
-                value="",
-                height=420,
-                placeholder="Paste up to ~10,000+ characters here. The app will chunk, generate, stitch, then export MP3.",
-            )
         else:
-            txt_file = st.file_uploader("Upload a .txt file", type=["txt"], key="single_txt")
-            if txt_file is not None:
-                text = txt_file.read().decode("utf-8", errors="ignore")
         st.write(f"**Characters:** {len(text):,}")
         st.divider()
@@ -366,16 +240,16 @@ with colA:
                 st.error("Please provide some text.")
                 st.stop()
-            st.session_state["_progress"] = st.progress(0)
-            st.session_state["_status"] = st.empty()
             try:
-                mp3_bytes = generate_mp3_from_text(text, label="Single")
             except Exception as e:
                 st.error(f"Generation failed: {e}")
                 st.stop()
-            st.session_state["_status"].write("✅ Done.")
             st.audio(mp3_bytes, format="audio/mp3")
             st.download_button(
                 "Download MP3",
@@ -385,54 +259,43 @@ with colA:
                 use_container_width=True,
             )
-    # -----------------------------
-    # Batch mode
-    # -----------------------------
     else:
         st.markdown("Upload multiple `.txt` files (each file = one chapter).")
-        batch_files = st.file_uploader(
-            "Upload chapter .txt files",
-            type=["txt"],
-            accept_multiple_files=True,
-            key="batch_txts",
-        )
         st.divider()
         if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
-            if not batch_files:
                 st.error("Please upload at least one .txt file.")
                 st.stop()
-            st.session_state["_progress"] = st.progress(0)
-            st.session_state["_status"] = st.empty()
             zip_buf = io.BytesIO()
-            results_preview = []
             with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
-                n = len(batch_files)
-                for idx, f in enumerate(batch_files, start=1):
                     raw = f.read().decode("utf-8", errors="ignore")
                     base = sanitize_filename(os.path.splitext(f.name)[0])
                     mp3_name = f"{base}.mp3"
-                    label = f"{idx}/{n} {base}"
                     base_prog = (idx - 1) / n
                     span_prog = 1.0 / n
                     try:
-                        mp3_bytes = generate_mp3_from_text(
-                            raw, label=label, progress_base=base_prog, progress_span=span_prog
-                        )
                     except Exception as e:
-                        st.error(f"Failed on file '{f.name}': {e}")
                         st.stop()
                     zf.writestr(mp3_name, mp3_bytes)
-                    results_preview.append((mp3_name, mp3_bytes))
-            st.session_state["_status"].write("✅ Batch complete. Download your ZIP below.")
             zip_buf.seek(0)
             st.download_button(
@@ -444,7 +307,7 @@ with colA:
             )
             st.markdown("### Preview")
-            for name, mp3_bytes in results_preview:
                 with st.expander(name, expanded=False):
                     st.audio(mp3_bytes, format="audio/mp3")
                     st.download_button(

 import io
 import os
+import re
 import zipfile
 import numpy as np
 import streamlit as st
 import soundfile as sf
 import torch
+import lameenc
+from qwen_tts import Qwen3TTSModel  # official package API (recommended by Qwen docs)
 MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
 # -----------------------------
+# Text chunking (10k+ chars)
 # -----------------------------
 def split_text_into_chunks(text: str, max_chars: int) -> list[str]:
     text = re.sub(r"\r\n", "\n", text).strip()
     if not text:
         chunks.append(cur)
     return chunks
+def make_silence(sr: int, ms: int) -> np.ndarray:
+    n = int(sr * (ms / 1000.0))
+    return np.zeros(n, dtype=np.float32)
+def normalize_audio(x: np.ndarray) -> np.ndarray:
+    x = x.astype(np.float32)
+    peak = float(np.max(np.abs(x))) if x.size else 0.0
+    if peak > 0:
+        x = x / max(peak, 1e-8)
+    return x
+# -----------------------------
+# MP3 encoding (no ffmpeg)
+# -----------------------------
 def float_to_int16_pcm(x: np.ndarray) -> bytes:
     x = np.clip(x, -1.0, 1.0)
+    return (x * 32767.0).astype(np.int16).tobytes()
 def encode_mp3_mono(audio_float32: np.ndarray, sr: int, bitrate_kbps: int = 192) -> bytes:
     enc = lameenc.Encoder()
     enc.set_bit_rate(int(bitrate_kbps))
     enc.set_in_sample_rate(int(sr))
     enc.set_channels(1)
+    enc.set_quality(2)  # 2=high quality
     mp3 = enc.encode(float_to_int16_pcm(audio_float32))
     mp3 += enc.flush()
     return mp3
 def sanitize_filename(name: str) -> str:
     name = name.strip().replace("\\", "_").replace("/", "_")
     name = re.sub(r"[^a-zA-Z0-9._ -]+", "", name)
     name = re.sub(r"\s+", " ", name).strip()
     return name or "chapter"
+# -----------------------------
+# Model loading (qwen-tts)
+# -----------------------------
+def pick_device_and_dtype():
+    if torch.cuda.is_available():
+        # bfloat16 is recommended in Qwen docs examples for modern GPUs
+        return "cuda:0", torch.bfloat16
+    return "cpu", torch.float32
 @st.cache_resource(show_spinner=False)
+def load_qwen_tts():
+    device_map, dtype = pick_device_and_dtype()
+    model = Qwen3TTSModel.from_pretrained(
+        MODEL_ID,
+        device_map=device_map,
+        dtype=dtype,
     )
+    # Try to read supported languages/speakers from the model
+    # (These helper methods are documented by Qwen for CustomVoice models)
+    try:
+        speakers = model.get_supported_speakers()
+    except Exception:
+        speakers = []
+    try:
+        languages = model.get_supported_languages()
+    except Exception:
+        languages = []
+    return model, speakers, languages, device_map, str(dtype)
 # -----------------------------
 # UI
 st.title("🎧 Haseeb's TTS")
 st.caption("Audiobook Generator • MP3 Output • Batch Mode • Language • Voices • Instruction Control")
+# Torch sanity check
+try:
+    _ = torch.tensor([1.0])
+except Exception as e:
+    st.error(f"PyTorch failed to initialize: {e}")
+    st.stop()
 with st.spinner("Loading model (first run can take a while)…"):
+    tts_model, supported_speakers, supported_langs, device_map, dtype_str = load_qwen_tts()
 colA, colB = st.columns([2, 1], gap="large")
 with colB:
     st.subheader("Controls")
+    st.caption(f"Device: `{device_map}` • dtype: `{dtype_str}`")
+    # Language dropdown (fallback list if model doesn't provide)
+    fallback_langs = ["Auto", "Chinese", "English", "Japanese", "Korean", "German", "French", "Russian", "Portuguese", "Spanish", "Italian"]
+    lang_options = supported_langs if supported_langs else fallback_langs
+    language = st.selectbox("Language", options=lang_options, index=0)
+    # Speaker dropdown (fallback common names from Qwen docs snippet)
+    fallback_speakers = ["Vivian", "Ryan"]
+    spk_options = supported_speakers if supported_speakers else fallback_speakers
+    speaker = st.selectbox("Speaker / Voice", options=spk_options, index=0)
+    # Instruction control
+    instruct = st.text_area(
+        "Instruction (style/emotion/pacing)",
         value="Warm, clear narration. Medium pace. Slightly expressive.",
         height=90,
+        help="Leave empty for neutral/default speaking style.",
     ).strip()
+    # Long chapter handling
     st.markdown("### Long Text Settings")
+    max_chars = st.slider("Chunk size (characters)", 600, 3000, 1400, 100)
+    gap_ms = st.slider("Silence between chunks (ms)", 0, 1200, 250, 50)
+    # Generation params
     st.markdown("### Generation Parameters")
+    max_new_tokens = st.slider("max_new_tokens", 256, 8192, 4096, 256, help="Increase for longer audio per chunk (more compute).")
+    # MP3
     st.markdown("### MP3 Export")
+    mp3_bitrate = st.selectbox("MP3 bitrate (kbps)", [96, 128, 160, 192, 256, 320], index=3)
+    do_normalize = st.checkbox("Normalize output audio", value=True)
 with colA:
     st.subheader("Input")
+    mode = st.radio("Mode", ["Single chapter", "Batch (multiple .txt)"], horizontal=True)
+    progress = st.progress(0)
+    status = st.empty()
+    def synth_one_mp3(text: str, label: str, base_prog: float, span_prog: float) -> bytes:
+        chunks = split_text_into_chunks(text, max_chars=max_chars)
         if not chunks:
+            raise ValueError("No text chunks produced.")
         stitched = None
+        sr_out = None
         for i, chunk in enumerate(chunks, start=1):
+            status.write(f"{label}: chunk {i}/{len(chunks)}")
+            wavs, sr = tts_model.generate_custom_voice(
+                text=chunk,
+                language=language if language else "Auto",
+                speaker=speaker,
+                instruct=instruct if instruct else "",
+                max_new_tokens=int(max_new_tokens),
+            )
+            audio = np.asarray(wavs[0], dtype=np.float32)
+            if do_normalize:
                 audio = normalize_audio(audio)
             if stitched is None:
                 stitched = audio
+                sr_out = int(sr)
             else:
                 if gap_ms > 0:
+                    stitched = np.concatenate([stitched, make_silence(sr_out, gap_ms), audio])
                 else:
                     stitched = np.concatenate([stitched, audio])
             frac = i / len(chunks)
+            progress.progress(int((base_prog + frac * span_prog) * 100))
+        return encode_mp3_mono(stitched, sr_out, bitrate_kbps=int(mp3_bitrate))
+    if mode == "Single chapter":
+        input_type = st.radio("Input type", ["Paste text", "Upload .txt"], horizontal=True)
         text = ""
+        if input_type == "Paste text":
+            text = st.text_area("Chapter text", height=420, placeholder="Paste your chapter text here…")
         else:
+            f = st.file_uploader("Upload a .txt file", type=["txt"])
+            if f is not None:
+                text = f.read().decode("utf-8", errors="ignore")
         st.write(f"**Characters:** {len(text):,}")
         st.divider()
                 st.error("Please provide some text.")
                 st.stop()
+            progress.progress(0)
+            status.write("Starting…")
             try:
+                mp3_bytes = synth_one_mp3(text, "Single", 0.0, 1.0)
             except Exception as e:
                 st.error(f"Generation failed: {e}")
                 st.stop()
+            status.write("✅ Done.")
             st.audio(mp3_bytes, format="audio/mp3")
             st.download_button(
                 "Download MP3",
                 use_container_width=True,
             )
     else:
         st.markdown("Upload multiple `.txt` files (each file = one chapter).")
+        files = st.file_uploader("Upload chapter .txt files", type=["txt"], accept_multiple_files=True)
         st.divider()
         if st.button("Generate MP3s (Batch)", type="primary", use_container_width=True):
+            if not files:
                 st.error("Please upload at least one .txt file.")
                 st.stop()
+            progress.progress(0)
+            status.write("Starting batch…")
             zip_buf = io.BytesIO()
+            previews = []
             with zipfile.ZipFile(zip_buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
+                n = len(files)
+                for idx, f in enumerate(files, start=1):
                     raw = f.read().decode("utf-8", errors="ignore")
                     base = sanitize_filename(os.path.splitext(f.name)[0])
                     mp3_name = f"{base}.mp3"
                     base_prog = (idx - 1) / n
                     span_prog = 1.0 / n
                     try:
+                        mp3_bytes = synth_one_mp3(raw, f"{idx}/{n} {base}", base_prog, span_prog)
                     except Exception as e:
+                        st.error(f"Failed on '{f.name}': {e}")
                         st.stop()
                     zf.writestr(mp3_name, mp3_bytes)
+                    previews.append((mp3_name, mp3_bytes))
+            status.write("✅ Batch complete.")
             zip_buf.seek(0)
             st.download_button(
             )
             st.markdown("### Preview")
+            for name, mp3_bytes in previews:
                 with st.expander(name, expanded=False):
                     st.audio(mp3_bytes, format="audio/mp3")
                     st.download_button(