Spaces:

broadfield-dev
/

qweb3-tts-cpu

Paused

App Files Files Community

broadfield-dev commited on Feb 17

Commit

0348257

verified ·

1 Parent(s): 27f7d20

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -59

app.py CHANGED Viewed

@@ -7,19 +7,18 @@ from qwen_tts import Qwen3TTSModel
 import os
 import warnings
-# Suppress some warnings if desired
 warnings.filterwarnings("ignore", category=UserWarning)
 # ────────────────────────────────────────────────
-#  Globals & helpers
 # ────────────────────────────────────────────────
 MODELS = {
-    "1.7B-CustomVoice":    "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
-    "0.6B-CustomVoice":    "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
-    "1.7B-VoiceDesign":    "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
-    "1.7B-Base":           "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
-    "0.6B-Base":           "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
 }
 loaded_models = {}
@@ -29,7 +28,7 @@ def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()
     if key in loaded_models:
         return loaded_models[key]
-    progress(0.1, desc=f"Loading {model_key} ({dtype_str}) … (first time may take 1–3 min)")
     repo_id = MODELS[model_key]
     dtype = torch.float32 if dtype_str == "float32" else torch.float16
@@ -42,7 +41,7 @@ def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()
             low_cpu_mem_usage=True,
         )
     except Exception as e:
-        raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry the other precision or smaller model.")
     loaded_models[key] = model
     progress(0.9, desc="Model ready.")
@@ -50,62 +49,63 @@ def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()
 # ────────────────────────────────────────────────
-#  Inference functions (unchanged except safety)
 # ────────────────────────────────────────────────
 def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
     if not text.strip():
-        return None, "Please enter text."
     model = get_model(model_key, precision, progress)
-    progress(0.4, desc="Generating… (can take 10–60s on CPU)")
     try:
         wavs, sr = model.generate_custom_voice(
             text=text,
             language=lang if lang != "Auto" else None,
             speaker=speaker,
             instruct=instruct.strip() or None,
-            max_new_tokens=1200,
         )
         path = "/tmp/output_custom.wav"
-        sf.write(path, wavs[0] if isinstance(wavs, (list, tuple)) else wavs, sr)
-        info = f"**Generated** with {model_key} | lang={lang} | speaker={speaker}"
-        if instruct: info += f" | instruct={instruct}"
         return path, info
     except Exception as e:
         return None, f"**Error**: {str(e)}"
-# (repeat similar small changes for infer_voice_design and infer_voice_clone if needed)
-# For brevity I'm only showing one – apply the same pattern:
-# - Use /tmp/ for output paths
-# - Add try/except with user-friendly message
-# - Shorten max_new_tokens if generations are too slow
 def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
     if not text.strip() or not instruct.strip():
-        return None, "Text and voice description required."
     model = get_model(model_key, precision, progress)
-    progress(0.4, desc="Generating…")
     try:
         wavs, sr = model.generate_voice_design(
             text=text,
             language=lang if lang != "Auto" else None,
             instruct=instruct,
-            max_new_tokens=1200,
         )
         path = "/tmp/output_design.wav"
-        sf.write(path, wavs[0] if isinstance(wavs, (list, tuple)) else wavs, sr)
-        return path, f"**Voice Design** — {model_key} | lang={lang}"
     except Exception as e:
         return None, f"**Error**: {str(e)}"
 def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
     if not text.strip():
-        return None, "Enter text."
     if not ref_audio:
         return None, "Upload reference audio."
     model = get_model(model_key, precision, progress)
-    progress(0.3, desc="Processing reference…")
     try:
         wavs, sr = model.generate_voice_clone(
             text=text,
@@ -113,73 +113,118 @@ def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key,
             ref_audio=ref_audio,
             ref_text=ref_text.strip() or None,
             x_vector_only_mode=x_vector_only,
-            max_new_tokens=1200,
         )
         path = "/tmp/output_clone.wav"
-        sf.write(path, wavs[0] if isinstance(wavs, (list, tuple)) else wavs, sr)
-        return path, f"**Cloned voice** — {model_key} | x-vector-only={x_vector_only}"
     except Exception as e:
         return None, f"**Error**: {str(e)}"
 # ────────────────────────────────────────────────
-#  UI
 # ────────────────────────────────────────────────
 css = """
-.radio-horizontal .radio-container { flex-direction: row !important; flex-wrap: wrap !important; gap: 1.2rem !important; }
-.radio-horizontal label { margin-right: 1.5rem !important; }
 """
-with gr.Blocks() as demo:   # ← no theme/css here anymore
-    gr.Markdown("# Qwen3-TTS All-Variants Demo  \nCPU • 0.6B & 1.7B • CustomVoice / VoiceDesign / Base")
-    with gr.Tab("CustomVoice (preset speakers + instruct)"):
-        gr.Markdown("**Qwen3-TTS-12Hz-(0.6B|1.7B)-CustomVoice**  \n9 built-in voices + style control")
-        cv_model = gr.Radio(
-            choices=["1.7B-CustomVoice", "0.6B-CustomVoice"],
-            value="1.7B-CustomVoice",
-            label="Model size",
-            elem_classes=["radio-horizontal"]   # ← CSS class for horizontal
-        )
-        cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision (float16 faster but riskier on CPU)")
         with gr.Row():
-            cv_text    = gr.Textbox(label="Text", lines=3, placeholder="今天天气很好，我们去公园吧～", value="这是一个测试。希望声音听起来自然一些。")
             cv_lang    = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language")
             cv_speaker = gr.Dropdown(
                 ["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"],
                 value="Vivian", label="Speaker"
             )
-        cv_instruct = gr.Textbox(label="Style instruction (optional)", placeholder="用特别温柔又带点撒娇的语气说", lines=2)
         cv_btn = gr.Button("Generate", variant="primary")
-        cv_out_audio = gr.Audio(label="Output", type="filepath", autoplay=False)
-        cv_out_info  = gr.Markdown()
         cv_btn.click(
             infer_custom_voice,
             inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
-            outputs=[cv_out_audio, cv_out_info]
         )
-    # ... Add the other tabs (VoiceDesign, Base/Clone) similarly ...
-    # Just copy-paste the structure and change the inference fn / inputs
     gr.Markdown("""
 **Notes**
-• First load per model variant can take 1–5 min (download + CPU RAM allocation).
-• Use **0.6B** models + **float32** if 1.7B crashes (RAM limit on free Spaces ~12–16 GB).
-• Audio may warn about SoX missing → generations should still work via soundfile/torchaudio fallback.
-• Official inference: https://github.com/QwenLM/Qwen3-TTS
     """)
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        theme=gr.themes.Soft(),   # ← moved here
-        css=css                   # ← moved here
     )

 import os
 import warnings
 warnings.filterwarnings("ignore", category=UserWarning)
 # ────────────────────────────────────────────────
+# Globals & Model Loader
 # ────────────────────────────────────────────────
 MODELS = {
+    "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
+    "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
+    "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
+    "1.7B-Base":        "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
+    "0.6B-Base":        "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
 }
 loaded_models = {}
     if key in loaded_models:
         return loaded_models[key]
+    progress(0.1, desc=f"Loading {model_key} ({dtype_str}) …")
     repo_id = MODELS[model_key]
     dtype = torch.float32 if dtype_str == "float32" else torch.float16
             low_cpu_mem_usage=True,
         )
     except Exception as e:
+        raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.")
     loaded_models[key] = model
     progress(0.9, desc="Model ready.")
 # ────────────────────────────────────────────────
+# Inference functions – full generation (non-streaming)
 # ────────────────────────────────────────────────
 def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
     if not text.strip():
+        return None, "Please enter some text."
     model = get_model(model_key, precision, progress)
+    progress(0.4, desc="Generating …")
     try:
         wavs, sr = model.generate_custom_voice(
             text=text,
             language=lang if lang != "Auto" else None,
             speaker=speaker,
             instruct=instruct.strip() or None,
+            max_new_tokens=1500,  # reasonable safety limit
         )
         path = "/tmp/output_custom.wav"
+        sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
+        info = f"**Generated with {model_key}**  \nlang: {lang}  \nspeaker: {speaker}  \ninstruct: {instruct or '(none)'}"
         return path, info
     except Exception as e:
         return None, f"**Error**: {str(e)}"
 def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
     if not text.strip() or not instruct.strip():
+        return None, "Text and voice instruction required."
     model = get_model(model_key, precision, progress)
+    progress(0.4, desc="Generating …")
     try:
         wavs, sr = model.generate_voice_design(
             text=text,
             language=lang if lang != "Auto" else None,
             instruct=instruct,
+            max_new_tokens=1500,
         )
         path = "/tmp/output_design.wav"
+        sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
+        info = f"**Voice Design �� {model_key}**  \nlang: {lang}  \ninstruct: {instruct}"
+        return path, info
     except Exception as e:
         return None, f"**Error**: {str(e)}"
 def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
     if not text.strip():
+        return None, "Enter text to synthesize."
     if not ref_audio:
         return None, "Upload reference audio."
     model = get_model(model_key, precision, progress)
+    progress(0.3, desc="Processing reference …")
     try:
         wavs, sr = model.generate_voice_clone(
             text=text,
             ref_audio=ref_audio,
             ref_text=ref_text.strip() or None,
             x_vector_only_mode=x_vector_only,
+            max_new_tokens=1500,
         )
         path = "/tmp/output_clone.wav"
+        sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
+        info = f"**Voice Clone – {model_key}**  \nlang: {lang}  \nx-vector-only: {x_vector_only}"
+        return path, info
     except Exception as e:
         return None, f"**Error**: {str(e)}"
 # ────────────────────────────────────────────────
+# UI – all tabs completed
 # ────────────────────────────────────────────────
 css = """
+.radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; }
+.radio-row > div { min-width: 140px; }
 """
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants • CPU-friendly • No streaming (full generation only)")
+    with gr.Tab("CustomVoice – Preset speakers + instruct"):
+        gr.Markdown("Uses 9 built-in premium voices + optional style instruction")
+        with gr.Row(elem_classes="radio-row"):
+            cv_model     = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model")
+            cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
         with gr.Row():
+            cv_text    = gr.Textbox(label="Text to speak", lines=4, value="这是一个测试。希望声音听起来自然一些。")
             cv_lang    = gr.Dropdown(["Auto", "Chinese", "English", "Japanese", "Korean"], value="Auto", label="Language")
             cv_speaker = gr.Dropdown(
                 ["Vivian", "Serena", "Uncle_Fu", "Dylan", "Eric", "Ryan", "Aiden", "Ono_Anna", "Sohee"],
                 value="Vivian", label="Speaker"
             )
+        cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="用特别愤怒的语气说")
         cv_btn = gr.Button("Generate", variant="primary")
+        cv_audio = gr.Audio(label="Generated Speech", type="filepath")
+        cv_info  = gr.Markdown()
         cv_btn.click(
             infer_custom_voice,
             inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
+            outputs=[cv_audio, cv_info]
+        )
+    with gr.Tab("Voice Design – Describe any voice"):
+        gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)")
+        with gr.Row(elem_classes="radio-row"):
+            vd_model     = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model")
+            vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
+        vd_text     = gr.Textbox(label="Text to speak", lines=4, value="哥哥，你回来啦，人家等了好久，要抱抱！")
+        vd_lang     = gr.Dropdown(["Auto", "Chinese", "English"], value="Chinese", label="Language")
+        vd_instruct = gr.Textbox(
+            label="Voice description / instruction",
+            lines=4,
+            value="体现撒娇稚嫩的萝莉女声，音调偏高且起伏明显，黏人、做作又刻意卖萌的感觉"
         )
+        vd_btn = gr.Button("Generate", variant="primary")
+        vd_audio = gr.Audio(label="Generated Speech", type="filepath")
+        vd_info  = gr.Markdown()
+        vd_btn.click(
+            infer_voice_design,
+            inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision],
+            outputs=[vd_audio, vd_info]
+        )
+    with gr.Tab("Base – Voice Clone from reference audio"):
+        gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)")
+        with gr.Row(elem_classes="radio-row"):
+            cl_model     = gr.Radio(["1.7B-Base", "0.6B-Base"], value="1.7B-Base", label="Model")
+            cl_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
+        cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.")
+        cl_lang = gr.Dropdown(["Auto", "English", "Chinese"], value="Auto", label="Language")
+        with gr.Row():
+            cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload", "microphone"])
+            cl_ref_text  = gr.Textbox(label="Transcript of reference (optional but improves quality)", lines=2)
+        cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, no transcript needed, lower quality)", value=False)
+        cl_btn = gr.Button("Clone & Generate", variant="primary")
+        cl_audio = gr.Audio(label="Cloned Speech", type="filepath")
+        cl_info  = gr.Markdown()
+        cl_btn.click(
+            infer_voice_clone,
+            inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision],
+            outputs=[cl_audio, cl_info]
+        )
     gr.Markdown("""
 **Notes**
+• First generation per model loads weights (may take 1–5 min).
+• Use **float32** if **float16** causes crashes (common on CPU).
+• **0.6B** models are faster / lighter on CPU.
+• No streaming yet in official qwen-tts package — generations are full-text → full-audio.
+• Repo & docs: https://github.com/QwenLM/Qwen3-TTS
     """)
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        theme=gr.themes.Soft(),
+        css=css,
     )