BlueV2

Running

notmax123 commited on about 1 month ago

Commit

56e7960

1 Parent(s): 615a636

Simplify Gradio voice cloning and auto-run synthesis

If reference audio is present, use it; otherwise use the selected saved voice. Auto-synthesize on upload; refresh Blue theme for V2.

Made-with: Cursor

Files changed (1) hide show

app.py +51 -58

app.py CHANGED Viewed

@@ -660,16 +660,11 @@ def style_from_wav(ref_wav: str) -> Style:
     return style_from_dict(payload)
-def _reference_audio_status(ref_wav: Optional[str], voice_source: str = "saved"):
-    if voice_source != "upload":
-        return (
-            '<div class="ref-status muted">Using the saved voice dropdown. '
-            'Uploaded reference audio is ignored unless Voice source is set to "Uploaded reference".</div>'
-        )
     if not ref_wav:
         return (
-            '<div class="ref-status warn">Upload or record a clean 3-12 second clip: one speaker, '
-            'no music, no background noise, no long silence.</div>'
         )
     try:
         import soundfile as sf
@@ -681,13 +676,13 @@ def _reference_audio_status(ref_wav: Optional[str], voice_source: str = "saved")
             msg = "Too short for cloning; use at least 3 seconds."
         elif dur > 20.0:
             level = "warn"
-            msg = "Long clips work, but the exporter only uses the early reference frames. Trim to the cleanest 3-12 seconds."
         elif channels > 2:
             level = "warn"
-            msg = "Many channels detected; mono or stereo speech is best."
         else:
             level = "ok"
-            msg = "Ready. This upload will override the saved voice for the next generation."
         return (
             f'<div class="ref-status {level}">'
             f'Reference: {dur:.1f}s, {info.samplerate} Hz, {channels} channel(s). {html.escape(msg)}'
@@ -697,17 +692,11 @@ def _reference_audio_status(ref_wav: Optional[str], voice_source: str = "saved")
         return f'<div class="ref-status warn">Could not inspect uploaded audio: {html.escape(str(e))}</div>'
-def synthesize_text(text: str, voice_source: str, voice: str, lang: str, steps: int, speed: float,
                     cfg_scale: float, ref_wav: Optional[str] = None):
     t0 = time.time()
-    using_ref = voice_source == "upload"
     if using_ref:
-        if not ref_wav:
-            err = (
-                '<div class="stats-bar"><span class="stat-pill">'
-                'Reference voice selected, but no audio was uploaded.</span></div>'
-            )
-            return None, err
         try:
             style = style_from_wav(ref_wav)
         except Exception as e:
@@ -717,7 +706,7 @@ def synthesize_text(text: str, voice_source: str, voice: str, lang: str, steps:
         if not VOICE_STYLES:
             err = (
                 '<div class="stats-bar"><span class="stat-pill">'
-                'No saved v2 voices are installed. Choose "Uploaded reference" and upload audio.</span></div>'
             )
             return None, err
         style = VOICE_STYLES[voice]
@@ -731,7 +720,7 @@ def synthesize_text(text: str, voice_source: str, voice: str, lang: str, steps:
     rtf = proc_time / audio_dur if audio_dur > 0 else 0
     stats = (
         f'<div class="stats-bar">'
-        f'<span class="stat-pill">Voice: {"uploaded reference" if using_ref else html.escape(voice)}</span>'
         f'<span class="stat-pill">⏱ {proc_time:.2f}s</span>'
         f'<span class="stat-pill">🔊 {audio_dur:.1f}s audio</span>'
         f'<span class="stat-pill">⚡ {rtf:.2f}x RTF</span>'
@@ -833,37 +822,38 @@ def _load_font_face() -> str:
 css = _load_font_face() + """
 @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&display=swap');
 * { box-sizing: border-box; }
-body, .gradio-container { background:#0a0a0f !important; font-family:'EuclidCircularB',sans-serif !important; color:#e8e8f0 !important; }
 .gradio-container { max-width:900px !important; margin:0 auto !important; padding:2rem 1.5rem !important; }
 .app-header { text-align:center; margin-bottom:2rem; padding:2rem 0 1rem; }
-.app-header h1 { font-size:2.8rem; font-weight:600; letter-spacing:-0.03em; background:linear-gradient(135deg,#60a5fa 0%,#a78bfa 50%,#34d399 100%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; background-clip:text; margin:0 0 0.5rem; }
-.app-header p { color:#6b7280; font-size:1rem; margin:0 0 1rem; }
-.app-header .github-link { display:inline-flex; align-items:center; gap:0.4rem; margin-top:0.75rem; padding:0.45rem 1rem; font-size:0.9rem; font-weight:500; text-decoration:none !important; color:#93c5fd !important; border:1px solid #2a3f5c; border-radius:999px; background:rgba(96,165,250,0.08); }
-.card { background:#111118; border:1px solid #1e1e2e; border-radius:16px; padding:1.5rem; margin-bottom:1rem; }
-.big-input textarea { background:#0d0d14 !important; border:1px solid #2a2a3e !important; border-radius:10px !important; color:#e8e8f0 !important; font-size:1.1rem !important; line-height:1.6 !important; padding:1rem !important; unicode-bidi:plaintext !important; }
-.big-input textarea:focus { border-color:#60a5fa !important; outline:none !important; }
 .controls-row { margin-top:1rem; display:flex !important; flex-direction:column !important; gap:0.75rem !important; }
 .ctrl-row1, .ctrl-row2, .ctrl-row3 { display:flex !important; flex-direction:row !important; gap:0.75rem !important; width:100% !important; }
 .ctrl-lang { flex:2 !important; min-width:0 !important; } .ctrl-voice { flex:3 !important; min-width:0 !important; }
 .ctrl-steps, .ctrl-speed, .ctrl-cfg { flex:1 !important; min-width:0 !important; }
-.gen-btn { background:linear-gradient(135deg,#3b82f6,#8b5cf6) !important; border:none !important; border-radius:10px !important; color:#fff !important; font-size:1rem !important; font-weight:600 !important; padding:0.75rem 2rem !important; width:100% !important; margin-top:1rem !important; }
-.gen-btn:hover { opacity:0.85 !important; }
-.gradio-audio { background:#111118 !important; border:1px solid #1e1e2e !important; border-radius:12px !important; }
 .stats-bar { display:flex; gap:0.75rem; flex-wrap:wrap; margin-top:0.75rem; padding:0.75rem 0; }
-.stat-pill { background:#1a1a2e; border:1px solid #2a2a4e; border-radius:20px; padding:0.3rem 0.9rem; font-family:'JetBrains Mono',monospace; font-size:0.8rem; color:#a78bfa; }
-.gradio-dropdown select, .gradio-dropdown input { background:#0d0d14 !important; border:1px solid #2a2a3e !important; color:#e8e8f0 !important; border-radius:8px !important; }
-.ref-panel { margin-top:1rem; padding:1rem; border:1px solid #24304a; border-radius:12px; background:#0d111c; }
-.ref-panel label { color:#c7d2fe !important; }
 .ref-status { margin-top:0.6rem; padding:0.75rem 0.9rem; border-radius:10px; font-size:0.9rem; line-height:1.4; }
-.ref-status.ok { color:#a7f3d0; background:rgba(16,185,129,0.10); border:1px solid rgba(16,185,129,0.25); }
 .ref-status.warn { color:#fde68a; background:rgba(245,158,11,0.10); border:1px solid rgba(245,158,11,0.25); }
-.ref-status.muted { color:#9ca3af; background:rgba(148,163,184,0.08); border:1px solid rgba(148,163,184,0.18); }
-.ref-help { color:#9ca3af; font-size:0.86rem; line-height:1.45; margin-top:0.5rem; }
 """
-with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
     gr.HTML(
-        '<div class="app-header"><h1>BlueTTS</h1>'
         '<p>Slim multilingual text-to-speech · English · Hebrew · Spanish · German · Italian</p>'
         '<a class="github-link" href="https://github.com/maxmelichov/BlueTTS" target="_blank">GitHub · maxmelichov/BlueTTS</a></div>'
     )
@@ -893,20 +883,16 @@ with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
                 cfg_input   = gr.Slider(1.0, 7.0, 3.0, step=0.1, label="CFG Scale", elem_classes="ctrl-cfg")
         with gr.Column(elem_classes="ref-panel"):
-            voice_source_input = gr.Radio(
-                choices=[("Saved voice", "saved"), ("Uploaded reference", "upload")],
-                value="saved" if VOICE_STYLES else "upload",
-                label="Voice source",
             )
             ref_wav_input = gr.Audio(
-                label="Reference audio for uploaded voice",
                 sources=["upload", "microphone"], type="filepath",
             )
-            gr.HTML(
-                '<div class="ref-help">For best cloning: upload 3-12 seconds of clean speech, one speaker, '
-                'no music/noise, no long silence. This is used only when Voice source is "Uploaded reference".</div>'
-            )
-            ref_status = gr.HTML(_reference_audio_status(None, "saved"))
         btn = gr.Button("⚡ Generate Speech", elem_classes="gen-btn")
     audio_out = gr.Audio(label="Output", type="numpy", autoplay=True)
@@ -914,21 +900,28 @@ with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
     gr.Examples(examples=EXAMPLES, inputs=[text_input, lang_input], label="Examples")
-    voice_source_input.change(
-        _reference_audio_status,
-        inputs=[ref_wav_input, voice_source_input],
-        outputs=[ref_status],
-    )
     ref_wav_input.change(
         _reference_audio_status,
-        inputs=[ref_wav_input, voice_source_input],
         outputs=[ref_status],
     )
     btn.click(
         synthesize_text,
-        inputs=[text_input, voice_source_input, voice_input, lang_input, steps_input, speed_input, cfg_input, ref_wav_input],
-        outputs=[audio_out, stats_out],
     )
     gr.HTML("""

     return style_from_dict(payload)
+def _reference_audio_status(ref_wav: Optional[str]):
     if not ref_wav:
         return (
+            '<div class="ref-status muted">No reference uploaded — '
+            'using the saved voice above. Upload or record a clip to clone a custom voice.</div>'
         )
     try:
         import soundfile as sf
             msg = "Too short for cloning; use at least 3 seconds."
         elif dur > 20.0:
             level = "warn"
+            msg = "Long clips work, but only the early frames are used. Trim to the cleanest 3-12 seconds."
         elif channels > 2:
             level = "warn"
+            msg = "Many channels detected; mono or stereo speech works best."
         else:
             level = "ok"
+            msg = "Cloned voice ready — this upload will be used for generation."
         return (
             f'<div class="ref-status {level}">'
             f'Reference: {dur:.1f}s, {info.samplerate} Hz, {channels} channel(s). {html.escape(msg)}'
         return f'<div class="ref-status warn">Could not inspect uploaded audio: {html.escape(str(e))}</div>'
+def synthesize_text(text: str, voice: str, lang: str, steps: int, speed: float,
                     cfg_scale: float, ref_wav: Optional[str] = None):
     t0 = time.time()
+    using_ref = bool(ref_wav)
     if using_ref:
         try:
             style = style_from_wav(ref_wav)
         except Exception as e:
         if not VOICE_STYLES:
             err = (
                 '<div class="stats-bar"><span class="stat-pill">'
+                'No saved voices installed. Upload a reference clip to clone a voice.</span></div>'
             )
             return None, err
         style = VOICE_STYLES[voice]
     rtf = proc_time / audio_dur if audio_dur > 0 else 0
     stats = (
         f'<div class="stats-bar">'
+        f'<span class="stat-pill">Voice: {"cloned from upload" if using_ref else html.escape(voice)}</span>'
         f'<span class="stat-pill">⏱ {proc_time:.2f}s</span>'
         f'<span class="stat-pill">🔊 {audio_dur:.1f}s audio</span>'
         f'<span class="stat-pill">⚡ {rtf:.2f}x RTF</span>'
 css = _load_font_face() + """
 @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&display=swap');
 * { box-sizing: border-box; }
+body, .gradio-container { background:#06101f !important; font-family:'EuclidCircularB',sans-serif !important; color:#e6efff !important; }
 .gradio-container { max-width:900px !important; margin:0 auto !important; padding:2rem 1.5rem !important; }
 .app-header { text-align:center; margin-bottom:2rem; padding:2rem 0 1rem; }
+.app-header h1 { font-size:2.8rem; font-weight:600; letter-spacing:-0.03em; background:linear-gradient(135deg,#38bdf8 0%,#3b82f6 50%,#1d4ed8 100%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; background-clip:text; margin:0 0 0.5rem; }
+.app-header p { color:#7ea3d4; font-size:1rem; margin:0 0 1rem; }
+.app-header .github-link { display:inline-flex; align-items:center; gap:0.4rem; margin-top:0.75rem; padding:0.45rem 1rem; font-size:0.9rem; font-weight:500; text-decoration:none !important; color:#93c5fd !important; border:1px solid #1e40af; border-radius:999px; background:rgba(59,130,246,0.12); }
+.card { background:#0b1a30; border:1px solid #163056; border-radius:16px; padding:1.5rem; margin-bottom:1rem; }
+.big-input textarea { background:#081327 !important; border:1px solid #1e3a66 !important; border-radius:10px !important; color:#e6efff !important; font-size:1.1rem !important; line-height:1.6 !important; padding:1rem !important; unicode-bidi:plaintext !important; }
+.big-input textarea:focus { border-color:#3b82f6 !important; outline:none !important; box-shadow:0 0 0 3px rgba(59,130,246,0.18) !important; }
 .controls-row { margin-top:1rem; display:flex !important; flex-direction:column !important; gap:0.75rem !important; }
 .ctrl-row1, .ctrl-row2, .ctrl-row3 { display:flex !important; flex-direction:row !important; gap:0.75rem !important; width:100% !important; }
 .ctrl-lang { flex:2 !important; min-width:0 !important; } .ctrl-voice { flex:3 !important; min-width:0 !important; }
 .ctrl-steps, .ctrl-speed, .ctrl-cfg { flex:1 !important; min-width:0 !important; }
+.gen-btn { background:linear-gradient(135deg,#2563eb,#1d4ed8) !important; border:none !important; border-radius:10px !important; color:#fff !important; font-size:1rem !important; font-weight:600 !important; padding:0.75rem 2rem !important; width:100% !important; margin-top:1rem !important; box-shadow:0 6px 18px rgba(37,99,235,0.35) !important; }
+.gen-btn:hover { opacity:0.9 !important; filter:brightness(1.05); }
+.gradio-audio { background:#0b1a30 !important; border:1px solid #163056 !important; border-radius:12px !important; }
 .stats-bar { display:flex; gap:0.75rem; flex-wrap:wrap; margin-top:0.75rem; padding:0.75rem 0; }
+.stat-pill { background:#0e2545; border:1px solid #1e40af; border-radius:20px; padding:0.3rem 0.9rem; font-family:'JetBrains Mono',monospace; font-size:0.8rem; color:#93c5fd; }
+.gradio-dropdown select, .gradio-dropdown input { background:#081327 !important; border:1px solid #1e3a66 !important; color:#e6efff !important; border-radius:8px !important; }
+.ref-panel { margin-top:1rem; padding:1rem; border:1px dashed #1e40af; border-radius:12px; background:#091a34; }
+.ref-panel label { color:#bfdbfe !important; }
+.ref-panel h3 { color:#dbeafe; margin:0 0 0.25rem; font-size:1rem; font-weight:600; }
 .ref-status { margin-top:0.6rem; padding:0.75rem 0.9rem; border-radius:10px; font-size:0.9rem; line-height:1.4; }
+.ref-status.ok { color:#bae6fd; background:rgba(14,165,233,0.12); border:1px solid rgba(14,165,233,0.35); }
 .ref-status.warn { color:#fde68a; background:rgba(245,158,11,0.10); border:1px solid rgba(245,158,11,0.25); }
+.ref-status.muted { color:#93a6c4; background:rgba(59,130,246,0.08); border:1px solid rgba(59,130,246,0.20); }
+.ref-help { color:#7ea3d4; font-size:0.86rem; line-height:1.45; margin-top:0.5rem; }
 """
+with gr.Blocks(title="BlueTTS V2 — Multilingual TTS") as demo:
     gr.HTML(
+        '<div class="app-header"><h1>BlueTTS V2</h1>'
         '<p>Slim multilingual text-to-speech · English · Hebrew · Spanish · German · Italian</p>'
         '<a class="github-link" href="https://github.com/maxmelichov/BlueTTS" target="_blank">GitHub · maxmelichov/BlueTTS</a></div>'
     )
                 cfg_input   = gr.Slider(1.0, 7.0, 3.0, step=0.1, label="CFG Scale", elem_classes="ctrl-cfg")
         with gr.Column(elem_classes="ref-panel"):
+            gr.HTML(
+                '<h3 style="color:#dbeafe;margin:0 0 0.25rem;font-size:1rem;font-weight:600;">Clone a voice (optional)</h3>'
+                '<div class="ref-help">Upload or record 3-12 seconds of clean speech to clone it. '
+                'Leave empty to use the saved voice selected above. Generation starts automatically when you upload.</div>'
             )
             ref_wav_input = gr.Audio(
+                label="Reference audio",
                 sources=["upload", "microphone"], type="filepath",
             )
+            ref_status = gr.HTML(_reference_audio_status(None))
         btn = gr.Button("⚡ Generate Speech", elem_classes="gen-btn")
     audio_out = gr.Audio(label="Output", type="numpy", autoplay=True)
     gr.Examples(examples=EXAMPLES, inputs=[text_input, lang_input], label="Examples")
+    synth_inputs = [text_input, voice_input, lang_input, steps_input, speed_input, cfg_input, ref_wav_input]
+    synth_outputs = [audio_out, stats_out]
+    def _auto_synth(text, voice, lang, steps, speed, cfg_scale, ref_wav):
+        if not ref_wav:
+            return gr.update(), gr.update()
+        return synthesize_text(text, voice, lang, steps, speed, cfg_scale, ref_wav)
     ref_wav_input.change(
         _reference_audio_status,
+        inputs=[ref_wav_input],
         outputs=[ref_status],
+    ).then(
+        _auto_synth,
+        inputs=synth_inputs,
+        outputs=synth_outputs,
     )
     btn.click(
         synthesize_text,
+        inputs=synth_inputs,
+        outputs=synth_outputs,
     )
     gr.HTML("""