Simplify Gradio voice cloning and auto-run synthesis
Browse filesIf reference audio is present, use it; otherwise use the selected saved voice. Auto-synthesize on upload; refresh Blue theme for V2.
Made-with: Cursor
app.py
CHANGED
|
@@ -660,16 +660,11 @@ def style_from_wav(ref_wav: str) -> Style:
|
|
| 660 |
return style_from_dict(payload)
|
| 661 |
|
| 662 |
|
| 663 |
-
def _reference_audio_status(ref_wav: Optional[str]
|
| 664 |
-
if voice_source != "upload":
|
| 665 |
-
return (
|
| 666 |
-
'<div class="ref-status muted">Using the saved voice dropdown. '
|
| 667 |
-
'Uploaded reference audio is ignored unless Voice source is set to "Uploaded reference".</div>'
|
| 668 |
-
)
|
| 669 |
if not ref_wav:
|
| 670 |
return (
|
| 671 |
-
'<div class="ref-status
|
| 672 |
-
'
|
| 673 |
)
|
| 674 |
try:
|
| 675 |
import soundfile as sf
|
|
@@ -681,13 +676,13 @@ def _reference_audio_status(ref_wav: Optional[str], voice_source: str = "saved")
|
|
| 681 |
msg = "Too short for cloning; use at least 3 seconds."
|
| 682 |
elif dur > 20.0:
|
| 683 |
level = "warn"
|
| 684 |
-
msg = "Long clips work, but
|
| 685 |
elif channels > 2:
|
| 686 |
level = "warn"
|
| 687 |
-
msg = "Many channels detected; mono or stereo speech
|
| 688 |
else:
|
| 689 |
level = "ok"
|
| 690 |
-
msg = "
|
| 691 |
return (
|
| 692 |
f'<div class="ref-status {level}">'
|
| 693 |
f'Reference: {dur:.1f}s, {info.samplerate} Hz, {channels} channel(s). {html.escape(msg)}'
|
|
@@ -697,17 +692,11 @@ def _reference_audio_status(ref_wav: Optional[str], voice_source: str = "saved")
|
|
| 697 |
return f'<div class="ref-status warn">Could not inspect uploaded audio: {html.escape(str(e))}</div>'
|
| 698 |
|
| 699 |
|
| 700 |
-
def synthesize_text(text: str,
|
| 701 |
cfg_scale: float, ref_wav: Optional[str] = None):
|
| 702 |
t0 = time.time()
|
| 703 |
-
using_ref =
|
| 704 |
if using_ref:
|
| 705 |
-
if not ref_wav:
|
| 706 |
-
err = (
|
| 707 |
-
'<div class="stats-bar"><span class="stat-pill">'
|
| 708 |
-
'Reference voice selected, but no audio was uploaded.</span></div>'
|
| 709 |
-
)
|
| 710 |
-
return None, err
|
| 711 |
try:
|
| 712 |
style = style_from_wav(ref_wav)
|
| 713 |
except Exception as e:
|
|
@@ -717,7 +706,7 @@ def synthesize_text(text: str, voice_source: str, voice: str, lang: str, steps:
|
|
| 717 |
if not VOICE_STYLES:
|
| 718 |
err = (
|
| 719 |
'<div class="stats-bar"><span class="stat-pill">'
|
| 720 |
-
'No saved
|
| 721 |
)
|
| 722 |
return None, err
|
| 723 |
style = VOICE_STYLES[voice]
|
|
@@ -731,7 +720,7 @@ def synthesize_text(text: str, voice_source: str, voice: str, lang: str, steps:
|
|
| 731 |
rtf = proc_time / audio_dur if audio_dur > 0 else 0
|
| 732 |
stats = (
|
| 733 |
f'<div class="stats-bar">'
|
| 734 |
-
f'<span class="stat-pill">Voice: {"
|
| 735 |
f'<span class="stat-pill">⏱ {proc_time:.2f}s</span>'
|
| 736 |
f'<span class="stat-pill">🔊 {audio_dur:.1f}s audio</span>'
|
| 737 |
f'<span class="stat-pill">⚡ {rtf:.2f}x RTF</span>'
|
|
@@ -833,37 +822,38 @@ def _load_font_face() -> str:
|
|
| 833 |
css = _load_font_face() + """
|
| 834 |
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&display=swap');
|
| 835 |
* { box-sizing: border-box; }
|
| 836 |
-
body, .gradio-container { background:#
|
| 837 |
.gradio-container { max-width:900px !important; margin:0 auto !important; padding:2rem 1.5rem !important; }
|
| 838 |
.app-header { text-align:center; margin-bottom:2rem; padding:2rem 0 1rem; }
|
| 839 |
-
.app-header h1 { font-size:2.8rem; font-weight:600; letter-spacing:-0.03em; background:linear-gradient(135deg,#
|
| 840 |
-
.app-header p { color:#
|
| 841 |
-
.app-header .github-link { display:inline-flex; align-items:center; gap:0.4rem; margin-top:0.75rem; padding:0.45rem 1rem; font-size:0.9rem; font-weight:500; text-decoration:none !important; color:#93c5fd !important; border:1px solid #
|
| 842 |
-
.card { background:#
|
| 843 |
-
.big-input textarea { background:#
|
| 844 |
-
.big-input textarea:focus { border-color:#
|
| 845 |
.controls-row { margin-top:1rem; display:flex !important; flex-direction:column !important; gap:0.75rem !important; }
|
| 846 |
.ctrl-row1, .ctrl-row2, .ctrl-row3 { display:flex !important; flex-direction:row !important; gap:0.75rem !important; width:100% !important; }
|
| 847 |
.ctrl-lang { flex:2 !important; min-width:0 !important; } .ctrl-voice { flex:3 !important; min-width:0 !important; }
|
| 848 |
.ctrl-steps, .ctrl-speed, .ctrl-cfg { flex:1 !important; min-width:0 !important; }
|
| 849 |
-
.gen-btn { background:linear-gradient(135deg,#
|
| 850 |
-
.gen-btn:hover { opacity:0.
|
| 851 |
-
.gradio-audio { background:#
|
| 852 |
.stats-bar { display:flex; gap:0.75rem; flex-wrap:wrap; margin-top:0.75rem; padding:0.75rem 0; }
|
| 853 |
-
.stat-pill { background:#
|
| 854 |
-
.gradio-dropdown select, .gradio-dropdown input { background:#
|
| 855 |
-
.ref-panel { margin-top:1rem; padding:1rem; border:1px
|
| 856 |
-
.ref-panel label { color:#
|
|
|
|
| 857 |
.ref-status { margin-top:0.6rem; padding:0.75rem 0.9rem; border-radius:10px; font-size:0.9rem; line-height:1.4; }
|
| 858 |
-
.ref-status.ok { color:#
|
| 859 |
.ref-status.warn { color:#fde68a; background:rgba(245,158,11,0.10); border:1px solid rgba(245,158,11,0.25); }
|
| 860 |
-
.ref-status.muted { color:#
|
| 861 |
-
.ref-help { color:#
|
| 862 |
"""
|
| 863 |
|
| 864 |
-
with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
|
| 865 |
gr.HTML(
|
| 866 |
-
'<div class="app-header"><h1>BlueTTS</h1>'
|
| 867 |
'<p>Slim multilingual text-to-speech · English · Hebrew · Spanish · German · Italian</p>'
|
| 868 |
'<a class="github-link" href="https://github.com/maxmelichov/BlueTTS" target="_blank">GitHub · maxmelichov/BlueTTS</a></div>'
|
| 869 |
)
|
|
@@ -893,20 +883,16 @@ with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
|
|
| 893 |
cfg_input = gr.Slider(1.0, 7.0, 3.0, step=0.1, label="CFG Scale", elem_classes="ctrl-cfg")
|
| 894 |
|
| 895 |
with gr.Column(elem_classes="ref-panel"):
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
)
|
| 901 |
ref_wav_input = gr.Audio(
|
| 902 |
-
label="Reference audio
|
| 903 |
sources=["upload", "microphone"], type="filepath",
|
| 904 |
)
|
| 905 |
-
gr.HTML(
|
| 906 |
-
'<div class="ref-help">For best cloning: upload 3-12 seconds of clean speech, one speaker, '
|
| 907 |
-
'no music/noise, no long silence. This is used only when Voice source is "Uploaded reference".</div>'
|
| 908 |
-
)
|
| 909 |
-
ref_status = gr.HTML(_reference_audio_status(None, "saved"))
|
| 910 |
|
| 911 |
btn = gr.Button("⚡ Generate Speech", elem_classes="gen-btn")
|
| 912 |
audio_out = gr.Audio(label="Output", type="numpy", autoplay=True)
|
|
@@ -914,21 +900,28 @@ with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
|
|
| 914 |
|
| 915 |
gr.Examples(examples=EXAMPLES, inputs=[text_input, lang_input], label="Examples")
|
| 916 |
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
|
|
|
|
|
|
|
|
|
| 922 |
ref_wav_input.change(
|
| 923 |
_reference_audio_status,
|
| 924 |
-
inputs=[ref_wav_input
|
| 925 |
outputs=[ref_status],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 926 |
)
|
| 927 |
|
| 928 |
btn.click(
|
| 929 |
synthesize_text,
|
| 930 |
-
inputs=
|
| 931 |
-
outputs=
|
| 932 |
)
|
| 933 |
|
| 934 |
gr.HTML("""
|
|
|
|
| 660 |
return style_from_dict(payload)
|
| 661 |
|
| 662 |
|
| 663 |
+
def _reference_audio_status(ref_wav: Optional[str]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
if not ref_wav:
|
| 665 |
return (
|
| 666 |
+
'<div class="ref-status muted">No reference uploaded — '
|
| 667 |
+
'using the saved voice above. Upload or record a clip to clone a custom voice.</div>'
|
| 668 |
)
|
| 669 |
try:
|
| 670 |
import soundfile as sf
|
|
|
|
| 676 |
msg = "Too short for cloning; use at least 3 seconds."
|
| 677 |
elif dur > 20.0:
|
| 678 |
level = "warn"
|
| 679 |
+
msg = "Long clips work, but only the early frames are used. Trim to the cleanest 3-12 seconds."
|
| 680 |
elif channels > 2:
|
| 681 |
level = "warn"
|
| 682 |
+
msg = "Many channels detected; mono or stereo speech works best."
|
| 683 |
else:
|
| 684 |
level = "ok"
|
| 685 |
+
msg = "Cloned voice ready — this upload will be used for generation."
|
| 686 |
return (
|
| 687 |
f'<div class="ref-status {level}">'
|
| 688 |
f'Reference: {dur:.1f}s, {info.samplerate} Hz, {channels} channel(s). {html.escape(msg)}'
|
|
|
|
| 692 |
return f'<div class="ref-status warn">Could not inspect uploaded audio: {html.escape(str(e))}</div>'
|
| 693 |
|
| 694 |
|
| 695 |
+
def synthesize_text(text: str, voice: str, lang: str, steps: int, speed: float,
|
| 696 |
cfg_scale: float, ref_wav: Optional[str] = None):
|
| 697 |
t0 = time.time()
|
| 698 |
+
using_ref = bool(ref_wav)
|
| 699 |
if using_ref:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
try:
|
| 701 |
style = style_from_wav(ref_wav)
|
| 702 |
except Exception as e:
|
|
|
|
| 706 |
if not VOICE_STYLES:
|
| 707 |
err = (
|
| 708 |
'<div class="stats-bar"><span class="stat-pill">'
|
| 709 |
+
'No saved voices installed. Upload a reference clip to clone a voice.</span></div>'
|
| 710 |
)
|
| 711 |
return None, err
|
| 712 |
style = VOICE_STYLES[voice]
|
|
|
|
| 720 |
rtf = proc_time / audio_dur if audio_dur > 0 else 0
|
| 721 |
stats = (
|
| 722 |
f'<div class="stats-bar">'
|
| 723 |
+
f'<span class="stat-pill">Voice: {"cloned from upload" if using_ref else html.escape(voice)}</span>'
|
| 724 |
f'<span class="stat-pill">⏱ {proc_time:.2f}s</span>'
|
| 725 |
f'<span class="stat-pill">🔊 {audio_dur:.1f}s audio</span>'
|
| 726 |
f'<span class="stat-pill">⚡ {rtf:.2f}x RTF</span>'
|
|
|
|
| 822 |
css = _load_font_face() + """
|
| 823 |
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&display=swap');
|
| 824 |
* { box-sizing: border-box; }
|
| 825 |
+
body, .gradio-container { background:#06101f !important; font-family:'EuclidCircularB',sans-serif !important; color:#e6efff !important; }
|
| 826 |
.gradio-container { max-width:900px !important; margin:0 auto !important; padding:2rem 1.5rem !important; }
|
| 827 |
.app-header { text-align:center; margin-bottom:2rem; padding:2rem 0 1rem; }
|
| 828 |
+
.app-header h1 { font-size:2.8rem; font-weight:600; letter-spacing:-0.03em; background:linear-gradient(135deg,#38bdf8 0%,#3b82f6 50%,#1d4ed8 100%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; background-clip:text; margin:0 0 0.5rem; }
|
| 829 |
+
.app-header p { color:#7ea3d4; font-size:1rem; margin:0 0 1rem; }
|
| 830 |
+
.app-header .github-link { display:inline-flex; align-items:center; gap:0.4rem; margin-top:0.75rem; padding:0.45rem 1rem; font-size:0.9rem; font-weight:500; text-decoration:none !important; color:#93c5fd !important; border:1px solid #1e40af; border-radius:999px; background:rgba(59,130,246,0.12); }
|
| 831 |
+
.card { background:#0b1a30; border:1px solid #163056; border-radius:16px; padding:1.5rem; margin-bottom:1rem; }
|
| 832 |
+
.big-input textarea { background:#081327 !important; border:1px solid #1e3a66 !important; border-radius:10px !important; color:#e6efff !important; font-size:1.1rem !important; line-height:1.6 !important; padding:1rem !important; unicode-bidi:plaintext !important; }
|
| 833 |
+
.big-input textarea:focus { border-color:#3b82f6 !important; outline:none !important; box-shadow:0 0 0 3px rgba(59,130,246,0.18) !important; }
|
| 834 |
.controls-row { margin-top:1rem; display:flex !important; flex-direction:column !important; gap:0.75rem !important; }
|
| 835 |
.ctrl-row1, .ctrl-row2, .ctrl-row3 { display:flex !important; flex-direction:row !important; gap:0.75rem !important; width:100% !important; }
|
| 836 |
.ctrl-lang { flex:2 !important; min-width:0 !important; } .ctrl-voice { flex:3 !important; min-width:0 !important; }
|
| 837 |
.ctrl-steps, .ctrl-speed, .ctrl-cfg { flex:1 !important; min-width:0 !important; }
|
| 838 |
+
.gen-btn { background:linear-gradient(135deg,#2563eb,#1d4ed8) !important; border:none !important; border-radius:10px !important; color:#fff !important; font-size:1rem !important; font-weight:600 !important; padding:0.75rem 2rem !important; width:100% !important; margin-top:1rem !important; box-shadow:0 6px 18px rgba(37,99,235,0.35) !important; }
|
| 839 |
+
.gen-btn:hover { opacity:0.9 !important; filter:brightness(1.05); }
|
| 840 |
+
.gradio-audio { background:#0b1a30 !important; border:1px solid #163056 !important; border-radius:12px !important; }
|
| 841 |
.stats-bar { display:flex; gap:0.75rem; flex-wrap:wrap; margin-top:0.75rem; padding:0.75rem 0; }
|
| 842 |
+
.stat-pill { background:#0e2545; border:1px solid #1e40af; border-radius:20px; padding:0.3rem 0.9rem; font-family:'JetBrains Mono',monospace; font-size:0.8rem; color:#93c5fd; }
|
| 843 |
+
.gradio-dropdown select, .gradio-dropdown input { background:#081327 !important; border:1px solid #1e3a66 !important; color:#e6efff !important; border-radius:8px !important; }
|
| 844 |
+
.ref-panel { margin-top:1rem; padding:1rem; border:1px dashed #1e40af; border-radius:12px; background:#091a34; }
|
| 845 |
+
.ref-panel label { color:#bfdbfe !important; }
|
| 846 |
+
.ref-panel h3 { color:#dbeafe; margin:0 0 0.25rem; font-size:1rem; font-weight:600; }
|
| 847 |
.ref-status { margin-top:0.6rem; padding:0.75rem 0.9rem; border-radius:10px; font-size:0.9rem; line-height:1.4; }
|
| 848 |
+
.ref-status.ok { color:#bae6fd; background:rgba(14,165,233,0.12); border:1px solid rgba(14,165,233,0.35); }
|
| 849 |
.ref-status.warn { color:#fde68a; background:rgba(245,158,11,0.10); border:1px solid rgba(245,158,11,0.25); }
|
| 850 |
+
.ref-status.muted { color:#93a6c4; background:rgba(59,130,246,0.08); border:1px solid rgba(59,130,246,0.20); }
|
| 851 |
+
.ref-help { color:#7ea3d4; font-size:0.86rem; line-height:1.45; margin-top:0.5rem; }
|
| 852 |
"""
|
| 853 |
|
| 854 |
+
with gr.Blocks(title="BlueTTS V2 — Multilingual TTS") as demo:
|
| 855 |
gr.HTML(
|
| 856 |
+
'<div class="app-header"><h1>BlueTTS V2</h1>'
|
| 857 |
'<p>Slim multilingual text-to-speech · English · Hebrew · Spanish · German · Italian</p>'
|
| 858 |
'<a class="github-link" href="https://github.com/maxmelichov/BlueTTS" target="_blank">GitHub · maxmelichov/BlueTTS</a></div>'
|
| 859 |
)
|
|
|
|
| 883 |
cfg_input = gr.Slider(1.0, 7.0, 3.0, step=0.1, label="CFG Scale", elem_classes="ctrl-cfg")
|
| 884 |
|
| 885 |
with gr.Column(elem_classes="ref-panel"):
|
| 886 |
+
gr.HTML(
|
| 887 |
+
'<h3 style="color:#dbeafe;margin:0 0 0.25rem;font-size:1rem;font-weight:600;">Clone a voice (optional)</h3>'
|
| 888 |
+
'<div class="ref-help">Upload or record 3-12 seconds of clean speech to clone it. '
|
| 889 |
+
'Leave empty to use the saved voice selected above. Generation starts automatically when you upload.</div>'
|
| 890 |
)
|
| 891 |
ref_wav_input = gr.Audio(
|
| 892 |
+
label="Reference audio",
|
| 893 |
sources=["upload", "microphone"], type="filepath",
|
| 894 |
)
|
| 895 |
+
ref_status = gr.HTML(_reference_audio_status(None))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
|
| 897 |
btn = gr.Button("⚡ Generate Speech", elem_classes="gen-btn")
|
| 898 |
audio_out = gr.Audio(label="Output", type="numpy", autoplay=True)
|
|
|
|
| 900 |
|
| 901 |
gr.Examples(examples=EXAMPLES, inputs=[text_input, lang_input], label="Examples")
|
| 902 |
|
| 903 |
+
synth_inputs = [text_input, voice_input, lang_input, steps_input, speed_input, cfg_input, ref_wav_input]
|
| 904 |
+
synth_outputs = [audio_out, stats_out]
|
| 905 |
+
|
| 906 |
+
def _auto_synth(text, voice, lang, steps, speed, cfg_scale, ref_wav):
|
| 907 |
+
if not ref_wav:
|
| 908 |
+
return gr.update(), gr.update()
|
| 909 |
+
return synthesize_text(text, voice, lang, steps, speed, cfg_scale, ref_wav)
|
| 910 |
+
|
| 911 |
ref_wav_input.change(
|
| 912 |
_reference_audio_status,
|
| 913 |
+
inputs=[ref_wav_input],
|
| 914 |
outputs=[ref_status],
|
| 915 |
+
).then(
|
| 916 |
+
_auto_synth,
|
| 917 |
+
inputs=synth_inputs,
|
| 918 |
+
outputs=synth_outputs,
|
| 919 |
)
|
| 920 |
|
| 921 |
btn.click(
|
| 922 |
synthesize_text,
|
| 923 |
+
inputs=synth_inputs,
|
| 924 |
+
outputs=synth_outputs,
|
| 925 |
)
|
| 926 |
|
| 927 |
gr.HTML("""
|