notmax123 commited on
Commit
56e7960
·
1 Parent(s): 615a636

Simplify Gradio voice cloning and auto-run synthesis

Browse files

If reference audio is present, use it; otherwise use the selected saved voice. Auto-synthesize on upload; refresh Blue theme for V2.

Made-with: Cursor

Files changed (1) hide show
  1. app.py +51 -58
app.py CHANGED
@@ -660,16 +660,11 @@ def style_from_wav(ref_wav: str) -> Style:
660
  return style_from_dict(payload)
661
 
662
 
663
- def _reference_audio_status(ref_wav: Optional[str], voice_source: str = "saved"):
664
- if voice_source != "upload":
665
- return (
666
- '<div class="ref-status muted">Using the saved voice dropdown. '
667
- 'Uploaded reference audio is ignored unless Voice source is set to "Uploaded reference".</div>'
668
- )
669
  if not ref_wav:
670
  return (
671
- '<div class="ref-status warn">Upload or record a clean 3-12 second clip: one speaker, '
672
- 'no music, no background noise, no long silence.</div>'
673
  )
674
  try:
675
  import soundfile as sf
@@ -681,13 +676,13 @@ def _reference_audio_status(ref_wav: Optional[str], voice_source: str = "saved")
681
  msg = "Too short for cloning; use at least 3 seconds."
682
  elif dur > 20.0:
683
  level = "warn"
684
- msg = "Long clips work, but the exporter only uses the early reference frames. Trim to the cleanest 3-12 seconds."
685
  elif channels > 2:
686
  level = "warn"
687
- msg = "Many channels detected; mono or stereo speech is best."
688
  else:
689
  level = "ok"
690
- msg = "Ready. This upload will override the saved voice for the next generation."
691
  return (
692
  f'<div class="ref-status {level}">'
693
  f'Reference: {dur:.1f}s, {info.samplerate} Hz, {channels} channel(s). {html.escape(msg)}'
@@ -697,17 +692,11 @@ def _reference_audio_status(ref_wav: Optional[str], voice_source: str = "saved")
697
  return f'<div class="ref-status warn">Could not inspect uploaded audio: {html.escape(str(e))}</div>'
698
 
699
 
700
- def synthesize_text(text: str, voice_source: str, voice: str, lang: str, steps: int, speed: float,
701
  cfg_scale: float, ref_wav: Optional[str] = None):
702
  t0 = time.time()
703
- using_ref = voice_source == "upload"
704
  if using_ref:
705
- if not ref_wav:
706
- err = (
707
- '<div class="stats-bar"><span class="stat-pill">'
708
- 'Reference voice selected, but no audio was uploaded.</span></div>'
709
- )
710
- return None, err
711
  try:
712
  style = style_from_wav(ref_wav)
713
  except Exception as e:
@@ -717,7 +706,7 @@ def synthesize_text(text: str, voice_source: str, voice: str, lang: str, steps:
717
  if not VOICE_STYLES:
718
  err = (
719
  '<div class="stats-bar"><span class="stat-pill">'
720
- 'No saved v2 voices are installed. Choose "Uploaded reference" and upload audio.</span></div>'
721
  )
722
  return None, err
723
  style = VOICE_STYLES[voice]
@@ -731,7 +720,7 @@ def synthesize_text(text: str, voice_source: str, voice: str, lang: str, steps:
731
  rtf = proc_time / audio_dur if audio_dur > 0 else 0
732
  stats = (
733
  f'<div class="stats-bar">'
734
- f'<span class="stat-pill">Voice: {"uploaded reference" if using_ref else html.escape(voice)}</span>'
735
  f'<span class="stat-pill">⏱ {proc_time:.2f}s</span>'
736
  f'<span class="stat-pill">🔊 {audio_dur:.1f}s audio</span>'
737
  f'<span class="stat-pill">⚡ {rtf:.2f}x RTF</span>'
@@ -833,37 +822,38 @@ def _load_font_face() -> str:
833
  css = _load_font_face() + """
834
  @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&display=swap');
835
  * { box-sizing: border-box; }
836
- body, .gradio-container { background:#0a0a0f !important; font-family:'EuclidCircularB',sans-serif !important; color:#e8e8f0 !important; }
837
  .gradio-container { max-width:900px !important; margin:0 auto !important; padding:2rem 1.5rem !important; }
838
  .app-header { text-align:center; margin-bottom:2rem; padding:2rem 0 1rem; }
839
- .app-header h1 { font-size:2.8rem; font-weight:600; letter-spacing:-0.03em; background:linear-gradient(135deg,#60a5fa 0%,#a78bfa 50%,#34d399 100%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; background-clip:text; margin:0 0 0.5rem; }
840
- .app-header p { color:#6b7280; font-size:1rem; margin:0 0 1rem; }
841
- .app-header .github-link { display:inline-flex; align-items:center; gap:0.4rem; margin-top:0.75rem; padding:0.45rem 1rem; font-size:0.9rem; font-weight:500; text-decoration:none !important; color:#93c5fd !important; border:1px solid #2a3f5c; border-radius:999px; background:rgba(96,165,250,0.08); }
842
- .card { background:#111118; border:1px solid #1e1e2e; border-radius:16px; padding:1.5rem; margin-bottom:1rem; }
843
- .big-input textarea { background:#0d0d14 !important; border:1px solid #2a2a3e !important; border-radius:10px !important; color:#e8e8f0 !important; font-size:1.1rem !important; line-height:1.6 !important; padding:1rem !important; unicode-bidi:plaintext !important; }
844
- .big-input textarea:focus { border-color:#60a5fa !important; outline:none !important; }
845
  .controls-row { margin-top:1rem; display:flex !important; flex-direction:column !important; gap:0.75rem !important; }
846
  .ctrl-row1, .ctrl-row2, .ctrl-row3 { display:flex !important; flex-direction:row !important; gap:0.75rem !important; width:100% !important; }
847
  .ctrl-lang { flex:2 !important; min-width:0 !important; } .ctrl-voice { flex:3 !important; min-width:0 !important; }
848
  .ctrl-steps, .ctrl-speed, .ctrl-cfg { flex:1 !important; min-width:0 !important; }
849
- .gen-btn { background:linear-gradient(135deg,#3b82f6,#8b5cf6) !important; border:none !important; border-radius:10px !important; color:#fff !important; font-size:1rem !important; font-weight:600 !important; padding:0.75rem 2rem !important; width:100% !important; margin-top:1rem !important; }
850
- .gen-btn:hover { opacity:0.85 !important; }
851
- .gradio-audio { background:#111118 !important; border:1px solid #1e1e2e !important; border-radius:12px !important; }
852
  .stats-bar { display:flex; gap:0.75rem; flex-wrap:wrap; margin-top:0.75rem; padding:0.75rem 0; }
853
- .stat-pill { background:#1a1a2e; border:1px solid #2a2a4e; border-radius:20px; padding:0.3rem 0.9rem; font-family:'JetBrains Mono',monospace; font-size:0.8rem; color:#a78bfa; }
854
- .gradio-dropdown select, .gradio-dropdown input { background:#0d0d14 !important; border:1px solid #2a2a3e !important; color:#e8e8f0 !important; border-radius:8px !important; }
855
- .ref-panel { margin-top:1rem; padding:1rem; border:1px solid #24304a; border-radius:12px; background:#0d111c; }
856
- .ref-panel label { color:#c7d2fe !important; }
 
857
  .ref-status { margin-top:0.6rem; padding:0.75rem 0.9rem; border-radius:10px; font-size:0.9rem; line-height:1.4; }
858
- .ref-status.ok { color:#a7f3d0; background:rgba(16,185,129,0.10); border:1px solid rgba(16,185,129,0.25); }
859
  .ref-status.warn { color:#fde68a; background:rgba(245,158,11,0.10); border:1px solid rgba(245,158,11,0.25); }
860
- .ref-status.muted { color:#9ca3af; background:rgba(148,163,184,0.08); border:1px solid rgba(148,163,184,0.18); }
861
- .ref-help { color:#9ca3af; font-size:0.86rem; line-height:1.45; margin-top:0.5rem; }
862
  """
863
 
864
- with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
865
  gr.HTML(
866
- '<div class="app-header"><h1>BlueTTS</h1>'
867
  '<p>Slim multilingual text-to-speech · English · Hebrew · Spanish · German · Italian</p>'
868
  '<a class="github-link" href="https://github.com/maxmelichov/BlueTTS" target="_blank">GitHub · maxmelichov/BlueTTS</a></div>'
869
  )
@@ -893,20 +883,16 @@ with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
893
  cfg_input = gr.Slider(1.0, 7.0, 3.0, step=0.1, label="CFG Scale", elem_classes="ctrl-cfg")
894
 
895
  with gr.Column(elem_classes="ref-panel"):
896
- voice_source_input = gr.Radio(
897
- choices=[("Saved voice", "saved"), ("Uploaded reference", "upload")],
898
- value="saved" if VOICE_STYLES else "upload",
899
- label="Voice source",
900
  )
901
  ref_wav_input = gr.Audio(
902
- label="Reference audio for uploaded voice",
903
  sources=["upload", "microphone"], type="filepath",
904
  )
905
- gr.HTML(
906
- '<div class="ref-help">For best cloning: upload 3-12 seconds of clean speech, one speaker, '
907
- 'no music/noise, no long silence. This is used only when Voice source is "Uploaded reference".</div>'
908
- )
909
- ref_status = gr.HTML(_reference_audio_status(None, "saved"))
910
 
911
  btn = gr.Button("⚡ Generate Speech", elem_classes="gen-btn")
912
  audio_out = gr.Audio(label="Output", type="numpy", autoplay=True)
@@ -914,21 +900,28 @@ with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
914
 
915
  gr.Examples(examples=EXAMPLES, inputs=[text_input, lang_input], label="Examples")
916
 
917
- voice_source_input.change(
918
- _reference_audio_status,
919
- inputs=[ref_wav_input, voice_source_input],
920
- outputs=[ref_status],
921
- )
 
 
 
922
  ref_wav_input.change(
923
  _reference_audio_status,
924
- inputs=[ref_wav_input, voice_source_input],
925
  outputs=[ref_status],
 
 
 
 
926
  )
927
 
928
  btn.click(
929
  synthesize_text,
930
- inputs=[text_input, voice_source_input, voice_input, lang_input, steps_input, speed_input, cfg_input, ref_wav_input],
931
- outputs=[audio_out, stats_out],
932
  )
933
 
934
  gr.HTML("""
 
660
  return style_from_dict(payload)
661
 
662
 
663
+ def _reference_audio_status(ref_wav: Optional[str]):
 
 
 
 
 
664
  if not ref_wav:
665
  return (
666
+ '<div class="ref-status muted">No reference uploaded '
667
+ 'using the saved voice above. Upload or record a clip to clone a custom voice.</div>'
668
  )
669
  try:
670
  import soundfile as sf
 
676
  msg = "Too short for cloning; use at least 3 seconds."
677
  elif dur > 20.0:
678
  level = "warn"
679
+ msg = "Long clips work, but only the early frames are used. Trim to the cleanest 3-12 seconds."
680
  elif channels > 2:
681
  level = "warn"
682
+ msg = "Many channels detected; mono or stereo speech works best."
683
  else:
684
  level = "ok"
685
+ msg = "Cloned voice ready this upload will be used for generation."
686
  return (
687
  f'<div class="ref-status {level}">'
688
  f'Reference: {dur:.1f}s, {info.samplerate} Hz, {channels} channel(s). {html.escape(msg)}'
 
692
  return f'<div class="ref-status warn">Could not inspect uploaded audio: {html.escape(str(e))}</div>'
693
 
694
 
695
+ def synthesize_text(text: str, voice: str, lang: str, steps: int, speed: float,
696
  cfg_scale: float, ref_wav: Optional[str] = None):
697
  t0 = time.time()
698
+ using_ref = bool(ref_wav)
699
  if using_ref:
 
 
 
 
 
 
700
  try:
701
  style = style_from_wav(ref_wav)
702
  except Exception as e:
 
706
  if not VOICE_STYLES:
707
  err = (
708
  '<div class="stats-bar"><span class="stat-pill">'
709
+ 'No saved voices installed. Upload a reference clip to clone a voice.</span></div>'
710
  )
711
  return None, err
712
  style = VOICE_STYLES[voice]
 
720
  rtf = proc_time / audio_dur if audio_dur > 0 else 0
721
  stats = (
722
  f'<div class="stats-bar">'
723
+ f'<span class="stat-pill">Voice: {"cloned from upload" if using_ref else html.escape(voice)}</span>'
724
  f'<span class="stat-pill">⏱ {proc_time:.2f}s</span>'
725
  f'<span class="stat-pill">🔊 {audio_dur:.1f}s audio</span>'
726
  f'<span class="stat-pill">⚡ {rtf:.2f}x RTF</span>'
 
822
  css = _load_font_face() + """
823
  @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&display=swap');
824
  * { box-sizing: border-box; }
825
+ body, .gradio-container { background:#06101f !important; font-family:'EuclidCircularB',sans-serif !important; color:#e6efff !important; }
826
  .gradio-container { max-width:900px !important; margin:0 auto !important; padding:2rem 1.5rem !important; }
827
  .app-header { text-align:center; margin-bottom:2rem; padding:2rem 0 1rem; }
828
+ .app-header h1 { font-size:2.8rem; font-weight:600; letter-spacing:-0.03em; background:linear-gradient(135deg,#38bdf8 0%,#3b82f6 50%,#1d4ed8 100%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; background-clip:text; margin:0 0 0.5rem; }
829
+ .app-header p { color:#7ea3d4; font-size:1rem; margin:0 0 1rem; }
830
+ .app-header .github-link { display:inline-flex; align-items:center; gap:0.4rem; margin-top:0.75rem; padding:0.45rem 1rem; font-size:0.9rem; font-weight:500; text-decoration:none !important; color:#93c5fd !important; border:1px solid #1e40af; border-radius:999px; background:rgba(59,130,246,0.12); }
831
+ .card { background:#0b1a30; border:1px solid #163056; border-radius:16px; padding:1.5rem; margin-bottom:1rem; }
832
+ .big-input textarea { background:#081327 !important; border:1px solid #1e3a66 !important; border-radius:10px !important; color:#e6efff !important; font-size:1.1rem !important; line-height:1.6 !important; padding:1rem !important; unicode-bidi:plaintext !important; }
833
+ .big-input textarea:focus { border-color:#3b82f6 !important; outline:none !important; box-shadow:0 0 0 3px rgba(59,130,246,0.18) !important; }
834
  .controls-row { margin-top:1rem; display:flex !important; flex-direction:column !important; gap:0.75rem !important; }
835
  .ctrl-row1, .ctrl-row2, .ctrl-row3 { display:flex !important; flex-direction:row !important; gap:0.75rem !important; width:100% !important; }
836
  .ctrl-lang { flex:2 !important; min-width:0 !important; } .ctrl-voice { flex:3 !important; min-width:0 !important; }
837
  .ctrl-steps, .ctrl-speed, .ctrl-cfg { flex:1 !important; min-width:0 !important; }
838
+ .gen-btn { background:linear-gradient(135deg,#2563eb,#1d4ed8) !important; border:none !important; border-radius:10px !important; color:#fff !important; font-size:1rem !important; font-weight:600 !important; padding:0.75rem 2rem !important; width:100% !important; margin-top:1rem !important; box-shadow:0 6px 18px rgba(37,99,235,0.35) !important; }
839
+ .gen-btn:hover { opacity:0.9 !important; filter:brightness(1.05); }
840
+ .gradio-audio { background:#0b1a30 !important; border:1px solid #163056 !important; border-radius:12px !important; }
841
  .stats-bar { display:flex; gap:0.75rem; flex-wrap:wrap; margin-top:0.75rem; padding:0.75rem 0; }
842
+ .stat-pill { background:#0e2545; border:1px solid #1e40af; border-radius:20px; padding:0.3rem 0.9rem; font-family:'JetBrains Mono',monospace; font-size:0.8rem; color:#93c5fd; }
843
+ .gradio-dropdown select, .gradio-dropdown input { background:#081327 !important; border:1px solid #1e3a66 !important; color:#e6efff !important; border-radius:8px !important; }
844
+ .ref-panel { margin-top:1rem; padding:1rem; border:1px dashed #1e40af; border-radius:12px; background:#091a34; }
845
+ .ref-panel label { color:#bfdbfe !important; }
846
+ .ref-panel h3 { color:#dbeafe; margin:0 0 0.25rem; font-size:1rem; font-weight:600; }
847
  .ref-status { margin-top:0.6rem; padding:0.75rem 0.9rem; border-radius:10px; font-size:0.9rem; line-height:1.4; }
848
+ .ref-status.ok { color:#bae6fd; background:rgba(14,165,233,0.12); border:1px solid rgba(14,165,233,0.35); }
849
  .ref-status.warn { color:#fde68a; background:rgba(245,158,11,0.10); border:1px solid rgba(245,158,11,0.25); }
850
+ .ref-status.muted { color:#93a6c4; background:rgba(59,130,246,0.08); border:1px solid rgba(59,130,246,0.20); }
851
+ .ref-help { color:#7ea3d4; font-size:0.86rem; line-height:1.45; margin-top:0.5rem; }
852
  """
853
 
854
+ with gr.Blocks(title="BlueTTS V2 — Multilingual TTS") as demo:
855
  gr.HTML(
856
+ '<div class="app-header"><h1>BlueTTS V2</h1>'
857
  '<p>Slim multilingual text-to-speech · English · Hebrew · Spanish · German · Italian</p>'
858
  '<a class="github-link" href="https://github.com/maxmelichov/BlueTTS" target="_blank">GitHub · maxmelichov/BlueTTS</a></div>'
859
  )
 
883
  cfg_input = gr.Slider(1.0, 7.0, 3.0, step=0.1, label="CFG Scale", elem_classes="ctrl-cfg")
884
 
885
  with gr.Column(elem_classes="ref-panel"):
886
+ gr.HTML(
887
+ '<h3 style="color:#dbeafe;margin:0 0 0.25rem;font-size:1rem;font-weight:600;">Clone a voice (optional)</h3>'
888
+ '<div class="ref-help">Upload or record 3-12 seconds of clean speech to clone it. '
889
+ 'Leave empty to use the saved voice selected above. Generation starts automatically when you upload.</div>'
890
  )
891
  ref_wav_input = gr.Audio(
892
+ label="Reference audio",
893
  sources=["upload", "microphone"], type="filepath",
894
  )
895
+ ref_status = gr.HTML(_reference_audio_status(None))
 
 
 
 
896
 
897
  btn = gr.Button("⚡ Generate Speech", elem_classes="gen-btn")
898
  audio_out = gr.Audio(label="Output", type="numpy", autoplay=True)
 
900
 
901
  gr.Examples(examples=EXAMPLES, inputs=[text_input, lang_input], label="Examples")
902
 
903
+ synth_inputs = [text_input, voice_input, lang_input, steps_input, speed_input, cfg_input, ref_wav_input]
904
+ synth_outputs = [audio_out, stats_out]
905
+
906
+ def _auto_synth(text, voice, lang, steps, speed, cfg_scale, ref_wav):
907
+ if not ref_wav:
908
+ return gr.update(), gr.update()
909
+ return synthesize_text(text, voice, lang, steps, speed, cfg_scale, ref_wav)
910
+
911
  ref_wav_input.change(
912
  _reference_audio_status,
913
+ inputs=[ref_wav_input],
914
  outputs=[ref_status],
915
+ ).then(
916
+ _auto_synth,
917
+ inputs=synth_inputs,
918
+ outputs=synth_outputs,
919
  )
920
 
921
  btn.click(
922
  synthesize_text,
923
+ inputs=synth_inputs,
924
+ outputs=synth_outputs,
925
  )
926
 
927
  gr.HTML("""