asbgig commited on
Commit
2c102d1
·
verified ·
1 Parent(s): 5d1c45d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -61
app.py CHANGED
@@ -1,106 +1,128 @@
1
- import os
2
- import gradio as gr
3
- import tempfile, os
4
  import numpy as np
5
  import soundfile as sf
 
6
  from TTS.api import TTS
7
 
8
- # If Coqui shows a CPML prompt when downloading models,
9
- # try pre-accept via env var (safe no-op if ignored).
10
- os.environ.setdefault("COQUI_TOS_AGREED", "y")
11
-
12
  MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
13
 
14
- # Lazy-load to avoid heavy import before Space is ready
15
- _tts_obj = None
16
  def get_tts():
17
- global _tts_obj
18
- if _tts_obj is None:
19
- _tts_obj = TTS(MODEL_NAME)
20
- return _tts_obj
 
 
 
 
 
 
21
 
22
  LANGS = [
23
- ("English", "en"), ("Urdu", "ur"), ("Hindi", "hi"), ("Arabic", "ar"),
24
- ("French", "fr"), ("German", "de"), ("Spanish", "es"), ("Italian", "it"),
25
- ("Portuguese", "pt"), ("Turkish", "tr"),
26
  ]
27
 
28
- def clean_text(text: str) -> str:
29
  return " ".join((text or "").strip().split())
30
 
31
  def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
32
  try:
33
- tts.tts_to_file(
34
- text=txt, file_path=out_path,
35
- speaker_wav=wav_path, language=lang, speed=speed,
36
- )
37
  except TypeError:
38
- tts.tts_to_file(
39
- text=txt, file_path=out_path,
40
- speaker_wav=wav_path, language=lang,
41
- )
42
 
43
- def tts_clone(text, ref_audio, language_code, speed, split_sentences):
44
- if ref_audio is None:
 
 
 
 
45
  raise gr.Error("Please upload a reference voice sample (10–60 seconds).")
 
46
  text = clean_text(text)
47
  if not text:
48
  raise gr.Error("Please enter some text.")
49
 
 
 
 
 
50
  tts = get_tts()
51
  wav_path = ref_audio
52
 
53
- chunks = [text]
54
- if split_sentences:
55
- import re
56
- chunks = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
57
-
58
  out_wavs = []
 
59
  with tempfile.TemporaryDirectory() as td:
60
  for i, chunk in enumerate(chunks, 1):
 
61
  out_path = os.path.join(td, f"part_{i}.wav")
62
  synth_to_file_safe(tts, chunk, out_path, wav_path, language_code, speed)
63
  data, sr = sf.read(out_path)
64
  out_wavs.append((data, sr))
65
 
66
- if len(out_wavs) == 1:
67
- final_data, sr = out_wavs[0]
68
- else:
69
- sr = out_wavs[0][1]
70
- final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
71
-
72
  final_path = os.path.join(td, "output.wav")
73
- sf.write(final_path, final_data, sr)
74
  return final_path
75
 
76
- with gr.Blocks(title="TalkClone - Voice Cloning & TTS", css="#warning{border-left:4px solid #22c55e;padding-left:8px;}") as demo:
77
- gr.Markdown("# TalkClone — Turn Text into Speech using a Reference Audio")
78
- gr.Markdown(
79
- "Upload a short, clean **reference voice** (10–60s), pick a **language**, type your **text**, and generate audio. "
80
- "For best results: no music/background noise, single speaker, 16kHz+ WAV/MP3."
81
- )
82
- with gr.Row():
83
- with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
85
  language = gr.Dropdown(choices=LANGS, value="en", label="Language")
86
- text = gr.Textbox(label="Text", lines=5, placeholder="Type your text here...")
87
  speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
88
  split = gr.Checkbox(value=True, label="Auto split long text by sentence")
89
- submit = gr.Button("Generate", variant="primary")
90
- gr.Markdown(
91
- '<div id="warning"><strong>Consent & Safety:</strong> Only clone voices you have explicit permission to use. '
92
- "Avoid public-figure impersonation and disclose AI-generated audio when required by law.</div>"
93
- )
94
- with gr.Column():
95
- output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
96
- download = gr.File(label="Download audio")
97
 
98
  def run_and_return(text, ref_audio, language, speed, split):
99
- out_path = tts_clone(text, ref_audio, language, speed, split)
100
- return out_path, out_path
101
 
102
- submit.click(run_and_return, inputs=[text, ref_audio, language, speed, split],
103
- outputs=[output, download])
 
104
 
105
  if __name__ == "__main__":
106
- demo.launch()
 
 
1
+ import os, re, tempfile
 
 
2
  import numpy as np
3
  import soundfile as sf
4
+ import gradio as gr
5
  from TTS.api import TTS
6
 
7
+ # -------- speed / device --------
8
+ USE_GPU = os.environ.get("USE_GPU", "1") == "1" # set to 1 if you switch Space to GPU (T4, A10G)
 
 
9
  MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
10
 
11
+ _tts = None
 
12
  def get_tts():
13
+ global _tts
14
+ if _tts is None:
15
+ t = TTS(MODEL_NAME)
16
+ try:
17
+ if USE_GPU:
18
+ t = t.to("cuda")
19
+ except Exception:
20
+ pass
21
+ _tts = t
22
+ return _tts
23
 
24
  LANGS = [
25
+ ("English","en"),("Urdu","ur"),("Hindi","hi"),("Arabic","ar"),
26
+ ("French","fr"),("German","de"),("Spanish","es"),("Italian","it"),
27
+ ("Portuguese","pt"),("Turkish","tr"),
28
  ]
29
 
30
+ def clean_text(text:str)->str:
31
  return " ".join((text or "").strip().split())
32
 
33
  def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
34
  try:
35
+ tts.tts_to_file(text=txt, file_path=out_path,
36
+ speaker_wav=wav_path, language=lang, speed=speed)
 
 
37
  except TypeError:
38
+ tts.tts_to_file(text=txt, file_path=out_path,
39
+ speaker_wav=wav_path, language=lang)
 
 
40
 
41
+ def split_sentences(text:str):
42
+ parts = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
43
+ return parts or [text]
44
+
45
+ def tts_clone(text, ref_audio, language_code, speed, split_long, progress=gr.Progress()):
46
+ if not ref_audio:
47
  raise gr.Error("Please upload a reference voice sample (10–60 seconds).")
48
+
49
  text = clean_text(text)
50
  if not text:
51
  raise gr.Error("Please enter some text.")
52
 
53
+ # hard guard for CPU: very long text can take a long time
54
+ if not USE_GPU and len(text) > 600:
55
+ raise gr.Error("Text is long for CPU. Please try ≤ 600 characters, or switch the Space to a GPU for long texts.")
56
+
57
  tts = get_tts()
58
  wav_path = ref_audio
59
 
60
+ chunks = split_sentences(text) if split_long else [text]
 
 
 
 
61
  out_wavs = []
62
+
63
  with tempfile.TemporaryDirectory() as td:
64
  for i, chunk in enumerate(chunks, 1):
65
+ progress((i-1)/max(1,len(chunks)), desc=f"Generating part {i}/{len(chunks)}")
66
  out_path = os.path.join(td, f"part_{i}.wav")
67
  synth_to_file_safe(tts, chunk, out_path, wav_path, language_code, speed)
68
  data, sr = sf.read(out_path)
69
  out_wavs.append((data, sr))
70
 
71
+ sr = out_wavs[0][1]
72
+ final = out_wavs[0][0] if len(out_wavs)==1 else np.concatenate([d for d,_ in out_wavs], axis=0)
 
 
 
 
73
  final_path = os.path.join(td, "output.wav")
74
+ sf.write(final_path, final, sr)
75
  return final_path
76
 
77
+ # ---------------- UI ----------------
78
+ THEME = gr.themes.Soft(
79
+ primary_hue="blue", neutral_hue="slate"
80
+ ).set(
81
+ body_background_fill="#ffffff",
82
+ block_background_fill="#ffffff",
83
+ block_border_width="1px",
84
+ block_border_color="#e5e7eb",
85
+ radius_xl="14px"
86
+ )
87
+
88
+ CUSTOM_CSS = """
89
+ /* one-column layout */
90
+ .container {max-width: 880px; margin: 24px auto;}
91
+ /* hide footer (“Built with Gradio”) */
92
+ footer { display: none !important; }
93
+ /* hide top-right toolbar (API / Settings / etc.) */
94
+ button[aria-label="Use via API"],
95
+ button[aria-label="Settings"],
96
+ a[href*="gradio.app"] { display:none !important; }
97
+ /* tighten widgets */
98
+ .gradio-container .wrap {gap: 8px;}
99
+ """
100
+
101
+ with gr.Blocks(theme=THEME, css=CUSTOM_CSS, fill_height=True, title="TalkClone - Voice Cloning & TTS") as demo:
102
+ gr.HTML('<div class="container"><h1 style="margin:0 0 8px;font-weight:700;">TalkClone — Clone a voice & generate speech</h1>'
103
+ '<p style="margin:0 0 16px;color:#334155;">Upload a clean reference (10–60s), choose language, enter text, then Generate.</p></div>')
104
+
105
+ with gr.Group():
106
+ with gr.Column(scale=1):
107
  ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
108
  language = gr.Dropdown(choices=LANGS, value="en", label="Language")
109
+ text = gr.Textbox(label="Text", lines=6, placeholder="Type your text here")
110
  speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
111
  split = gr.Checkbox(value=True, label="Auto split long text by sentence")
112
+ generate = gr.Button("Generate", variant="primary", scale=1)
113
+
114
+ with gr.Group():
115
+ output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
116
+ download = gr.File(label="Download audio")
 
 
 
117
 
118
  def run_and_return(text, ref_audio, language, speed, split):
119
+ p = tts_clone(text, ref_audio, language, speed, split)
120
+ return p, p
121
 
122
+ generate.click(run_and_return,
123
+ inputs=[text, ref_audio, language, speed, split],
124
+ outputs=[output, download])
125
 
126
  if __name__ == "__main__":
127
+ # hide “Use via API”, keep errors off in production
128
+ demo.launch(show_api=False, show_error=False)