asbgig commited on
Commit
bf4353b
·
verified ·
1 Parent(s): 2c102d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -88
app.py CHANGED
@@ -1,128 +1,169 @@
1
- import os, re, tempfile
 
 
 
 
2
  import numpy as np
3
  import soundfile as sf
 
 
 
 
4
  import gradio as gr
5
  from TTS.api import TTS
6
 
7
- # -------- speed / device --------
8
- USE_GPU = os.environ.get("USE_GPU", "1") == "1" # set to 1 if you switch Space to GPU (T4, A10G)
 
9
  MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
10
 
11
- _tts = None
12
- def get_tts():
13
- global _tts
14
- if _tts is None:
15
- t = TTS(MODEL_NAME)
16
- try:
17
- if USE_GPU:
18
- t = t.to("cuda")
19
- except Exception:
20
- pass
21
- _tts = t
22
- return _tts
23
 
 
24
  LANGS = [
25
- ("English","en"),("Urdu","ur"),("Hindi","hi"),("Arabic","ar"),
26
- ("French","fr"),("German","de"),("Spanish","es"),("Italian","it"),
27
- ("Portuguese","pt"),("Turkish","tr"),
 
 
 
 
 
 
 
28
  ]
29
 
30
- def clean_text(text:str)->str:
 
31
  return " ".join((text or "").strip().split())
32
 
33
- def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
 
 
 
 
34
  try:
35
- tts.tts_to_file(text=txt, file_path=out_path,
36
- speaker_wav=wav_path, language=lang, speed=speed)
 
 
 
 
 
37
  except TypeError:
38
- tts.tts_to_file(text=txt, file_path=out_path,
39
- speaker_wav=wav_path, language=lang)
40
-
41
- def split_sentences(text:str):
42
- parts = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
43
- return parts or [text]
44
-
45
- def tts_clone(text, ref_audio, language_code, speed, split_long, progress=gr.Progress()):
46
- if not ref_audio:
47
- raise gr.Error("Please upload a reference voice sample (10–60 seconds).")
48
-
 
49
  text = clean_text(text)
50
  if not text:
51
  raise gr.Error("Please enter some text.")
52
 
53
- # hard guard for CPU: very long text can take a long time
54
- if not USE_GPU and len(text) > 600:
55
- raise gr.Error("Text is long for CPU. Please try ≤ 600 characters, or switch the Space to a GPU for long texts.")
56
-
57
- tts = get_tts()
58
  wav_path = ref_audio
59
 
60
- chunks = split_sentences(text) if split_long else [text]
61
- out_wavs = []
 
 
62
 
 
63
  with tempfile.TemporaryDirectory() as td:
64
  for i, chunk in enumerate(chunks, 1):
65
- progress((i-1)/max(1,len(chunks)), desc=f"Generating part {i}/{len(chunks)}")
66
  out_path = os.path.join(td, f"part_{i}.wav")
67
- synth_to_file_safe(tts, chunk, out_path, wav_path, language_code, speed)
68
  data, sr = sf.read(out_path)
69
  out_wavs.append((data, sr))
70
 
71
- sr = out_wavs[0][1]
72
- final = out_wavs[0][0] if len(out_wavs)==1 else np.concatenate([d for d,_ in out_wavs], axis=0)
 
 
 
 
 
 
73
  final_path = os.path.join(td, "output.wav")
74
- sf.write(final_path, final, sr)
75
  return final_path
76
 
77
- # ---------------- UI ----------------
78
- THEME = gr.themes.Soft(
79
- primary_hue="blue", neutral_hue="slate"
80
- ).set(
81
- body_background_fill="#ffffff",
82
- block_background_fill="#ffffff",
83
- block_border_width="1px",
84
- block_border_color="#e5e7eb",
85
- radius_xl="14px"
86
- )
87
 
88
- CUSTOM_CSS = """
89
- /* one-column layout */
90
- .container {max-width: 880px; margin: 24px auto;}
91
- /* hide footer (“Built with Gradio”) */
92
- footer { display: none !important; }
93
- /* hide top-right toolbar (API / Settings / etc.) */
94
- button[aria-label="Use via API"],
95
- button[aria-label="Settings"],
96
- a[href*="gradio.app"] { display:none !important; }
97
- /* tighten widgets */
98
- .gradio-container .wrap {gap: 8px;}
99
  """
100
 
101
- with gr.Blocks(theme=THEME, css=CUSTOM_CSS, fill_height=True, title="TalkClone - Voice Cloning & TTS") as demo:
102
- gr.HTML('<div class="container"><h1 style="margin:0 0 8px;font-weight:700;">TalkClone — Clone a voice & generate speech</h1>'
103
- '<p style="margin:0 0 16px;color:#334155;">Upload a clean reference (10–60s), choose language, enter text, then Generate.</p></div>')
104
-
105
- with gr.Group():
106
- with gr.Column(scale=1):
107
- ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
108
- language = gr.Dropdown(choices=LANGS, value="en", label="Language")
109
- text = gr.Textbox(label="Text", lines=6, placeholder="Type your text here…")
110
- speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
111
- split = gr.Checkbox(value=True, label="Auto split long text by sentence")
112
- generate = gr.Button("Generate", variant="primary", scale=1)
113
 
114
- with gr.Group():
115
- output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
116
- download = gr.File(label="Download audio")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def run_and_return(text, ref_audio, language, speed, split):
119
- p = tts_clone(text, ref_audio, language, speed, split)
120
- return p, p
121
 
122
- generate.click(run_and_return,
123
- inputs=[text, ref_audio, language, speed, split],
124
- outputs=[output, download])
 
 
125
 
126
  if __name__ == "__main__":
127
- # hide “Use via API”, keep errors off in production
128
- demo.launch(show_api=False, show_error=False)
 
 
 
 
 
 
1
+ # app.py — TalkClone (HF Space, one-column, footer hidden)
2
+
3
+ import os
4
+ import tempfile
5
+ import re
6
  import numpy as np
7
  import soundfile as sf
8
+
9
+ # --- Coqui XTTS license prompt (must be set in headless envs like Spaces)
10
+ os.environ.setdefault("COQUI_TOS_AGREED", "1")
11
+
12
  import gradio as gr
13
  from TTS.api import TTS
14
 
15
+ # ----------------------------
16
+ # Model: Coqui XTTS v2
17
+ # ----------------------------
18
  MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
19
 
20
+ # Try to use GPU when available (on HF, switch Space hardware to a GPU in Settings)
21
+ try:
22
+ import torch
23
+ USE_GPU = torch.cuda.is_available()
24
+ tts = TTS(MODEL_NAME, gpu=USE_GPU) # some versions accept gpu kwarg
25
+ except Exception:
26
+ # Fallback (older/newer API variations)
27
+ tts = TTS(MODEL_NAME)
 
 
 
 
28
 
29
+ # (label, value) pairs -> UI shows label, function receives code
30
  LANGS = [
31
+ ("English", "en"),
32
+ ("Urdu", "ur"),
33
+ ("Hindi", "hi"),
34
+ ("Arabic", "ar"),
35
+ ("French", "fr"),
36
+ ("German", "de"),
37
+ ("Spanish", "es"),
38
+ ("Italian", "it"),
39
+ ("Portuguese", "pt"),
40
+ ("Turkish", "tr"),
41
  ]
42
 
43
+ def clean_text(text: str) -> str:
44
+ """Trim and collapse whitespace."""
45
  return " ".join((text or "").strip().split())
46
 
47
+ def synth_to_file_safe(txt, out_path, wav_path, lang, speed):
48
+ """
49
+ Call XTTS with 'speed' if supported; fall back without it if not.
50
+ Some XTTS builds ignore/raise on speed, so we guard it.
51
+ """
52
  try:
53
+ tts.tts_to_file(
54
+ text=txt,
55
+ file_path=out_path,
56
+ speaker_wav=wav_path,
57
+ language=lang,
58
+ speed=speed,
59
+ )
60
  except TypeError:
61
+ # Older/newer variants may not accept "speed"
62
+ tts.tts_to_file(
63
+ text=txt,
64
+ file_path=out_path,
65
+ speaker_wav=wav_path,
66
+ language=lang,
67
+ )
68
+
69
+ def tts_clone(text, ref_audio, language_code, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
70
+ # Basic checks
71
+ if ref_audio is None:
72
+ raise gr.Error("Please upload a reference voice sample (10–60 seconds of clean speech).")
73
  text = clean_text(text)
74
  if not text:
75
  raise gr.Error("Please enter some text.")
76
 
77
+ # Gradio passes a file path when type='filepath'
 
 
 
 
78
  wav_path = ref_audio
79
 
80
+ # Split long text into sentences (keeps memory lower on CPU; speeds up first output chunk)
81
+ chunks = [text]
82
+ if split_sentences:
83
+ chunks = [s.strip() for s in re.split(r'(?<=[.!?؟۔])\s+', text) if s.strip()]
84
 
85
+ out_wavs = []
86
  with tempfile.TemporaryDirectory() as td:
87
  for i, chunk in enumerate(chunks, 1):
88
+ progress((i - 1) / max(len(chunks), 1), desc=f"Synthesizing part {i}/{len(chunks)}")
89
  out_path = os.path.join(td, f"part_{i}.wav")
90
+ synth_to_file_safe(chunk, out_path, wav_path, language_code, speed)
91
  data, sr = sf.read(out_path)
92
  out_wavs.append((data, sr))
93
 
94
+ # Concatenate all parts
95
+ if len(out_wavs) == 1:
96
+ final_data, sr = out_wavs[0]
97
+ else:
98
+ sr = out_wavs[0][1]
99
+ final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
100
+
101
+ # Save final output
102
  final_path = os.path.join(td, "output.wav")
103
+ sf.write(final_path, final_data, sr)
104
  return final_path
105
 
106
+ # ---- Minimal CSS: one column layout + hide footer / badges / settings
107
+ HIDE_CSS = """
108
+ /* one-column width */
109
+ .gradio-container { max-width: 880px !important; margin: 0 auto; }
 
 
 
 
 
 
110
 
111
+ /* hide footer + badges + "Use via API" strip */
112
+ footer, .footer, #footer { display: none !important; }
113
+ a[href*="gradio.live"], a[href*="gradio.app"], a[href*="hf.space"] { display: none !important; }
114
+ /* hide top-right settings gear / menu in many themes */
115
+ button[aria-label="Settings"], [data-testid="block-analytics"], [data-testid="embed-info"] { display:none !important; }
 
 
 
 
 
 
116
  """
117
 
118
+ THEME = gr.themes.Soft(
119
+ primary_hue="indigo",
120
+ neutral_hue="slate",
121
+ ).set(
122
+ body_background_fill="*white",
123
+ button_primary_background_fill="*primary_500",
124
+ button_primary_background_fill_hover="*primary_600",
125
+ input_background_fill="*neutral_50",
126
+ input_border_color="*neutral_200",
127
+ )
 
 
128
 
129
+ with gr.Blocks(
130
+ title="TalkClone - Voice Cloning & TTS",
131
+ theme=THEME,
132
+ css=HIDE_CSS,
133
+ analytics_enabled=False
134
+ ) as demo:
135
+ gr.Markdown("## TalkClone — Turn Text into Speech from a Reference Voice")
136
+ gr.Markdown(
137
+ "Upload a short, clean **reference voice** (10–60s), pick a **language**, paste your **text**, and click **Generate**.\n\n"
138
+ "**Tip for speed:** long paragraphs synthesize faster if you keep them under ~20–30 seconds per sentence. "
139
+ "For best cloning quality, avoid background music/noise."
140
+ )
141
+
142
+ ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
143
+ language = gr.Dropdown(choices=LANGS, value="en", label="Language")
144
+ text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…")
145
+ speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
146
+ split = gr.Checkbox(value=True, label="Auto split long text by sentence")
147
+ submit = gr.Button("Generate", variant="primary", scale=1)
148
+
149
+ output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
150
+ download = gr.File(label="Download audio")
151
 
152
  def run_and_return(text, ref_audio, language, speed, split):
153
+ out_path = tts_clone(text, ref_audio, language, speed, split)
154
+ return out_path, out_path
155
 
156
+ submit.click(
157
+ run_and_return,
158
+ inputs=[text, ref_audio, language, speed, split],
159
+ outputs=[output, download]
160
+ )
161
 
162
  if __name__ == "__main__":
163
+ # On HF Spaces, don't open browser; show_api=False hides "Use via API".
164
+ demo.queue(concurrency_count=1).launch(
165
+ server_name="0.0.0.0",
166
+ server_port=7860,
167
+ show_error=True,
168
+ show_api=False
169
+ )