asbgig commited on
Commit
a4b0424
·
verified ·
1 Parent(s): bf4353b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -78
app.py CHANGED
@@ -1,32 +1,36 @@
1
- # app.py — TalkClone (HF Space, one-column, footer hidden)
2
 
3
- import os
4
- import tempfile
5
- import re
6
  import numpy as np
7
  import soundfile as sf
 
8
 
9
- # --- Coqui XTTS license prompt (must be set in headless envs like Spaces)
10
  os.environ.setdefault("COQUI_TOS_AGREED", "1")
11
 
12
- import gradio as gr
13
- from TTS.api import TTS
14
-
15
- # ----------------------------
16
- # Model: Coqui XTTS v2
17
- # ----------------------------
18
  MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
19
 
20
- # Try to use GPU when available (on HF, switch Space hardware to a GPU in Settings)
21
- try:
22
- import torch
23
- USE_GPU = torch.cuda.is_available()
24
- tts = TTS(MODEL_NAME, gpu=USE_GPU) # some versions accept gpu kwarg
25
- except Exception:
26
- # Fallback (older/newer API variations)
27
- tts = TTS(MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- # (label, value) pairs -> UI shows label, function receives code
30
  LANGS = [
31
  ("English", "en"),
32
  ("Urdu", "ur"),
@@ -40,103 +44,76 @@ LANGS = [
40
  ("Turkish", "tr"),
41
  ]
42
 
43
- def clean_text(text: str) -> str:
44
- """Trim and collapse whitespace."""
45
- return " ".join((text or "").strip().split())
46
 
47
- def synth_to_file_safe(txt, out_path, wav_path, lang, speed):
48
- """
49
- Call XTTS with 'speed' if supported; fall back without it if not.
50
- Some XTTS builds ignore/raise on speed, so we guard it.
51
- """
52
  try:
53
  tts.tts_to_file(
54
- text=txt,
55
- file_path=out_path,
56
- speaker_wav=wav_path,
57
- language=lang,
58
- speed=speed,
59
  )
60
  except TypeError:
61
- # Older/newer variants may not accept "speed"
62
  tts.tts_to_file(
63
- text=txt,
64
- file_path=out_path,
65
- speaker_wav=wav_path,
66
- language=lang,
67
  )
68
 
69
  def tts_clone(text, ref_audio, language_code, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
70
- # Basic checks
71
  if ref_audio is None:
72
- raise gr.Error("Please upload a reference voice sample (10–60 seconds of clean speech).")
73
  text = clean_text(text)
74
  if not text:
75
  raise gr.Error("Please enter some text.")
76
 
77
- # Gradio passes a file path when type='filepath'
78
  wav_path = ref_audio
79
-
80
- # Split long text into sentences (keeps memory lower on CPU; speeds up first output chunk)
81
  chunks = [text]
82
  if split_sentences:
 
83
  chunks = [s.strip() for s in re.split(r'(?<=[.!?؟۔])\s+', text) if s.strip()]
84
 
 
 
85
  out_wavs = []
86
  with tempfile.TemporaryDirectory() as td:
87
  for i, chunk in enumerate(chunks, 1):
88
- progress((i - 1) / max(len(chunks), 1), desc=f"Synthesizing part {i}/{len(chunks)}")
89
- out_path = os.path.join(td, f"part_{i}.wav")
90
- synth_to_file_safe(chunk, out_path, wav_path, language_code, speed)
91
- data, sr = sf.read(out_path)
92
  out_wavs.append((data, sr))
93
 
94
- # Concatenate all parts
95
  if len(out_wavs) == 1:
96
  final_data, sr = out_wavs[0]
97
  else:
98
  sr = out_wavs[0][1]
99
  final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
100
 
101
- # Save final output
102
  final_path = os.path.join(td, "output.wav")
103
  sf.write(final_path, final_data, sr)
104
  return final_path
105
 
106
- # ---- Minimal CSS: one column layout + hide footer / badges / settings
107
  HIDE_CSS = """
108
- /* one-column width */
109
  .gradio-container { max-width: 880px !important; margin: 0 auto; }
110
-
111
- /* hide footer + badges + "Use via API" strip */
112
- footer, .footer, #footer { display: none !important; }
113
- a[href*="gradio.live"], a[href*="gradio.app"], a[href*="hf.space"] { display: none !important; }
114
- /* hide top-right settings gear / menu in many themes */
115
- button[aria-label="Settings"], [data-testid="block-analytics"], [data-testid="embed-info"] { display:none !important; }
116
  """
117
 
118
- THEME = gr.themes.Soft(
119
- primary_hue="indigo",
120
- neutral_hue="slate",
121
- ).set(
122
- body_background_fill="*white",
123
- button_primary_background_fill="*primary_500",
124
- button_primary_background_fill_hover="*primary_600",
125
- input_background_fill="*neutral_50",
126
- input_border_color="*neutral_200",
127
- )
128
-
129
  with gr.Blocks(
130
  title="TalkClone - Voice Cloning & TTS",
131
- theme=THEME,
132
  css=HIDE_CSS,
133
  analytics_enabled=False
134
  ) as demo:
135
  gr.Markdown("## TalkClone — Turn Text into Speech from a Reference Voice")
136
  gr.Markdown(
137
- "Upload a short, clean **reference voice** (10–60s), pick a **language**, paste your **text**, and click **Generate**.\n\n"
138
- "**Tip for speed:** long paragraphs synthesize faster if you keep them under ~20–30 seconds per sentence. "
139
- "For best cloning quality, avoid background music/noise."
140
  )
141
 
142
  ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
@@ -144,14 +121,14 @@ with gr.Blocks(
144
  text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…")
145
  speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
146
  split = gr.Checkbox(value=True, label="Auto split long text by sentence")
147
- submit = gr.Button("Generate", variant="primary", scale=1)
148
 
149
  output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
150
  download = gr.File(label="Download audio")
151
 
152
  def run_and_return(text, ref_audio, language, speed, split):
153
- out_path = tts_clone(text, ref_audio, language, speed, split)
154
- return out_path, out_path
155
 
156
  submit.click(
157
  run_and_return,
@@ -160,10 +137,11 @@ with gr.Blocks(
160
  )
161
 
162
  if __name__ == "__main__":
163
- # On HF Spaces, don't open browser; show_api=False hides "Use via API".
 
164
  demo.queue(concurrency_count=1).launch(
165
  server_name="0.0.0.0",
166
- server_port=7860,
167
  show_error=True,
168
- show_api=False
169
  )
 
1
+ # app.py — TalkClone (HF Space, one-column, footer hidden, binds to $PORT)
2
 
3
+ import os, re, tempfile
 
 
4
  import numpy as np
5
  import soundfile as sf
6
+ import gradio as gr
7
 
8
+ # Accept Coqui license non-interactively (required on Spaces)
9
  os.environ.setdefault("COQUI_TOS_AGREED", "1")
10
 
 
 
 
 
 
 
11
  MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
12
 
13
+ # Lazy-load TTS so the Space starts quickly and fails less often
14
+ _tts = None
15
+ def get_tts():
16
+ global _tts
17
+ if _tts is not None:
18
+ return _tts
19
+ # Try GPU if torch+CUDA is present; otherwise fall back to CPU.
20
+ try:
21
+ import torch
22
+ use_gpu = torch.cuda.is_available()
23
+ except Exception:
24
+ use_gpu = False
25
+
26
+ from TTS.api import TTS
27
+ try:
28
+ # Some versions accept gpu=…
29
+ _tts = TTS(MODEL_NAME, gpu=use_gpu)
30
+ except TypeError:
31
+ _tts = TTS(MODEL_NAME)
32
+ return _tts
33
 
 
34
  LANGS = [
35
  ("English", "en"),
36
  ("Urdu", "ur"),
 
44
  ("Turkish", "tr"),
45
  ]
46
 
47
+ def clean_text(t: str) -> str:
48
+ return " ".join((t or "").strip().split())
 
49
 
50
+ def synth_to_file_safe(tts, txt, out_path, wav_path, lang, speed):
51
+ # XTTS variants differ on "speed" support
 
 
 
52
  try:
53
  tts.tts_to_file(
54
+ text=txt, file_path=out_path,
55
+ speaker_wav=wav_path, language=lang, speed=speed
 
 
 
56
  )
57
  except TypeError:
 
58
  tts.tts_to_file(
59
+ text=txt, file_path=out_path,
60
+ speaker_wav=wav_path, language=lang
 
 
61
  )
62
 
63
  def tts_clone(text, ref_audio, language_code, speed, split_sentences, progress=gr.Progress(track_tqdm=True)):
 
64
  if ref_audio is None:
65
+ raise gr.Error("Please upload a reference voice sample (10–60 seconds, clean speech).")
66
  text = clean_text(text)
67
  if not text:
68
  raise gr.Error("Please enter some text.")
69
 
 
70
  wav_path = ref_audio
 
 
71
  chunks = [text]
72
  if split_sentences:
73
+ # Split on sentence boundaries including Urdu/Arabic punctuation
74
  chunks = [s.strip() for s in re.split(r'(?<=[.!?؟۔])\s+', text) if s.strip()]
75
 
76
+ tts = get_tts()
77
+
78
  out_wavs = []
79
  with tempfile.TemporaryDirectory() as td:
80
  for i, chunk in enumerate(chunks, 1):
81
+ progress((i-1)/max(len(chunks),1), desc=f"Synthesizing {i}/{len(chunks)}")
82
+ part_path = os.path.join(td, f"part_{i}.wav")
83
+ synth_to_file_safe(tts, chunk, part_path, wav_path, language_code, speed)
84
+ data, sr = sf.read(part_path)
85
  out_wavs.append((data, sr))
86
 
 
87
  if len(out_wavs) == 1:
88
  final_data, sr = out_wavs[0]
89
  else:
90
  sr = out_wavs[0][1]
91
  final_data = np.concatenate([d for d, _ in out_wavs], axis=0)
92
 
 
93
  final_path = os.path.join(td, "output.wav")
94
  sf.write(final_path, final_data, sr)
95
  return final_path
96
 
97
+ # ---- Minimal CSS: one column + hide footer / badges / settings
98
  HIDE_CSS = """
99
+ /* compact one-column center */
100
  .gradio-container { max-width: 880px !important; margin: 0 auto; }
101
+ /* hide footer & badges & embed/info areas */
102
+ footer, .footer, #footer, [data-testid="block-analytics"], [data-testid="embed-info"] { display:none !important; }
103
+ a[href*="gradio.live"], a[href*="gradio.app"], a[href*="hf.space"] { display:none !important; }
104
+ /* hide settings button in many themes */
105
+ button[aria-label="Settings"] { display:none !important; }
 
106
  """
107
 
 
 
 
 
 
 
 
 
 
 
 
108
  with gr.Blocks(
109
  title="TalkClone - Voice Cloning & TTS",
 
110
  css=HIDE_CSS,
111
  analytics_enabled=False
112
  ) as demo:
113
  gr.Markdown("## TalkClone — Turn Text into Speech from a Reference Voice")
114
  gr.Markdown(
115
+ "Upload a short **reference voice** (10–60s), choose **language**, enter **text**, click **Generate**.\n"
116
+ "**Tip:** Long texts are split by sentence for reliability; shorter sentences synthesize faster."
 
117
  )
118
 
119
  ref_audio = gr.Audio(label="Reference Voice (WAV/MP3)", type="filepath")
 
121
  text = gr.Textbox(label="Text", lines=6, placeholder="Type or paste your text here…")
122
  speed = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Speed")
123
  split = gr.Checkbox(value=True, label="Auto split long text by sentence")
124
+ submit = gr.Button("Generate", variant="primary")
125
 
126
  output = gr.Audio(label="Cloned Speech", type="filepath", interactive=False)
127
  download = gr.File(label="Download audio")
128
 
129
  def run_and_return(text, ref_audio, language, speed, split):
130
+ path = tts_clone(text, ref_audio, language, speed, split)
131
+ return path, path
132
 
133
  submit.click(
134
  run_and_return,
 
137
  )
138
 
139
  if __name__ == "__main__":
140
+ # IMPORTANT on Spaces: bind to the port Spaces gives you
141
+ port = int(os.environ.get("PORT", "7860"))
142
  demo.queue(concurrency_count=1).launch(
143
  server_name="0.0.0.0",
144
+ server_port=port,
145
  show_error=True,
146
+ show_api=False,
147
  )