tahirturk commited on
Commit
213e126
·
1 Parent(s): 449de0f
Files changed (1) hide show
  1. app.py +55 -205
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
 
2
  os.environ["GRADIO_NODE_LAUNCH_METHOD"] = "legacy"
 
3
  import random
4
  import numpy as np
5
  import torch
@@ -7,176 +9,86 @@ from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGE
7
  import gradio as gr
8
  import spaces
9
 
 
10
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
11
- print(f"🚀 Running on device: {DEVICE}")
12
 
13
- # --- Global Model Initialization ---
14
  MODEL = None
15
 
 
16
  LANGUAGE_CONFIG = {
17
- "ar": {
18
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac",
19
- "text": "في الشهر الماضي، وصلنا إلى معلم جديد بمليارين من المشاهدات على قناتنا على يوتيوب."
20
- },
21
- "da": {
22
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/da_m1.flac",
23
- "text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal."
24
- },
25
- "de": {
26
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac",
27
- "text": "Letzten Monat haben wir einen neuen Meilenstein erreicht: zwei Milliarden Aufrufe auf unserem YouTube-Kanal."
28
- },
29
- "el": {
30
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/el_m.flac",
31
- "text": "Τον περασμένο μήνα, φτάσαμε σε ένα νέο ορόσημο με δύο δισεκατομμύρια προβολές στο κανάλι μας στο YouTube."
32
- },
33
  "en": {
34
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac",
35
  "text": "Last month, we reached a new milestone with two billion views on our YouTube channel."
36
  },
37
- "es": {
38
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac",
39
- "text": "El mes pasado alcanzamos un nuevo hito: dos mil millones de visualizaciones en nuestro canal de YouTube."
40
- },
41
- "fi": {
42
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fi_m.flac",
43
- "text": "Viime kuussa saavutimme uuden virstanpylvään kahden miljardin katselukerran kanssa YouTube-kanavallamme."
44
- },
45
  "fr": {
46
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac",
47
  "text": "Le mois dernier, nous avons atteint un nouveau jalon avec deux milliards de vues sur notre chaîne YouTube."
48
  },
49
- "he": {
50
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/he_m1.flac",
51
- "text": "בחודש שעבר הגענו לאבן דרך חדשה עם שני מיליארד צפיות בערוץ היוטיוב שלנו."
52
- },
53
  "hi": {
54
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac",
55
  "text": "पिछले महीने हमने एक नया मील का पत्थर छुआ: हमारे YouTube चैनल पर दो अरब व्यूज़।"
56
  },
57
- "it": {
58
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/it_m1.flac",
59
- "text": "Il mese scorso abbiamo raggiunto un nuovo traguardo: due miliardi di visualizzazioni sul nostro canale YouTube."
60
  },
61
  "ja": {
62
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac",
63
  "text": "先月、私たちのYouTubeチャンネルで二十億回の再生回数という新たなマイルストーンに到達しました。"
64
  },
65
- "ko": {
66
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ko_f.flac",
67
- "text": "지난달 우리는 유튜브 채널에서 이십억 조회수라는 새로운 이정표에 도달했습니다."
68
- },
69
- "ms": {
70
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ms_f.flac",
71
- "text": "Bulan lepas, kami mencapai pencapaian baru dengan dua bilion tontonan di saluran YouTube kami."
72
- },
73
- "nl": {
74
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/nl_m.flac",
75
- "text": "Vorige maand bereikten we een nieuwe mijlpaal met twee miljard weergaven op ons YouTube-kanaal."
76
- },
77
- "no": {
78
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/no_f1.flac",
79
- "text": "Forrige måned nådde vi en ny milepæl med to milliarder visninger på YouTube-kanalen vår."
80
- },
81
- "pl": {
82
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pl_m.flac",
83
- "text": "W zeszłym miesiącu osiągnęliśmy nowy kamień milowy z dwoma miliardami wyświetleń na naszym kanale YouTube."
84
- },
85
- "pt": {
86
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pt_m1.flac",
87
- "text": "No mês passado, alcançámos um novo marco: dois mil milhões de visualizações no nosso canal do YouTube."
88
- },
89
- "ru": {
90
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ru_m.flac",
91
- "text": "В прошлом месяце мы достигли нового рубежа: два миллиарда просмотров на нашем YouTube-канале."
92
- },
93
- "sv": {
94
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sv_f.flac",
95
- "text": "Förra månaden nådde vi en ny milstolpe med två miljarder visningar på vår YouTube-kanal."
96
- },
97
- "sw": {
98
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sw_m.flac",
99
- "text": "Mwezi uliopita, tulifika hatua mpya ya maoni ya bilioni mbili kweny kituo chetu cha YouTube."
100
- },
101
- "tr": {
102
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/tr_m.flac",
103
- "text": "Geçen ay YouTube kanalımızda iki milyar görüntüleme ile yeni bir dönüm noktasına ulaştık."
104
- },
105
  "zh": {
106
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
107
  "text": "上个月,我们达到了一个新的里程碑。 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。"
108
  },
109
  }
110
 
111
- # --- UI Helpers ---
112
  def default_audio_for_ui(lang: str) -> str | None:
113
  return LANGUAGE_CONFIG.get(lang, {}).get("audio")
114
 
115
-
116
  def default_text_for_ui(lang: str) -> str:
117
  return LANGUAGE_CONFIG.get(lang, {}).get("text", "")
118
 
119
-
120
  def get_supported_languages_display() -> str:
121
- """Generate a formatted display of all supported languages."""
122
- language_items = []
123
- for code, name in sorted(SUPPORTED_LANGUAGES.items()):
124
- language_items.append(f"**{name}** (`{code}`)")
125
-
126
- # Split into 2 lines
127
- mid = len(language_items) // 2
128
- line1 = " • ".join(language_items[:mid])
129
- line2 = " • ".join(language_items[mid:])
130
-
131
  return f"""
132
- ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
133
- {line1}
134
 
135
- {line2}
136
  """
137
 
138
-
139
  def get_or_load_model():
140
- """Loads the ChatterboxMultilingualTTS model if it hasn't been loaded already,
141
- and ensures it's on the correct device."""
142
  global MODEL
143
  if MODEL is None:
144
- print("Model not loaded, initializing...")
145
  try:
146
  MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
147
- if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
148
  MODEL.to(DEVICE)
149
- print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
150
  except Exception as e:
151
- print(f"Error loading model: {e}")
152
  raise
153
  return MODEL
154
 
155
- # Attempt to load the model at startup.
156
  try:
157
  get_or_load_model()
158
  except Exception as e:
159
- print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
160
 
161
  def set_seed(seed: int):
162
- """Sets the random seed for reproducibility across torch, numpy, and random."""
163
  torch.manual_seed(seed)
164
  if DEVICE == "cuda":
165
  torch.cuda.manual_seed(seed)
166
  torch.cuda.manual_seed_all(seed)
167
  random.seed(seed)
168
  np.random.seed(seed)
169
-
170
- def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | None:
171
- """
172
- Decide which audio prompt to use:
173
- - If user provided a path (upload/mic/url), use it.
174
- - Else, fall back to language-specific default (if any).
175
- """
176
- if provided_path and str(provided_path).strip():
177
- return provided_path
178
- return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
179
-
180
 
181
  @spaces.GPU
182
  def generate_tts_audio(
@@ -188,39 +100,15 @@ def generate_tts_audio(
188
  seed_num_input: int = 0,
189
  cfgw_input: float = 0.5
190
  ) -> tuple[int, np.ndarray]:
191
- """
192
- Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
193
- Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
194
-
195
- This tool synthesizes natural-sounding speech from input text. When a reference audio file
196
- is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
197
- maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
198
-
199
- Args:
200
- text_input (str): The text to synthesize into speech (maximum 300 characters)
201
- language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
202
- audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
203
- exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
204
- temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
205
- seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
206
- cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
207
-
208
- Returns:
209
- tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
210
- """
211
- current_model = get_or_load_model()
212
-
213
- if current_model is None:
214
- raise RuntimeError("TTS model is not loaded.")
215
 
216
  if seed_num_input != 0:
217
  set_seed(int(seed_num_input))
218
 
219
- print(f"Generating audio for text: '{text_input[:50]}...'")
220
-
221
- # Handle optional audio prompt
222
  chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
223
-
224
  generate_kwargs = {
225
  "exaggeration": exaggeration_input,
226
  "temperature": temperature_input,
@@ -228,96 +116,58 @@ def generate_tts_audio(
228
  }
229
  if chosen_prompt:
230
  generate_kwargs["audio_prompt_path"] = chosen_prompt
231
- print(f"Using audio prompt: {chosen_prompt}")
232
  else:
233
- print("No audio prompt provided; using default voice.")
234
-
235
- wav = current_model.generate(
236
- text_input[:300], # Truncate text to max chars
237
- language_id=language_id,
238
- **generate_kwargs
239
- )
240
- print("Audio generation complete.")
241
- return (current_model.sr, wav.squeeze(0).numpy())
242
 
243
- with gr.Blocks() as demo:
244
- gr.Markdown(
245
- """
246
- # Chatterbox Multilingual Demo
247
- Generate high-quality multilingual speech from text with reference audio styling, supporting 23 languages.
248
-
249
- For a hosted version of Chatterbox Multilingual and for finetuning, please visit [resemble.ai](https://app.resemble.ai)
250
- """
251
- )
252
-
253
- # Display supported languages
254
  gr.Markdown(get_supported_languages_display())
 
255
  with gr.Row():
256
  with gr.Column():
257
- initial_lang = "fr"
258
- text = gr.Textbox(
259
- value=default_text_for_ui(initial_lang),
260
- label="Text to synthesize (max chars 300)",
261
- max_lines=5
262
- )
263
-
264
  language_id = gr.Dropdown(
265
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
266
- value=initial_lang,
267
- label="Language",
268
- info="Select the language for text-to-speech synthesis"
269
  )
270
-
271
  ref_wav = gr.Audio(
272
  sources=["upload", "microphone"],
273
  type="filepath",
274
- label="Reference Audio File (Optional)",
275
  value=default_audio_for_ui(initial_lang)
276
  )
277
-
278
- gr.Markdown(
279
- "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
280
- elem_classes=["audio-note"]
281
- )
282
-
283
- exaggeration = gr.Slider(
284
- 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
285
- )
286
- cfg_weight = gr.Slider(
287
- 0.2, 1, step=.05, label="CFG/Pace", value=0.5
288
- )
289
 
290
  with gr.Accordion("More options", open=False):
291
- seed_num = gr.Number(value=0, label="Random seed (0 for random)")
292
- temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
293
 
294
- run_btn = gr.Button("Generate", variant="primary")
295
 
296
  with gr.Column():
297
- audio_output = gr.Audio(label="Output Audio")
298
 
299
- def on_language_change(lang, current_ref, current_text):
300
  return default_audio_for_ui(lang), default_text_for_ui(lang)
301
 
302
- language_id.change(
303
- fn=on_language_change,
304
- inputs=[language_id, ref_wav, text],
305
- outputs=[ref_wav, text],
306
- show_progress=False
307
- )
308
 
309
  run_btn.click(
310
  fn=generate_tts_audio,
311
- inputs=[
312
- text,
313
- language_id,
314
- ref_wav,
315
- exaggeration,
316
- temp,
317
- seed_num,
318
- cfg_weight,
319
- ],
320
- outputs=[audio_output],
321
  )
322
 
323
- demo.launch(mcp_server=True)
 
 
 
1
  import os
2
+ # 🛠️ Fix Node.js --import warning on Hugging Face Spaces / older Node versions
3
  os.environ["GRADIO_NODE_LAUNCH_METHOD"] = "legacy"
4
+
5
  import random
6
  import numpy as np
7
  import torch
 
9
  import gradio as gr
10
  import spaces
11
 
12
+ # --- Device Setup ---
13
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
+ print(f"🚀 Using device: {DEVICE}")
15
 
16
+ # --- Global Model Cache ---
17
  MODEL = None
18
 
19
+ # --- Language Defaults ---
20
  LANGUAGE_CONFIG = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "en": {
22
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac",
23
  "text": "Last month, we reached a new milestone with two billion views on our YouTube channel."
24
  },
 
 
 
 
 
 
 
 
25
  "fr": {
26
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac",
27
  "text": "Le mois dernier, nous avons atteint un nouveau jalon avec deux milliards de vues sur notre chaîne YouTube."
28
  },
 
 
 
 
29
  "hi": {
30
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac",
31
  "text": "पिछले महीने हमने एक नया मील का पत्थर छुआ: हमारे YouTube चैनल पर दो अरब व्यूज़।"
32
  },
33
+ "es": {
34
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac",
35
+ "text": "El mes pasado alcanzamos un nuevo hito: dos mil millones de visualizaciones en nuestro canal de YouTube."
36
  },
37
  "ja": {
38
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac",
39
  "text": "先月、私たちのYouTubeチャンネルで二十億回の再生回数という新たなマイルストーンに到達しました。"
40
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  "zh": {
42
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
43
  "text": "上个月,我们达到了一个新的里程碑。 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。"
44
  },
45
  }
46
 
47
+ # --- Helpers ---
48
  def default_audio_for_ui(lang: str) -> str | None:
49
  return LANGUAGE_CONFIG.get(lang, {}).get("audio")
50
 
 
51
  def default_text_for_ui(lang: str) -> str:
52
  return LANGUAGE_CONFIG.get(lang, {}).get("text", "")
53
 
 
54
  def get_supported_languages_display() -> str:
55
+ items = [f"**{name}** (`{code}`)" for code, name in sorted(SUPPORTED_LANGUAGES.items())]
56
+ mid = len(items) // 2
 
 
 
 
 
 
 
 
57
  return f"""
58
+ ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)})
59
+ {' • '.join(items[:mid])}
60
 
61
+ {' • '.join(items[mid:])}
62
  """
63
 
64
+ # --- Model Loader ---
65
  def get_or_load_model():
 
 
66
  global MODEL
67
  if MODEL is None:
68
+ print("🔄 Loading Chatterbox Multilingual TTS model...")
69
  try:
70
  MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
71
+ if hasattr(MODEL, "to") and str(MODEL.device) != DEVICE:
72
  MODEL.to(DEVICE)
73
+ print(f"Model ready on {getattr(MODEL, 'device', DEVICE)}")
74
  except Exception as e:
75
+ print(f"Error loading model: {e}")
76
  raise
77
  return MODEL
78
 
79
+ # Preload model silently (non-blocking for Spaces)
80
  try:
81
  get_or_load_model()
82
  except Exception as e:
83
+ print(f"⚠️ Model preload failed: {e}")
84
 
85
  def set_seed(seed: int):
 
86
  torch.manual_seed(seed)
87
  if DEVICE == "cuda":
88
  torch.cuda.manual_seed(seed)
89
  torch.cuda.manual_seed_all(seed)
90
  random.seed(seed)
91
  np.random.seed(seed)
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  @spaces.GPU
94
  def generate_tts_audio(
 
100
  seed_num_input: int = 0,
101
  cfgw_input: float = 0.5
102
  ) -> tuple[int, np.ndarray]:
103
+ """Generate multilingual TTS output."""
104
+ model = get_or_load_model()
105
+ if model is None:
106
+ raise RuntimeError("Model not loaded")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  if seed_num_input != 0:
109
  set_seed(int(seed_num_input))
110
 
 
 
 
111
  chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
 
112
  generate_kwargs = {
113
  "exaggeration": exaggeration_input,
114
  "temperature": temperature_input,
 
116
  }
117
  if chosen_prompt:
118
  generate_kwargs["audio_prompt_path"] = chosen_prompt
119
+ print(f"🎧 Using reference: {chosen_prompt}")
120
  else:
121
+ print("🎙️ No reference audio using default voice")
 
 
 
 
 
 
 
 
122
 
123
+ wav = model.generate(text_input[:300], language_id=language_id, **generate_kwargs)
124
+ print("✅ Audio generated.")
125
+ return (model.sr, wav.squeeze(0).numpy())
126
+
127
+ # --- Gradio UI ---
128
+ with gr.Blocks(title="Chatterbox Multilingual TTS") as demo:
129
+ gr.Markdown("# 🌍 Chatterbox Multilingual TTS Demo\nGenerate high-quality multilingual speech with optional voice cloning.")
 
 
 
 
130
  gr.Markdown(get_supported_languages_display())
131
+
132
  with gr.Row():
133
  with gr.Column():
134
+ initial_lang = "en"
135
+ text = gr.Textbox(value=default_text_for_ui(initial_lang), label="Text", max_lines=5)
 
 
 
 
 
136
  language_id = gr.Dropdown(
137
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
138
+ value=initial_lang, label="Language"
 
 
139
  )
 
140
  ref_wav = gr.Audio(
141
  sources=["upload", "microphone"],
142
  type="filepath",
143
+ label="Reference Audio (Optional)",
144
  value=default_audio_for_ui(initial_lang)
145
  )
146
+
147
+ exaggeration = gr.Slider(0.25, 2, 0.05, label="Exaggeration (emotion)", value=0.5)
148
+ cfg_weight = gr.Slider(0.2, 1, 0.05, label="CFG / Pace", value=0.5)
 
 
 
 
 
 
 
 
 
149
 
150
  with gr.Accordion("More options", open=False):
151
+ seed_num = gr.Number(value=0, label="Seed (0=random)")
152
+ temp = gr.Slider(0.05, 5, 0.05, label="Temperature", value=0.8)
153
 
154
+ run_btn = gr.Button("🎤 Generate", variant="primary")
155
 
156
  with gr.Column():
157
+ audio_output = gr.Audio(label="Generated Audio")
158
 
159
+ def on_lang_change(lang, _, __):
160
  return default_audio_for_ui(lang), default_text_for_ui(lang)
161
 
162
+ language_id.change(fn=on_lang_change, inputs=[language_id, ref_wav, text],
163
+ outputs=[ref_wav, text], show_progress=False)
 
 
 
 
164
 
165
  run_btn.click(
166
  fn=generate_tts_audio,
167
+ inputs=[text, language_id, ref_wav, exaggeration, temp, seed_num, cfg_weight],
168
+ outputs=[audio_output]
 
 
 
 
 
 
 
 
169
  )
170
 
171
+ # --- Launch ---
172
+ if __name__ == "__main__":
173
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)