Biorrith commited on
Commit
9658111
·
1 Parent(s): 7a08d20

UI changes - only support 2 languages, hardlocked exaggeration and formatting updates.

Browse files
Files changed (2) hide show
  1. app.py +46 -53
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import random
 
 
2
  import numpy as np
3
  import torch
4
- from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES
5
- import gradio as gr
6
 
7
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
8
  print(f"🚀 Running on device: {DEVICE}")
@@ -12,19 +14,17 @@ MODEL = None
12
 
13
  LANGUAGE_CONFIG = {
14
  "da": {
15
- "audio_options": {
16
- "mic": "voices/mic.wav",
17
- "nic": "voices/nic.wav"
18
- },
19
  "default_audio": "voices/mic.wav", # Default to mic
20
- "text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal."
21
  },
22
  "en": {
23
  "audio": "voices/en_f1.flac",
24
- "text": "Last month, we reached a new milestone with two billion views on our YouTube channel."
25
  },
26
  }
27
 
 
28
  # --- UI Helpers ---
29
  def default_audio_for_ui(lang: str, danish_voice: str = "mic") -> str | None:
30
  config = LANGUAGE_CONFIG.get(lang, {})
@@ -47,12 +47,12 @@ def get_supported_languages_display() -> str:
47
  language_items = []
48
  for code, name in sorted(SUPPORTED_LANGUAGES.items()):
49
  language_items.append(f"**{name}** (`{code}`)")
50
-
51
  # Split into 2 lines
52
  mid = len(language_items) // 2
53
  line1 = " • ".join(language_items[:mid])
54
  line2 = " • ".join(language_items[mid:])
55
-
56
  return f"""
57
  ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
58
  {line1}
@@ -69,7 +69,7 @@ def get_or_load_model():
69
  print("Model not loaded, initializing...")
70
  try:
71
  MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
72
- if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
73
  MODEL.to(DEVICE)
74
  print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
75
  except Exception as e:
@@ -77,12 +77,14 @@ def get_or_load_model():
77
  raise
78
  return MODEL
79
 
 
80
  # Attempt to load the model at startup.
81
  try:
82
  get_or_load_model()
83
  except Exception as e:
84
  print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
85
 
 
86
  def set_seed(seed: int):
87
  """Sets the random seed for reproducibility across torch, numpy, and random."""
88
  torch.manual_seed(seed)
@@ -91,7 +93,8 @@ def set_seed(seed: int):
91
  torch.cuda.manual_seed_all(seed)
92
  random.seed(seed)
93
  np.random.seed(seed)
94
-
 
95
  def resolve_audio_prompt(language_id: str, provided_path: str | None, danish_voice: str = "mic") -> str | None:
96
  """
97
  Decide which audio prompt to use:
@@ -112,14 +115,14 @@ def generate_tts_audio(
112
  exaggeration_input: float = 0.5,
113
  temperature_input: float = 0.8,
114
  seed_num_input: int = 0,
115
- cfgw_input: float = 0.5
116
  ) -> tuple[int, np.ndarray]:
117
  """
118
  Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
119
  Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
120
-
121
- This tool synthesizes natural-sounding speech from input text. When a reference audio file
122
- is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
123
  maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
124
 
125
  Args:
@@ -129,7 +132,7 @@ def generate_tts_audio(
129
  exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
130
  temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
131
  seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
132
- cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
133
 
134
  Returns:
135
  tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
@@ -143,7 +146,7 @@ def generate_tts_audio(
143
  set_seed(int(seed_num_input))
144
 
145
  print(f"Generating audio for text: '{text_input[:50]}...'")
146
-
147
  # Handle optional audio prompt
148
  chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input, danish_voice_input)
149
 
@@ -157,71 +160,64 @@ def generate_tts_audio(
157
  print(f"Using audio prompt: {chosen_prompt}")
158
  else:
159
  print("No audio prompt provided; using default voice.")
160
-
161
  wav = current_model.generate(
162
  text_input[:300], # Truncate text to max chars
163
  language_id=language_id,
164
- **generate_kwargs
165
  )
166
  print("Audio generation complete.")
167
  return (current_model.sr, wav.squeeze(0).numpy())
168
 
 
169
  with gr.Blocks() as demo:
170
  gr.Markdown(
171
  """
172
  # Chatterbox Multilingual Demo
173
- Generate high-quality multilingual speech from text with reference audio styling, supporting 23 languages.
174
  """
175
  )
176
-
177
  # Display supported languages
178
  gr.Markdown(get_supported_languages_display())
179
  with gr.Row():
180
  with gr.Column():
181
  initial_lang = "da"
182
- text = gr.Textbox(
183
- value=default_text_for_ui(initial_lang),
184
- label="Text to synthesize (max chars 300)",
185
- max_lines=5
186
- )
187
-
188
  language_id = gr.Dropdown(
189
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
190
  value=initial_lang,
191
  label="Language",
192
- info="Select the language for text-to-speech synthesis"
193
  )
194
-
195
  danish_voice = gr.Dropdown(
196
  choices=get_danish_voice_options(),
197
  value="mic",
198
  label="Danish Voice Selection",
199
  info="Choose between different Danish voice options",
200
- visible=(initial_lang == "da")
201
  )
202
-
203
  ref_wav = gr.Audio(
204
  sources=["upload", "microphone"],
205
  type="filepath",
206
  label="Reference Audio File (Optional)",
207
- value=default_audio_for_ui(initial_lang)
208
  )
209
-
210
  gr.Markdown(
211
  "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
212
- elem_classes=["audio-note"]
213
- )
214
-
215
- exaggeration = gr.Slider(
216
- 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
217
- )
218
- cfg_weight = gr.Slider(
219
- 0.2, 1, step=.05, label="CFG/Pace", value=0.5
220
  )
221
 
 
 
 
222
  with gr.Accordion("More options", open=False):
223
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
224
- temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
225
 
226
  run_btn = gr.Button("Generate", variant="primary")
227
 
@@ -229,13 +225,13 @@ with gr.Blocks() as demo:
229
  audio_output = gr.Audio(label="Output Audio")
230
 
231
  def on_language_change(lang, current_ref, current_text):
232
- is_danish = (lang == "da")
233
  danish_voice_val = "mic" if is_danish else "mic" # Default to mic
234
  return (
235
- default_audio_for_ui(lang, danish_voice_val),
236
- default_text_for_ui(lang),
237
  gr.update(visible=is_danish), # Update Danish voice dropdown visibility
238
- danish_voice_val
239
  )
240
 
241
  def on_danish_voice_change(lang, danish_voice_val):
@@ -247,14 +243,11 @@ with gr.Blocks() as demo:
247
  fn=on_language_change,
248
  inputs=[language_id, ref_wav, text],
249
  outputs=[ref_wav, text, danish_voice, danish_voice],
250
- show_progress=False
251
  )
252
 
253
  danish_voice.change(
254
- fn=on_danish_voice_change,
255
- inputs=[language_id, danish_voice],
256
- outputs=[ref_wav],
257
- show_progress=False
258
  )
259
 
260
  run_btn.click(
@@ -272,4 +265,4 @@ with gr.Blocks() as demo:
272
  outputs=[audio_output],
273
  )
274
 
275
- demo.launch() #mcp_server=True
 
1
  import random
2
+
3
+ import gradio as gr
4
  import numpy as np
5
  import torch
6
+
7
+ from src.chatterbox.mtl_tts import SUPPORTED_LANGUAGES, ChatterboxMultilingualTTS
8
 
9
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
  print(f"🚀 Running on device: {DEVICE}")
 
14
 
15
  LANGUAGE_CONFIG = {
16
  "da": {
17
+ "audio_options": {"mic": "voices/mic.wav", "nic": "voices/nic.wav"},
 
 
 
18
  "default_audio": "voices/mic.wav", # Default to mic
19
+ "text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal.",
20
  },
21
  "en": {
22
  "audio": "voices/en_f1.flac",
23
+ "text": "Last month, we reached a new milestone with two billion views on our YouTube channel.",
24
  },
25
  }
26
 
27
+
28
  # --- UI Helpers ---
29
  def default_audio_for_ui(lang: str, danish_voice: str = "mic") -> str | None:
30
  config = LANGUAGE_CONFIG.get(lang, {})
 
47
  language_items = []
48
  for code, name in sorted(SUPPORTED_LANGUAGES.items()):
49
  language_items.append(f"**{name}** (`{code}`)")
50
+
51
  # Split into 2 lines
52
  mid = len(language_items) // 2
53
  line1 = " • ".join(language_items[:mid])
54
  line2 = " • ".join(language_items[mid:])
55
+
56
  return f"""
57
  ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
58
  {line1}
 
69
  print("Model not loaded, initializing...")
70
  try:
71
  MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
72
+ if hasattr(MODEL, "to") and str(MODEL.device) != DEVICE:
73
  MODEL.to(DEVICE)
74
  print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
75
  except Exception as e:
 
77
  raise
78
  return MODEL
79
 
80
+
81
  # Attempt to load the model at startup.
82
  try:
83
  get_or_load_model()
84
  except Exception as e:
85
  print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
86
 
87
+
88
  def set_seed(seed: int):
89
  """Sets the random seed for reproducibility across torch, numpy, and random."""
90
  torch.manual_seed(seed)
 
93
  torch.cuda.manual_seed_all(seed)
94
  random.seed(seed)
95
  np.random.seed(seed)
96
+
97
+
98
  def resolve_audio_prompt(language_id: str, provided_path: str | None, danish_voice: str = "mic") -> str | None:
99
  """
100
  Decide which audio prompt to use:
 
115
  exaggeration_input: float = 0.5,
116
  temperature_input: float = 0.8,
117
  seed_num_input: int = 0,
118
+ cfgw_input: float = 0.5,
119
  ) -> tuple[int, np.ndarray]:
120
  """
121
  Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
122
  Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
123
+
124
+ This tool synthesizes natural-sounding speech from input text. When a reference audio file
125
+ is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
126
  maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
127
 
128
  Args:
 
132
  exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
133
  temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
134
  seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
135
+ cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
136
 
137
  Returns:
138
  tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
 
146
  set_seed(int(seed_num_input))
147
 
148
  print(f"Generating audio for text: '{text_input[:50]}...'")
149
+
150
  # Handle optional audio prompt
151
  chosen_prompt = resolve_audio_prompt(language_id, audio_prompt_path_input, danish_voice_input)
152
 
 
160
  print(f"Using audio prompt: {chosen_prompt}")
161
  else:
162
  print("No audio prompt provided; using default voice.")
163
+
164
  wav = current_model.generate(
165
  text_input[:300], # Truncate text to max chars
166
  language_id=language_id,
167
+ **generate_kwargs,
168
  )
169
  print("Audio generation complete.")
170
  return (current_model.sr, wav.squeeze(0).numpy())
171
 
172
+
173
  with gr.Blocks() as demo:
174
  gr.Markdown(
175
  """
176
  # Chatterbox Multilingual Demo
177
+ Generate high-quality danish speech from text with reference audio styling.
178
  """
179
  )
180
+
181
  # Display supported languages
182
  gr.Markdown(get_supported_languages_display())
183
  with gr.Row():
184
  with gr.Column():
185
  initial_lang = "da"
186
+ text = gr.Textbox(value=default_text_for_ui(initial_lang), label="Text to synthesize (max chars 300)", max_lines=5)
187
+
 
 
 
 
188
  language_id = gr.Dropdown(
189
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
190
  value=initial_lang,
191
  label="Language",
192
+ info="Select the language for text-to-speech synthesis",
193
  )
194
+
195
  danish_voice = gr.Dropdown(
196
  choices=get_danish_voice_options(),
197
  value="mic",
198
  label="Danish Voice Selection",
199
  info="Choose between different Danish voice options",
200
+ visible=(initial_lang == "da"),
201
  )
202
+
203
  ref_wav = gr.Audio(
204
  sources=["upload", "microphone"],
205
  type="filepath",
206
  label="Reference Audio File (Optional)",
207
+ value=default_audio_for_ui(initial_lang),
208
  )
209
+
210
  gr.Markdown(
211
  "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
212
+ elem_classes=["audio-note"],
 
 
 
 
 
 
 
213
  )
214
 
215
+ exaggeration = 0.5
216
+ cfg_weight = gr.Slider(0.2, 1, step=0.05, label="CFG/Pace", value=0.5)
217
+
218
  with gr.Accordion("More options", open=False):
219
  seed_num = gr.Number(value=0, label="Random seed (0 for random)")
220
+ temp = gr.Slider(0.05, 5, step=0.05, label="Temperature", value=0.8)
221
 
222
  run_btn = gr.Button("Generate", variant="primary")
223
 
 
225
  audio_output = gr.Audio(label="Output Audio")
226
 
227
  def on_language_change(lang, current_ref, current_text):
228
+ is_danish = lang == "da"
229
  danish_voice_val = "mic" if is_danish else "mic" # Default to mic
230
  return (
231
+ default_audio_for_ui(lang, danish_voice_val),
232
+ default_text_for_ui(lang),
233
  gr.update(visible=is_danish), # Update Danish voice dropdown visibility
234
+ danish_voice_val,
235
  )
236
 
237
  def on_danish_voice_change(lang, danish_voice_val):
 
243
  fn=on_language_change,
244
  inputs=[language_id, ref_wav, text],
245
  outputs=[ref_wav, text, danish_voice, danish_voice],
246
+ show_progress=False,
247
  )
248
 
249
  danish_voice.change(
250
+ fn=on_danish_voice_change, inputs=[language_id, danish_voice], outputs=[ref_wav], show_progress=False
 
 
 
251
  )
252
 
253
  run_btn.click(
 
265
  outputs=[audio_output],
266
  )
267
 
268
+ demo.launch() # mcp_server=True
requirements.txt CHANGED
@@ -9,4 +9,4 @@ omegaconf==2.3.0
9
  resemble-perth==1.0.1
10
  silero-vad==5.1.2
11
  conformer==0.3.2
12
- safetensors
 
9
  resemble-perth==1.0.1
10
  silero-vad==5.1.2
11
  conformer==0.3.2
12
+ safetensors