tahirturk commited on
Commit
c7b0afd
·
1 Parent(s): 95cffa9
Files changed (2) hide show
  1. app.py +205 -53
  2. requirements.txt +13 -20
app.py CHANGED
@@ -5,86 +5,176 @@ from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGE
5
  import gradio as gr
6
  import spaces
7
 
8
- # ✅ Detect CUDA
9
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
- print(f" Model ready on {DEVICE}")
11
 
12
- # --- Global Model Cache ---
13
  MODEL = None
14
 
15
- # --- Language Defaults ---
16
  LANGUAGE_CONFIG = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  "en": {
18
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac",
19
  "text": "Last month, we reached a new milestone with two billion views on our YouTube channel."
20
  },
 
 
 
 
 
 
 
 
21
  "fr": {
22
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac",
23
  "text": "Le mois dernier, nous avons atteint un nouveau jalon avec deux milliards de vues sur notre chaîne YouTube."
24
  },
 
 
 
 
25
  "hi": {
26
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac",
27
  "text": "पिछले महीने हमने एक नया मील का पत्थर छुआ: हमारे YouTube चैनल पर दो अरब व्यूज़।"
28
  },
29
- "es": {
30
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac",
31
- "text": "El mes pasado alcanzamos un nuevo hito: dos mil millones de visualizaciones en nuestro canal de YouTube."
32
  },
33
  "ja": {
34
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac",
35
  "text": "先月、私たちのYouTubeチャンネルで二十億回の再生回数という新たなマイルストーンに到達しました。"
36
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  "zh": {
38
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
39
  "text": "上个月,我们达到了一个新的里程碑。 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。"
40
  },
41
  }
42
 
43
- # --- Helpers ---
44
  def default_audio_for_ui(lang: str) -> str | None:
45
  return LANGUAGE_CONFIG.get(lang, {}).get("audio")
46
 
 
47
  def default_text_for_ui(lang: str) -> str:
48
  return LANGUAGE_CONFIG.get(lang, {}).get("text", "")
49
 
 
50
  def get_supported_languages_display() -> str:
51
- items = [f"**{name}** (`{code}`)" for code, name in sorted(SUPPORTED_LANGUAGES.items())]
52
- mid = len(items) // 2
 
 
 
 
 
 
 
 
53
  return f"""
54
- ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)})
55
- {' • '.join(items[:mid])}
56
 
57
- {' • '.join(items[mid:])}
58
  """
59
 
60
- # --- Model Loader ---
61
  def get_or_load_model():
 
 
62
  global MODEL
63
  if MODEL is None:
64
- print("🔄 Loading Chatterbox Multilingual TTS model...")
65
  try:
66
  MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
67
- if hasattr(MODEL, "to") and str(MODEL.device) != DEVICE:
68
  MODEL.to(DEVICE)
69
- print(f"Model ready on {getattr(MODEL, 'device', DEVICE)}")
70
  except Exception as e:
71
- print(f"Error loading model: {e}")
72
  raise
73
  return MODEL
74
 
75
- # Preload model silently (non-blocking for Spaces)
76
  try:
77
  get_or_load_model()
78
  except Exception as e:
79
- print(f"⚠️ Model preload failed: {e}")
80
 
81
  def set_seed(seed: int):
 
82
  torch.manual_seed(seed)
83
  if DEVICE == "cuda":
84
  torch.cuda.manual_seed(seed)
85
  torch.cuda.manual_seed_all(seed)
86
  random.seed(seed)
87
  np.random.seed(seed)
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  @spaces.GPU
90
  def generate_tts_audio(
@@ -96,15 +186,39 @@ def generate_tts_audio(
96
  seed_num_input: int = 0,
97
  cfgw_input: float = 0.5
98
  ) -> tuple[int, np.ndarray]:
99
- """Generate multilingual TTS output."""
100
- model = get_or_load_model()
101
- if model is None:
102
- raise RuntimeError("Model not loaded")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  if seed_num_input != 0:
105
  set_seed(int(seed_num_input))
106
 
 
 
 
107
  chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
 
108
  generate_kwargs = {
109
  "exaggeration": exaggeration_input,
110
  "temperature": temperature_input,
@@ -112,58 +226,96 @@ def generate_tts_audio(
112
  }
113
  if chosen_prompt:
114
  generate_kwargs["audio_prompt_path"] = chosen_prompt
115
- print(f"🎧 Using reference: {chosen_prompt}")
116
  else:
117
- print("🎙️ No reference audio using default voice")
118
-
119
- wav = model.generate(text_input[:300], language_id=language_id, **generate_kwargs)
120
- print("✅ Audio generated.")
121
- return (model.sr, wav.squeeze(0).numpy())
 
 
 
 
122
 
123
- # --- Gradio UI ---
124
- with gr.Blocks(title="Chatterbox Multilingual TTS") as demo:
125
- gr.Markdown("# 🌍 Chatterbox Multilingual TTS Demo\nGenerate high-quality multilingual speech with optional voice cloning.")
 
 
 
 
 
 
 
 
126
  gr.Markdown(get_supported_languages_display())
127
-
128
  with gr.Row():
129
  with gr.Column():
130
- initial_lang = "en"
131
- text = gr.Textbox(value=default_text_for_ui(initial_lang), label="Text", max_lines=5)
 
 
 
 
 
132
  language_id = gr.Dropdown(
133
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
134
- value=initial_lang, label="Language"
 
 
135
  )
 
136
  ref_wav = gr.Audio(
137
  sources=["upload", "microphone"],
138
  type="filepath",
139
- label="Reference Audio (Optional)",
140
  value=default_audio_for_ui(initial_lang)
141
  )
142
-
143
- exaggeration = gr.Slider(0.25, 2, 0.05, label="Exaggeration (emotion)", value=0.5)
144
- cfg_weight = gr.Slider(0.2, 1, 0.05, label="CFG / Pace", value=0.5)
 
 
 
 
 
 
 
 
 
145
 
146
  with gr.Accordion("More options", open=False):
147
- seed_num = gr.Number(value=0, label="Seed (0=random)")
148
- temp = gr.Slider(0.05, 5, 0.05, label="Temperature", value=0.8)
149
 
150
- run_btn = gr.Button("🎤 Generate", variant="primary")
151
 
152
  with gr.Column():
153
- audio_output = gr.Audio(label="Generated Audio")
154
 
155
- def on_lang_change(lang, _, __):
156
  return default_audio_for_ui(lang), default_text_for_ui(lang)
157
 
158
- language_id.change(fn=on_lang_change, inputs=[language_id, ref_wav, text],
159
- outputs=[ref_wav, text], show_progress=False)
 
 
 
 
160
 
161
  run_btn.click(
162
  fn=generate_tts_audio,
163
- inputs=[text, language_id, ref_wav, exaggeration, temp, seed_num, cfg_weight],
164
- outputs=[audio_output]
 
 
 
 
 
 
 
 
165
  )
166
 
167
- # --- Launch ---
168
- if __name__ == "__main__":
169
- demo.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
 
5
  import gradio as gr
6
  import spaces
7
 
 
8
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
9
+ print(f"🚀 Running on device: {DEVICE}")
10
 
11
+ # --- Global Model Initialization ---
12
  MODEL = None
13
 
 
14
  LANGUAGE_CONFIG = {
15
+ "ar": {
16
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac",
17
+ "text": "في الشهر الماضي، وصلنا إلى معلم جديد بمليارين من المشاهدات على قناتنا على يوتيوب."
18
+ },
19
+ "da": {
20
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/da_m1.flac",
21
+ "text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal."
22
+ },
23
+ "de": {
24
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac",
25
+ "text": "Letzten Monat haben wir einen neuen Meilenstein erreicht: zwei Milliarden Aufrufe auf unserem YouTube-Kanal."
26
+ },
27
+ "el": {
28
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/el_m.flac",
29
+ "text": "Τον περασμένο μήνα, φτάσαμε σε ένα νέο ορόσημο με δύο δισεκατομμύρια προβολές στο κανάλι μας στο YouTube."
30
+ },
31
  "en": {
32
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac",
33
  "text": "Last month, we reached a new milestone with two billion views on our YouTube channel."
34
  },
35
+ "es": {
36
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac",
37
+ "text": "El mes pasado alcanzamos un nuevo hito: dos mil millones de visualizaciones en nuestro canal de YouTube."
38
+ },
39
+ "fi": {
40
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fi_m.flac",
41
+ "text": "Viime kuussa saavutimme uuden virstanpylvään kahden miljardin katselukerran kanssa YouTube-kanavallamme."
42
+ },
43
  "fr": {
44
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac",
45
  "text": "Le mois dernier, nous avons atteint un nouveau jalon avec deux milliards de vues sur notre chaîne YouTube."
46
  },
47
+ "he": {
48
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/he_m1.flac",
49
+ "text": "בחודש שעבר הגענו לאבן דרך חדשה עם שני מיליארד צפיות בערוץ היוטיוב שלנו."
50
+ },
51
  "hi": {
52
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac",
53
  "text": "पिछले महीने हमने एक नया मील का पत्थर छुआ: हमारे YouTube चैनल पर दो अरब व्यूज़।"
54
  },
55
+ "it": {
56
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/it_m1.flac",
57
+ "text": "Il mese scorso abbiamo raggiunto un nuovo traguardo: due miliardi di visualizzazioni sul nostro canale YouTube."
58
  },
59
  "ja": {
60
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac",
61
  "text": "先月、私たちのYouTubeチャンネルで二十億回の再生回数という新たなマイルストーンに到達しました。"
62
  },
63
+ "ko": {
64
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ko_f.flac",
65
+ "text": "지난달 우리는 유튜브 채널에서 이십억 조회수라는 새로운 이정표에 도달했습니다."
66
+ },
67
+ "ms": {
68
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ms_f.flac",
69
+ "text": "Bulan lepas, kami mencapai pencapaian baru dengan dua bilion tontonan di saluran YouTube kami."
70
+ },
71
+ "nl": {
72
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/nl_m.flac",
73
+ "text": "Vorige maand bereikten we een nieuwe mijlpaal met twee miljard weergaven op ons YouTube-kanaal."
74
+ },
75
+ "no": {
76
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/no_f1.flac",
77
+ "text": "Forrige måned nådde vi en ny milepæl med to milliarder visninger på YouTube-kanalen vår."
78
+ },
79
+ "pl": {
80
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pl_m.flac",
81
+ "text": "W zeszłym miesiącu osiągnęliśmy nowy kamień milowy z dwoma miliardami wyświetleń na naszym kanale YouTube."
82
+ },
83
+ "pt": {
84
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pt_m1.flac",
85
+ "text": "No mês passado, alcançámos um novo marco: dois mil milhões de visualizações no nosso canal do YouTube."
86
+ },
87
+ "ru": {
88
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ru_m.flac",
89
+ "text": "В прошлом месяце мы достигли нового рубежа: два миллиарда просмотров на нашем YouTube-канале."
90
+ },
91
+ "sv": {
92
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sv_f.flac",
93
+ "text": "Förra månaden nådde vi en ny milstolpe med två miljarder visningar på vår YouTube-kanal."
94
+ },
95
+ "sw": {
96
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sw_m.flac",
97
+ "text": "Mwezi uliopita, tulifika hatua mpya ya maoni ya bilioni mbili kweny kituo chetu cha YouTube."
98
+ },
99
+ "tr": {
100
+ "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/tr_m.flac",
101
+ "text": "Geçen ay YouTube kanalımızda iki milyar görüntüleme ile yeni bir dönüm noktasına ulaştık."
102
+ },
103
  "zh": {
104
  "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
105
  "text": "上个月,我们达到了一个新的里程碑。 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。"
106
  },
107
  }
108
 
109
+ # --- UI Helpers ---
110
  def default_audio_for_ui(lang: str) -> str | None:
111
  return LANGUAGE_CONFIG.get(lang, {}).get("audio")
112
 
113
+
114
  def default_text_for_ui(lang: str) -> str:
115
  return LANGUAGE_CONFIG.get(lang, {}).get("text", "")
116
 
117
+
118
  def get_supported_languages_display() -> str:
119
+ """Generate a formatted display of all supported languages."""
120
+ language_items = []
121
+ for code, name in sorted(SUPPORTED_LANGUAGES.items()):
122
+ language_items.append(f"**{name}** (`{code}`)")
123
+
124
+ # Split into 2 lines
125
+ mid = len(language_items) // 2
126
+ line1 = " • ".join(language_items[:mid])
127
+ line2 = " • ".join(language_items[mid:])
128
+
129
  return f"""
130
+ ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
131
+ {line1}
132
 
133
+ {line2}
134
  """
135
 
136
+
137
  def get_or_load_model():
138
+ """Loads the ChatterboxMultilingualTTS model if it hasn't been loaded already,
139
+ and ensures it's on the correct device."""
140
  global MODEL
141
  if MODEL is None:
142
+ print("Model not loaded, initializing...")
143
  try:
144
  MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
145
+ if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE:
146
  MODEL.to(DEVICE)
147
+ print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")
148
  except Exception as e:
149
+ print(f"Error loading model: {e}")
150
  raise
151
  return MODEL
152
 
153
+ # Attempt to load the model at startup.
154
  try:
155
  get_or_load_model()
156
  except Exception as e:
157
+ print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}")
158
 
159
  def set_seed(seed: int):
160
+ """Sets the random seed for reproducibility across torch, numpy, and random."""
161
  torch.manual_seed(seed)
162
  if DEVICE == "cuda":
163
  torch.cuda.manual_seed(seed)
164
  torch.cuda.manual_seed_all(seed)
165
  random.seed(seed)
166
  np.random.seed(seed)
167
+
168
+ def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | None:
169
+ """
170
+ Decide which audio prompt to use:
171
+ - If user provided a path (upload/mic/url), use it.
172
+ - Else, fall back to language-specific default (if any).
173
+ """
174
+ if provided_path and str(provided_path).strip():
175
+ return provided_path
176
+ return LANGUAGE_CONFIG.get(language_id, {}).get("audio")
177
+
178
 
179
  @spaces.GPU
180
  def generate_tts_audio(
 
186
  seed_num_input: int = 0,
187
  cfgw_input: float = 0.5
188
  ) -> tuple[int, np.ndarray]:
189
+ """
190
+ Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
191
+ Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi.
192
+
193
+ This tool synthesizes natural-sounding speech from input text. When a reference audio file
194
+ is provided, it captures the speaker's voice characteristics and speaking style. The generated audio
195
+ maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
196
+
197
+ Args:
198
+ text_input (str): The text to synthesize into speech (maximum 300 characters)
199
+ language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
200
+ audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
201
+ exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
202
+ temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8.
203
+ seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0.
204
+ cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer.
205
+
206
+ Returns:
207
+ tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray)
208
+ """
209
+ current_model = get_or_load_model()
210
+
211
+ if current_model is None:
212
+ raise RuntimeError("TTS model is not loaded.")
213
 
214
  if seed_num_input != 0:
215
  set_seed(int(seed_num_input))
216
 
217
+ print(f"Generating audio for text: '{text_input[:50]}...'")
218
+
219
+ # Handle optional audio prompt
220
  chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
221
+
222
  generate_kwargs = {
223
  "exaggeration": exaggeration_input,
224
  "temperature": temperature_input,
 
226
  }
227
  if chosen_prompt:
228
  generate_kwargs["audio_prompt_path"] = chosen_prompt
229
+ print(f"Using audio prompt: {chosen_prompt}")
230
  else:
231
+ print("No audio prompt provided; using default voice.")
232
+
233
+ wav = current_model.generate(
234
+ text_input[:300], # Truncate text to max chars
235
+ language_id=language_id,
236
+ **generate_kwargs
237
+ )
238
+ print("Audio generation complete.")
239
+ return (current_model.sr, wav.squeeze(0).numpy())
240
 
241
+ with gr.Blocks() as demo:
242
+ gr.Markdown(
243
+ """
244
+ # Chatterbox Multilingual Demo
245
+ Generate high-quality multilingual speech from text with reference audio styling, supporting 23 languages.
246
+
247
+ For a hosted version of Chatterbox Multilingual and for finetuning, please visit [resemble.ai](https://app.resemble.ai)
248
+ """
249
+ )
250
+
251
+ # Display supported languages
252
  gr.Markdown(get_supported_languages_display())
 
253
  with gr.Row():
254
  with gr.Column():
255
+ initial_lang = "fr"
256
+ text = gr.Textbox(
257
+ value=default_text_for_ui(initial_lang),
258
+ label="Text to synthesize (max chars 300)",
259
+ max_lines=5
260
+ )
261
+
262
  language_id = gr.Dropdown(
263
  choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
264
+ value=initial_lang,
265
+ label="Language",
266
+ info="Select the language for text-to-speech synthesis"
267
  )
268
+
269
  ref_wav = gr.Audio(
270
  sources=["upload", "microphone"],
271
  type="filepath",
272
+ label="Reference Audio File (Optional)",
273
  value=default_audio_for_ui(initial_lang)
274
  )
275
+
276
+ gr.Markdown(
277
+ "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
278
+ elem_classes=["audio-note"]
279
+ )
280
+
281
+ exaggeration = gr.Slider(
282
+ 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
283
+ )
284
+ cfg_weight = gr.Slider(
285
+ 0.2, 1, step=.05, label="CFG/Pace", value=0.5
286
+ )
287
 
288
  with gr.Accordion("More options", open=False):
289
+ seed_num = gr.Number(value=0, label="Random seed (0 for random)")
290
+ temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
291
 
292
+ run_btn = gr.Button("Generate", variant="primary")
293
 
294
  with gr.Column():
295
+ audio_output = gr.Audio(label="Output Audio")
296
 
297
+ def on_language_change(lang, current_ref, current_text):
298
  return default_audio_for_ui(lang), default_text_for_ui(lang)
299
 
300
+ language_id.change(
301
+ fn=on_language_change,
302
+ inputs=[language_id, ref_wav, text],
303
+ outputs=[ref_wav, text],
304
+ show_progress=False
305
+ )
306
 
307
  run_btn.click(
308
  fn=generate_tts_audio,
309
+ inputs=[
310
+ text,
311
+ language_id,
312
+ ref_wav,
313
+ exaggeration,
314
+ temp,
315
+ seed_num,
316
+ cfg_weight,
317
+ ],
318
+ outputs=[audio_output],
319
  )
320
 
321
+ demo.launch(mcp_server=True)
 
 
requirements.txt CHANGED
@@ -1,26 +1,19 @@
1
- # Core dependencies
2
- torch==2.4.1
3
- torchaudio==2.4.1
4
- transformers==4.46.3
5
- diffusers==0.29.0
6
- safetensors
7
- omegaconf==2.3.0
8
  numpy==1.26.0
9
  resampy==0.4.3
10
  librosa==0.10.0
11
- soundfile
12
- gradio==4.20.1 # ✅ Node-free stable build for T4 GPU (no "--import" issue)
13
- spaces==0.26.1
14
- tqdm
15
-
16
- # Speech/voice-related packages
17
  silero-vad==5.1.2
18
  conformer==0.3.2
19
- resemble-perth==1.0.1
20
- s3tokenizer
21
 
22
- # Language-specific utilities (optional, uncomment as needed)
23
- # spacy_pkuseg # Chinese text segmentation
24
- # pykakasi>=2.2.0 # Japanese text processing (Kanji → Hiragana)
25
- # russian-text-stresser @ git+https://github.com/Vuizur/add-stress-to-epub
26
- # dicta-onnx>=0.1.0 # Hebrew diacritization
 
 
1
+ gradio
 
 
 
 
 
 
2
  numpy==1.26.0
3
  resampy==0.4.3
4
  librosa==0.10.0
5
+ s3tokenizer
6
+ transformers==4.46.3
7
+ diffusers==0.29.0
8
+ omegaconf==2.3.0
9
+ resemble-perth==1.0.1
 
10
  silero-vad==5.1.2
11
  conformer==0.3.2
12
+ safetensors
 
13
 
14
+ # Optional language-specific dependencies
15
+ # Uncomment the ones you need for specific languages:
16
+ spacy_pkuseg # For Chinese text segmentation
17
+ pykakasi>=2.2.0 # For Japanese text processing (Kanji to Hiragana)
18
+ russian-text-stresser @ git+https://github.com/Vuizur/add-stress-to-epub
19
+ # dicta-onnx>=0.1.0 # For Hebrew diacritization