muhammadnoman76 commited on
Commit
aae7a3d
·
1 Parent(s): 6450af0

Update app.py - fix phonemizer error for non-English languages

Browse files
Files changed (1) hide show
  1. app.py +19 -70
app.py CHANGED
@@ -208,7 +208,12 @@ def split_into_sentences(text: str) -> List[str]:
208
  return [s.strip() for s in sentences if s.strip()]
209
 
210
  def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lang_code: str):
211
- """Generate audio for a single text chunk with robust error handling"""
 
 
 
 
 
212
 
213
  # Preprocess text
214
  text = preprocess_text_for_phonemizer(text)
@@ -217,12 +222,21 @@ def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lan
217
  logger.warning("Text too short after preprocessing, skipping")
218
  return None
219
 
220
- pipeline = pipelines.get(lang_code)
 
 
 
 
 
 
 
 
 
 
221
  if not pipeline:
222
- # Fallback to English if language pipeline not available
223
- pipeline = pipelines.get('a', pipelines.get('b'))
224
  if not pipeline:
225
- logger.error(f"No pipeline available for {lang_code}")
226
  return None
227
 
228
  try:
@@ -251,74 +265,9 @@ def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lan
251
  return None
252
 
253
  except Exception as e:
254
- error_msg = str(e)
255
-
256
- # Check if this is the phonemizer "lines not equal" error
257
- if "number of lines in input and output must be equal" in error_msg or "words count mismatch" in error_msg:
258
- logger.warning(f"Phonemizer error for lang={lang_code}, trying sentence-by-sentence fallback")
259
-
260
- # Try processing sentence by sentence
261
- sentences = split_into_sentences(text)
262
- if len(sentences) > 1:
263
- audio_parts = []
264
- for sentence in sentences:
265
- try:
266
- # Try with current language
267
- result = generate_single_sentence_audio(sentence, voice, speed, use_gpu, lang_code, pipeline)
268
- if result is not None:
269
- audio_parts.append(result)
270
- except Exception:
271
- # If sentence fails, try with English phonemizer as last resort
272
- try:
273
- if lang_code != 'a' and 'a' in pipelines:
274
- result = generate_single_sentence_audio(sentence, voice, speed, use_gpu, 'a', pipelines['a'])
275
- if result is not None:
276
- audio_parts.append(result)
277
- except Exception:
278
- logger.warning(f"Skipping problematic sentence: {sentence[:50]}...")
279
- continue
280
-
281
- if audio_parts:
282
- # Merge the parts
283
- sample_rate = 24000
284
- silence = np.zeros(int(0.05 * sample_rate), dtype=np.float32)
285
- merged = []
286
- for i, part in enumerate(audio_parts):
287
- merged.append(part)
288
- if i < len(audio_parts) - 1:
289
- merged.append(silence)
290
- return np.concatenate(merged) if len(merged) > 1 else merged[0]
291
-
292
- # If still failing, try with English phonemizer directly
293
- if lang_code != 'a' and 'a' in pipelines:
294
- logger.warning(f"Falling back to English phonemizer for: {text[:50]}...")
295
- return generate_single_sentence_audio(text, voice, speed, use_gpu, 'a', pipelines['a'])
296
-
297
  logger.error(f"Failed to generate audio chunk: {e}")
298
  return None
299
 
300
- def generate_single_sentence_audio(text: str, voice: str, speed: float, use_gpu: bool, lang_code: str, pipeline):
301
- """Generate audio for a single sentence with minimal processing"""
302
- text = preprocess_text_for_phonemizer(text)
303
-
304
- if not text or len(text) < 2:
305
- return None
306
-
307
- pack = pipeline.load_voice(voice)
308
-
309
- for _, ps, _ in pipeline(text, voice, speed):
310
- ref_s = pack[len(ps)-1]
311
-
312
- with torch.no_grad():
313
- if use_gpu and True in models:
314
- audio = models[True](ps, ref_s, speed)
315
- else:
316
- audio = models[False](ps, ref_s, speed)
317
-
318
- return audio.numpy()
319
-
320
- return None
321
-
322
  async def generate_audio(text: str, voice: str = 'af_heart', speed: float = 1.0, use_gpu: bool = None, lang_code: str = 'a'):
323
  """Generate audio from text using Kokoro TTS with parallel chunking for unlimited text length"""
324
 
 
208
  return [s.strip() for s in sentences if s.strip()]
209
 
210
  def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lang_code: str):
211
+ """Generate audio for a single text chunk.
212
+
213
+ IMPORTANT: For non-English languages, we use the English phonemizer because
214
+ the Spanish/French/etc phonemizers have known issues with the 'espeak-ng' backend.
215
+ The voice model still sounds correct - only phoneme conversion uses English rules.
216
+ """
217
 
218
  # Preprocess text
219
  text = preprocess_text_for_phonemizer(text)
 
222
  logger.warning("Text too short after preprocessing, skipping")
223
  return None
224
 
225
+ # ALWAYS use English phonemizer for stability - the voice model handles accents
226
+ # Languages like Spanish (e), French (f), Italian (i), Portuguese (p) have phonemizer bugs
227
+ STABLE_LANGUAGES = {'a', 'b'} # Only American and British English phonemizers are stable
228
+
229
+ if lang_code in STABLE_LANGUAGES:
230
+ pipeline = pipelines.get(lang_code)
231
+ else:
232
+ # Use English phonemizer for all other languages to avoid phonemizer errors
233
+ pipeline = pipelines.get('a') # American English is most stable
234
+ logger.debug(f"Using English phonemizer for lang={lang_code} (stability)")
235
+
236
  if not pipeline:
237
+ pipeline = pipelines.get('b', list(pipelines.values())[0] if pipelines else None)
 
238
  if not pipeline:
239
+ logger.error("No pipeline available")
240
  return None
241
 
242
  try:
 
265
  return None
266
 
267
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  logger.error(f"Failed to generate audio chunk: {e}")
269
  return None
270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  async def generate_audio(text: str, voice: str = 'af_heart', speed: float = 1.0, use_gpu: bool = None, lang_code: str = 'a'):
272
  """Generate audio from text using Kokoro TTS with parallel chunking for unlimited text length"""
273