Spaces:

muhammadnoman76
/

text_to_speech_3

Running

App Files Files Community

muhammadnoman76 commited on 19 days ago

Commit

aae7a3d

1 Parent(s): 6450af0

Update app.py - fix phonemizer error for non-English languages

Browse files

Files changed (1) hide show

app.py +19 -70

app.py CHANGED Viewed

@@ -208,7 +208,12 @@ def split_into_sentences(text: str) -> List[str]:
     return [s.strip() for s in sentences if s.strip()]
 def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lang_code: str):
-    """Generate audio for a single text chunk with robust error handling"""
     # Preprocess text
     text = preprocess_text_for_phonemizer(text)
@@ -217,12 +222,21 @@ def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lan
         logger.warning("Text too short after preprocessing, skipping")
         return None
-    pipeline = pipelines.get(lang_code)
     if not pipeline:
-        # Fallback to English if language pipeline not available
-        pipeline = pipelines.get('a', pipelines.get('b'))
         if not pipeline:
-            logger.error(f"No pipeline available for {lang_code}")
             return None
     try:
@@ -251,74 +265,9 @@ def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lan
         return None
     except Exception as e:
-        error_msg = str(e)
-        # Check if this is the phonemizer "lines not equal" error
-        if "number of lines in input and output must be equal" in error_msg or "words count mismatch" in error_msg:
-            logger.warning(f"Phonemizer error for lang={lang_code}, trying sentence-by-sentence fallback")
-            # Try processing sentence by sentence
-            sentences = split_into_sentences(text)
-            if len(sentences) > 1:
-                audio_parts = []
-                for sentence in sentences:
-                    try:
-                        # Try with current language
-                        result = generate_single_sentence_audio(sentence, voice, speed, use_gpu, lang_code, pipeline)
-                        if result is not None:
-                            audio_parts.append(result)
-                    except Exception:
-                        # If sentence fails, try with English phonemizer as last resort
-                        try:
-                            if lang_code != 'a' and 'a' in pipelines:
-                                result = generate_single_sentence_audio(sentence, voice, speed, use_gpu, 'a', pipelines['a'])
-                                if result is not None:
-                                    audio_parts.append(result)
-                        except Exception:
-                            logger.warning(f"Skipping problematic sentence: {sentence[:50]}...")
-                            continue
-                if audio_parts:
-                    # Merge the parts
-                    sample_rate = 24000
-                    silence = np.zeros(int(0.05 * sample_rate), dtype=np.float32)
-                    merged = []
-                    for i, part in enumerate(audio_parts):
-                        merged.append(part)
-                        if i < len(audio_parts) - 1:
-                            merged.append(silence)
-                    return np.concatenate(merged) if len(merged) > 1 else merged[0]
-            # If still failing, try with English phonemizer directly
-            if lang_code != 'a' and 'a' in pipelines:
-                logger.warning(f"Falling back to English phonemizer for: {text[:50]}...")
-                return generate_single_sentence_audio(text, voice, speed, use_gpu, 'a', pipelines['a'])
         logger.error(f"Failed to generate audio chunk: {e}")
         return None
-def generate_single_sentence_audio(text: str, voice: str, speed: float, use_gpu: bool, lang_code: str, pipeline):
-    """Generate audio for a single sentence with minimal processing"""
-    text = preprocess_text_for_phonemizer(text)
-    if not text or len(text) < 2:
-        return None
-    pack = pipeline.load_voice(voice)
-    for _, ps, _ in pipeline(text, voice, speed):
-        ref_s = pack[len(ps)-1]
-        with torch.no_grad():
-            if use_gpu and True in models:
-                audio = models[True](ps, ref_s, speed)
-            else:
-                audio = models[False](ps, ref_s, speed)
-        return audio.numpy()
-    return None
 async def generate_audio(text: str, voice: str = 'af_heart', speed: float = 1.0, use_gpu: bool = None, lang_code: str = 'a'):
     """Generate audio from text using Kokoro TTS with parallel chunking for unlimited text length"""

     return [s.strip() for s in sentences if s.strip()]
 def generate_audio_chunk(text: str, voice: str, speed: float, use_gpu: bool, lang_code: str):
+    """Generate audio for a single text chunk.
+    IMPORTANT: For non-English languages, we use the English phonemizer because
+    the Spanish/French/etc phonemizers have known issues with the 'espeak-ng' backend.
+    The voice model still sounds correct - only phoneme conversion uses English rules.
+    """
     # Preprocess text
     text = preprocess_text_for_phonemizer(text)
         logger.warning("Text too short after preprocessing, skipping")
         return None
+    # ALWAYS use English phonemizer for stability - the voice model handles accents
+    # Languages like Spanish (e), French (f), Italian (i), Portuguese (p) have phonemizer bugs
+    STABLE_LANGUAGES = {'a', 'b'}  # Only American and British English phonemizers are stable
+    if lang_code in STABLE_LANGUAGES:
+        pipeline = pipelines.get(lang_code)
+    else:
+        # Use English phonemizer for all other languages to avoid phonemizer errors
+        pipeline = pipelines.get('a')  # American English is most stable
+        logger.debug(f"Using English phonemizer for lang={lang_code} (stability)")
     if not pipeline:
+        pipeline = pipelines.get('b', list(pipelines.values())[0] if pipelines else None)
         if not pipeline:
+            logger.error("No pipeline available")
             return None
     try:
         return None
     except Exception as e:
         logger.error(f"Failed to generate audio chunk: {e}")
         return None
 async def generate_audio(text: str, voice: str = 'af_heart', speed: float = 1.0, use_gpu: bool = None, lang_code: str = 'a'):
     """Generate audio from text using Kokoro TTS with parallel chunking for unlimited text length"""