Edge_TTS_NGHIA_transcript

Runtime error

App Files Files Community

cnph001 commited on May 13, 2025

Commit

451c102

verified ·

1 Parent(s): 049675e

Update app.py

Browse files

Fix "no audio from edge TTS error"

Files changed (1) hide show

app.py +63 -120

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 ##fix overlap, remove silence, leave a tiny bit of silence
 import spaces
 import gradio as gr
@@ -61,136 +62,78 @@ async def get_voices():
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
-    """Generates audio for a text segment, handling voice prefixes."""
     current_voice_full = default_voice
     current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
-    voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
-    voice1_short = voice1_full.split(" - ")[0]
-    voice1F_full ="en-GB-SoniaNeural - en-GB (Female)"
-    voice1F_short = voice1F_full.split(" - ")[0]
-    voice2_full = "en-GB-RyanNeural - en-GB (Male)"
-    voice2_short = voice2_full.split(" - ")[0]
-    voice2F_full = "en-US-JennyNeural - en-US (Female)"
-    voice2F_short = voice2F_full.split(" - ")[0]
-    voice3_full ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
-    voice3_short = voice3_full.split(" - ")[0]
-    voice3F_full = "en-HK-YanNeural - en-HK (Female)"
-    voice3F_short = voice3F_full.split(" - ")[0]
-    voice4_full = "en-GB-ThomasNeural - en-GB (Male)"
-    voice4_short = voice4_full.split(" - ")[0]
-    voice4F_full ="en-US-EmmaNeural - en-US (Female)"
-    voice4F_short = voice4_full.split(" - ")[0]
-    voice5_full = "en-GB-RyanNeural - en-GB (Male)" #Old Man
-    voice5_short = voice5_full.split(" - ")[0]
-    voice6_full = "en-GB-MaisieNeural - en-GB (Female)"  #Child
-    voice6_short = voice6_full.split(" - ")[0]
-    voice7_full = "vi-VN-HoaiMyNeural - vi-VN (Female)"  #Vietnamese
-    voice7_short = voice7_full.split(" - ")[0]
-    voice8_full = "vi-VN-NamMinhNeural - vi-VN (Male)"  #Vietnamese
-    voice8_short = voice8_full.split(" - ")[0]
-    voice9F_full = "de-DE-SeraphinaMultilingualNeural - de-DE (Female)"  #Vietnamese
-    voice9F_short = voice7_full.split(" - ")[0]
-    voice9_full = "ko-KR-HyunsuMultilingualNeural - ko-KR (Male)"  #Vietnamese
-    voice9_short = voice8_full.split(" - ")[0]
-    detect=0
-    if processed_text.startswith("1F"):
-        current_voice_short = voice1F_short
-        current_pitch = 25
-        detect=1
-        #processed_text = processed_text[2:].strip()
-    elif processed_text.startswith("2F"):
-        current_voice_short = voice2F_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("3F"):
-        current_voice_short = voice3F_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("4F"):
-        current_voice_short = voice4F_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("1M"):
-        current_voice_short = voice1_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("2M"):
-        current_voice_short = voice2_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("3M"):
-        current_voice_short = voice3_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("4M"):
-        current_voice_short = voice4_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("1O"):  # Old man voice
-        current_voice_short = voice5_short
-        current_pitch = -20
-        current_rate = -10
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("1C"):  #Child voice
-        current_voice_short = voice6_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("1V"):  #Female VN
-        current_voice_short = voice7_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("2V"):
-        current_voice_short = voice8_short
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("3V"):  #Female VN
-        current_voice_short = voice9F_short
-        current_pitch = 25
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    elif processed_text.startswith("4V"):
-        current_voice_short = voice9_short
-        current_pitch = -20
-        #processed_text = processed_text[2:].strip()
-        detect=1
-    #Looking for number following prefix, which are pitch values.
-    #match = re.search(r'[A-Za-z]\d+', part)  # Look for a letter followed by one or more digits
-    match = re.search(r'[A-Za-z]+\-?\d+', processed_text)  # Look for a letter(s) followed by an optional '-' and digits
     if match:
-        # Extract the prefix (e.g., '2F') and number (e.g., '-20')
-        prefix = ''.join([ch for ch in match.group() if ch.isalpha()])  # Extract letters (prefix)
-        number = int(''.join([ch for ch in match.group() if ch.isdigit() or ch == '-']))  # Extract digits (number)
         current_pitch += number
-        # Step 2: Remove the found number from the string
-        new_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip()  # Remove prefix and number (e.g., '2F-20')
-        #processed_text = new_text[2:]  #cut out the prefix like 1F, 3M etc
-        processed_text = new_text[len(prefix):]  # Dynamically remove the prefix part
-    else:
-        if detect:
-            processed_text = processed_text[2:]
     if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
-        communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-            audio_path = tmp_file.name
-            await communicate.save(audio_path)
-        # Load the audio from file
-        audio = AudioSegment.from_mp3(audio_path)
-        # Strip silence at start and end
-        audio = strip_silence(audio, silence_thresh=-40, min_silence_len=100)
-        # Save the stripped version back to file
-        stripped_path = tempfile.mktemp(suffix=".mp3")
-        audio.export(stripped_path, format="mp3")
-        return stripped_path
     return None
 async def process_transcript_line(line, default_voice, rate, pitch):

 ##fix overlap, remove silence, leave a tiny bit of silence
+## Simplified
 import spaces
 import gradio as gr
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
+    """Generates audio for a text segment, handling voice prefixes, retries, and fallback."""
+    voice_map = {
+        "1F": ("en-GB-SoniaNeural", 25, 0),
+        "2F": ("en-US-JennyNeural", 0, 0),
+        "3F": ("en-HK-YanNeural", 0, 0),
+        "4F": ("en-US-EmmaNeural", 0, 0),
+        "1M": ("en-AU-WilliamNeural", 0, 0),
+        "2M": ("en-GB-RyanNeural", 0, 0),
+        "3M": ("en-US-BrianMultilingualNeural", 0, 0),
+        "4M": ("en-GB-ThomasNeural", 0, 0),
+        "1O": ("en-GB-RyanNeural", -20, -10),
+        "1C": ("en-GB-MaisieNeural", 0, 0),
+        "1V": ("vi-VN-HoaiMyNeural", 0, 0),
+        "2V": ("vi-VN-NamMinhNeural", 0, 0),
+        "3V": ("de-DE-SeraphinaMultilingualNeural", 25, 0),
+        "4V": ("ko-KR-HyunsuMultilingualNeural", -20, 0),
+    }
     current_voice_full = default_voice
     current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
+    detect = False
+    prefix = processed_text[:2]
+    if prefix in voice_map:
+        current_voice_short, pitch_adj, rate_adj = voice_map[prefix]
+        current_pitch += pitch_adj
+        current_rate += rate_adj
+        detect = True
+    match = re.search(r'[A-Za-z]+\-?\d+', processed_text)
     if match:
+        group = match.group()
+        prefix_only = ''.join(filter(str.isalpha, group))
+        number = int(''.join(ch for ch in group if ch.isdigit() or ch == '-'))
         current_pitch += number
+        processed_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip()
+        processed_text = processed_text[len(prefix_only):].strip()
+    elif detect:
+        processed_text = processed_text[2:].strip()
     if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
+        # Retry logic
+        for attempt in range(3):
+            try:
+                communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+                    audio_path = tmp_file.name
+                    await communicate.save(audio_path)
+                audio = AudioSegment.from_mp3(audio_path)
+                audio = strip_silence(audio, silence_thresh=-40, min_silence_len=100)
+                stripped_path = tempfile.mktemp(suffix=".mp3")
+                audio.export(stripped_path, format="mp3")
+                return stripped_path
+            except Exception as e:
+                if attempt == 2:
+                    # Final failure: return 500ms of silence
+                    silent_audio = AudioSegment.silent(duration=500)
+                    fallback_path = tempfile.mktemp(suffix=".mp3")
+                    silent_audio.export(fallback_path, format="mp3")
+                    return fallback_path
+                await asyncio.sleep(0.5)  # brief wait before retry
     return None
 async def process_transcript_line(line, default_voice, rate, pitch):