Edge_TTS_NGHIA_transcript

Runtime error

App Files Files Community

cnph001 commited on May 7, 2025

Commit

0a995d3

verified ·

1 Parent(s): c9c3247

Update app.py

Browse files

Add error handling - skip if text is not valid

Files changed (1) hide show

app.py +42 -48

app.py CHANGED Viewed

@@ -17,11 +17,9 @@ def get_silence(duration_ms=1000):
         duration=duration_ms,
         frame_rate=24000  # 24kHz sampling rate
     )
     # Set audio parameters
     silent_audio = silent_audio.set_channels(1)  # Mono
     silent_audio = silent_audio.set_sample_width(4)  # 32-bit (4 bytes per sample)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         # Export with specific bitrate and codec parameters
         silent_audio.export(
@@ -39,8 +37,12 @@ def get_silence(duration_ms=1000):
 # Get all available voices
 async def get_voices():
-    voices = await edge_tts.list_voices()
-    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
     """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
@@ -78,7 +80,6 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             detect = 1
             processed_text = processed_text[len(prefix):].strip()
             break
     match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
     if match:
         prefix_pitch = match.group(1)
@@ -88,36 +89,35 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
         elif detect:
             processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
     elif detect:
         processed_text = processed_text[2:].strip()
     if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
-        communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-            audio_path = tmp_file.name
-            await communicate.save(audio_path)
-        if target_duration_ms is not None and os.path.exists(audio_path):
-            audio = AudioSegment.from_mp3(audio_path)
-            audio_duration_ms = len(audio)
-            #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
-            if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
-                speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
-                #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
-                if speed_factor > 0:
-                    if speed_factor <1.0:
-                       speed_factor = 1.0
-                    y, sr = librosa.load(audio_path, sr=None)
-                    y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
-                    sf.write(audio_path, y_stretched, sr)
-            else:
-                print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
-        return audio_path
     return None
 async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
@@ -153,7 +153,6 @@ async def process_transcript_line(line, default_voice, rate, pitch, speed_adjust
                 audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
         return start_time_ms, audio_segments, duration_ms
     return None, None, None
@@ -162,43 +161,38 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
         return None, gr.Warning("Please enter transcript text.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
     lines = transcript_text.strip().split('\n')
     timed_audio_segments = []
     max_end_time_ms = 0
     for line in lines:
         start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
         if start_time is not None and audio_paths:
             combined_line_audio = AudioSegment.empty()
             current_time_ms = start_time
             segment_duration = duration / len(audio_paths) if audio_paths else 0
             for path in audio_paths:
-                try:
-                    audio = AudioSegment.from_mp3(path)
-                    combined_line_audio += audio
-                    os.remove(path)
-                except FileNotFoundError:
-                    print(f"Warning: Audio file not found: {path}")
             if combined_line_audio:
                 timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
                 max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
         elif audio_paths:
             for path in audio_paths:
-                try:
-                    os.remove(path)
-                except FileNotFoundError:
-                    pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
     final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
     for segment in timed_audio_segments:
         final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
     combined_audio_path = tempfile.mktemp(suffix=".mp3")
     final_audio.export(combined_audio_path, format="mp3")
     return combined_audio_path, None

         duration=duration_ms,
         frame_rate=24000  # 24kHz sampling rate
     )
     # Set audio parameters
     silent_audio = silent_audio.set_channels(1)  # Mono
     silent_audio = silent_audio.set_sample_width(4)  # 32-bit (4 bytes per sample)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
         # Export with specific bitrate and codec parameters
         silent_audio.export(
 # Get all available voices
 async def get_voices():
+    try:
+        voices = await edge_tts.list_voices()
+        return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
+    except Exception as e:
+        print(f"Error listing voices: {e}")
+        return {}
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
     """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
             detect = 1
             processed_text = processed_text[len(prefix):].strip()
             break
     match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
     if match:
         prefix_pitch = match.group(1)
             processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
         elif detect:
             processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
     elif detect:
         processed_text = processed_text[2:].strip()
     if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
+        try:
+            communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+                audio_path = tmp_file.name
+                await communicate.save(audio_path)
+            if target_duration_ms is not None and os.path.exists(audio_path):
+                audio = AudioSegment.from_mp3(audio_path)
+                audio_duration_ms = len(audio)
+                #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
+                if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
+                    speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
+                    #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
+                    if speed_factor > 0:
+                        if speed_factor < 1.0:
+                            speed_factor = 1.0
+                        y, sr = librosa.load(audio_path, sr=None)
+                        y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
+                        sf.write(audio_path, y_stretched, sr)
+                else:
+                    print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
+            return audio_path
+        except Exception as e:
+            print(f"Edge TTS error processing '{processed_text}': {e}")
+            return None
     return None
 async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
                 audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
         return start_time_ms, audio_segments, duration_ms
     return None, None, None
         return None, gr.Warning("Please enter transcript text.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
     lines = transcript_text.strip().split('\n')
     timed_audio_segments = []
     max_end_time_ms = 0
     for line in lines:
         start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
         if start_time is not None and audio_paths:
             combined_line_audio = AudioSegment.empty()
             current_time_ms = start_time
             segment_duration = duration / len(audio_paths) if audio_paths else 0
             for path in audio_paths:
+                if path:  # Only process if audio_path is not None (meaning TTS was successful)
+                    try:
+                        audio = AudioSegment.from_mp3(path)
+                        combined_line_audio += audio
+                        os.remove(path)
+                    except FileNotFoundError:
+                        print(f"Warning: Audio file not found: {path}")
             if combined_line_audio:
                 timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
                 max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
         elif audio_paths:
             for path in audio_paths:
+                if path:
+                    try:
+                        os.remove(path)
+                    except FileNotFoundError:
+                        pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
     final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
     for segment in timed_audio_segments:
         final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
     combined_audio_path = tempfile.mktemp(suffix=".mp3")
     final_audio.export(combined_audio_path, format="mp3")
     return combined_audio_path, None