Edge_TTS_NGHIA_transcript

Runtime error

App Files Files Community

cnph001 commited on Apr 27, 2025

Commit

7042e46

verified ·

1 Parent(s): ef4c8b8

Update app.py

Browse files

fixing sillence

Files changed (1) hide show

app.py +27 -11

app.py CHANGED Viewed

@@ -5,12 +5,29 @@ import asyncio
 import tempfile
 import os
 import re  # Import the regular expression module
-# Get all available voices
-async def get_voices():
-    voices = await edge_tts.list_voices()
-    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 # Text-to-speech function for a single paragraph with SS handling
 async def paragraph_to_speech(text, voice, rate, pitch):
@@ -27,15 +44,15 @@ async def paragraph_to_speech(text, voice, rate, pitch):
         return None, []  # Return None for audio path and empty list for silence
     audio_segments = []
-    silence_durations = []
     parts = re.split(r'(SS\d+\.?\d*)', text)
     for part in parts:
         if re.match(r'SS\d+\.?\d*', part):
             try:
                 silence_duration = float(part[2:])
-                silence_durations.append(silence_duration)
-                audio_segments.append(None) # Placeholder for silence
             except ValueError:
                 print(f"Warning: Invalid silence duration format: {part}")
         elif part.strip():
@@ -71,8 +88,6 @@ async def paragraph_to_speech(text, voice, rate, pitch):
                 current_pitch = -30
                 current_rate = -20
             else:
-                # Use selected voice, or fallback to default
-                #voice_short_name = (voice or default_voice).split(" - ")[0]
                 current_voice = (voice or default_voice).split(" - ")[0]
                 processed_text=part[:]
             rate_str = f"{current_rate:+d}%"
@@ -83,6 +98,7 @@ async def paragraph_to_speech(text, voice, rate, pitch):
                 await communicate.save(tmp_path)
             audio_segments.append(tmp_path)
         else:
             audio_segments.append(None) # Empty string
     return audio_segments, silence_durations

 import tempfile
 import os
 import re  # Import the regular expression module
+import struct
+# Function to create a temporary silent MP3 file (basic approximation)
+def create_silent_mp3(duration, temp_dir):
+    frame_rate = 44100
+    num_channels = 1
+    sample_width = 2  # bytes (16-bit)
+    num_frames = int(duration * frame_rate)
+    silent_data = b'\x00' * (num_frames * num_channels * sample_width)
+    temp_silent_file = os.path.join(temp_dir, f"silent_{duration}.raw")
+    with open(temp_silent_file, 'wb') as f:
+        f.write(silent_data)
+    # This is a very basic way to make it look like an MP3 - it won't be a valid MP3.
+    # For a proper MP3, you'd need an MP3 encoding library or ffmpeg.
+    temp_mp3_path = os.path.join(temp_dir, f"silent_{duration}.mp3")
+    with open(temp_mp3_path, 'wb') as f:
+        f.write(b'\xff\xfb\x90\x00\x00\x00\x00') # Minimal MP3 header (very simplified)
+        f.write(silent_data) # Append raw silence
+    os.remove(temp_silent_file) # Clean up the raw file
+    return temp_mp3_path
 # Text-to-speech function for a single paragraph with SS handling
 async def paragraph_to_speech(text, voice, rate, pitch):
         return None, []  # Return None for audio path and empty list for silence
     audio_segments = []
+    temp_dir = tempfile.gettempdir()
     parts = re.split(r'(SS\d+\.?\d*)', text)
     for part in parts:
         if re.match(r'SS\d+\.?\d*', part):
             try:
                 silence_duration = float(part[2:])
+                silent_mp3_path = create_silent_mp3(silence_duration, temp_dir)
+                audio_segments.append(silent_mp3_path)
             except ValueError:
                 print(f"Warning: Invalid silence duration format: {part}")
         elif part.strip():
                 current_pitch = -30
                 current_rate = -20
             else:
                 current_voice = (voice or default_voice).split(" - ")[0]
                 processed_text=part[:]
             rate_str = f"{current_rate:+d}%"
                 await communicate.save(tmp_path)
             audio_segments.append(tmp_path)
         else:
+            #pass # Ignore empty parts
             audio_segments.append(None) # Empty string
     return audio_segments, silence_durations