NGHIA_Test_Edge_TTS_transcript_w_timestamp

Sleeping

App Files Files Community

cnph001 commited on Apr 27, 2025

Commit

552e1db

verified ·

1 Parent(s): f5bf582

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -49

app.py CHANGED Viewed

@@ -4,22 +4,24 @@ import edge_tts
 import asyncio
 import tempfile
 import os
-import re
 import struct
 import wave
-# Define the get_voices function first
-async def get_voices():
-    voices_list = await edge_tts.list_voices()
-    voices_dict = {v["ShortName"]: f"{v['Name']} - {v['LocaleName']} ({v['Gender']})" for v in voices_list}
-    return voices_dict
 # Function to create a temporary silent WAV file
 def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
-    """Creates a temporary WAV file containing silence."""
-    if duration <= 0:
-        raise ValueError("Duration must be positive.")
     num_frames = int(duration * sample_rate)
     silent_data = b'\x00' * (num_frames * num_channels * sample_width)
@@ -31,18 +33,16 @@ def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sam
         wf.writeframes(silent_data)
     return temp_wav_path
-# Function to process text and generate audio for a single paragraph
 async def paragraph_to_speech(text, voice, rate, pitch):
-    voices = {
-        "voice1F": "en-US-EmmaNeural - en-US (Female)",
-        "voice2F": "en-US-JennyNeural - en-US (Female)",
-        "voice3F": "en-HK-YanNeural - en-HK (Female)",
-        "voice1": "en-AU-WilliamNeural - en-AU (Male)",
-        "voice2": "it-IT-GiuseppeMultilingualNeural - it-IT (Male)",
-        "voice3": "en-US-BrianMultilingualNeural - en-US (Male)",
-        "voice4": "en-GB-MaisieNeural - en-GB (Female)",  # Child
-        "voice5": "en-GB-RyanNeural - en-GB (Male)"  # Old Man
-    }
     if not text.strip():
         return None, []  # Return None for audio path and empty list for silence
@@ -55,6 +55,7 @@ async def paragraph_to_speech(text, voice, rate, pitch):
         if re.match(r'SS\d+\.?\d*', part):
             try:
                 silence_duration = float(part[2:])
                 silent_wav_path = create_silent_wav(silence_duration, temp_dir)
                 audio_segments.append(silent_wav_path)
             except ValueError:
@@ -65,50 +66,46 @@ async def paragraph_to_speech(text, voice, rate, pitch):
             current_rate = rate
             current_pitch = pitch
-            # Select voice based on part prefix
             if part.startswith("1F"):
                 processed_text = part[2:]
-                current_voice = voices["voice1F"]
             elif part.startswith("2F"):
                 processed_text = part[2:]
-                current_voice = voices["voice2F"]
             elif part.startswith("3F"):
                 processed_text = part[2:]
-                current_voice = voices["voice3F"]
             elif part.startswith("1M"):
                 processed_text = part[2:]
-                current_voice = voices["voice1"]
             elif part.startswith("2M"):
                 processed_text = part[2:]
-                current_voice = voices["voice2"]
             elif part.startswith("3M"):
                 processed_text = part[2:]
-                current_voice = voices["voice3"]
             elif part.startswith("1C"):
                 processed_text = part[2:]
-                current_voice = voices["voice4"]
             elif part.startswith("1O"):
                 processed_text = part[2:]
-                current_voice = voices["voice5"]
                 current_pitch = -30
                 current_rate = -20
             else:
-                current_voice = (voice or voices["voice1"]).split(" - ")[0]
-                processed_text = part[:]
             rate_str = f"{current_rate:+d}%"
             pitch_str = f"{current_pitch:+d}Hz"
             communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
-            # Save speech output to temporary file
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                 tmp_path = tmp_file.name
                 await communicate.save(tmp_path)
             audio_segments.append(tmp_path)
         else:
-            audio_segments.append(None)  # Empty string
-    return audio_segments, []  # Returning empty list for silence times as we are directly creating silent WAV
 # Main text-to-speech function that processes paragraphs and silence
 async def text_to_speech(text, voice, rate, pitch):
@@ -117,7 +114,7 @@ async def text_to_speech(text, voice, rate, pitch):
     if not voice:
         return None, gr.Warning("Please select a voice.")
-    paragraphs = [p.strip() for p in re.split(r'\n\n+', text) if p.strip()]
     final_audio_segments = []
     for paragraph in paragraphs:
@@ -170,14 +167,21 @@ async def text_to_speech(text, voice, rate, pitch):
     return combined_audio_path, None
-# Gradio interface function (wrapper to run async code)
-def tts_interface_sync(text, voice, rate, pitch):
-    return asyncio.run(tts_interface(text, voice, rate, pitch))
-# Gradio interface
 async def create_demo():
-    voices = await get_voices()  # Now this function is defined
-    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
     Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
     You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
@@ -186,14 +190,14 @@ async def create_demo():
     """
     demo = gr.Interface(
-        fn=tts_interface_sync,
-        inputs=[
             gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
         ],
-        outputs=[
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
@@ -208,4 +212,4 @@ async def create_demo():
 # Run the application
 if __name__ == "__main__":
     demo = asyncio.run(create_demo())
-    demo.launch()

 import asyncio
 import tempfile
 import os
+import re  # Import the regular expression module
 import struct
 import wave
 # Function to create a temporary silent WAV file
 def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
+    """Creates a temporary WAV file containing silence.
+    Args:
+        duration (float): Duration of silence in seconds.
+        temp_dir (str): Directory to save the temporary file.
+        sample_rate (int): Sample rate of the audio (samples per second).
+        num_channels (int): Number of audio channels (1 for mono, 2 for stereo).
+        sample_width (int): Sample width in bytes (e.g., 2 for 16-bit).
+    Returns:
+        str: Path to the temporary silent WAV file.
+    """
     num_frames = int(duration * sample_rate)
     silent_data = b'\x00' * (num_frames * num_channels * sample_width)
         wf.writeframes(silent_data)
     return temp_wav_path
+# Text-to-speech function for a single paragraph with SS handling
 async def paragraph_to_speech(text, voice, rate, pitch):
+    voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
+    voice1F ="en-US-EmmaNeural - en-US (Female)"
+    voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
+    voice2F = "en-US-JennyNeural - en-US (Female)"
+    voice1 = "en-AU-WilliamNeural - en-AU (Male)"
+    voice3F = "en-HK-YanNeural - en-HK (Female)"
+    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
+    voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
     if not text.strip():
         return None, []  # Return None for audio path and empty list for silence
         if re.match(r'SS\d+\.?\d*', part):
             try:
                 silence_duration = float(part[2:])
+                # Assuming default WAV parameters for silence
                 silent_wav_path = create_silent_wav(silence_duration, temp_dir)
                 audio_segments.append(silent_wav_path)
             except ValueError:
             current_rate = rate
             current_pitch = pitch
             if part.startswith("1F"):
                 processed_text = part[2:]
+                current_voice = voice1F.split(" - ")[0]
             elif part.startswith("2F"):
                 processed_text = part[2:]
+                current_voice = voice2F.split(" - ")[0]
             elif part.startswith("3F"):
                 processed_text = part[2:]
+                current_voice = voice3F.split(" - ")[0]
             elif part.startswith("1M"):
                 processed_text = part[2:]
+                current_voice = voice1.split(" - ")[0]
             elif part.startswith("2M"):
                 processed_text = part[2:]
+                current_voice = voice2.split(" - ")[0]
             elif part.startswith("3M"):
                 processed_text = part[2:]
+                current_voice = voice3.split(" - ")[0]
             elif part.startswith("1C"):
                 processed_text = part[2:]
+                current_voice = voice4.split(" - ")[0]
             elif part.startswith("1O"):
                 processed_text = part[2:]
+                current_voice = voice5.split(" - ")[0]
                 current_pitch = -30
                 current_rate = -20
             else:
+                current_voice = (voice or default_voice).split(" - ")[0]
+                processed_text=part[:]
             rate_str = f"{current_rate:+d}%"
             pitch_str = f"{current_pitch:+d}Hz"
             communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
             with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                 tmp_path = tmp_file.name
                 await communicate.save(tmp_path)
             audio_segments.append(tmp_path)
         else:
+            audio_segments.append(None) # Empty string
+    return audio_segments, [] # Returning empty list for silence times as we are directly creating silent WAV
 # Main text-to-speech function that processes paragraphs and silence
 async def text_to_speech(text, voice, rate, pitch):
     if not voice:
         return None, gr.Warning("Please select a voice.")
+    paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
     final_audio_segments = []
     for paragraph in paragraphs:
     return combined_audio_path, None
+# Gradio interface function
+@spaces.GPU
+def tts_interface(text, voice, rate, pitch):
+    audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
+    return audio, warning
+async def get_voices():
+    voices_list = await edge_tts.list_voices()
+    voices_dict = {v["ShortName"]: f"{v['Name']} - {v['LocaleName']} ({v['Gender']})" for v in voices_list}
+    return voices_dict
+# Create Gradio application
 async def create_demo():
+    voices = await get_voices()
+    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # 👈 Pick one of the available voices
     description = """
     Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
     You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
     """
     demo = gr.Interface(
+        fn=tts_interface,
+        inputs=[
             gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
         ],
+        outputs=[
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
 # Run the application
 if __name__ == "__main__":
     demo = asyncio.run(create_demo())
+    demo.launch()