Edge_TTS_NGHIA_transcript

Runtime error

App Files Files Community

cnph001 commited on Apr 27, 2025

Commit

2928892

verified ·

1 Parent(s): 7042e46

Change to output WAV

Browse files

Files changed (1) hide show

app.py +94 -78

app.py CHANGED Viewed

@@ -4,41 +4,39 @@ import edge_tts
 import asyncio
 import tempfile
 import os
-import re  # Import the regular expression module
 import struct
-# Function to create a temporary silent MP3 file (basic approximation)
-def create_silent_mp3(duration, temp_dir):
-    frame_rate = 44100
-    num_channels = 1
-    sample_width = 2  # bytes (16-bit)
-    num_frames = int(duration * frame_rate)
     silent_data = b'\x00' * (num_frames * num_channels * sample_width)
-    temp_silent_file = os.path.join(temp_dir, f"silent_{duration}.raw")
-    with open(temp_silent_file, 'wb') as f:
-        f.write(silent_data)
-    # This is a very basic way to make it look like an MP3 - it won't be a valid MP3.
-    # For a proper MP3, you'd need an MP3 encoding library or ffmpeg.
-    temp_mp3_path = os.path.join(temp_dir, f"silent_{duration}.mp3")
-    with open(temp_mp3_path, 'wb') as f:
-        f.write(b'\xff\xfb\x90\x00\x00\x00\x00') # Minimal MP3 header (very simplified)
-        f.write(silent_data) # Append raw silence
-    os.remove(temp_silent_file) # Clean up the raw file
-    return temp_mp3_path
-# Text-to-speech function for a single paragraph with SS handling
 async def paragraph_to_speech(text, voice, rate, pitch):
-    voice3 ="en-US-BrianMultilingualNeural - en-US (Male)"  #good for reading
-    voice1F ="en-US-EmmaNeural - en-US (Female)"
-    voice2 = "it-IT-GiuseppeMultilingualNeural - it-IT (Male)"
-    voice2F = "en-US-JennyNeural - en-US (Female)"
-    voice1 = "en-AU-WilliamNeural - en-AU (Male)"
-    voice3F = "en-HK-YanNeural - en-HK (Female)"
-    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
-    voice5 = "en-GB-RyanNeural - en-GB (Male)" #Old Man
     if not text.strip():
         return None, []  # Return None for audio path and empty list for silence
@@ -51,8 +49,8 @@ async def paragraph_to_speech(text, voice, rate, pitch):
         if re.match(r'SS\d+\.?\d*', part):
             try:
                 silence_duration = float(part[2:])
-                silent_mp3_path = create_silent_mp3(silence_duration, temp_dir)
-                audio_segments.append(silent_mp3_path)
             except ValueError:
                 print(f"Warning: Invalid silence duration format: {part}")
         elif part.strip():
@@ -61,47 +59,50 @@ async def paragraph_to_speech(text, voice, rate, pitch):
             current_rate = rate
             current_pitch = pitch
             if part.startswith("1F"):
                 processed_text = part[2:]
-                current_voice = voice1F.split(" - ")[0]
             elif part.startswith("2F"):
                 processed_text = part[2:]
-                current_voice = voice2F.split(" - ")[0]
             elif part.startswith("3F"):
                 processed_text = part[2:]
-                current_voice = voice3F.split(" - ")[0]
             elif part.startswith("1M"):
                 processed_text = part[2:]
-                current_voice = voice1.split(" - ")[0]
             elif part.startswith("2M"):
                 processed_text = part[2:]
-                current_voice = voice2.split(" - ")[0]
             elif part.startswith("3M"):
                 processed_text = part[2:]
-                current_voice = voice3.split(" - ")[0]
             elif part.startswith("1C"):
                 processed_text = part[2:]
-                current_voice = voice4.split(" - ")[0]
             elif part.startswith("1O"):
                 processed_text = part[2:]
-                current_voice = voice5.split(" - ")[0]
                 current_pitch = -30
                 current_rate = -20
             else:
-                current_voice = (voice or default_voice).split(" - ")[0]
-                processed_text=part[:]
             rate_str = f"{current_rate:+d}%"
             pitch_str = f"{current_pitch:+d}Hz"
             communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 tmp_path = tmp_file.name
                 await communicate.save(tmp_path)
             audio_segments.append(tmp_path)
         else:
-            #pass # Ignore empty parts
-            audio_segments.append(None) # Empty string
-    return audio_segments, silence_durations
 # Main text-to-speech function that processes paragraphs and silence
 async def text_to_speech(text, voice, rate, pitch):
@@ -110,16 +111,13 @@ async def text_to_speech(text, voice, rate, pitch):
     if not voice:
         return None, gr.Warning("Please select a voice.")
-    paragraphs = [p.strip() for p in re.split(r'"', text) if p.strip()]
     final_audio_segments = []
     for paragraph in paragraphs:
-        audio_paths, silence_times = await paragraph_to_speech(paragraph, voice, rate, pitch)
         if audio_paths:
-            for i, path in enumerate(audio_paths):
-                final_audio_segments.append(path)
-                if i < len(silence_times):
-                    final_audio_segments.append(silence_times[i])
     if not any(isinstance(item, str) for item in final_audio_segments):
         return None, None  # No actual audio generated
@@ -127,35 +125,53 @@ async def text_to_speech(text, voice, rate, pitch):
     if all(not isinstance(item, str) for item in final_audio_segments):
         return None, "Only silence markers found."
-    combined_audio_path = tempfile.mktemp(suffix=".mp3")
-    with open(combined_audio_path, 'wb') as outfile:
-        for segment in final_audio_segments:
-            if isinstance(segment, str):
                 try:
-                    with open(segment, 'rb') as infile:
-                        outfile.write(infile.read())
-                    os.remove(segment)  # Clean up individual files
                 except FileNotFoundError:
-                    print(f"Warning: Audio file not found: {segment}")
-            elif isinstance(segment, (int, float)):
-                # Basic silence insertion (approximate)
-                silence = b'\x00' * int(segment * 44100 * 2) # Assuming 16-bit mono at 44.1kHz
-                outfile.write(silence)
     return combined_audio_path, None
-# Gradio interface function
-@spaces.GPU
-def tts_interface(text, voice, rate, pitch):
-    audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
-    return audio, warning
-# Create Gradio application
-import gradio as gr
 async def create_demo():
     voices = await get_voices()
-    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"  # 👈 Pick one of the available voices
     description = """
     Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
     You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
@@ -164,18 +180,18 @@ async def create_demo():
     """
     demo = gr.Interface(
-        fn=tts_interface,
-        inputs=[
             gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
         ],
-        outputs=[
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
-        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph)",
         description=description,
         article="Process text paragraph by paragraph for smoother output and insert silence markers.",
         analytics_enabled=False,
@@ -186,4 +202,4 @@ async def create_demo():
 # Run the application
 if __name__ == "__main__":
     demo = asyncio.run(create_demo())
-    demo.launch()

 import asyncio
 import tempfile
 import os
+import re
 import struct
+import wave
+# Function to create a temporary silent WAV file
+def create_silent_wav(duration, temp_dir, sample_rate=44100, num_channels=1, sample_width=2):
+    """Creates a temporary WAV file containing silence."""
+    if duration <= 0:
+        raise ValueError("Duration must be positive.")
+    num_frames = int(duration * sample_rate)
     silent_data = b'\x00' * (num_frames * num_channels * sample_width)
+    temp_wav_path = os.path.join(temp_dir, f"silent_{duration}.wav")
+    with wave.open(temp_wav_path, 'w') as wf:
+        wf.setnchannels(num_channels)
+        wf.setframerate(sample_rate)
+        wf.setsampwidth(sample_width)
+        wf.writeframes(silent_data)
+    return temp_wav_path
+# Function to process text and generate audio for a single paragraph
 async def paragraph_to_speech(text, voice, rate, pitch):
+    voices = {
+        "voice1F": "en-US-EmmaNeural - en-US (Female)",
+        "voice2F": "en-US-JennyNeural - en-US (Female)",
+        "voice3F": "en-HK-YanNeural - en-HK (Female)",
+        "voice1": "en-AU-WilliamNeural - en-AU (Male)",
+        "voice2": "it-IT-GiuseppeMultilingualNeural - it-IT (Male)",
+        "voice3": "en-US-BrianMultilingualNeural - en-US (Male)",
+        "voice4": "en-GB-MaisieNeural - en-GB (Female)",  # Child
+        "voice5": "en-GB-RyanNeural - en-GB (Male)"  # Old Man
+    }
     if not text.strip():
         return None, []  # Return None for audio path and empty list for silence
         if re.match(r'SS\d+\.?\d*', part):
             try:
                 silence_duration = float(part[2:])
+                silent_wav_path = create_silent_wav(silence_duration, temp_dir)
+                audio_segments.append(silent_wav_path)
             except ValueError:
                 print(f"Warning: Invalid silence duration format: {part}")
         elif part.strip():
             current_rate = rate
             current_pitch = pitch
+            # Select voice based on part prefix
             if part.startswith("1F"):
                 processed_text = part[2:]
+                current_voice = voices["voice1F"]
             elif part.startswith("2F"):
                 processed_text = part[2:]
+                current_voice = voices["voice2F"]
             elif part.startswith("3F"):
                 processed_text = part[2:]
+                current_voice = voices["voice3F"]
             elif part.startswith("1M"):
                 processed_text = part[2:]
+                current_voice = voices["voice1"]
             elif part.startswith("2M"):
                 processed_text = part[2:]
+                current_voice = voices["voice2"]
             elif part.startswith("3M"):
                 processed_text = part[2:]
+                current_voice = voices["voice3"]
             elif part.startswith("1C"):
                 processed_text = part[2:]
+                current_voice = voices["voice4"]
             elif part.startswith("1O"):
                 processed_text = part[2:]
+                current_voice = voices["voice5"]
                 current_pitch = -30
                 current_rate = -20
             else:
+                current_voice = (voice or voices["voice1"]).split(" - ")[0]
+                processed_text = part[:]
             rate_str = f"{current_rate:+d}%"
             pitch_str = f"{current_pitch:+d}Hz"
             communicate = edge_tts.Communicate(processed_text, current_voice, rate=rate_str, pitch=pitch_str)
+            # Save speech output to temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                 tmp_path = tmp_file.name
                 await communicate.save(tmp_path)
             audio_segments.append(tmp_path)
         else:
+            audio_segments.append(None)  # Empty string
+    return audio_segments, []  # Returning empty list for silence times as we are directly creating silent WAV
 # Main text-to-speech function that processes paragraphs and silence
 async def text_to_speech(text, voice, rate, pitch):
     if not voice:
         return None, gr.Warning("Please select a voice.")
+    paragraphs = [p.strip() for p in re.split(r'\n\n+', text) if p.strip()]
     final_audio_segments = []
     for paragraph in paragraphs:
+        audio_paths, _ = await paragraph_to_speech(paragraph, voice, rate, pitch)
         if audio_paths:
+            final_audio_segments.extend(audio_paths)
     if not any(isinstance(item, str) for item in final_audio_segments):
         return None, None  # No actual audio generated
     if all(not isinstance(item, str) for item in final_audio_segments):
         return None, "Only silence markers found."
+    combined_audio_path = tempfile.mktemp(suffix=".wav")
+    with wave.open(combined_audio_path, 'w') as outfile:
+        first_audio = True
+        sample_rate = None
+        num_channels = None
+        sample_width = None
+        for segment_path in final_audio_segments:
+            if isinstance(segment_path, str):
                 try:
+                    with wave.open(segment_path, 'rb') as infile:
+                        current_num_channels = infile.getnchannels()
+                        current_sample_rate = infile.getframerate()
+                        current_sample_width = infile.getsampwidth()
+                        frames = infile.readframes(infile.getnframes())
+                        if first_audio:
+                            num_channels = current_num_channels
+                            sample_rate = current_sample_rate
+                            sample_width = current_sample_width
+                            outfile.setnchannels(num_channels)
+                            outfile.setframerate(sample_rate)
+                            outfile.setsampwidth(sample_width)
+                            first_audio = False
+                        elif (current_num_channels != num_channels or
+                              current_sample_rate != sample_rate or
+                              current_sample_width != sample_width):
+                            print(f"Warning: Audio segment {segment_path} has different format. Skipping.")
+                            continue
+                        outfile.writeframes(frames)
+                    os.remove(segment_path)  # Clean up individual files
+                except wave.Error as e:
+                    print(f"Warning: Error reading WAV file {segment_path}: {e}")
                 except FileNotFoundError:
+                    print(f"Warning: Audio file not found: {segment_path}")
     return combined_audio_path, None
+# Gradio interface function (wrapper to run async code)
+def tts_interface_sync(text, voice, rate, pitch):
+    return asyncio.run(tts_interface(text, voice, rate, pitch))
+# Gradio interface
 async def create_demo():
     voices = await get_voices()
+    default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
     Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Yan, 1M:AU_Will, 2M:IT_Guiseppe,3M:US_Brian,  1C: Childvoice, 1O = OldMan
     You can insert silence using the marker 'SS' followed by the duration in seconds (e.g., 'SS1.2' for a 1.2-second pause).
     """
     demo = gr.Interface(
+        fn=tts_interface_sync,
+        inputs=[
             gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines. Use 'SS[duration]' for silence."),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
         ],
+        outputs=[
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
+        title="Voicecloning.be Text-to-Speech with Silence Insertion (Paragraph by Paragraph) - WAV Output",
         description=description,
         article="Process text paragraph by paragraph for smoother output and insert silence markers.",
         analytics_enabled=False,
 # Run the application
 if __name__ == "__main__":
     demo = asyncio.run(create_demo())
+    demo.launch()