hivecorp commited on
Commit
eceecf3
·
verified ·
1 Parent(s): c53e8a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -188
app.py CHANGED
@@ -1,189 +1,83 @@
1
  import gradio as gr
2
- import edge_tts
3
- import asyncio
4
- import tempfile
5
- import os
6
- import re
7
- from pydub import AudioSegment # Required for audio duration, needs ffmpeg installed
8
-
9
- # Get all available voices
10
- async def get_voices():
11
- """Fetches all available voices from the Edge TTS service."""
12
- voices = await edge_tts.list_voices()
13
- # Format voice names for display in the dropdown
14
- return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
15
-
16
- # Text-to-speech function
17
- async def text_to_speech(text, voice, rate, pitch):
18
- """
19
- Converts text to speech using Edge TTS and saves it to a temporary file.
20
- Returns the path to the generated audio file and the original text for SRT generation.
21
- """
22
- if not text.strip():
23
- # Return a string for the warning, instead of a gr.Warning object directly
24
- return None, None, "Please enter text to convert."
25
- if not voice:
26
- # Return a string for the warning
27
- return None, None, "Please select a voice."
28
-
29
- # Extract the short name from the selected voice string
30
- voice_short_name = voice.split(" - ")[0]
31
-
32
- # Format rate and pitch for the Edge TTS API
33
- rate_str = f"{rate:+d}%"
34
- pitch_str = f"{pitch:+d}Hz"
35
-
36
- # Initialize the Edge TTS communicator
37
- communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
38
-
39
- # Create a temporary file to save the audio
40
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
41
- tmp_path = tmp_file.name
42
- await communicate.save(tmp_path)
43
-
44
- return tmp_path, text, "" # Return audio path, original text, and an empty string for no warning
45
-
46
- def format_time(ms):
47
- """
48
- Formats milliseconds into SRT time format (HH:MM:SS,mmm).
49
- """
50
- hours = int(ms / 3_600_000)
51
- ms %= 3_600_000
52
- minutes = int(ms / 60_000)
53
- ms %= 60_000
54
- seconds = int(ms / 1_000)
55
- milliseconds = int(ms % 1_000)
56
- return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
57
-
58
- def generate_srt(text_input, audio_filepath):
59
- """
60
- Generates a basic SRT file based on text input and estimated timings
61
- from audio duration. Timings are proportional to segment text length.
62
-
63
- Note: This does not use advanced audio analysis for precise timing of pauses.
64
- It's an estimation based on character count per segment.
65
- Requires ffmpeg installed for pydub to read audio duration.
66
- """
67
- if not text_input or not audio_filepath:
68
- return None
69
-
70
- try:
71
- # Load audio to get its total duration using pydub
72
- audio = AudioSegment.from_file(audio_filepath)
73
- audio_duration_ms = len(audio)
74
- except Exception as e:
75
- print(f"Error getting audio duration with pydub: {e}. SRT generation requires ffmpeg.")
76
- # If pydub fails (e.g., ffmpeg not found), return None for SRT
77
- return None
78
-
79
- # Split text into segments. This regex splits on common sentence-ending
80
- # punctuation, keeping the punctuation with the segment, and also handles newlines.
81
- segments = re.findall(r'[^.!?,\n]+[.!?,\n]*', text_input)
82
- segments = [s.strip() for s in segments if s.strip()] # Clean up empty strings
83
-
84
- if not segments:
85
- return None
86
-
87
- srt_content = []
88
- current_time_ms = 0
89
- total_chars = sum(len(s) for s in segments)
90
-
91
- if total_chars == 0: # Prevent division by zero if text is somehow empty after stripping
92
- return None
93
-
94
- for i, segment in enumerate(segments):
95
- # Estimate duration for the segment based on its character count
96
- # This assumes a roughly constant speech rate throughout the audio.
97
- estimated_segment_duration_ms = (len(segment) / total_chars) * audio_duration_ms
98
-
99
- start_time = current_time_ms
100
- end_time = current_time_ms + estimated_segment_duration_ms
101
-
102
- # Ensure the last segment's end time matches the total audio duration
103
- if i == len(segments) - 1:
104
- end_time = audio_duration_ms
105
-
106
- # Add SRT entry
107
- srt_content.append(str(i + 1))
108
- srt_content.append(f"{format_time(start_time)} --> {format_time(end_time)}")
109
- srt_content.append(segment)
110
- srt_content.append("") # Empty line separates SRT blocks
111
-
112
- current_time_ms = end_time
113
-
114
- # Save the SRT content to a temporary file
115
- srt_filename = f"{os.path.splitext(audio_filepath)[0]}.srt"
116
- with open(srt_filename, "w", encoding="utf-8") as f:
117
- f.write("\n".join(srt_content))
118
-
119
- return srt_filename
120
-
121
- # Gradio interface function (wraps async functions and handles SRT generation)
122
- def tts_interface(text, voice, rate, pitch):
123
- """
124
- The main interface function for Gradio. It calls text_to_speech and then generate_srt.
125
- """
126
- # Run the async text_to_speech function
127
- audio_path, original_text, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
128
-
129
- srt_path = None
130
- if audio_path: # Only attempt SRT generation if audio was successfully created
131
- srt_path = generate_srt(original_text, audio_path)
132
-
133
- # Return the generated audio, SRT file, and any warnings
134
- return audio_path, srt_path, warning
135
-
136
- # Create Gradio application
137
- async def create_demo():
138
- """
139
- Asynchronously creates and configures the Gradio interface.
140
- """
141
- voices = await get_voices() # Fetch voices when the app starts
142
-
143
- description = """
144
- Convert text to speech using Microsoft Edge TTS. Adjust speech rate and pitch: 0 is default, positive values increase, negative values decrease.
145
-
146
- ✨ **New Feature: Generate SRT Subtitles (Estimated Timings)!** ✨
147
-
148
- Automatically generates an SRT (SubRip Subtitle) file from your input text.
149
- **Important Note on Timings:** The SRT timings are *estimated* based on the length of each text segment relative to the total audio duration. This feature *does not* perform advanced audio waveform analysis for precise pause detection or word-level synchronization. For perfectly synchronized subtitles, dedicated forced-alignment tools are typically required.
150
-
151
- 🎥 **Exciting News: Introducing our Text-to-Video Converter!** 🎥
152
-
153
- Take your content creation to the next level with our cutting-edge Text-to-Video Converter!
154
- Transform your words into stunning, professional-quality videos in just a few clicks.
155
-
156
- ✨ Features:
157
- • Convert text to engaging videos with customizable visuals
158
- • Choose from 40+ languages and 300+ voices
159
- • Perfect for creating audiobooks, storytelling, and language learning materials
160
- • Ideal for educators, content creators, and language enthusiasts
161
-
162
- Ready to revolutionize your content? [Click here to try our Text-to-Video Converter now!](https://text2video.wingetgui.com/)
163
- """
164
-
165
- demo = gr.Interface(
166
- fn=tts_interface, # The function that processes inputs and returns outputs
167
- inputs=[
168
- gr.Textbox(label="Input Text", lines=5, placeholder="Enter your text here to convert to speech and generate SRT..."),
169
- gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="", type="value"),
170
- gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
171
- gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
172
- ],
173
- outputs=[
174
- gr.Audio(label="Generated Audio", type="filepath"),
175
- gr.File(label="Generated SRT Subtitle", type="filepath", file_count="single", visible=True), # Output for the SRT file
176
- gr.Markdown(label="Warning") # Now expects a string output
177
- ],
178
- title="Edge TTS Text-to-Speech with SRT Generator",
179
- description=description,
180
- article="Experience the power of Edge TTS for text-to-speech conversion, and explore our advanced Text-to-Video Converter for even more creative possibilities!",
181
- analytics_enabled=False,
182
- flagging_mode='never' # Changed from allow_flagging=False
183
- )
184
- return demo
185
-
186
- # Run the application
187
- if __name__ == "__main__":
188
- demo = asyncio.run(create_demo())
189
- demo.launch()
 
1
  import gradio as gr
2
+ from pydub import AudioSegment, silence
3
+ import nltk
4
+ import srt
5
+ import io
6
+ import datetime
7
+
8
+ nltk.download('punkt')
9
+
10
+ def process_audio_and_script(audio_file, script_text):
11
+ # Load audio
12
+ audio = AudioSegment.from_file(audio_file)
13
+ silence_thresh = audio.dBFS - 16
14
+ silences = silence.detect_silence(audio, min_silence_len=400, silence_thresh=silence_thresh)
15
+
16
+ # Convert silence list to start-end in seconds
17
+ silences = [(start / 1000, stop / 1000) for start, stop in silences]
18
+
19
+ # Segment script based on punctuation
20
+ sentences = nltk.tokenize.sent_tokenize(script_text)
21
+
22
+ # Distribute timing across sentences based on silence gaps
23
+ subtitles = []
24
+ last_time = 0.0
25
+ for i, sentence in enumerate(sentences):
26
+ if i < len(silences):
27
+ start = last_time
28
+ end = silences[i][0]
29
+ last_time = silences[i][1]
30
+ else:
31
+ start = last_time
32
+ end = start + 2.5 # default length if not enough silences
33
+
34
+ subtitle = srt.Subtitle(index=i + 1,
35
+ start=datetime.timedelta(seconds=start),
36
+ end=datetime.timedelta(seconds=end),
37
+ content=sentence)
38
+ subtitles.append(subtitle)
39
+
40
+ srt_file = srt.compose(subtitles)
41
+ return srt_file
42
+
43
+ def download_srt(audio_file, script_text):
44
+ srt_data = process_audio_and_script(audio_file, script_text)
45
+ return ("subtitles.srt", srt_data)
46
+
47
+ # Interface
48
+ with gr.Blocks() as demo:
49
+ gr.Markdown("### 🎙️ Audio to Timed Subtitle (SRT) Generator with Waveform")
50
+ with gr.Row():
51
+ audio_input = gr.Audio(type="file", label="Upload Audio")
52
+ script_input = gr.Textbox(lines=10, label="Paste Script/Text with Punctuation")
53
+
54
+ srt_output = gr.File(label="Download SRT")
55
+
56
+ def waveform_html(audio_file):
57
+ return f"""
58
+ <div id="waveform"></div>
59
+ <script src="https://unpkg.com/wavesurfer.js"></script>
60
+ <script>
61
+ var wavesurfer = WaveSurfer.create({
62
+ container: '#waveform',
63
+ waveColor: '#999',
64
+ progressColor: '#333',
65
+ height: 100
66
+ });
67
+ wavesurfer.load("{audio_file}");
68
+ </script>
69
+ """
70
+
71
+ waveform = gr.HTML()
72
+
73
+ with gr.Row():
74
+ gen_btn = gr.Button("Generate SRT")
75
+ gen_btn.click(fn=download_srt,
76
+ inputs=[audio_input, script_input],
77
+ outputs=srt_output)
78
+
79
+ audio_input.change(fn=lambda audio: waveform_html(audio["name"]),
80
+ inputs=audio_input,
81
+ outputs=waveform)
82
+
83
+ demo.launch()