Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -202,7 +202,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
|
|
| 202 |
timed_audio_segments = []
|
| 203 |
max_end_time_ms = 0
|
| 204 |
previous_end_time_ms = 0
|
| 205 |
-
next_start_time_ms = None
|
|
|
|
| 206 |
|
| 207 |
for i, line in enumerate(lines):
|
| 208 |
start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
|
|
@@ -220,56 +221,13 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
|
|
| 220 |
current_audio_duration = len(combined_line_audio)
|
| 221 |
intended_start_time = start_time
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
next_h, next_m, next_s, next_ms = next_line_match.groups()
|
| 228 |
-
next_start_time_ms = (
|
| 229 |
-
int(next_h) * 3600000 +
|
| 230 |
-
int(next_m) * 60000 +
|
| 231 |
-
int(next_s) * 1000 +
|
| 232 |
-
int(next_ms)
|
| 233 |
-
)
|
| 234 |
-
else:
|
| 235 |
-
next_start_time_ms = None
|
| 236 |
-
else:
|
| 237 |
-
next_start_time_ms = None
|
| 238 |
-
|
| 239 |
-
# Combine audio segments if current audio is longer than the time difference
|
| 240 |
-
while next_start_time_ms and current_audio_duration > (next_start_time_ms - start_time):
|
| 241 |
-
if i + 1 < len(lines):
|
| 242 |
-
next_start_time, next_audio_paths = await process_transcript_line(lines[i + 1], voice, rate, pitch)
|
| 243 |
-
if next_start_time is not None and next_audio_paths:
|
| 244 |
-
for next_path in next_audio_paths:
|
| 245 |
-
try:
|
| 246 |
-
next_audio = AudioSegment.from_mp3(next_path)
|
| 247 |
-
combined_line_audio += next_audio
|
| 248 |
-
os.remove(next_path)
|
| 249 |
-
except FileNotFoundError:
|
| 250 |
-
print(f"Warning: Audio file not found: {next_path}")
|
| 251 |
-
current_audio_duration = len(combined_line_audio)
|
| 252 |
-
i += 1 # Move to the next line
|
| 253 |
-
if i + 1 < len(lines):
|
| 254 |
-
next_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+.*', lines[i + 1])
|
| 255 |
-
if next_line_match:
|
| 256 |
-
next_h, next_m, next_s, next_ms = next_line_match.groups()
|
| 257 |
-
next_start_time_ms = (
|
| 258 |
-
int(next_h) * 3600000 +
|
| 259 |
-
int(next_m) * 60000 +
|
| 260 |
-
int(next_s) * 1000 +
|
| 261 |
-
int(next_ms)
|
| 262 |
-
)
|
| 263 |
-
else:
|
| 264 |
-
next_start_time_ms = None
|
| 265 |
-
else:
|
| 266 |
-
next_start_time_ms = None
|
| 267 |
-
else:
|
| 268 |
-
break # Exit the loop if there are no more processable lines
|
| 269 |
-
else:
|
| 270 |
-
break
|
| 271 |
|
| 272 |
timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
|
|
|
|
| 273 |
previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
|
| 274 |
max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
|
| 275 |
elif audio_paths:
|
|
@@ -300,7 +258,7 @@ async def create_demo():
|
|
| 300 |
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
|
| 301 |
description = """
|
| 302 |
Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
|
| 303 |
-
Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "
|
| 304 |
Example:
|
| 305 |
```
|
| 306 |
00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
|
|
|
|
| 202 |
timed_audio_segments = []
|
| 203 |
max_end_time_ms = 0
|
| 204 |
previous_end_time_ms = 0
|
| 205 |
+
next_start_time_ms = None # Keep track of the start time of the *next* segment
|
| 206 |
+
previous_start_time_ms = 0
|
| 207 |
|
| 208 |
for i, line in enumerate(lines):
|
| 209 |
start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
|
|
|
|
| 221 |
current_audio_duration = len(combined_line_audio)
|
| 222 |
intended_start_time = start_time
|
| 223 |
|
| 224 |
+
if i > 0:
|
| 225 |
+
time_difference = start_time - previous_start_time_ms
|
| 226 |
+
if current_audio_duration > time_difference:
|
| 227 |
+
intended_start_time = previous_end_time_ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
|
| 230 |
+
previous_start_time_ms = start_time
|
| 231 |
previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
|
| 232 |
max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
|
| 233 |
elif audio_paths:
|
|
|
|
| 258 |
default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
|
| 259 |
description = """
|
| 260 |
Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
|
| 261 |
+
Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "1F Different Voice"
|
| 262 |
Example:
|
| 263 |
```
|
| 264 |
00:00:00,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
|