Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -39,6 +39,7 @@ class Segment:
|
|
| 39 |
end_time: int = 0
|
| 40 |
duration: int = 0
|
| 41 |
audio: Optional[AudioSegment] = None
|
|
|
|
| 42 |
|
| 43 |
class TextProcessor:
|
| 44 |
def __init__(self, words_per_line: int, lines_per_segment: int):
|
|
@@ -182,13 +183,18 @@ class TextProcessor:
|
|
| 182 |
return lines
|
| 183 |
|
| 184 |
async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
|
| 185 |
-
"""Process a
|
| 186 |
audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
|
| 187 |
try:
|
| 188 |
-
|
|
|
|
|
|
|
| 189 |
await tts.save(audio_file)
|
| 190 |
|
| 191 |
segment.audio = AudioSegment.from_file(audio_file)
|
|
|
|
|
|
|
|
|
|
| 192 |
segment.duration = len(segment.audio)
|
| 193 |
|
| 194 |
return segment
|
|
@@ -197,46 +203,50 @@ async def process_segment_with_timing(segment: Segment, voice: str, rate: str, p
|
|
| 197 |
os.remove(audio_file)
|
| 198 |
|
| 199 |
async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
|
| 200 |
-
# Initialize text processor and split text
|
| 201 |
processor = TextProcessor(words_per_line, lines_per_segment)
|
| 202 |
segments = processor.split_into_segments(text)
|
| 203 |
|
| 204 |
-
# Process
|
| 205 |
-
|
| 206 |
-
process_segment_with_timing(segment, voice, rate, pitch)
|
| 207 |
-
for segment in segments
|
| 208 |
-
]
|
| 209 |
-
processed_segments = await asyncio.gather(*tasks)
|
| 210 |
-
|
| 211 |
-
# Calculate timing for each segment
|
| 212 |
current_time = 0
|
| 213 |
final_audio = AudioSegment.empty()
|
| 214 |
srt_content = ""
|
| 215 |
|
| 216 |
-
for segment in
|
| 217 |
-
#
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
-
# Add to SRT
|
| 222 |
srt_content += (
|
| 223 |
-
f"{
|
| 224 |
-
f"{format_time_ms(
|
| 225 |
-
f"{
|
| 226 |
)
|
| 227 |
|
| 228 |
-
# Add to final audio
|
| 229 |
-
final_audio
|
| 230 |
|
| 231 |
-
# Update timing
|
| 232 |
-
current_time =
|
|
|
|
| 233 |
|
| 234 |
-
# Export
|
| 235 |
unique_id = uuid.uuid4()
|
| 236 |
audio_path = f"final_audio_{unique_id}.mp3"
|
| 237 |
srt_path = f"final_subtitles_{unique_id}.srt"
|
| 238 |
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
with open(srt_path, "w", encoding='utf-8') as f:
|
| 241 |
f.write(srt_content)
|
| 242 |
|
|
|
|
| 39 |
end_time: int = 0
|
| 40 |
duration: int = 0
|
| 41 |
audio: Optional[AudioSegment] = None
|
| 42 |
+
lines: List[str] = None # Add lines field for display purposes only
|
| 43 |
|
| 44 |
class TextProcessor:
|
| 45 |
def __init__(self, words_per_line: int, lines_per_segment: int):
|
|
|
|
| 183 |
return lines
|
| 184 |
|
| 185 |
async def process_segment_with_timing(segment: Segment, voice: str, rate: str, pitch: str) -> Segment:
|
| 186 |
+
"""Process a complete segment as a single TTS unit"""
|
| 187 |
audio_file = f"temp_segment_{segment.id}_{uuid.uuid4()}.wav"
|
| 188 |
try:
|
| 189 |
+
# Process the entire segment text as one unit, replacing newlines with spaces
|
| 190 |
+
segment_text = ' '.join(segment.text.split('\n'))
|
| 191 |
+
tts = edge_tts.Communicate(segment_text, voice, rate=rate, pitch=pitch)
|
| 192 |
await tts.save(audio_file)
|
| 193 |
|
| 194 |
segment.audio = AudioSegment.from_file(audio_file)
|
| 195 |
+
# Add small silence at start and end for natural spacing
|
| 196 |
+
silence = AudioSegment.silent(duration=50)
|
| 197 |
+
segment.audio = silence + segment.audio + silence
|
| 198 |
segment.duration = len(segment.audio)
|
| 199 |
|
| 200 |
return segment
|
|
|
|
| 203 |
os.remove(audio_file)
|
| 204 |
|
| 205 |
async def generate_accurate_srt(text: str, voice: str, rate: str, pitch: str, words_per_line: int, lines_per_segment: int) -> Tuple[str, str]:
|
|
|
|
| 206 |
processor = TextProcessor(words_per_line, lines_per_segment)
|
| 207 |
segments = processor.split_into_segments(text)
|
| 208 |
|
| 209 |
+
# Process segments sequentially for better timing control
|
| 210 |
+
processed_segments = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
current_time = 0
|
| 212 |
final_audio = AudioSegment.empty()
|
| 213 |
srt_content = ""
|
| 214 |
|
| 215 |
+
for segment in segments:
|
| 216 |
+
# Process segment
|
| 217 |
+
processed_segment = await process_segment_with_timing(segment, voice, rate, pitch)
|
| 218 |
+
|
| 219 |
+
# Calculate precise timing
|
| 220 |
+
processed_segment.start_time = current_time
|
| 221 |
+
processed_segment.end_time = current_time + processed_segment.duration
|
| 222 |
|
| 223 |
+
# Add to SRT with precise timing
|
| 224 |
srt_content += (
|
| 225 |
+
f"{processed_segment.id}\n"
|
| 226 |
+
f"{format_time_ms(processed_segment.start_time)} --> {format_time_ms(processed_segment.end_time)}\n"
|
| 227 |
+
f"{processed_segment.text}\n\n"
|
| 228 |
)
|
| 229 |
|
| 230 |
+
# Add to final audio with precise positioning
|
| 231 |
+
final_audio = final_audio.append(processed_segment.audio, crossfade=0)
|
| 232 |
|
| 233 |
+
# Update timing with precise gap
|
| 234 |
+
current_time = processed_segment.end_time
|
| 235 |
+
processed_segments.append(processed_segment)
|
| 236 |
|
| 237 |
+
# Export with high precision
|
| 238 |
unique_id = uuid.uuid4()
|
| 239 |
audio_path = f"final_audio_{unique_id}.mp3"
|
| 240 |
srt_path = f"final_subtitles_{unique_id}.srt"
|
| 241 |
|
| 242 |
+
# Export with high quality settings for precise timing
|
| 243 |
+
final_audio.export(
|
| 244 |
+
audio_path,
|
| 245 |
+
format="mp3",
|
| 246 |
+
bitrate="320k",
|
| 247 |
+
parameters=["-ar", "48000", "-ac", "2"]
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
with open(srt_path, "w", encoding='utf-8') as f:
|
| 251 |
f.write(srt_content)
|
| 252 |
|