Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -178,6 +178,12 @@ def split_audio_by_pause(audio, sr, pause_threshold, top_db=30, energy_threshold
|
|
| 178 |
filtered_intervals.append((start, end))
|
| 179 |
return filtered_intervals
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
# -------------------------------
|
| 183 |
# Main Transcription Function
|
|
@@ -186,6 +192,7 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
| 186 |
start_time = time.time()
|
| 187 |
final_result = ""
|
| 188 |
debug_log = []
|
|
|
|
| 189 |
|
| 190 |
try:
|
| 191 |
# If vocal extraction is enabled, process the file first
|
|
@@ -240,7 +247,13 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
| 240 |
for word in segment["words"]:
|
| 241 |
adjusted_start = word['start'] + seg_start/sr
|
| 242 |
adjusted_end = word['end'] + seg_start/sr
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
else:
|
| 245 |
# Process the entire audio without splitting
|
| 246 |
transcript = model.transcribe(audio, batch_size=batch_size, language=language)
|
|
@@ -249,7 +262,24 @@ def transcribe(audio_file, model_size="base", debug=False, pause_threshold=0.0,
|
|
| 249 |
)
|
| 250 |
for segment in aligned["segments"]:
|
| 251 |
for word in segment["words"]:
|
| 252 |
-
final_result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
debug_log.append(f"Language used: {language}")
|
| 255 |
debug_log.append(f"Batch size: {batch_size}")
|
|
|
|
| 178 |
filtered_intervals.append((start, end))
|
| 179 |
return filtered_intervals
|
| 180 |
|
| 181 |
+
def seconds_to_srt_time(seconds):
|
| 182 |
+
msec_total = int(round(seconds * 1000))
|
| 183 |
+
hours, msec_remainder = divmod(msec_total, 3600 * 1000)
|
| 184 |
+
minutes, msec_remainder = divmod(msec_remainder, 60 * 1000)
|
| 185 |
+
sec, msec = divmod(msec_remainder, 1000)
|
| 186 |
+
return f"{hours:02d}:{minutes:02d}:{sec:02d},{msec:03d}"
|
| 187 |
|
| 188 |
# -------------------------------
|
| 189 |
# Main Transcription Function
|
|
|
|
| 192 |
start_time = time.time()
|
| 193 |
final_result = ""
|
| 194 |
debug_log = []
|
| 195 |
+
srt_entries = []
|
| 196 |
|
| 197 |
try:
|
| 198 |
# If vocal extraction is enabled, process the file first
|
|
|
|
| 247 |
for word in segment["words"]:
|
| 248 |
adjusted_start = word['start'] + seg_start/sr
|
| 249 |
adjusted_end = word['end'] + seg_start/sr
|
| 250 |
+
|
| 251 |
+
srt_entries.append({
|
| 252 |
+
'start': adjusted_start,
|
| 253 |
+
'end': adjusted_end,
|
| 254 |
+
'word': word['word'].strip()
|
| 255 |
+
})
|
| 256 |
+
#final_result += f"[{adjusted_start:5.2f}s-{adjusted_end:5.2f}s] {word['word']}\n"
|
| 257 |
else:
|
| 258 |
# Process the entire audio without splitting
|
| 259 |
transcript = model.transcribe(audio, batch_size=batch_size, language=language)
|
|
|
|
| 262 |
)
|
| 263 |
for segment in aligned["segments"]:
|
| 264 |
for word in segment["words"]:
|
| 265 |
+
#final_result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
|
| 266 |
+
srt_entries.append({
|
| 267 |
+
'start': word['start'],
|
| 268 |
+
'end': word['end'],
|
| 269 |
+
'word': word['word'].strip()
|
| 270 |
+
})
|
| 271 |
+
|
| 272 |
+
srt_content = []
|
| 273 |
+
for idx, entry in enumerate(srt_entries, start=1):
|
| 274 |
+
start_time_srt = seconds_to_srt_time(entry['start'])
|
| 275 |
+
end_time_srt = seconds_to_srt_time(entry['end'])
|
| 276 |
+
srt_content.append(
|
| 277 |
+
f"{idx}\n"
|
| 278 |
+
f"{start_time_srt} --> {end_time_srt}\n"
|
| 279 |
+
f"{entry['word']}\n"
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
final_result = "\n".join(srt_content)
|
| 283 |
|
| 284 |
debug_log.append(f"Language used: {language}")
|
| 285 |
debug_log.append(f"Batch size: {batch_size}")
|