killian31
commited on
Commit
·
baafc0a
1
Parent(s):
272fe46
feat: use subtitles file
Browse files
app.py
CHANGED
|
@@ -1,63 +1,87 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
import whisper
|
| 4 |
-
from moviepy.editor import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from moviepy.video.VideoClip import TextClip
|
| 6 |
|
| 7 |
|
| 8 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
# Transcribe audio
|
| 10 |
-
progress(0.
|
| 11 |
result = model.transcribe(audio_path, language=language)
|
| 12 |
progress(0.30, "Audio transcribed!")
|
| 13 |
|
| 14 |
-
#
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
if
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
)
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
)
|
| 39 |
-
|
| 40 |
-
current_time = segment["end"]
|
| 41 |
-
progress(min(0.3 + running_progress, 0.7), "Generating video frames...")
|
| 42 |
-
|
| 43 |
-
if lag > 0:
|
| 44 |
-
clips.insert(0, ColorClip((1280, 720), color=(0, 0, 0)).set_duration(lag))
|
| 45 |
-
progress(0.7, "Video frames generated!")
|
| 46 |
-
|
| 47 |
-
# Concatenate clips and set audio
|
| 48 |
-
progress(0.75, "Concatenating video clips...")
|
| 49 |
-
video = concatenate_videoclips(clips, method="compose")
|
| 50 |
-
|
| 51 |
-
# Add audio to the video
|
| 52 |
-
progress(0.85, "Adding audio to video...")
|
| 53 |
-
video = video.set_audio(AudioFileClip(audio_path))
|
| 54 |
-
|
| 55 |
-
# Export video to a buffer
|
| 56 |
-
progress(0.90, "Exporting video...")
|
| 57 |
-
output_path = "./transcribed_video.mp4"
|
| 58 |
-
video.write_videofile(output_path, fps=6, codec="libx264", audio_codec="aac")
|
| 59 |
-
progress(1.0, "Video exported!")
|
| 60 |
-
return output_path
|
| 61 |
|
| 62 |
|
| 63 |
if __name__ == "__main__":
|
|
@@ -69,8 +93,12 @@ if __name__ == "__main__":
|
|
| 69 |
fn=generate_video,
|
| 70 |
inputs=[
|
| 71 |
gr.Audio(
|
| 72 |
-
sources=["upload", "microphone"],
|
|
|
|
|
|
|
| 73 |
),
|
|
|
|
|
|
|
| 74 |
gr.Dropdown(
|
| 75 |
["en", "es", "fr", "de", "it", "nl", "ru", "no", "zh"],
|
| 76 |
label="Language",
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
import whisper
|
| 4 |
+
from moviepy.editor import (
|
| 5 |
+
AudioFileClip,
|
| 6 |
+
ColorClip,
|
| 7 |
+
CompositeVideoClip,
|
| 8 |
+
VideoFileClip,
|
| 9 |
+
concatenate_videoclips,
|
| 10 |
+
)
|
| 11 |
from moviepy.video.VideoClip import TextClip
|
| 12 |
|
| 13 |
|
| 14 |
+
def generate_srt_file(transcription_result, srt_file_path, lag=0):
|
| 15 |
+
with open(srt_file_path, "w") as file:
|
| 16 |
+
for i, segment in enumerate(transcription_result["segments"], start=1):
|
| 17 |
+
# Adjusting times for lag
|
| 18 |
+
start_time = segment["start"] + lag
|
| 19 |
+
end_time = segment["end"] + lag
|
| 20 |
+
text = segment["text"]
|
| 21 |
+
|
| 22 |
+
# Convert times to SRT format (HH:MM:SS,MS)
|
| 23 |
+
start_srt = f"{int(start_time // 3600):02d}:{int((start_time % 3600) // 60):02d}:{int(start_time % 60):02d},{int((start_time % 1) * 1000):03d}"
|
| 24 |
+
end_srt = f"{int(end_time // 3600):02d}:{int((end_time % 3600) // 60):02d}:{int(end_time % 60):02d},{int((end_time % 1) * 1000):03d}"
|
| 25 |
+
|
| 26 |
+
file.write(f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def generate_video(
|
| 30 |
+
audio_path, video_path, input, language, lag, progress=gr.Progress(track_tqdm=True)
|
| 31 |
+
):
|
| 32 |
+
|
| 33 |
+
# Check if the input is a video
|
| 34 |
+
progress(0.0, "Checking input...")
|
| 35 |
+
if input == "Video":
|
| 36 |
+
progress(0.0, "Extracting audio from video...")
|
| 37 |
+
audio_path = "./temp_audio.wav"
|
| 38 |
+
video = VideoFileClip(video_path)
|
| 39 |
+
video.audio.write_audiofile(audio_path)
|
| 40 |
+
video.close()
|
| 41 |
+
progress(0.1, "Audio extracted!")
|
| 42 |
+
|
| 43 |
# Transcribe audio
|
| 44 |
+
progress(0.1, "Transcribing audio...")
|
| 45 |
result = model.transcribe(audio_path, language=language)
|
| 46 |
progress(0.30, "Audio transcribed!")
|
| 47 |
|
| 48 |
+
# Generate SRT file
|
| 49 |
+
progress(0.30, "Generating SRT file...")
|
| 50 |
+
srt_file_path = "./temp.srt"
|
| 51 |
+
generate_srt_file(result, srt_file_path, lag=lag)
|
| 52 |
+
progress(0.40, "SRT file generated!")
|
| 53 |
+
|
| 54 |
+
if input == "Video":
|
| 55 |
+
# if lag is 0, we can use the original video, else we need to create a new video
|
| 56 |
+
if lag == 0:
|
| 57 |
+
return video_path, srt_file_path
|
| 58 |
+
else:
|
| 59 |
+
# we simply extend the original video with a black screen at the end of duration lag
|
| 60 |
+
video = VideoFileClip(video_path)
|
| 61 |
+
fps = video.fps
|
| 62 |
+
black_screen = ColorClip(
|
| 63 |
+
size=video.size, color=(0, 0, 0), duration=lag
|
| 64 |
+
).set_fps(1)
|
| 65 |
+
final_video = concatenate_videoclips([video, black_screen])
|
| 66 |
+
output_video_path = "./transcribed_video.mp4"
|
| 67 |
+
final_video.write_videofile(
|
| 68 |
+
output_video_path, codec="libx264", audio_codec="aac"
|
| 69 |
)
|
| 70 |
+
return output_video_path, srt_file_path
|
| 71 |
+
else:
|
| 72 |
+
output_video_path = "./transcribed_video.mp4"
|
| 73 |
+
audio_clip = AudioFileClip(audio_path)
|
| 74 |
+
duration = audio_clip.duration + lag
|
| 75 |
+
video_clip = ColorClip(
|
| 76 |
+
size=(1280, 720), color=(0, 0, 0), duration=duration
|
| 77 |
+
).set_fps(
|
| 78 |
+
1
|
| 79 |
+
) # Low fps
|
| 80 |
+
video_clip = video_clip.set_audio(audio_clip)
|
| 81 |
+
video_clip.write_videofile(
|
| 82 |
+
output_video_path, codec="libx264", audio_codec="aac"
|
| 83 |
)
|
| 84 |
+
return output_video_path, srt_file_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
|
| 87 |
if __name__ == "__main__":
|
|
|
|
| 93 |
fn=generate_video,
|
| 94 |
inputs=[
|
| 95 |
gr.Audio(
|
| 96 |
+
sources=["upload", "microphone"],
|
| 97 |
+
type="filepath",
|
| 98 |
+
label="Audio File",
|
| 99 |
),
|
| 100 |
+
gr.Video(label="Or Video File", sources=["upload", "webcam"]),
|
| 101 |
+
gr.Dropdown(["Video", "Audio"], label="File Type", value="Audio"),
|
| 102 |
gr.Dropdown(
|
| 103 |
["en", "es", "fr", "de", "it", "nl", "ru", "no", "zh"],
|
| 104 |
label="Language",
|