| import gradio as gr |
| import torch |
| import whisper |
| from moviepy.editor import ( |
| AudioFileClip, |
| ColorClip, |
| CompositeVideoClip, |
| VideoFileClip, |
| concatenate_videoclips, |
| ) |
| from moviepy.video.VideoClip import TextClip |
|
|
|
|
| def generate_srt_file(transcription_result, srt_file_path, lag=0): |
| with open(srt_file_path, "w") as file: |
| for i, segment in enumerate(transcription_result["segments"], start=1): |
| |
| start_time = segment["start"] + lag |
| end_time = segment["end"] + lag |
| text = segment["text"] |
|
|
| |
| start_srt = f"{int(start_time // 3600):02d}:{int((start_time % 3600) // 60):02d}:{int(start_time % 60):02d},{int((start_time % 1) * 1000):03d}" |
| end_srt = f"{int(end_time // 3600):02d}:{int((end_time % 3600) // 60):02d}:{int(end_time % 60):02d},{int((end_time % 1) * 1000):03d}" |
|
|
| file.write(f"{i}\n{start_srt} --> {end_srt}\n{text}\n\n") |
|
|
|
|
| def generate_video( |
| audio_path, video_path, input, language, lag, progress=gr.Progress(track_tqdm=True) |
| ): |
|
|
| |
| progress(0.0, "Checking input...") |
| if input == "Video": |
| progress(0.0, "Extracting audio from video...") |
| audio_path = "./temp_audio.wav" |
| video = VideoFileClip(video_path) |
| video.audio.write_audiofile(audio_path) |
| video.close() |
| progress(0.1, "Audio extracted!") |
|
|
| |
| progress(0.1, "Transcribing audio...") |
| result = model.transcribe(audio_path, language=language) |
| progress(0.30, "Audio transcribed!") |
|
|
| |
| progress(0.30, "Generating SRT file...") |
| srt_file_path = "./temp.srt" |
| generate_srt_file(result, srt_file_path, lag=lag) |
| progress(0.40, "SRT file generated!") |
|
|
| if input == "Video": |
| |
| if lag == 0: |
| return video_path, srt_file_path |
| else: |
| |
| video = VideoFileClip(video_path) |
| fps = video.fps |
| black_screen = ColorClip( |
| size=video.size, color=(0, 0, 0), duration=lag |
| ).set_fps(1) |
| final_video = concatenate_videoclips([video, black_screen]) |
| output_video_path = "./transcribed_video.mp4" |
| final_video.write_videofile( |
| output_video_path, codec="libx264", audio_codec="aac" |
| ) |
| return output_video_path, srt_file_path |
| else: |
| output_video_path = "./transcribed_video.mp4" |
| audio_clip = AudioFileClip(audio_path) |
| duration = audio_clip.duration + lag |
| video_clip = ColorClip( |
| size=(1280, 720), color=(0, 0, 0), duration=duration |
| ).set_fps( |
| 1 |
| ) |
| video_clip = video_clip.set_audio(audio_clip) |
| video_clip.write_videofile( |
| output_video_path, codec="libx264", audio_codec="aac" |
| ) |
| return output_video_path, srt_file_path |
|
|
|
|
| if __name__ == "__main__": |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| model = whisper.load_model("base", device=DEVICE) |
|
|
| |
| iface = gr.Interface( |
| fn=generate_video, |
| inputs=[ |
| gr.Audio( |
| sources=["upload", "microphone"], |
| type="filepath", |
| label="Audio File", |
| ), |
| gr.Video(label="Or Video File", sources=["upload", "webcam"]), |
| gr.Dropdown(["Video", "Audio"], label="File Type", value="Audio"), |
| gr.Dropdown( |
| ["en", "es", "fr", "de", "it", "nl", "ru", "no", "zh"], |
| label="Language", |
| value="en", |
| ), |
| gr.Slider( |
| minimum=0, |
| maximum=10, |
| step=1, |
| value=0, |
| label="Lag (seconds): delay the transcription by this amount of time.", |
| ), |
| ], |
| outputs=gr.Video(label="Play Video", show_download_button=True), |
| title="Audio Transcription Video Generator", |
| description="Upload your audio file and select the language for transcription.", |
| ) |
|
|
| iface.launch() |
|
|