| import gradio as gr |
| import numpy as np |
| import torch |
| import whisper |
| from moviepy.editor import * |
| from moviepy.video.VideoClip import TextClip |
|
|
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| model = whisper.load_model("base", device=DEVICE) |
|
|
|
|
| def generate_video(audio_path, language): |
| |
| result = model.transcribe(audio_path, language=language) |
|
|
| |
| clips = [] |
| for segment in result["segments"]: |
| text_clip = ( |
| TextClip( |
| segment["text"], |
| fontsize=24, |
| font="Arial", |
| color="white", |
| bg_color="black", |
| size=(1280, 720), |
| ) |
| .set_duration(segment["end"] - segment["start"]) |
| .set_start(segment["start"]) |
| ) |
| clips.append(text_clip) |
|
|
| |
| video = concatenate_videoclips(clips, method="compose") |
| video = video.set_audio(AudioFileClip(audio_path)) |
|
|
| |
| output_path = "./transcribed_video.mp4" |
| video.write_videofile(output_path, fps=6, codec="libx264", audio_codec="aac") |
|
|
| return output_path |
|
|
|
|
| if __name__ == "__main__": |
|
|
| print( |
| f"Model is {'multilingual' if model.is_multilingual else 'English-only'} " |
| f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters." |
| ) |
| |
| iface = gr.Interface( |
| fn=generate_video, |
| inputs=[ |
| gr.Audio(sources=["upload", "microphone"], type="filepath"), |
| gr.Dropdown( |
| ["en", "es", "fr", "de", "it", "nl", "ru", "zh"], |
| label="Language", |
| ), |
| ], |
| outputs=gr.Video(label="Play Video", show_download_button=True), |
| title="Audio Transcription Video Generator", |
| description="Upload your audio file and select the language for transcription.", |
| ) |
|
|
| iface.launch() |
|
|