import gradio as gr import torch from moviepy.editor import VideoFileClip from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "openai/whisper-large-v3" processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, use_safetensors=True ) device = "cuda:0" if torch.cuda.is_available() else "cpu" pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=400, # Adjusted to a lower value chunk_length_s=30, batch_size=16, return_timestamps=True, torch_dtype=torch_dtype, device=device, ) def transcribe_video(video_path): """Transcribes the audio from a video file using Whisper. Args: video_path: Path to the video file. Returns: The transcribed text. """ try: # Extract audio from video video = VideoFileClip(video_path) audio_path = video_path.replace(".mp4", ".mp3") # Assuming input is MP4 video.audio.write_audiofile(audio_path) # Load the Whisper model result = pipe(audio_path) return result except Exception as e: return f"An error occurred: {e}" # Create the Gradio interface iface = gr.Interface( fn=transcribe_video, inputs=gr.Video(label="Upload Video"), outputs="text", title="Video Transcription with Whisper", description="Upload a video to transcribe its audio content.", ) iface.launch()