video-extractor / app.py
leeboykt's picture
change to old pipeline
cda531f
import gradio as gr
import torch
from moviepy.editor import VideoFileClip
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, use_safetensors=True
)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=400, # Adjusted to a lower value
chunk_length_s=30,
batch_size=16,
return_timestamps=True,
torch_dtype=torch_dtype,
device=device,
)
def transcribe_video(video_path):
"""Transcribes the audio from a video file using Whisper.
Args:
video_path: Path to the video file.
Returns:
The transcribed text.
"""
try:
# Extract audio from video
video = VideoFileClip(video_path)
audio_path = video_path.replace(".mp4", ".mp3") # Assuming input is MP4
video.audio.write_audiofile(audio_path)
# Load the Whisper model
result = pipe(audio_path)
return result
except Exception as e:
return f"An error occurred: {e}"
# Create the Gradio interface
iface = gr.Interface(
fn=transcribe_video,
inputs=gr.Video(label="Upload Video"),
outputs="text",
title="Video Transcription with Whisper",
description="Upload a video to transcribe its audio content.",
)
iface.launch()