whisper_UI / app.py
ZipeiZhang
update youtube download function
cc41a01
import gradio as gr
from moviepy.editor import VideoFileClip
from transformers import pipeline
import tempfile
import os
import requests
from pytube import YouTube
import gradio as gr
from moviepy.editor import VideoFileClip
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import pipeline
import tempfile
import os
import gradio as gr
from moviepy.editor import VideoFileClip
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import pipeline
import tempfile
import os
import yt_dlp as youtube_dl
# Load your Whisper model
model = WhisperForConditionalGeneration.from_pretrained("Zipei-KTH/whisper_3")
processor = WhisperProcessor.from_pretrained("Zipei-KTH/whisper_3", language="chinese", task="transcribe")
pipe = pipeline(model="Zipei-KTH/whisper_3")
def download_video(url):
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': tempfile.mktemp() + '.%(ext)s',
'noplaylist': True,
'verbose': True #
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(url, download=True)
video_file = ydl.prepare_filename(info)
return video_file
except Exception as e:
print(f"Error downloading video: {e}")
return None
from moviepy.editor import AudioFileClip
def transcribe(audio_file=None, video_file=None, video_url=None, max_audio_length='60'):
# Check if max_audio_length is not provided or empty, and set a default value
if not max_audio_length:
max_audio_length = '60' # Default maximum length in seconds
max_audio_length = float(max_audio_length) # Convert to float
if video_url:
file_path = download_video(video_url)
else:
file_path = audio_file if audio_file is not None else video_file
if file_path.endswith('.mp4'):
with VideoFileClip(file_path) as video:
# Truncate the video clip if it's longer than max_audio_length
if video.duration > max_audio_length:
video = video.subclip(0, max_audio_length) # Keep only the first max_audio_length seconds
temp_dir = tempfile.mkdtemp()
temp_audio_path = os.path.join(temp_dir, 'temp_audio.wav')
video.audio.write_audiofile(temp_audio_path, codec='pcm_s16le')
text = pipe(temp_audio_path)["text"]
os.remove(temp_audio_path)
os.rmdir(temp_dir)
else:
text = pipe(file_path)["text"]
if video_url:
os.remove(file_path)
return text
# Rest of your Gradio interface code
# Define the Gradio interface
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(type="filepath", label="Upload audio file"),
gr.Video(label="Upload .mp4 video file"),
gr.Textbox(label="Or enter a video URL"),
gr.Textbox(label="enter the maximum length")
],
outputs="text",
title="Whisper Small Chinese",
description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model. Supports audio, .mp4 video files, and video URLs.(not working for youtube URL on huggingface space due to packages difference)"
)
iface.launch(share=True)