import gradio as gr from moviepy.editor import VideoFileClip from transformers import pipeline import tempfile import os import requests from pytube import YouTube import gradio as gr from moviepy.editor import VideoFileClip from transformers import WhisperForConditionalGeneration, WhisperProcessor from transformers import pipeline import tempfile import os import gradio as gr from moviepy.editor import VideoFileClip from transformers import WhisperForConditionalGeneration, WhisperProcessor from transformers import pipeline import tempfile import os import yt_dlp as youtube_dl # Load your Whisper model model = WhisperForConditionalGeneration.from_pretrained("Zipei-KTH/whisper_3") processor = WhisperProcessor.from_pretrained("Zipei-KTH/whisper_3", language="chinese", task="transcribe") pipe = pipeline(model="Zipei-KTH/whisper_3") def download_video(url): ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': tempfile.mktemp() + '.%(ext)s', 'noplaylist': True, 'verbose': True # } with youtube_dl.YoutubeDL(ydl_opts) as ydl: try: info = ydl.extract_info(url, download=True) video_file = ydl.prepare_filename(info) return video_file except Exception as e: print(f"Error downloading video: {e}") return None from moviepy.editor import AudioFileClip def transcribe(audio_file=None, video_file=None, video_url=None, max_audio_length='60'): # Check if max_audio_length is not provided or empty, and set a default value if not max_audio_length: max_audio_length = '60' # Default maximum length in seconds max_audio_length = float(max_audio_length) # Convert to float if video_url: file_path = download_video(video_url) else: file_path = audio_file if audio_file is not None else video_file if file_path.endswith('.mp4'): with VideoFileClip(file_path) as video: # Truncate the video clip if it's longer than max_audio_length if video.duration > max_audio_length: video = video.subclip(0, max_audio_length) # Keep only the first max_audio_length seconds temp_dir = tempfile.mkdtemp() temp_audio_path = os.path.join(temp_dir, 'temp_audio.wav') video.audio.write_audiofile(temp_audio_path, codec='pcm_s16le') text = pipe(temp_audio_path)["text"] os.remove(temp_audio_path) os.rmdir(temp_dir) else: text = pipe(file_path)["text"] if video_url: os.remove(file_path) return text # Rest of your Gradio interface code # Define the Gradio interface iface = gr.Interface( fn=transcribe, inputs=[ gr.Audio(type="filepath", label="Upload audio file"), gr.Video(label="Upload .mp4 video file"), gr.Textbox(label="Or enter a video URL"), gr.Textbox(label="enter the maximum length") ], outputs="text", title="Whisper Small Chinese", description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model. Supports audio, .mp4 video files, and video URLs.(not working for youtube URL on huggingface space due to packages difference)" ) iface.launch(share=True)