Spaces:

Zipei-KTH
/

whisper_UI

Sleeping

File size: 3,259 Bytes

import gradio as gr
from moviepy.editor import VideoFileClip
from transformers import pipeline
import tempfile
import os
import requests
from pytube import YouTube
import gradio as gr
from moviepy.editor import VideoFileClip
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import pipeline
import tempfile
import os

import gradio as gr
from moviepy.editor import VideoFileClip
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import pipeline
import tempfile
import os
import yt_dlp as youtube_dl



# Load your Whisper model
model = WhisperForConditionalGeneration.from_pretrained("Zipei-KTH/whisper_3")
processor = WhisperProcessor.from_pretrained("Zipei-KTH/whisper_3", language="chinese", task="transcribe")
pipe = pipeline(model="Zipei-KTH/whisper_3")

def download_video(url):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': tempfile.mktemp() + '.%(ext)s',
        'noplaylist': True,
        'verbose': True  #
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            info = ydl.extract_info(url, download=True)
            video_file = ydl.prepare_filename(info)
            return video_file
        except Exception as e:
            print(f"Error downloading video: {e}")
            return None


from moviepy.editor import AudioFileClip

def transcribe(audio_file=None, video_file=None, video_url=None, max_audio_length='60'):
    # Check if max_audio_length is not provided or empty, and set a default value
    if not max_audio_length:
        max_audio_length = '60'  # Default maximum length in seconds
    max_audio_length = float(max_audio_length)  # Convert to float

    if video_url:
        file_path = download_video(video_url)
    else:
        file_path = audio_file if audio_file is not None else video_file

    if file_path.endswith('.mp4'):
        with VideoFileClip(file_path) as video:
            # Truncate the video clip if it's longer than max_audio_length
            if video.duration > max_audio_length:
                video = video.subclip(0, max_audio_length)  # Keep only the first max_audio_length seconds

            temp_dir = tempfile.mkdtemp()
            temp_audio_path = os.path.join(temp_dir, 'temp_audio.wav')
            video.audio.write_audiofile(temp_audio_path, codec='pcm_s16le')

        text = pipe(temp_audio_path)["text"]

        os.remove(temp_audio_path)
        os.rmdir(temp_dir)
    else:
        text = pipe(file_path)["text"]

    if video_url:
        os.remove(file_path)

    return text


# Rest of your Gradio interface code


# Define the Gradio interface
iface = gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(type="filepath", label="Upload audio file"),
        gr.Video(label="Upload .mp4 video file"),
        gr.Textbox(label="Or enter a video URL"),
        gr.Textbox(label="enter the maximum length")
    ],
    outputs="text",
    title="Whisper Small Chinese",
    description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model. Supports audio, .mp4 video files, and video URLs.(not working for youtube URL on huggingface space due to packages difference)"
)

iface.launch(share=True)