Spaces:

Zipei-KTH
/

whisper_UI

Sleeping

whisper_UI / app.py

ZipeiZhang

update youtube download function

cc41a01 about 2 years ago

3.26 kB

	import gradio as gr
	from moviepy.editor import VideoFileClip
	from transformers import pipeline
	import tempfile
	import os
	import requests
	from pytube import YouTube
	import gradio as gr
	from moviepy.editor import VideoFileClip
	from transformers import WhisperForConditionalGeneration, WhisperProcessor
	from transformers import pipeline
	import tempfile
	import os

	import gradio as gr
	from moviepy.editor import VideoFileClip
	from transformers import WhisperForConditionalGeneration, WhisperProcessor
	from transformers import pipeline
	import tempfile
	import os
	import yt_dlp as youtube_dl



	# Load your Whisper model
	model = WhisperForConditionalGeneration.from_pretrained("Zipei-KTH/whisper_3")
	processor = WhisperProcessor.from_pretrained("Zipei-KTH/whisper_3", language="chinese", task="transcribe")
	pipe = pipeline(model="Zipei-KTH/whisper_3")

	def download_video(url):
	ydl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': tempfile.mktemp() + '.%(ext)s',
	'noplaylist': True,
	'verbose': True #
	}
	with youtube_dl.YoutubeDL(ydl_opts) as ydl:
	try:
	info = ydl.extract_info(url, download=True)
	video_file = ydl.prepare_filename(info)
	return video_file
	except Exception as e:
	print(f"Error downloading video: {e}")
	return None


	from moviepy.editor import AudioFileClip

	def transcribe(audio_file=None, video_file=None, video_url=None, max_audio_length='60'):
	# Check if max_audio_length is not provided or empty, and set a default value
	if not max_audio_length:
	max_audio_length = '60' # Default maximum length in seconds
	max_audio_length = float(max_audio_length) # Convert to float

	if video_url:
	file_path = download_video(video_url)
	else:
	file_path = audio_file if audio_file is not None else video_file

	if file_path.endswith('.mp4'):
	with VideoFileClip(file_path) as video:
	# Truncate the video clip if it's longer than max_audio_length
	if video.duration > max_audio_length:
	video = video.subclip(0, max_audio_length) # Keep only the first max_audio_length seconds

	temp_dir = tempfile.mkdtemp()
	temp_audio_path = os.path.join(temp_dir, 'temp_audio.wav')
	video.audio.write_audiofile(temp_audio_path, codec='pcm_s16le')

	text = pipe(temp_audio_path)["text"]

	os.remove(temp_audio_path)
	os.rmdir(temp_dir)
	else:
	text = pipe(file_path)["text"]

	if video_url:
	os.remove(file_path)

	return text


	# Rest of your Gradio interface code


	# Define the Gradio interface
	iface = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(type="filepath", label="Upload audio file"),
	gr.Video(label="Upload .mp4 video file"),
	gr.Textbox(label="Or enter a video URL"),
	gr.Textbox(label="enter the maximum length")
	],
	outputs="text",
	title="Whisper Small Chinese",
	description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model. Supports audio, .mp4 video files, and video URLs.(not working for youtube URL on huggingface space due to packages difference)"
	)

	iface.launch(share=True)