Spaces:

Jamshaid-Saleem
/

Video-To-Text-App

Build error

Video-To-Text-App / app.py

Update app.py

c3b9318 verified about 1 year ago

1.7 kB

	from transformers import pipeline
	import yt_dlp
	import torchaudio
	import torch
	import numpy as np

	def download_youtube_audio(video_url):
	"""Download the audio from a YouTube video using yt-dlp."""
	ydl_opts = {
	'format': 'bestaudio/best',
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '192',
	}],
	'outtmpl': 'audio.%(ext)s', # Save as audio.mp3
	}
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([video_url])
	return "audio.mp3"

	def transcribe_audio(audio_path):
	"""Transcribe audio using Hugging Face Whisper model."""
	# Load the Whisper pipeline
	transcriber = pipeline(model="openai/whisper-base", task="automatic-speech-recognition")

	# Load audio using torchaudio
	audio, sample_rate = torchaudio.load(audio_path)

	# Convert stereo to mono if necessary
	if audio.shape[0] > 1:
	audio = torch.mean(audio, dim=0, keepdim=True)

	# Resample audio to 16 kHz (required by Whisper models)
	if sample_rate != 16000:
	resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
	audio = resampler(audio)

	# Convert audio to NumPy array and flatten
	audio_np = audio.squeeze(0).numpy()

	# Transcribe audio with return_timestamps enabled
	transcription = transcriber(audio_np, return_timestamps=True)
	return transcription["text"]

	# Test the application
	video_url = "https://www.youtube.com/watch?v=CSycuOBzQsk"
	audio_path = download_youtube_audio(video_url)
	transcription = transcribe_audio(audio_path)
	print("Transcription:\n", transcription)