from transformers import pipeline import yt_dlp import torchaudio import torch import numpy as np def download_youtube_audio(video_url): """Download the audio from a YouTube video using yt-dlp.""" ydl_opts = { 'format': 'bestaudio/best', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], 'outtmpl': 'audio.%(ext)s', # Save as audio.mp3 } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([video_url]) return "audio.mp3" def transcribe_audio(audio_path): """Transcribe audio using Hugging Face Whisper model.""" # Load the Whisper pipeline transcriber = pipeline(model="openai/whisper-base", task="automatic-speech-recognition") # Load audio using torchaudio audio, sample_rate = torchaudio.load(audio_path) # Convert stereo to mono if necessary if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True) # Resample audio to 16 kHz (required by Whisper models) if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) audio = resampler(audio) # Convert audio to NumPy array and flatten audio_np = audio.squeeze(0).numpy() # Transcribe audio with return_timestamps enabled transcription = transcriber(audio_np, return_timestamps=True) return transcription["text"] # Test the application video_url = "https://www.youtube.com/watch?v=CSycuOBzQsk" audio_path = download_youtube_audio(video_url) transcription = transcribe_audio(audio_path) print("Transcription:\n", transcription)