Spaces:
Build error
Build error
| from transformers import pipeline | |
| import yt_dlp | |
| import torchaudio | |
| import torch | |
| import numpy as np | |
| def download_youtube_audio(video_url): | |
| """Download the audio from a YouTube video using yt-dlp.""" | |
| ydl_opts = { | |
| 'format': 'bestaudio/best', | |
| 'postprocessors': [{ | |
| 'key': 'FFmpegExtractAudio', | |
| 'preferredcodec': 'mp3', | |
| 'preferredquality': '192', | |
| }], | |
| 'outtmpl': 'audio.%(ext)s', # Save as audio.mp3 | |
| } | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| ydl.download([video_url]) | |
| return "audio.mp3" | |
| def transcribe_audio(audio_path): | |
| """Transcribe audio using Hugging Face Whisper model.""" | |
| # Load the Whisper pipeline | |
| transcriber = pipeline(model="openai/whisper-base", task="automatic-speech-recognition") | |
| # Load audio using torchaudio | |
| audio, sample_rate = torchaudio.load(audio_path) | |
| # Convert stereo to mono if necessary | |
| if audio.shape[0] > 1: | |
| audio = torch.mean(audio, dim=0, keepdim=True) | |
| # Resample audio to 16 kHz (required by Whisper models) | |
| if sample_rate != 16000: | |
| resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
| audio = resampler(audio) | |
| # Convert audio to NumPy array and flatten | |
| audio_np = audio.squeeze(0).numpy() | |
| # Transcribe audio with return_timestamps enabled | |
| transcription = transcriber(audio_np, return_timestamps=True) | |
| return transcription["text"] | |
| # Test the application | |
| video_url = "https://www.youtube.com/watch?v=CSycuOBzQsk" | |
| audio_path = download_youtube_audio(video_url) | |
| transcription = transcribe_audio(audio_path) | |
| print("Transcription:\n", transcription) | |