Jamshaid-Saleem's picture
Update app.py
c3b9318 verified
from transformers import pipeline
import yt_dlp
import torchaudio
import torch
import numpy as np
def download_youtube_audio(video_url):
"""Download the audio from a YouTube video using yt-dlp."""
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': 'audio.%(ext)s', # Save as audio.mp3
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
return "audio.mp3"
def transcribe_audio(audio_path):
"""Transcribe audio using Hugging Face Whisper model."""
# Load the Whisper pipeline
transcriber = pipeline(model="openai/whisper-base", task="automatic-speech-recognition")
# Load audio using torchaudio
audio, sample_rate = torchaudio.load(audio_path)
# Convert stereo to mono if necessary
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
# Resample audio to 16 kHz (required by Whisper models)
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
audio = resampler(audio)
# Convert audio to NumPy array and flatten
audio_np = audio.squeeze(0).numpy()
# Transcribe audio with return_timestamps enabled
transcription = transcriber(audio_np, return_timestamps=True)
return transcription["text"]
# Test the application
video_url = "https://www.youtube.com/watch?v=CSycuOBzQsk"
audio_path = download_youtube_audio(video_url)
transcription = transcribe_audio(audio_path)
print("Transcription:\n", transcription)