from transformers import pipeline
import yt_dlp
import torchaudio
import torch
import numpy as np

def download_youtube_audio(video_url):
    """Download the audio from a YouTube video using yt-dlp."""
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': 'audio.%(ext)s',  # Save as audio.mp3
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])
    return "audio.mp3"

def transcribe_audio(audio_path):
    """Transcribe audio using Hugging Face Whisper model."""
    # Load the Whisper pipeline
    transcriber = pipeline(model="openai/whisper-base", task="automatic-speech-recognition")
    
    # Load audio using torchaudio
    audio, sample_rate = torchaudio.load(audio_path)
    
    # Convert stereo to mono if necessary
    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
    
    # Resample audio to 16 kHz (required by Whisper models)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        audio = resampler(audio)
    
    # Convert audio to NumPy array and flatten
    audio_np = audio.squeeze(0).numpy()
    
    # Transcribe audio with return_timestamps enabled
    transcription = transcriber(audio_np, return_timestamps=True)
    return transcription["text"]

# Test the application
video_url = "https://www.youtube.com/watch?v=CSycuOBzQsk"
audio_path = download_youtube_audio(video_url)
transcription = transcribe_audio(audio_path)
print("Transcription:\n", transcription)