import gradio as gr import os import torch import gc from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM import tempfile import yt_dlp summarizer = pipeline("summarization", model="facebook/bart-large-cnn") asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base") def download_audio(url: str, temp_dir: str) -> str: output_path = os.path.join(temp_dir, "audio.%(ext)s") ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': output_path, 'quiet': True, 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) return output_path.replace('%(ext)s', 'mp3') def process_video(url: str) -> str: with tempfile.TemporaryDirectory() as tmpdir: audio_path = download_audio(url, tmpdir) transcription_result = asr_pipeline(audio_path) text = transcription_result['text'] if len(text.strip()) < 50: return "Transcription too short or unclear" gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() summary_result = summarizer(text, max_length=150, min_length=50, do_sample=False) return summary_result[0]['summary_text'] def main(url): return process_video(url) iface = gr.Interface(fn=main, inputs="text", outputs="text", title="YouTube Audio Summarizer") iface.launch()