# YouTubeSummerizer.py — full, updated script (yt-dlp captions + DistilBART summary + Gradio UI) import requests import webvtt from yt_dlp import YoutubeDL import gradio as gr import torch from transformers import pipeline # ---------------- Summarizer ---------------- # NOTE: Uses CPU by default. If you hit RAM errors, restart and it will stream-load weights. text_summary = pipeline( "summarization", model="sshleifer/distilbart-cnn-12-6", torch_dtype=torch.bfloat16 ) def summarize_text(text: str) -> str: # Light chunking for very long captions (kept simple) words = text.split() chunks, step = [], 350 for i in range(0, len(words), step): chunks.append(" ".join(words[i:i+step])) partial = [] for c in chunks: out = text_summary(c, max_length=180, min_length=60, do_sample=False) partial.append(out[0]["summary_text"]) merged = " ".join(partial) if len(merged.split()) > 320: return text_summary(merged, max_length=200, min_length=80, do_sample=False)[0]["summary_text"] return merged # ---------------- Helpers to get captions via yt-dlp ---------------- def _pick_caption_url(info, preferred=("en", "en-US", "en-GB")): subs = info.get("subtitles") or {} autos = info.get("automatic_captions") or {} # Prefer manual → auto → any for d in (subs, autos): for code in preferred: if code in d and d[code]: return d[code][0]["url"] for d in (subs, autos): for tracks in d.values(): if tracks: return tracks[0]["url"] return None def _fetch_caption_text(video_url: str) -> str | None: """ Try multiple ways to fetch a captions VTT URL, without spamming errors. Order: Edge -> Firefox -> Chrome (Default/Profile 1) -> no cookies. """ attempts = [ {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("edge",)}, {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("firefox",)}, {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Default")}, {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Profile 1")}, {"skip_download": True, "quiet": True}, # no cookies ] for opts in attempts: try: with YoutubeDL(opts) as ydl: info = ydl.extract_info(video_url, download=False) vtt_url = _pick_caption_url(info) if not vtt_url: continue r = requests.get(vtt_url, timeout=20) r.raise_for_status() # Parse WebVTT text in-memory lines = [] for cue in webvtt.from_string(r.text): t = cue.text.strip().replace("\n", " ") if t: lines.append(t) result = " ".join(lines).strip() if result: return result except Exception: continue return None # ---------------- Main function used by Gradio ---------------- def get_youtube_transcript(video_url: str) -> str: text = _fetch_caption_text(video_url) if not text: return "No captions available or captions endpoint blocked. Try another video, network, or export cookies to a cookies.txt file." try: return summarize_text(text) except Exception as e: return f"Summarizer error: {e}" # ---------------- Gradio UI ---------------- gr.close_all() demo = gr.Interface( fn=get_youtube_transcript, inputs=[gr.Textbox(label="Input YouTube Url to summarize", lines=1, placeholder="https://www.youtube.com/watch?v=...")], outputs=[gr.Textbox(label="Summarized text", lines=10)], title="@Sahibhim GenAI Project 2: YouTube Script Summarizer", description="Paste a YouTube link. App fetches captions (manual or auto) and summarizes them." ) if __name__ == "__main__": demo.launch()