Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# YouTubeSummerizer.py — full, updated script (yt-dlp captions + DistilBART summary + Gradio UI)
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
import webvtt
|
| 5 |
+
from yt_dlp import YoutubeDL
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import torch
|
| 8 |
+
from transformers import pipeline
|
| 9 |
+
|
| 10 |
+
# ---------------- Summarizer ----------------
|
| 11 |
+
# NOTE: Uses CPU by default. If you hit RAM errors, restart and it will stream-load weights.
|
| 12 |
+
text_summary = pipeline(
|
| 13 |
+
"summarization",
|
| 14 |
+
model="sshleifer/distilbart-cnn-12-6",
|
| 15 |
+
torch_dtype=torch.bfloat16
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def summarize_text(text: str) -> str:
|
| 19 |
+
# Light chunking for very long captions (kept simple)
|
| 20 |
+
words = text.split()
|
| 21 |
+
chunks, step = [], 350
|
| 22 |
+
for i in range(0, len(words), step):
|
| 23 |
+
chunks.append(" ".join(words[i:i+step]))
|
| 24 |
+
partial = []
|
| 25 |
+
for c in chunks:
|
| 26 |
+
out = text_summary(c, max_length=180, min_length=60, do_sample=False)
|
| 27 |
+
partial.append(out[0]["summary_text"])
|
| 28 |
+
merged = " ".join(partial)
|
| 29 |
+
if len(merged.split()) > 320:
|
| 30 |
+
return text_summary(merged, max_length=200, min_length=80, do_sample=False)[0]["summary_text"]
|
| 31 |
+
return merged
|
| 32 |
+
|
| 33 |
+
# ---------------- Helpers to get captions via yt-dlp ----------------
|
| 34 |
+
def _pick_caption_url(info, preferred=("en", "en-US", "en-GB")):
|
| 35 |
+
subs = info.get("subtitles") or {}
|
| 36 |
+
autos = info.get("automatic_captions") or {}
|
| 37 |
+
# Prefer manual → auto → any
|
| 38 |
+
for d in (subs, autos):
|
| 39 |
+
for code in preferred:
|
| 40 |
+
if code in d and d[code]:
|
| 41 |
+
return d[code][0]["url"]
|
| 42 |
+
for d in (subs, autos):
|
| 43 |
+
for tracks in d.values():
|
| 44 |
+
if tracks:
|
| 45 |
+
return tracks[0]["url"]
|
| 46 |
+
return None
|
| 47 |
+
|
| 48 |
+
def _fetch_caption_text(video_url: str) -> str | None:
|
| 49 |
+
"""
|
| 50 |
+
Try multiple ways to fetch a captions VTT URL, without spamming errors.
|
| 51 |
+
Order: Edge -> Firefox -> Chrome (Default/Profile 1) -> no cookies.
|
| 52 |
+
"""
|
| 53 |
+
attempts = [
|
| 54 |
+
{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("edge",)},
|
| 55 |
+
{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("firefox",)},
|
| 56 |
+
{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Default")},
|
| 57 |
+
{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Profile 1")},
|
| 58 |
+
{"skip_download": True, "quiet": True}, # no cookies
|
| 59 |
+
]
|
| 60 |
+
for opts in attempts:
|
| 61 |
+
try:
|
| 62 |
+
with YoutubeDL(opts) as ydl:
|
| 63 |
+
info = ydl.extract_info(video_url, download=False)
|
| 64 |
+
|
| 65 |
+
vtt_url = _pick_caption_url(info)
|
| 66 |
+
if not vtt_url:
|
| 67 |
+
continue
|
| 68 |
+
|
| 69 |
+
r = requests.get(vtt_url, timeout=20)
|
| 70 |
+
r.raise_for_status()
|
| 71 |
+
|
| 72 |
+
# Parse WebVTT text in-memory
|
| 73 |
+
lines = []
|
| 74 |
+
for cue in webvtt.from_string(r.text):
|
| 75 |
+
t = cue.text.strip().replace("\n", " ")
|
| 76 |
+
if t:
|
| 77 |
+
lines.append(t)
|
| 78 |
+
result = " ".join(lines).strip()
|
| 79 |
+
if result:
|
| 80 |
+
return result
|
| 81 |
+
except Exception:
|
| 82 |
+
continue
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
# ---------------- Main function used by Gradio ----------------
|
| 86 |
+
def get_youtube_transcript(video_url: str) -> str:
|
| 87 |
+
text = _fetch_caption_text(video_url)
|
| 88 |
+
if not text:
|
| 89 |
+
return "No captions available or captions endpoint blocked. Try another video, network, or export cookies to a cookies.txt file."
|
| 90 |
+
try:
|
| 91 |
+
return summarize_text(text)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
return f"Summarizer error: {e}"
|
| 94 |
+
|
| 95 |
+
# ---------------- Gradio UI ----------------
|
| 96 |
+
gr.close_all()
|
| 97 |
+
demo = gr.Interface(
|
| 98 |
+
fn=get_youtube_transcript,
|
| 99 |
+
inputs=[gr.Textbox(label="Input YouTube Url to summarize", lines=1, placeholder="https://www.youtube.com/watch?v=...")],
|
| 100 |
+
outputs=[gr.Textbox(label="Summarized text", lines=10)],
|
| 101 |
+
title="@Sahibhim GenAI Project 2: YouTube Script Summarizer",
|
| 102 |
+
description="Paste a YouTube link. App fetches captions (manual or auto) and summarizes them."
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
demo.launch()
|