File size: 3,939 Bytes
1b8009f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# YouTubeSummerizer.py  — full, updated script (yt-dlp captions + DistilBART summary + Gradio UI)

import requests
import webvtt
from yt_dlp import YoutubeDL
import gradio as gr
import torch
from transformers import pipeline

# ---------------- Summarizer ----------------
# NOTE: Uses CPU by default. If you hit RAM errors, restart and it will stream-load weights.
text_summary = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-12-6",
    torch_dtype=torch.bfloat16
)

def summarize_text(text: str) -> str:
    # Light chunking for very long captions (kept simple)
    words = text.split()
    chunks, step = [], 350
    for i in range(0, len(words), step):
        chunks.append(" ".join(words[i:i+step]))
    partial = []
    for c in chunks:
        out = text_summary(c, max_length=180, min_length=60, do_sample=False)
        partial.append(out[0]["summary_text"])
    merged = " ".join(partial)
    if len(merged.split()) > 320:
        return text_summary(merged, max_length=200, min_length=80, do_sample=False)[0]["summary_text"]
    return merged

# ---------------- Helpers to get captions via yt-dlp ----------------
def _pick_caption_url(info, preferred=("en", "en-US", "en-GB")):
    subs = info.get("subtitles") or {}
    autos = info.get("automatic_captions") or {}
    # Prefer manual → auto → any
    for d in (subs, autos):
        for code in preferred:
            if code in d and d[code]:
                return d[code][0]["url"]
    for d in (subs, autos):
        for tracks in d.values():
            if tracks:
                return tracks[0]["url"]
    return None

def _fetch_caption_text(video_url: str) -> str | None:
    """
    Try multiple ways to fetch a captions VTT URL, without spamming errors.
    Order: Edge -> Firefox -> Chrome (Default/Profile 1) -> no cookies.
    """
    attempts = [
        {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("edge",)},
        {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("firefox",)},
        {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Default")},
        {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Profile 1")},
        {"skip_download": True, "quiet": True},  # no cookies
    ]
    for opts in attempts:
        try:
            with YoutubeDL(opts) as ydl:
                info = ydl.extract_info(video_url, download=False)

            vtt_url = _pick_caption_url(info)
            if not vtt_url:
                continue

            r = requests.get(vtt_url, timeout=20)
            r.raise_for_status()

            # Parse WebVTT text in-memory
            lines = []
            for cue in webvtt.from_string(r.text):
                t = cue.text.strip().replace("\n", " ")
                if t:
                    lines.append(t)
            result = " ".join(lines).strip()
            if result:
                return result
        except Exception:
            continue
    return None

# ---------------- Main function used by Gradio ----------------
def get_youtube_transcript(video_url: str) -> str:
    text = _fetch_caption_text(video_url)
    if not text:
        return "No captions available or captions endpoint blocked. Try another video, network, or export cookies to a cookies.txt file."
    try:
        return summarize_text(text)
    except Exception as e:
        return f"Summarizer error: {e}"

# ---------------- Gradio UI ----------------
gr.close_all()
demo = gr.Interface(
    fn=get_youtube_transcript,
    inputs=[gr.Textbox(label="Input YouTube Url to summarize", lines=1, placeholder="https://www.youtube.com/watch?v=...")],
    outputs=[gr.Textbox(label="Summarized text", lines=10)],
    title="@Sahibhim GenAI Project 2: YouTube Script Summarizer",
    description="Paste a YouTube link. App fetches captions (manual or auto) and summarizes them."
)

if __name__ == "__main__":
    demo.launch()