Sahibhim commited on
Commit
1b8009f
·
verified ·
1 Parent(s): 5e1dd99

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YouTubeSummerizer.py — full, updated script (yt-dlp captions + DistilBART summary + Gradio UI)
2
+
3
+ import requests
4
+ import webvtt
5
+ from yt_dlp import YoutubeDL
6
+ import gradio as gr
7
+ import torch
8
+ from transformers import pipeline
9
+
10
+ # ---------------- Summarizer ----------------
11
+ # NOTE: Uses CPU by default. If you hit RAM errors, restart and it will stream-load weights.
12
+ text_summary = pipeline(
13
+ "summarization",
14
+ model="sshleifer/distilbart-cnn-12-6",
15
+ torch_dtype=torch.bfloat16
16
+ )
17
+
18
+ def summarize_text(text: str) -> str:
19
+ # Light chunking for very long captions (kept simple)
20
+ words = text.split()
21
+ chunks, step = [], 350
22
+ for i in range(0, len(words), step):
23
+ chunks.append(" ".join(words[i:i+step]))
24
+ partial = []
25
+ for c in chunks:
26
+ out = text_summary(c, max_length=180, min_length=60, do_sample=False)
27
+ partial.append(out[0]["summary_text"])
28
+ merged = " ".join(partial)
29
+ if len(merged.split()) > 320:
30
+ return text_summary(merged, max_length=200, min_length=80, do_sample=False)[0]["summary_text"]
31
+ return merged
32
+
33
+ # ---------------- Helpers to get captions via yt-dlp ----------------
34
+ def _pick_caption_url(info, preferred=("en", "en-US", "en-GB")):
35
+ subs = info.get("subtitles") or {}
36
+ autos = info.get("automatic_captions") or {}
37
+ # Prefer manual → auto → any
38
+ for d in (subs, autos):
39
+ for code in preferred:
40
+ if code in d and d[code]:
41
+ return d[code][0]["url"]
42
+ for d in (subs, autos):
43
+ for tracks in d.values():
44
+ if tracks:
45
+ return tracks[0]["url"]
46
+ return None
47
+
48
+ def _fetch_caption_text(video_url: str) -> str | None:
49
+ """
50
+ Try multiple ways to fetch a captions VTT URL, without spamming errors.
51
+ Order: Edge -> Firefox -> Chrome (Default/Profile 1) -> no cookies.
52
+ """
53
+ attempts = [
54
+ {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("edge",)},
55
+ {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("firefox",)},
56
+ {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Default")},
57
+ {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Profile 1")},
58
+ {"skip_download": True, "quiet": True}, # no cookies
59
+ ]
60
+ for opts in attempts:
61
+ try:
62
+ with YoutubeDL(opts) as ydl:
63
+ info = ydl.extract_info(video_url, download=False)
64
+
65
+ vtt_url = _pick_caption_url(info)
66
+ if not vtt_url:
67
+ continue
68
+
69
+ r = requests.get(vtt_url, timeout=20)
70
+ r.raise_for_status()
71
+
72
+ # Parse WebVTT text in-memory
73
+ lines = []
74
+ for cue in webvtt.from_string(r.text):
75
+ t = cue.text.strip().replace("\n", " ")
76
+ if t:
77
+ lines.append(t)
78
+ result = " ".join(lines).strip()
79
+ if result:
80
+ return result
81
+ except Exception:
82
+ continue
83
+ return None
84
+
85
+ # ---------------- Main function used by Gradio ----------------
86
+ def get_youtube_transcript(video_url: str) -> str:
87
+ text = _fetch_caption_text(video_url)
88
+ if not text:
89
+ return "No captions available or captions endpoint blocked. Try another video, network, or export cookies to a cookies.txt file."
90
+ try:
91
+ return summarize_text(text)
92
+ except Exception as e:
93
+ return f"Summarizer error: {e}"
94
+
95
+ # ---------------- Gradio UI ----------------
96
+ gr.close_all()
97
+ demo = gr.Interface(
98
+ fn=get_youtube_transcript,
99
+ inputs=[gr.Textbox(label="Input YouTube Url to summarize", lines=1, placeholder="https://www.youtube.com/watch?v=...")],
100
+ outputs=[gr.Textbox(label="Summarized text", lines=10)],
101
+ title="@Sahibhim GenAI Project 2: YouTube Script Summarizer",
102
+ description="Paste a YouTube link. App fetches captions (manual or auto) and summarizes them."
103
+ )
104
+
105
+ if __name__ == "__main__":
106
+ demo.launch()