rocky250 commited on
Commit
f0f0ba5
Β·
verified Β·
1 Parent(s): 44bafbe

Create fetcher.py

Browse files
Files changed (1) hide show
  1. fetcher.py +231 -0
fetcher.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ fetcher.py β€” All YouTube Data API v3 + transcript fetching logic.
3
+ Completely decoupled from the UI layer.
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import time
9
+ from typing import Optional, Tuple, List, Dict, Any
10
+
11
+ import pandas as pd
12
+ from googleapiclient.discovery import build
13
+ from googleapiclient.errors import HttpError
14
+ from youtube_transcript_api import (
15
+ YouTubeTranscriptApi,
16
+ NoTranscriptFound,
17
+ TranscriptsDisabled,
18
+ VideoUnavailable,
19
+ )
20
+
21
+
22
+ # ── ID Extraction ──────────────────────────────────────────────────────────────
23
+
24
+ def extract_video_id(url_or_id: str) -> Optional[str]:
25
+ """Extract YouTube video ID from any URL format or a raw ID."""
26
+ url_or_id = url_or_id.strip()
27
+ patterns = [
28
+ r"(?:v=|youtu\.be/|embed/|shorts/|live/)([A-Za-z0-9_-]{11})",
29
+ r"^([A-Za-z0-9_-]{11})$",
30
+ ]
31
+ for p in patterns:
32
+ m = re.search(p, url_or_id)
33
+ if m:
34
+ return m.group(1)
35
+ return None
36
+
37
+
38
+ # ── YouTube API Client ─────────────────────────────────────────────────────────
39
+
40
+ def get_yt_client(api_key: str):
41
+ return build("youtube", "v3", developerKey=api_key, cache_discovery=False)
42
+
43
+
44
+ # ── Video Metadata ─────────────────────────────────────────────────────────────
45
+
46
+ def fetch_video_metadata(video_id: str, api_key: str) -> Tuple[Optional[Dict], Optional[str]]:
47
+ """
48
+ Returns (metadata_dict, error_string).
49
+ metadata_dict keys: title, channel_title, description, tags, duration,
50
+ published_at, view_count, like_count, comment_count,
51
+ thumbnail_url, video_id
52
+ """
53
+ try:
54
+ yt = get_yt_client(api_key)
55
+ resp = yt.videos().list(
56
+ part="snippet,contentDetails,statistics",
57
+ id=video_id,
58
+ ).execute()
59
+
60
+ if not resp.get("items"):
61
+ return None, "Video not found or unavailable."
62
+
63
+ item = resp["items"][0]
64
+ snippet = item.get("snippet", {})
65
+ stats = item.get("statistics", {})
66
+ content = item.get("contentDetails", {})
67
+
68
+ # Parse ISO 8601 duration e.g. PT4M13S β†’ "4m 13s"
69
+ raw_dur = content.get("duration", "PT0S")
70
+ duration_str = _parse_duration(raw_dur)
71
+
72
+ metadata = {
73
+ "video_id": video_id,
74
+ "title": snippet.get("title", "N/A"),
75
+ "channel_title": snippet.get("channelTitle", "N/A"),
76
+ "description": snippet.get("description", ""),
77
+ "tags": snippet.get("tags", []),
78
+ "published_at": snippet.get("publishedAt", "")[:10],
79
+ "duration": duration_str,
80
+ "view_count": int(stats.get("viewCount", 0)),
81
+ "like_count": int(stats.get("likeCount", 0)),
82
+ "comment_count": int(stats.get("commentCount", 0)),
83
+ "thumbnail_url": (
84
+ snippet.get("thumbnails", {})
85
+ .get("maxres", snippet.get("thumbnails", {}).get("high", {}))
86
+ .get("url", "")
87
+ ),
88
+ }
89
+ return metadata, None
90
+
91
+ except HttpError as e:
92
+ return None, f"YouTube API HTTP error {e.resp.status}: {e._get_reason()}"
93
+ except Exception as e:
94
+ return None, f"Unexpected error: {e}"
95
+
96
+
97
+ def _parse_duration(iso: str) -> str:
98
+ """Convert PT4M13S β†’ '4m 13s'"""
99
+ m = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", iso)
100
+ if not m:
101
+ return "N/A"
102
+ h, mn, s = m.group(1), m.group(2), m.group(3)
103
+ parts = []
104
+ if h: parts.append(f"{h}h")
105
+ if mn: parts.append(f"{mn}m")
106
+ if s: parts.append(f"{s}s")
107
+ return " ".join(parts) or "0s"
108
+
109
+
110
+ # ── Transcript ─────────────────────────────────────────────────────────────────
111
+
112
+ def fetch_transcript(video_id: str) -> Tuple[str, str]:
113
+ """
114
+ Returns (transcript_text, status_message).
115
+ Tries English first, then any available language.
116
+ """
117
+ try:
118
+ segments = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
119
+ text = " ".join(s["text"] for s in segments)
120
+ return text, f"βœ… English transcript fetched ({len(segments)} segments, {len(text.split())} words)"
121
+
122
+ except NoTranscriptFound:
123
+ # Try any available
124
+ try:
125
+ tl = YouTubeTranscriptApi.list_transcripts(video_id)
126
+ for t in tl:
127
+ try:
128
+ segments = t.fetch()
129
+ text = " ".join(s["text"] for s in segments)
130
+ return text, f"βœ… Transcript fetched (lang: {t.language_code}, {len(text.split())} words)"
131
+ except Exception:
132
+ continue
133
+ return "", "⚠️ No usable transcript found for any language."
134
+ except Exception as e:
135
+ return "", f"⚠️ Transcript listing failed: {e}"
136
+
137
+ except TranscriptsDisabled:
138
+ return "", "⚠️ Transcripts are disabled for this video."
139
+ except VideoUnavailable:
140
+ return "", "❌ Video is unavailable."
141
+ except Exception as e:
142
+ return "", f"⚠️ Transcript error: {e}"
143
+
144
+
145
+ # ── Comments ───────────────────────────────────────────────────────────────────
146
+
147
+ def fetch_comments(
148
+ video_id: str,
149
+ api_key: str,
150
+ max_comments: int = 200,
151
+ ) -> Tuple[pd.DataFrame, str]:
152
+ """
153
+ Fetch top-level comment threads.
154
+ Returns (DataFrame with cols: author, text, likes, published_at), status_msg.
155
+ Handles disabled comments gracefully.
156
+ """
157
+ try:
158
+ yt = get_yt_client(api_key)
159
+ comments = []
160
+ next_page = None
161
+
162
+ while len(comments) < max_comments:
163
+ kwargs = dict(
164
+ part="snippet",
165
+ videoId=video_id,
166
+ maxResults=min(100, max_comments - len(comments)),
167
+ order="relevance",
168
+ textFormat="plainText",
169
+ )
170
+ if next_page:
171
+ kwargs["pageToken"] = next_page
172
+
173
+ resp = yt.commentThreads().list(**kwargs).execute()
174
+
175
+ for item in resp.get("items", []):
176
+ top = item["snippet"]["topLevelComment"]["snippet"]
177
+ comments.append({
178
+ "author": top.get("authorDisplayName", "Anonymous"),
179
+ "text": top.get("textDisplay", ""),
180
+ "likes": int(top.get("likeCount", 0)),
181
+ "published_at": top.get("publishedAt", "")[:10],
182
+ })
183
+
184
+ next_page = resp.get("nextPageToken")
185
+ if not next_page:
186
+ break
187
+
188
+ if not comments:
189
+ return pd.DataFrame(), "⚠️ No comments found."
190
+
191
+ df = pd.DataFrame(comments)
192
+ return df, f"βœ… Fetched {len(df)} comments"
193
+
194
+ except HttpError as e:
195
+ reason = e._get_reason()
196
+ if "commentsDisabled" in reason or e.resp.status == 403:
197
+ return pd.DataFrame(), "⚠️ Comments are disabled for this video."
198
+ return pd.DataFrame(), f"❌ API error {e.resp.status}: {reason}"
199
+ except Exception as e:
200
+ return pd.DataFrame(), f"❌ Comments error: {e}"
201
+
202
+
203
+ # ── Search by keyword (for uploaded files) ────────────────────────────────────
204
+
205
+ def search_videos_by_title(query: str, api_key: str, max_results: int = 5) -> List[Dict]:
206
+ """
207
+ Search YouTube for videos matching a title/keyword query.
208
+ Used when user uploads a video file and we need to find it on YouTube.
209
+ """
210
+ try:
211
+ yt = get_yt_client(api_key)
212
+ resp = yt.search().list(
213
+ part="snippet",
214
+ q=query,
215
+ type="video",
216
+ maxResults=max_results,
217
+ ).execute()
218
+
219
+ results = []
220
+ for item in resp.get("items", []):
221
+ results.append({
222
+ "video_id": item["id"]["videoId"],
223
+ "title": item["snippet"]["title"],
224
+ "channel_title": item["snippet"]["channelTitle"],
225
+ "thumbnail_url": item["snippet"]["thumbnails"]["default"]["url"],
226
+ "published_at": item["snippet"]["publishedAt"][:10],
227
+ })
228
+ return results
229
+
230
+ except Exception:
231
+ return []