Spaces:
Sleeping
Sleeping
File size: 6,252 Bytes
f0f0ba5 3f45a30 f0f0ba5 3f45a30 f0f0ba5 6ecb9bd 3f45a30 b1690db 3f45a30 f0f0ba5 3f45a30 f0f0ba5 3f45a30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | """
fetcher.py — YouTube Data API v3 helpers
"""
import re
import requests
import pandas as pd
# Video ID extraction
def extract_video_id(url_or_id: str) -> str | None:
"""Return an 11-char YouTube video ID, or None if not found."""
patterns = [
r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/shorts/)([a-zA-Z0-9_-]{11})",
r"^([a-zA-Z0-9_-]{11})$",
]
for pattern in patterns:
m = re.search(pattern, url_or_id)
if m:
return m.group(1)
return None
# Duration parser
def _parse_duration(iso: str) -> str:
m = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", iso or "PT0S")
if not m:
return "0:00"
h, mn, s = (int(x or 0) for x in m.groups())
return f"{h}:{mn:02d}:{s:02d}" if h else f"{mn}:{s:02d}"
# Metadata
def fetch_video_metadata(video_id: str, api_key: str) -> tuple[dict | None, str | None]:
"""Return (meta_dict, error_string). One will be None."""
try:
resp = requests.get(
"https://www.googleapis.com/youtube/v3/videos",
params={
"id": video_id,
"key": api_key,
"part": "snippet,statistics,contentDetails",
},
timeout=15,
)
data = resp.json()
if "error" in data:
return None, data["error"].get("message", "YouTube API error")
items = data.get("items", [])
if not items:
return None, "Video not found — check the ID or URL."
item = items[0]
sn = item.get("snippet", {})
st = item.get("statistics", {})
cd = item.get("contentDetails", {})
meta = {
"title": sn.get("title", "Unknown"),
"description": sn.get("description", ""),
"channel_title": sn.get("channelTitle", "Unknown"),
"published_at": sn.get("publishedAt", "")[:10],
"tags": sn.get("tags", []),
"thumbnail_url": (
sn.get("thumbnails", {}).get("high", {}).get("url", "")
or sn.get("thumbnails", {}).get("medium", {}).get("url", "")
),
"view_count": int(st.get("viewCount", 0)),
"like_count": int(st.get("likeCount", 0)),
"comment_count": int(st.get("commentCount", 0)),
"duration": _parse_duration(cd.get("duration", "PT0S")),
}
return meta, None
except requests.exceptions.Timeout:
return None, "Request timed out. Check your internet connection."
except Exception as exc:
return None, str(exc)
# Transcript
def fetch_transcript(video_id: str) -> tuple[str, str]:
"""Return (text, status_message)."""
try:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
segments = YouTubeTranscriptApi.get_transcript(video_id)
text = " ".join(s["text"] for s in segments)
return text, f" Transcript: {len(text.split())} words"
except Exception as exc:
short = str(exc)[:80]
return "", f" Transcript unavailable: {short}"
# Comments
def fetch_comments(
video_id: str,
api_key: str,
max_comments: int = 150,
) -> tuple[pd.DataFrame, str]:
"""Return (DataFrame, status_message)."""
rows = []
next_token = None
try:
while len(rows) < max_comments:
want = min(100, max_comments - len(rows))
params = {
"videoId": video_id,
"key": api_key,
"part": "snippet",
"maxResults": want,
"order": "relevance",
}
if next_token:
params["pageToken"] = next_token
resp = requests.get(
"https://www.googleapis.com/youtube/v3/commentThreads",
params=params,
timeout=15,
)
data = resp.json()
if "error" in data:
msg = data["error"].get("message", "Comment API error")
break
for item in data.get("items", []):
c = item["snippet"]["topLevelComment"]["snippet"]
rows.append({
"author": c.get("authorDisplayName", ""),
"text": c.get("textDisplay", ""),
"likes": int(c.get("likeCount", 0)),
"published_at": c.get("publishedAt", "")[:10],
})
next_token = data.get("nextPageToken")
if not next_token or not data.get("items"):
break
if not rows:
return pd.DataFrame(), " No comments fetched (comments may be disabled)"
df = pd.DataFrame(rows)
return df, f" Comments: {len(df)} fetched"
except requests.exceptions.Timeout:
return pd.DataFrame(), " Comments request timed out"
except Exception as exc:
return pd.DataFrame(), f" Comments error: {str(exc)[:80]}"
# Search by keyword
def search_videos_by_title(
keyword: str,
api_key: str,
max_results: int = 5,
) -> list[dict]:
try:
resp = requests.get(
"https://www.googleapis.com/youtube/v3/search",
params={
"q": keyword,
"key": api_key,
"part": "snippet",
"type": "video",
"maxResults": max_results,
},
timeout=15,
)
data = resp.json()
results = []
for item in data.get("items", []):
vid_id = item.get("id", {}).get("videoId", "")
sn = item.get("snippet", {})
if not vid_id:
continue
results.append({
"video_id": vid_id,
"title": sn.get("title", ""),
"channel_title": sn.get("channelTitle", ""),
"published_at": sn.get("publishedAt", "")[:10],
"thumbnail_url": sn.get("thumbnails", {}).get("medium", {}).get("url", ""),
})
return results
except Exception:
return [] |