MHMisinfo

Sleeping

App Files Files Community

rocky250 commited on Apr 19

Commit

f0f0ba5

verified ·

1 Parent(s): 44bafbe

Create fetcher.py

Browse files

Files changed (1) hide show

fetcher.py +231 -0

fetcher.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+fetcher.py — All YouTube Data API v3 + transcript fetching logic.
+Completely decoupled from the UI layer.
+"""
+import os
+import re
+import time
+from typing import Optional, Tuple, List, Dict, Any
+import pandas as pd
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+from youtube_transcript_api import (
+    YouTubeTranscriptApi,
+    NoTranscriptFound,
+    TranscriptsDisabled,
+    VideoUnavailable,
+)
+# ── ID Extraction ──────────────────────────────────────────────────────────────
+def extract_video_id(url_or_id: str) -> Optional[str]:
+    """Extract YouTube video ID from any URL format or a raw ID."""
+    url_or_id = url_or_id.strip()
+    patterns = [
+        r"(?:v=|youtu\.be/|embed/|shorts/|live/)([A-Za-z0-9_-]{11})",
+        r"^([A-Za-z0-9_-]{11})$",
+    ]
+    for p in patterns:
+        m = re.search(p, url_or_id)
+        if m:
+            return m.group(1)
+    return None
+# ── YouTube API Client ─────────────────────────────────────────────────────────
+def get_yt_client(api_key: str):
+    return build("youtube", "v3", developerKey=api_key, cache_discovery=False)
+# ── Video Metadata ─────────────────────────────────────────────────────────────
+def fetch_video_metadata(video_id: str, api_key: str) -> Tuple[Optional[Dict], Optional[str]]:
+    """
+    Returns (metadata_dict, error_string).
+    metadata_dict keys: title, channel_title, description, tags, duration,
+                        published_at, view_count, like_count, comment_count,
+                        thumbnail_url, video_id
+    """
+    try:
+        yt = get_yt_client(api_key)
+        resp = yt.videos().list(
+            part="snippet,contentDetails,statistics",
+            id=video_id,
+        ).execute()
+        if not resp.get("items"):
+            return None, "Video not found or unavailable."
+        item = resp["items"][0]
+        snippet = item.get("snippet", {})
+        stats = item.get("statistics", {})
+        content = item.get("contentDetails", {})
+        # Parse ISO 8601 duration e.g. PT4M13S → "4m 13s"
+        raw_dur = content.get("duration", "PT0S")
+        duration_str = _parse_duration(raw_dur)
+        metadata = {
+            "video_id":      video_id,
+            "title":         snippet.get("title", "N/A"),
+            "channel_title": snippet.get("channelTitle", "N/A"),
+            "description":   snippet.get("description", ""),
+            "tags":          snippet.get("tags", []),
+            "published_at":  snippet.get("publishedAt", "")[:10],
+            "duration":      duration_str,
+            "view_count":    int(stats.get("viewCount", 0)),
+            "like_count":    int(stats.get("likeCount", 0)),
+            "comment_count": int(stats.get("commentCount", 0)),
+            "thumbnail_url": (
+                snippet.get("thumbnails", {})
+                       .get("maxres", snippet.get("thumbnails", {}).get("high", {}))
+                       .get("url", "")
+            ),
+        }
+        return metadata, None
+    except HttpError as e:
+        return None, f"YouTube API HTTP error {e.resp.status}: {e._get_reason()}"
+    except Exception as e:
+        return None, f"Unexpected error: {e}"
+def _parse_duration(iso: str) -> str:
+    """Convert PT4M13S → '4m 13s'"""
+    m = re.match(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", iso)
+    if not m:
+        return "N/A"
+    h, mn, s = m.group(1), m.group(2), m.group(3)
+    parts = []
+    if h:  parts.append(f"{h}h")
+    if mn: parts.append(f"{mn}m")
+    if s:  parts.append(f"{s}s")
+    return " ".join(parts) or "0s"
+# ── Transcript ─────────────────────────────────────────────────────────────────
+def fetch_transcript(video_id: str) -> Tuple[str, str]:
+    """
+    Returns (transcript_text, status_message).
+    Tries English first, then any available language.
+    """
+    try:
+        segments = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
+        text = " ".join(s["text"] for s in segments)
+        return text, f"✅ English transcript fetched ({len(segments)} segments, {len(text.split())} words)"
+    except NoTranscriptFound:
+        # Try any available
+        try:
+            tl = YouTubeTranscriptApi.list_transcripts(video_id)
+            for t in tl:
+                try:
+                    segments = t.fetch()
+                    text = " ".join(s["text"] for s in segments)
+                    return text, f"✅ Transcript fetched (lang: {t.language_code}, {len(text.split())} words)"
+                except Exception:
+                    continue
+            return "", "⚠️ No usable transcript found for any language."
+        except Exception as e:
+            return "", f"⚠️ Transcript listing failed: {e}"
+    except TranscriptsDisabled:
+        return "", "⚠️ Transcripts are disabled for this video."
+    except VideoUnavailable:
+        return "", "❌ Video is unavailable."
+    except Exception as e:
+        return "", f"⚠️ Transcript error: {e}"
+# ── Comments ───────────────────────────────────────────────────────────────────
+def fetch_comments(
+    video_id: str,
+    api_key: str,
+    max_comments: int = 200,
+) -> Tuple[pd.DataFrame, str]:
+    """
+    Fetch top-level comment threads.
+    Returns (DataFrame with cols: author, text, likes, published_at), status_msg.
+    Handles disabled comments gracefully.
+    """
+    try:
+        yt = get_yt_client(api_key)
+        comments = []
+        next_page = None
+        while len(comments) < max_comments:
+            kwargs = dict(
+                part="snippet",
+                videoId=video_id,
+                maxResults=min(100, max_comments - len(comments)),
+                order="relevance",
+                textFormat="plainText",
+            )
+            if next_page:
+                kwargs["pageToken"] = next_page
+            resp = yt.commentThreads().list(**kwargs).execute()
+            for item in resp.get("items", []):
+                top = item["snippet"]["topLevelComment"]["snippet"]
+                comments.append({
+                    "author":       top.get("authorDisplayName", "Anonymous"),
+                    "text":         top.get("textDisplay", ""),
+                    "likes":        int(top.get("likeCount", 0)),
+                    "published_at": top.get("publishedAt", "")[:10],
+                })
+            next_page = resp.get("nextPageToken")
+            if not next_page:
+                break
+        if not comments:
+            return pd.DataFrame(), "⚠️ No comments found."
+        df = pd.DataFrame(comments)
+        return df, f"✅ Fetched {len(df)} comments"
+    except HttpError as e:
+        reason = e._get_reason()
+        if "commentsDisabled" in reason or e.resp.status == 403:
+            return pd.DataFrame(), "⚠️ Comments are disabled for this video."
+        return pd.DataFrame(), f"❌ API error {e.resp.status}: {reason}"
+    except Exception as e:
+        return pd.DataFrame(), f"❌ Comments error: {e}"
+# ── Search by keyword (for uploaded files) ────────────────────────────────────
+def search_videos_by_title(query: str, api_key: str, max_results: int = 5) -> List[Dict]:
+    """
+    Search YouTube for videos matching a title/keyword query.
+    Used when user uploads a video file and we need to find it on YouTube.
+    """
+    try:
+        yt = get_yt_client(api_key)
+        resp = yt.search().list(
+            part="snippet",
+            q=query,
+            type="video",
+            maxResults=max_results,
+        ).execute()
+        results = []
+        for item in resp.get("items", []):
+            results.append({
+                "video_id":      item["id"]["videoId"],
+                "title":         item["snippet"]["title"],
+                "channel_title": item["snippet"]["channelTitle"],
+                "thumbnail_url": item["snippet"]["thumbnails"]["default"]["url"],
+                "published_at":  item["snippet"]["publishedAt"][:10],
+            })
+        return results
+    except Exception:
+        return []