Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import re | |
| from typing import Optional, List | |
| from bs4 import BeautifulSoup | |
| import requests | |
| from langchain_core.tools import tool | |
| # ------------------ CONFIG ---------------------- | |
| DEFAULT_TIMEOUT = 20 | |
| HEADERS = { | |
| # Helps avoid consent/anti-bot interstitials on some sites | |
| "User-Agent": ( | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
| ) | |
| } | |
| INVIDIOUS_POOL = [ | |
| "https://yewtu.be", | |
| "https://inv.nadeko.net", | |
| "https://invidious.tiekoetter.com/", | |
| "https://invidious.f5.si", | |
| "https://invidious.nerdvpn.de", | |
| ] | |
| # ------------------ HELPERS ---------------------- | |
| def _fetch_html(url: str, *, timeout: int = DEFAULT_TIMEOUT) -> str: | |
| """Download raw HTML for a URL or raise on HTTP errors.""" | |
| resp = requests.get(url, headers=HEADERS, timeout=timeout) | |
| resp.raise_for_status() | |
| return resp.text | |
| def _http_json(base: str, path: str, **params): | |
| r = requests.get( | |
| base.rstrip("/") + path, params=params, headers=HEADERS, timeout=DEFAULT_TIMEOUT | |
| ) | |
| if r.status_code in (404, 410): | |
| raise FileNotFoundError(f"No captions on {base}") | |
| if r.status_code >= 400: | |
| raise requests.HTTPError(f"{r.status_code} at {base}{path}") | |
| return r.json() | |
| def _http_text(base: str, path: str, **params) -> str: | |
| r = requests.get( | |
| base.rstrip("/") + path, params=params, headers=HEADERS, timeout=DEFAULT_TIMEOUT | |
| ) | |
| if r.status_code in (404, 410): | |
| raise FileNotFoundError(f"No captions on {base}") | |
| if r.status_code >= 400: | |
| raise requests.HTTPError(f"{r.status_code} at {base}{path}") | |
| return r.text | |
| def _vtt_to_text(vtt: str) -> str: | |
| lines = [] | |
| for line in vtt.splitlines(): | |
| if not line or line.startswith("WEBVTT"): | |
| continue | |
| if re.match(r"\d{2}:\d{2}:\d{2}\.\d{3} --> ", line) or line.isdigit(): | |
| continue | |
| lines.append(line) | |
| return re.sub(r"<[^>]+>", "", "\n".join(lines)).strip() | |
| # ------------------ YOUTUBE FETCHERS ---------------------- | |
| def extract_video_id(video_url: str) -> str: | |
| """ | |
| Extract a YouTube video ID from any URL format (watch?v=, youtu.be/, shorts/, embed/, or raw ID). | |
| """ | |
| url = (video_url or "").strip() | |
| if re.fullmatch(r"[A-Za-z0-9_-]{11}", url): | |
| return url | |
| patterns = [ | |
| r"[?&]v=([A-Za-z0-9_-]{11})", | |
| r"youtu\.be/([A-Za-z0-9_-]{11})", | |
| r"youtube\.com/embed/([A-Za-z0-9_-]{11})", | |
| r"youtube\.com/shorts/([A-Za-z0-9_-]{11})", | |
| ] | |
| for pat in patterns: | |
| m = re.search(pat, url) | |
| if m: | |
| return m.group(1) | |
| m = re.search(r"/([A-Za-z0-9_-]{11})(?:\?|$)", url) | |
| if m: | |
| return m.group(1) | |
| raise ValueError( | |
| "Unable to extract a valid YouTube video ID from the provided input." | |
| ) | |
| def get_youtube_title_description(video_url: str) -> str: | |
| """ | |
| get_youtube title and description from youtube url (ex: https://www.youtube.com/watch?v=1htKBjuUWec) | |
| Extract YouTube title + description from Open Graph meta tags. | |
| Falls back to standard <meta name="title"/"description"> if needed. | |
| """ | |
| html = _fetch_html(video_url) | |
| soup = BeautifulSoup(html, "html.parser") | |
| title_tag = soup.find("meta", property="og:title") or soup.find( | |
| "meta", attrs={"name": "title"} | |
| ) | |
| desc_tag = soup.find("meta", property="og:description") or soup.find( | |
| "meta", attrs={"name": "description"} | |
| ) | |
| title = (title_tag.get("content") if title_tag else None) or "No title found" | |
| description = ( | |
| desc_tag.get("content") if desc_tag else None | |
| ) or "No description found" | |
| return f"Title: {title}\nDescription: {description}" | |
| def get_youtube_transcript_tool( | |
| video_id: str, langs: Optional[List[str]] = None | |
| ) -> str: | |
| """ | |
| Tool: return YouTube transcript text for a video ID. | |
| """ | |
| langs = langs or ["en", "en-US", "fr", "fr-FR"] | |
| last_err = None | |
| for base in INVIDIOUS_POOL: | |
| try: | |
| caps = _http_json(base, f"/api/v1/captions/{video_id}") | |
| tracks = caps.get("captions", caps) if isinstance(caps, dict) else caps | |
| if not isinstance(tracks, list) or not tracks: | |
| continue | |
| pick = None | |
| for lang in langs: | |
| pick = next( | |
| ( | |
| c | |
| for c in tracks | |
| if (c.get("languageCode", "").lower() == lang.lower()) | |
| or ( | |
| c.get("label", "") | |
| .lower() | |
| .startswith(lang.split("-")[0].lower()) | |
| ) | |
| ), | |
| None, | |
| ) | |
| if pick: | |
| break | |
| pick = pick or tracks[0] | |
| vtt = ( | |
| _http_text(base, pick["url"]) | |
| if pick.get("url") | |
| else _http_text( | |
| base, | |
| f"/api/v1/captions/{video_id}", | |
| label=pick.get("label") or pick.get("languageCode") or "English", | |
| lang=pick.get("languageCode") or "en", | |
| ) | |
| ) | |
| text = _vtt_to_text(vtt) | |
| if text: | |
| return text | |
| except Exception as e: | |
| last_err = e | |
| continue | |
| raise RuntimeError(f"No captions available. Last error: {last_err}") | |