#!/usr/bin/env python3 from __future__ import annotations import re from typing import Optional, List from bs4 import BeautifulSoup import requests from langchain_core.tools import tool # ------------------ CONFIG ---------------------- DEFAULT_TIMEOUT = 20 HEADERS = { # Helps avoid consent/anti-bot interstitials on some sites "User-Agent": ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) } INVIDIOUS_POOL = [ "https://yewtu.be", "https://inv.nadeko.net", "https://invidious.tiekoetter.com/", "https://invidious.f5.si", "https://invidious.nerdvpn.de", ] # ------------------ HELPERS ---------------------- def _fetch_html(url: str, *, timeout: int = DEFAULT_TIMEOUT) -> str: """Download raw HTML for a URL or raise on HTTP errors.""" resp = requests.get(url, headers=HEADERS, timeout=timeout) resp.raise_for_status() return resp.text def _http_json(base: str, path: str, **params): r = requests.get( base.rstrip("/") + path, params=params, headers=HEADERS, timeout=DEFAULT_TIMEOUT ) if r.status_code in (404, 410): raise FileNotFoundError(f"No captions on {base}") if r.status_code >= 400: raise requests.HTTPError(f"{r.status_code} at {base}{path}") return r.json() def _http_text(base: str, path: str, **params) -> str: r = requests.get( base.rstrip("/") + path, params=params, headers=HEADERS, timeout=DEFAULT_TIMEOUT ) if r.status_code in (404, 410): raise FileNotFoundError(f"No captions on {base}") if r.status_code >= 400: raise requests.HTTPError(f"{r.status_code} at {base}{path}") return r.text def _vtt_to_text(vtt: str) -> str: lines = [] for line in vtt.splitlines(): if not line or line.startswith("WEBVTT"): continue if re.match(r"\d{2}:\d{2}:\d{2}\.\d{3} --> ", line) or line.isdigit(): continue lines.append(line) return re.sub(r"<[^>]+>", "", "\n".join(lines)).strip() # ------------------ YOUTUBE FETCHERS ---------------------- def extract_video_id(video_url: str) -> str: """ Extract a YouTube video ID from any URL format (watch?v=, youtu.be/, shorts/, embed/, or raw ID). """ url = (video_url or "").strip() if re.fullmatch(r"[A-Za-z0-9_-]{11}", url): return url patterns = [ r"[?&]v=([A-Za-z0-9_-]{11})", r"youtu\.be/([A-Za-z0-9_-]{11})", r"youtube\.com/embed/([A-Za-z0-9_-]{11})", r"youtube\.com/shorts/([A-Za-z0-9_-]{11})", ] for pat in patterns: m = re.search(pat, url) if m: return m.group(1) m = re.search(r"/([A-Za-z0-9_-]{11})(?:\?|$)", url) if m: return m.group(1) raise ValueError( "Unable to extract a valid YouTube video ID from the provided input." ) @tool("get_youtube_title_description") def get_youtube_title_description(video_url: str) -> str: """ get_youtube title and description from youtube url (ex: https://www.youtube.com/watch?v=1htKBjuUWec) Extract YouTube title + description from Open Graph meta tags. Falls back to standard if needed. """ html = _fetch_html(video_url) soup = BeautifulSoup(html, "html.parser") title_tag = soup.find("meta", property="og:title") or soup.find( "meta", attrs={"name": "title"} ) desc_tag = soup.find("meta", property="og:description") or soup.find( "meta", attrs={"name": "description"} ) title = (title_tag.get("content") if title_tag else None) or "No title found" description = ( desc_tag.get("content") if desc_tag else None ) or "No description found" return f"Title: {title}\nDescription: {description}" @tool("get_youtube_transcript") def get_youtube_transcript_tool( video_id: str, langs: Optional[List[str]] = None ) -> str: """ Tool: return YouTube transcript text for a video ID. """ langs = langs or ["en", "en-US", "fr", "fr-FR"] last_err = None for base in INVIDIOUS_POOL: try: caps = _http_json(base, f"/api/v1/captions/{video_id}") tracks = caps.get("captions", caps) if isinstance(caps, dict) else caps if not isinstance(tracks, list) or not tracks: continue pick = None for lang in langs: pick = next( ( c for c in tracks if (c.get("languageCode", "").lower() == lang.lower()) or ( c.get("label", "") .lower() .startswith(lang.split("-")[0].lower()) ) ), None, ) if pick: break pick = pick or tracks[0] vtt = ( _http_text(base, pick["url"]) if pick.get("url") else _http_text( base, f"/api/v1/captions/{video_id}", label=pick.get("label") or pick.get("languageCode") or "English", lang=pick.get("languageCode") or "en", ) ) text = _vtt_to_text(vtt) if text: return text except Exception as e: last_err = e continue raise RuntimeError(f"No captions available. Last error: {last_err}")