gaia-agents-langgraph / agents /youtube_transcript_tool.py
fmarky's picture
chore: remove useless file and env variables
a30fc90
#!/usr/bin/env python3
from __future__ import annotations
import re
from typing import Optional, List
from bs4 import BeautifulSoup
import requests
from langchain_core.tools import tool
# ------------------ CONFIG ----------------------
DEFAULT_TIMEOUT = 20
HEADERS = {
# Helps avoid consent/anti-bot interstitials on some sites
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
}
INVIDIOUS_POOL = [
"https://yewtu.be",
"https://inv.nadeko.net",
"https://invidious.tiekoetter.com/",
"https://invidious.f5.si",
"https://invidious.nerdvpn.de",
]
# ------------------ HELPERS ----------------------
def _fetch_html(url: str, *, timeout: int = DEFAULT_TIMEOUT) -> str:
"""Download raw HTML for a URL or raise on HTTP errors."""
resp = requests.get(url, headers=HEADERS, timeout=timeout)
resp.raise_for_status()
return resp.text
def _http_json(base: str, path: str, **params):
r = requests.get(
base.rstrip("/") + path, params=params, headers=HEADERS, timeout=DEFAULT_TIMEOUT
)
if r.status_code in (404, 410):
raise FileNotFoundError(f"No captions on {base}")
if r.status_code >= 400:
raise requests.HTTPError(f"{r.status_code} at {base}{path}")
return r.json()
def _http_text(base: str, path: str, **params) -> str:
r = requests.get(
base.rstrip("/") + path, params=params, headers=HEADERS, timeout=DEFAULT_TIMEOUT
)
if r.status_code in (404, 410):
raise FileNotFoundError(f"No captions on {base}")
if r.status_code >= 400:
raise requests.HTTPError(f"{r.status_code} at {base}{path}")
return r.text
def _vtt_to_text(vtt: str) -> str:
lines = []
for line in vtt.splitlines():
if not line or line.startswith("WEBVTT"):
continue
if re.match(r"\d{2}:\d{2}:\d{2}\.\d{3} --> ", line) or line.isdigit():
continue
lines.append(line)
return re.sub(r"<[^>]+>", "", "\n".join(lines)).strip()
# ------------------ YOUTUBE FETCHERS ----------------------
def extract_video_id(video_url: str) -> str:
"""
Extract a YouTube video ID from any URL format (watch?v=, youtu.be/, shorts/, embed/, or raw ID).
"""
url = (video_url or "").strip()
if re.fullmatch(r"[A-Za-z0-9_-]{11}", url):
return url
patterns = [
r"[?&]v=([A-Za-z0-9_-]{11})",
r"youtu\.be/([A-Za-z0-9_-]{11})",
r"youtube\.com/embed/([A-Za-z0-9_-]{11})",
r"youtube\.com/shorts/([A-Za-z0-9_-]{11})",
]
for pat in patterns:
m = re.search(pat, url)
if m:
return m.group(1)
m = re.search(r"/([A-Za-z0-9_-]{11})(?:\?|$)", url)
if m:
return m.group(1)
raise ValueError(
"Unable to extract a valid YouTube video ID from the provided input."
)
@tool("get_youtube_title_description")
def get_youtube_title_description(video_url: str) -> str:
"""
get_youtube title and description from youtube url (ex: https://www.youtube.com/watch?v=1htKBjuUWec)
Extract YouTube title + description from Open Graph meta tags.
Falls back to standard <meta name="title"/"description"> if needed.
"""
html = _fetch_html(video_url)
soup = BeautifulSoup(html, "html.parser")
title_tag = soup.find("meta", property="og:title") or soup.find(
"meta", attrs={"name": "title"}
)
desc_tag = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
title = (title_tag.get("content") if title_tag else None) or "No title found"
description = (
desc_tag.get("content") if desc_tag else None
) or "No description found"
return f"Title: {title}\nDescription: {description}"
@tool("get_youtube_transcript")
def get_youtube_transcript_tool(
video_id: str, langs: Optional[List[str]] = None
) -> str:
"""
Tool: return YouTube transcript text for a video ID.
"""
langs = langs or ["en", "en-US", "fr", "fr-FR"]
last_err = None
for base in INVIDIOUS_POOL:
try:
caps = _http_json(base, f"/api/v1/captions/{video_id}")
tracks = caps.get("captions", caps) if isinstance(caps, dict) else caps
if not isinstance(tracks, list) or not tracks:
continue
pick = None
for lang in langs:
pick = next(
(
c
for c in tracks
if (c.get("languageCode", "").lower() == lang.lower())
or (
c.get("label", "")
.lower()
.startswith(lang.split("-")[0].lower())
)
),
None,
)
if pick:
break
pick = pick or tracks[0]
vtt = (
_http_text(base, pick["url"])
if pick.get("url")
else _http_text(
base,
f"/api/v1/captions/{video_id}",
label=pick.get("label") or pick.get("languageCode") or "English",
lang=pick.get("languageCode") or "en",
)
)
text = _vtt_to_text(vtt)
if text:
return text
except Exception as e:
last_err = e
continue
raise RuntimeError(f"No captions available. Last error: {last_err}")