gaia-agents-langgraph

Sleeping

App Files Files Community

gaia-agents-langgraph / agents /youtube_transcript_tool.py

fmarky

chore: remove useless file and env variables

a30fc90 4 months ago

raw

history blame contribute delete

5.64 kB

	#!/usr/bin/env python3
	from __future__ import annotations
	import re
	from typing import Optional, List
	from bs4 import BeautifulSoup
	import requests
	from langchain_core.tools import tool

	# ------------------ CONFIG ----------------------

	DEFAULT_TIMEOUT = 20
	HEADERS = {
	# Helps avoid consent/anti-bot interstitials on some sites
	"User-Agent": (
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
	)
	}

	INVIDIOUS_POOL = [
	"https://yewtu.be",
	"https://inv.nadeko.net",
	"https://invidious.tiekoetter.com/",
	"https://invidious.f5.si",
	"https://invidious.nerdvpn.de",
	]

	# ------------------ HELPERS ----------------------


	def _fetch_html(url: str, *, timeout: int = DEFAULT_TIMEOUT) -> str:
	"""Download raw HTML for a URL or raise on HTTP errors."""
	resp = requests.get(url, headers=HEADERS, timeout=timeout)
	resp.raise_for_status()
	return resp.text


	def _http_json(base: str, path: str, **params):
	r = requests.get(
	base.rstrip("/") + path, params=params, headers=HEADERS, timeout=DEFAULT_TIMEOUT
	)
	if r.status_code in (404, 410):
	raise FileNotFoundError(f"No captions on {base}")
	if r.status_code >= 400:
	raise requests.HTTPError(f"{r.status_code} at {base}{path}")
	return r.json()


	def _http_text(base: str, path: str, **params) -> str:
	r = requests.get(
	base.rstrip("/") + path, params=params, headers=HEADERS, timeout=DEFAULT_TIMEOUT
	)
	if r.status_code in (404, 410):
	raise FileNotFoundError(f"No captions on {base}")
	if r.status_code >= 400:
	raise requests.HTTPError(f"{r.status_code} at {base}{path}")
	return r.text


	def _vtt_to_text(vtt: str) -> str:
	lines = []
	for line in vtt.splitlines():
	if not line or line.startswith("WEBVTT"):
	continue
	if re.match(r"\d{2}:\d{2}:\d{2}\.\d{3} --> ", line) or line.isdigit():
	continue
	lines.append(line)
	return re.sub(r"<[^>]+>", "", "\n".join(lines)).strip()


	# ------------------ YOUTUBE FETCHERS ----------------------


	def extract_video_id(video_url: str) -> str:
	"""
	Extract a YouTube video ID from any URL format (watch?v=, youtu.be/, shorts/, embed/, or raw ID).
	"""
	url = (video_url or "").strip()
	if re.fullmatch(r"[A-Za-z0-9_-]{11}", url):
	return url

	patterns = [
	r"[?&]v=([A-Za-z0-9_-]{11})",
	r"youtu\.be/([A-Za-z0-9_-]{11})",
	r"youtube\.com/embed/([A-Za-z0-9_-]{11})",
	r"youtube\.com/shorts/([A-Za-z0-9_-]{11})",
	]
	for pat in patterns:
	m = re.search(pat, url)
	if m:
	return m.group(1)

	m = re.search(r"/([A-Za-z0-9_-]{11})(?:\?\|$)", url)
	if m:
	return m.group(1)

	raise ValueError(
	"Unable to extract a valid YouTube video ID from the provided input."
	)


	@tool("get_youtube_title_description")
	def get_youtube_title_description(video_url: str) -> str:
	"""
	get_youtube title and description from youtube url (ex: https://www.youtube.com/watch?v=1htKBjuUWec)

	Extract YouTube title + description from Open Graph meta tags.
	Falls back to standard <meta name="title"/"description"> if needed.
	"""
	html = _fetch_html(video_url)
	soup = BeautifulSoup(html, "html.parser")

	title_tag = soup.find("meta", property="og:title") or soup.find(
	"meta", attrs={"name": "title"}
	)
	desc_tag = soup.find("meta", property="og:description") or soup.find(
	"meta", attrs={"name": "description"}
	)

	title = (title_tag.get("content") if title_tag else None) or "No title found"
	description = (
	desc_tag.get("content") if desc_tag else None
	) or "No description found"

	return f"Title: {title}\nDescription: {description}"


	@tool("get_youtube_transcript")
	def get_youtube_transcript_tool(
	video_id: str, langs: Optional[List[str]] = None
	) -> str:
	"""
	Tool: return YouTube transcript text for a video ID.
	"""
	langs = langs or ["en", "en-US", "fr", "fr-FR"]
	last_err = None
	for base in INVIDIOUS_POOL:
	try:
	caps = _http_json(base, f"/api/v1/captions/{video_id}")
	tracks = caps.get("captions", caps) if isinstance(caps, dict) else caps
	if not isinstance(tracks, list) or not tracks:
	continue
	pick = None
	for lang in langs:
	pick = next(
	(
	c
	for c in tracks
	if (c.get("languageCode", "").lower() == lang.lower())
	or (
	c.get("label", "")
	.lower()
	.startswith(lang.split("-")[0].lower())
	)
	),
	None,
	)
	if pick:
	break
	pick = pick or tracks[0]
	vtt = (
	_http_text(base, pick["url"])
	if pick.get("url")
	else _http_text(
	base,
	f"/api/v1/captions/{video_id}",
	label=pick.get("label") or pick.get("languageCode") or "English",
	lang=pick.get("languageCode") or "en",
	)
	)
	text = _vtt_to_text(vtt)
	if text:
	return text
	except Exception as e:
	last_err = e
	continue
	raise RuntimeError(f"No captions available. Last error: {last_err}")