LangGraph_GAIA / tools /youtube_transcript.py
BiGuan's picture
Update tools/youtube_transcript.py
97b4da5 verified
Raw
History Blame Contribute Delete
1.44 kB
import re
import requests
from bs4 import BeautifulSoup
def youtube_transcript(url: str) -> str:
match = re.search(r"(?:v=|youtu\.be/|/shorts/|/embed/)([0-9A-Za-z_-]{11})", url)
video_id = match.group(1) if match else url.strip()
# 尝试获取字幕
try:
from youtube_transcript_api import YouTubeTranscriptApi
if hasattr(YouTubeTranscriptApi, "get_transcript"):
chunks = YouTubeTranscriptApi.get_transcript(video_id)
else:
fetched = YouTubeTranscriptApi().fetch(video_id)
chunks = [{"text": snippet.text} for snippet in fetched]
transcript = " ".join(c["text"] for c in chunks)
if transcript.strip():
return f"Transcript:\n{transcript[:8000]}"
except Exception:
pass
# 降级:获取视频描述
try:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
resp = requests.get(url, headers=headers, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
desc_meta = soup.find("meta", {"name": "description"})
if desc_meta and desc_meta.get("content"):
description = desc_meta["content"]
return f"Video description: {description[:4000]}"
else:
return "Video has no transcript or description."
except Exception:
return "Video information unavailable."