| """ |
| tools/youtube_transcript.py —— 工具⑦:抓取 YouTube 视频字幕 |
| |
| 有的题目给一个 YouTube 视频链接,问"视频里某人说了什么/出现了什么数字"。 |
| 这个工具不去真的"看"视频(那太慢),而是直接抓取视频自带的字幕文字, |
| 把整段台词拼成一段文字交给大模型,从中找答案。 |
| """ |
|
|
| import re |
|
|
| from langchain_core.tools import tool |
|
|
|
|
| @tool |
| def youtube_transcript(url: str) -> str: |
| """Return the spoken transcript of a YouTube video given its URL (or video id). Use it |
| to answer questions about what is said in a video.""" |
| |
| |
| match = re.search(r"(?:v=|youtu\.be/|/shorts/|/embed/)([0-9A-Za-z_-]{11})", url) |
| |
| video_id = match.group(1) if match else url.strip() |
| try: |
| from youtube_transcript_api import YouTubeTranscriptApi |
|
|
| |
| if hasattr(YouTubeTranscriptApi, "get_transcript"): |
| |
| chunks = YouTubeTranscriptApi.get_transcript(video_id) |
| return " ".join(c["text"] for c in chunks) |
| |
| fetched = YouTubeTranscriptApi().fetch(video_id) |
| return " ".join(snippet.text for snippet in fetched) |
| except Exception as e: |
| |
| return f"Could not fetch transcript for '{url}': {e}" |
|
|