Vladyslav Khaitov commited on
Commit
a670fa5
·
1 Parent(s): 74bcd34

Add youtube video info + transcript extraction tool

Browse files
smolagents_agent.py CHANGED
@@ -5,7 +5,7 @@ from smolagents import (CodeAgent, InferenceClientModel, load_tool, tool,
5
  VisitWebpageTool, GoogleSearchTool, DuckDuckGoSearchTool, PythonInterpreterTool, \
6
  FinalAnswerTool, GradioUI)
7
 
8
- from tools import TextFileInspectorTool, ImageInspectorTool, AudioInspectorTool
9
  from tools import (
10
  ArchiveSearchTool,
11
  FinderTool,
@@ -74,6 +74,7 @@ def create_agent():
74
  document_inspection_tool = TextFileInspectorTool(model, text_limit)
75
  image_inspection_tool = ImageInspectorTool(model)
76
  audio_inspection_tool = AudioInspectorTool(model)
 
77
  python_interpreter = PythonInterpreterTool()
78
  final_answer = FinalAnswerTool()
79
  # TODO:
@@ -89,12 +90,14 @@ def create_agent():
89
  document_inspection_tool,
90
  image_inspection_tool,
91
  audio_inspection_tool,
 
92
  python_interpreter,
93
  final_answer
94
  ],
95
  add_base_tools=False,
96
  max_steps=20,
97
  verbosity_level=2,
 
98
  # grammar=None,
99
  # planning_interval=None,
100
  # name=None,
 
5
  VisitWebpageTool, GoogleSearchTool, DuckDuckGoSearchTool, PythonInterpreterTool, \
6
  FinalAnswerTool, GradioUI)
7
 
8
+ from tools import TextFileInspectorTool, ImageInspectorTool, AudioInspectorTool, YouTubeVideoInspectorTool
9
  from tools import (
10
  ArchiveSearchTool,
11
  FinderTool,
 
74
  document_inspection_tool = TextFileInspectorTool(model, text_limit)
75
  image_inspection_tool = ImageInspectorTool(model)
76
  audio_inspection_tool = AudioInspectorTool(model)
77
+ youtube_video_inspection_tool = YouTubeVideoInspectorTool(model, text_limit)
78
  python_interpreter = PythonInterpreterTool()
79
  final_answer = FinalAnswerTool()
80
  # TODO:
 
90
  document_inspection_tool,
91
  image_inspection_tool,
92
  audio_inspection_tool,
93
+ youtube_video_inspection_tool,
94
  python_interpreter,
95
  final_answer
96
  ],
97
  add_base_tools=False,
98
  max_steps=20,
99
  verbosity_level=2,
100
+ additional_authorized_imports=['numpy', 'pandas']
101
  # grammar=None,
102
  # planning_interval=None,
103
  # name=None,
tools/__init__.py CHANGED
@@ -10,3 +10,4 @@ from .text_web_browser import (
10
  )
11
  from .image_inspector_tool import ImageInspectorTool
12
  from .audio_inspector_tool import AudioInspectorTool
 
 
10
  )
11
  from .image_inspector_tool import ImageInspectorTool
12
  from .audio_inspector_tool import AudioInspectorTool
13
+ from .youtube_video_inspector_tool import YouTubeVideoInspectorTool
tools/mdconvert.py CHANGED
@@ -311,7 +311,7 @@ class YouTubeConverter(DocumentConverter):
311
  video_id = str(params["v"][0])
312
  try:
313
  # Must be a single transcript.
314
- transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
315
  # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
316
  # Alternative formatting:
317
  transcript_text = SRTFormatter().format_transcript(transcript)
 
311
  video_id = str(params["v"][0])
312
  try:
313
  # Must be a single transcript.
314
+ transcript = YouTubeTranscriptApi.fetch(video_id) # type: ignore
315
  # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
316
  # Alternative formatting:
317
  transcript_text = SRTFormatter().format_transcript(transcript)
tools/youtube_video_inspector_tool.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.parse import urlparse, parse_qs
2
+
3
+ from smolagents import Tool
4
+ from smolagents.models import Model, ChatMessage
5
+
6
+ from .mdconvert import YouTubeConverter
7
+
8
+ import requests
9
+ import tempfile
10
+ import os
11
+
12
+
13
+ class YouTubeVideoInspectorTool(Tool):
14
+ name = "inspect_youtube_video"
15
+ description = """
16
+ A tool to inspect YouTube videos by URL. It extracts the video title, metadata, description, and transcript (if available), and can answer questions about the video content. Use this tool for YouTube video URLs only. It does not handle playlists or non-YouTube URLs.
17
+ """
18
+
19
+ inputs = {
20
+ "youtube_url": {
21
+ "description": "The URL of the YouTube video to inspect. Must be a direct YouTube video URL (https://www.youtube.com/watch?v=...).",
22
+ "type": "string",
23
+ },
24
+ "question": {
25
+ "description": "[Optional]: Your question about the video. If not provided, returns the extracted video content and transcript.",
26
+ "type": "string",
27
+ "nullable": True,
28
+ },
29
+ }
30
+ output_type = "string"
31
+
32
+ def __init__(self, model: Model, text_limit: int = 100000):
33
+ super().__init__()
34
+ self.model = model
35
+ self.text_limit = text_limit
36
+ self.youtube_converter = YouTubeConverter()
37
+
38
+ def extract_youtube_video_id(self, url: str) -> str | None:
39
+ parsed = urlparse(url)
40
+ if parsed.netloc in ["www.youtube.com", "youtube.com", "m.youtube.com"]:
41
+ if parsed.path == "/watch":
42
+ qs = parse_qs(parsed.query)
43
+ return qs.get("v", [None])[0]
44
+ elif parsed.path.startswith("/embed/"):
45
+ return parsed.path.split("/embed/")[1].split("/")[0]
46
+ elif parsed.netloc == "youtu.be":
47
+ return parsed.path.lstrip("/")
48
+ return None
49
+
50
+ def forward(self, youtube_url: str, question: str | None = None) -> str:
51
+ from smolagents.models import MessageRole
52
+
53
+ video_id = self.extract_youtube_video_id(youtube_url)
54
+ if not video_id:
55
+ raise Exception("This tool only supports direct YouTube video URLs (watch, youtu.be, or embed links).")
56
+ canonical_url = f"https://www.youtube.com/watch?v={video_id}"
57
+
58
+ # Download the HTML page of the YouTube video into a temporary directory
59
+ with tempfile.TemporaryDirectory() as tmp_dir:
60
+ html_response = requests.get(canonical_url)
61
+ html_filename = f"{video_id}.html"
62
+ html_path = os.path.join(tmp_dir, html_filename)
63
+ with open(html_path, "w", encoding="utf-8") as html_file:
64
+ html_file.write(html_response.text)
65
+ # Use the temporary HTML file for conversion
66
+ result = self.youtube_converter.convert(local_path=html_path, file_extension='.html', url=canonical_url)
67
+ if result is None:
68
+ raise Exception("Failed to extract video data. Ensure the URL is a valid YouTube video and try again.")
69
+
70
+ if not question:
71
+ return result.text_content
72
+
73
+ messages = [
74
+ ChatMessage(
75
+ role=MessageRole.SYSTEM,
76
+ content=[
77
+ {
78
+ "type": "text",
79
+ "text": "You will have to write a short caption for this YouTube video, then answer this question: " + question,
80
+ }
81
+ ],
82
+ ),
83
+ ChatMessage(
84
+ role=MessageRole.USER,
85
+ content=[
86
+ {
87
+ "type": "text",
88
+ "text": "Here is the complete video transcript and metadata as markdown text:\n### "
89
+ + str(result.title)
90
+ + "\n\n"
91
+ + result.text_content[: self.text_limit],
92
+ }
93
+ ],
94
+ ),
95
+ ChatMessage(
96
+ role=MessageRole.USER,
97
+ content=[
98
+ {
99
+ "type": "text",
100
+ "text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the video and question asked.' " + question,
101
+ }
102
+ ],
103
+ ),
104
+ ]
105
+ output = self.model(messages).content
106
+ if isinstance(output, list):
107
+ output = str(output)
108
+ return str(output)