unit4_test

Sleeping

App Files Files Community

Vladyslav Khaitov commited on Jul 8, 2025

Commit

a670fa5

1 Parent(s): 74bcd34

Add youtube video info + transcript extraction tool

Browse files

Files changed (4) hide show

smolagents_agent.py +4 -1
tools/__init__.py +1 -0
tools/mdconvert.py +1 -1
tools/youtube_video_inspector_tool.py +108 -0

smolagents_agent.py CHANGED Viewed

@@ -5,7 +5,7 @@ from smolagents import (CodeAgent, InferenceClientModel, load_tool, tool,
                         VisitWebpageTool, GoogleSearchTool, DuckDuckGoSearchTool, PythonInterpreterTool, \
                         FinalAnswerTool, GradioUI)
-from tools import TextFileInspectorTool, ImageInspectorTool, AudioInspectorTool
 from tools import (
     ArchiveSearchTool,
     FinderTool,
@@ -74,6 +74,7 @@ def create_agent():
     document_inspection_tool = TextFileInspectorTool(model, text_limit)
     image_inspection_tool = ImageInspectorTool(model)
     audio_inspection_tool = AudioInspectorTool(model)
     python_interpreter = PythonInterpreterTool()
     final_answer = FinalAnswerTool()
     # TODO:
@@ -89,12 +90,14 @@ def create_agent():
             document_inspection_tool,
             image_inspection_tool,
             audio_inspection_tool,
             python_interpreter,
             final_answer
         ],
         add_base_tools=False,
         max_steps=20,
         verbosity_level=2,
         # grammar=None,
         # planning_interval=None,
         # name=None,

                         VisitWebpageTool, GoogleSearchTool, DuckDuckGoSearchTool, PythonInterpreterTool, \
                         FinalAnswerTool, GradioUI)
+from tools import TextFileInspectorTool, ImageInspectorTool, AudioInspectorTool, YouTubeVideoInspectorTool
 from tools import (
     ArchiveSearchTool,
     FinderTool,
     document_inspection_tool = TextFileInspectorTool(model, text_limit)
     image_inspection_tool = ImageInspectorTool(model)
     audio_inspection_tool = AudioInspectorTool(model)
+    youtube_video_inspection_tool = YouTubeVideoInspectorTool(model, text_limit)
     python_interpreter = PythonInterpreterTool()
     final_answer = FinalAnswerTool()
     # TODO:
             document_inspection_tool,
             image_inspection_tool,
             audio_inspection_tool,
+            youtube_video_inspection_tool,
             python_interpreter,
             final_answer
         ],
         add_base_tools=False,
         max_steps=20,
         verbosity_level=2,
+        additional_authorized_imports=['numpy', 'pandas']
         # grammar=None,
         # planning_interval=None,
         # name=None,

tools/__init__.py CHANGED Viewed

@@ -10,3 +10,4 @@ from .text_web_browser import (
 )
 from .image_inspector_tool import ImageInspectorTool
 from .audio_inspector_tool import AudioInspectorTool

 )
 from .image_inspector_tool import ImageInspectorTool
 from .audio_inspector_tool import AudioInspectorTool
+from .youtube_video_inspector_tool import YouTubeVideoInspectorTool

tools/mdconvert.py CHANGED Viewed

@@ -311,7 +311,7 @@ class YouTubeConverter(DocumentConverter):
             video_id = str(params["v"][0])
             try:
                 # Must be a single transcript.
-                transcript = YouTubeTranscriptApi.get_transcript(video_id)  # type: ignore
                 # transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
                 # Alternative formatting:
                 transcript_text = SRTFormatter().format_transcript(transcript)

             video_id = str(params["v"][0])
             try:
                 # Must be a single transcript.
+                transcript = YouTubeTranscriptApi.fetch(video_id)  # type: ignore
                 # transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
                 # Alternative formatting:
                 transcript_text = SRTFormatter().format_transcript(transcript)

tools/youtube_video_inspector_tool.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from urllib.parse import urlparse, parse_qs
+from smolagents import Tool
+from smolagents.models import Model, ChatMessage
+from .mdconvert import YouTubeConverter
+import requests
+import tempfile
+import os
+class YouTubeVideoInspectorTool(Tool):
+    name = "inspect_youtube_video"
+    description = """
+A tool to inspect YouTube videos by URL. It extracts the video title, metadata, description, and transcript (if available), and can answer questions about the video content. Use this tool for YouTube video URLs only. It does not handle playlists or non-YouTube URLs.
+"""
+    inputs = {
+        "youtube_url": {
+            "description": "The URL of the YouTube video to inspect. Must be a direct YouTube video URL (https://www.youtube.com/watch?v=...).",
+            "type": "string",
+        },
+        "question": {
+            "description": "[Optional]: Your question about the video. If not provided, returns the extracted video content and transcript.",
+            "type": "string",
+            "nullable": True,
+        },
+    }
+    output_type = "string"
+    def __init__(self, model: Model, text_limit: int = 100000):
+        super().__init__()
+        self.model = model
+        self.text_limit = text_limit
+        self.youtube_converter = YouTubeConverter()
+    def extract_youtube_video_id(self, url: str) -> str | None:
+        parsed = urlparse(url)
+        if parsed.netloc in ["www.youtube.com", "youtube.com", "m.youtube.com"]:
+            if parsed.path == "/watch":
+                qs = parse_qs(parsed.query)
+                return qs.get("v", [None])[0]
+            elif parsed.path.startswith("/embed/"):
+                return parsed.path.split("/embed/")[1].split("/")[0]
+        elif parsed.netloc == "youtu.be":
+            return parsed.path.lstrip("/")
+        return None
+    def forward(self, youtube_url: str, question: str | None = None) -> str:
+        from smolagents.models import MessageRole
+        video_id = self.extract_youtube_video_id(youtube_url)
+        if not video_id:
+            raise Exception("This tool only supports direct YouTube video URLs (watch, youtu.be, or embed links).")
+        canonical_url = f"https://www.youtube.com/watch?v={video_id}"
+        # Download the HTML page of the YouTube video into a temporary directory
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            html_response = requests.get(canonical_url)
+            html_filename = f"{video_id}.html"
+            html_path = os.path.join(tmp_dir, html_filename)
+            with open(html_path, "w", encoding="utf-8") as html_file:
+                html_file.write(html_response.text)
+            # Use the temporary HTML file for conversion
+            result = self.youtube_converter.convert(local_path=html_path, file_extension='.html', url=canonical_url)
+            if result is None:
+                raise Exception("Failed to extract video data. Ensure the URL is a valid YouTube video and try again.")
+        if not question:
+            return result.text_content
+        messages = [
+            ChatMessage(
+                role=MessageRole.SYSTEM,
+                content=[
+                    {
+                        "type": "text",
+                        "text": "You will have to write a short caption for this YouTube video, then answer this question: " + question,
+                    }
+                ],
+            ),
+            ChatMessage(
+                role=MessageRole.USER,
+                content=[
+                    {
+                        "type": "text",
+                        "text": "Here is the complete video transcript and metadata as markdown text:\n### "
+                        + str(result.title)
+                        + "\n\n"
+                        + result.text_content[: self.text_limit],
+                    }
+                ],
+            ),
+            ChatMessage(
+                role=MessageRole.USER,
+                content=[
+                    {
+                        "type": "text",
+                        "text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the video and question asked.' " + question,
+                    }
+                ],
+            ),
+        ]
+        output = self.model(messages).content
+        if isinstance(output, list):
+            output = str(output)
+        return str(output)