Spaces:
Sleeping
Sleeping
Vladyslav Khaitov
commited on
Commit
·
a670fa5
1
Parent(s):
74bcd34
Add youtube video info + transcript extraction tool
Browse files- smolagents_agent.py +4 -1
- tools/__init__.py +1 -0
- tools/mdconvert.py +1 -1
- tools/youtube_video_inspector_tool.py +108 -0
smolagents_agent.py
CHANGED
|
@@ -5,7 +5,7 @@ from smolagents import (CodeAgent, InferenceClientModel, load_tool, tool,
|
|
| 5 |
VisitWebpageTool, GoogleSearchTool, DuckDuckGoSearchTool, PythonInterpreterTool, \
|
| 6 |
FinalAnswerTool, GradioUI)
|
| 7 |
|
| 8 |
-
from tools import TextFileInspectorTool, ImageInspectorTool, AudioInspectorTool
|
| 9 |
from tools import (
|
| 10 |
ArchiveSearchTool,
|
| 11 |
FinderTool,
|
|
@@ -74,6 +74,7 @@ def create_agent():
|
|
| 74 |
document_inspection_tool = TextFileInspectorTool(model, text_limit)
|
| 75 |
image_inspection_tool = ImageInspectorTool(model)
|
| 76 |
audio_inspection_tool = AudioInspectorTool(model)
|
|
|
|
| 77 |
python_interpreter = PythonInterpreterTool()
|
| 78 |
final_answer = FinalAnswerTool()
|
| 79 |
# TODO:
|
|
@@ -89,12 +90,14 @@ def create_agent():
|
|
| 89 |
document_inspection_tool,
|
| 90 |
image_inspection_tool,
|
| 91 |
audio_inspection_tool,
|
|
|
|
| 92 |
python_interpreter,
|
| 93 |
final_answer
|
| 94 |
],
|
| 95 |
add_base_tools=False,
|
| 96 |
max_steps=20,
|
| 97 |
verbosity_level=2,
|
|
|
|
| 98 |
# grammar=None,
|
| 99 |
# planning_interval=None,
|
| 100 |
# name=None,
|
|
|
|
| 5 |
VisitWebpageTool, GoogleSearchTool, DuckDuckGoSearchTool, PythonInterpreterTool, \
|
| 6 |
FinalAnswerTool, GradioUI)
|
| 7 |
|
| 8 |
+
from tools import TextFileInspectorTool, ImageInspectorTool, AudioInspectorTool, YouTubeVideoInspectorTool
|
| 9 |
from tools import (
|
| 10 |
ArchiveSearchTool,
|
| 11 |
FinderTool,
|
|
|
|
| 74 |
document_inspection_tool = TextFileInspectorTool(model, text_limit)
|
| 75 |
image_inspection_tool = ImageInspectorTool(model)
|
| 76 |
audio_inspection_tool = AudioInspectorTool(model)
|
| 77 |
+
youtube_video_inspection_tool = YouTubeVideoInspectorTool(model, text_limit)
|
| 78 |
python_interpreter = PythonInterpreterTool()
|
| 79 |
final_answer = FinalAnswerTool()
|
| 80 |
# TODO:
|
|
|
|
| 90 |
document_inspection_tool,
|
| 91 |
image_inspection_tool,
|
| 92 |
audio_inspection_tool,
|
| 93 |
+
youtube_video_inspection_tool,
|
| 94 |
python_interpreter,
|
| 95 |
final_answer
|
| 96 |
],
|
| 97 |
add_base_tools=False,
|
| 98 |
max_steps=20,
|
| 99 |
verbosity_level=2,
|
| 100 |
+
additional_authorized_imports=['numpy', 'pandas']
|
| 101 |
# grammar=None,
|
| 102 |
# planning_interval=None,
|
| 103 |
# name=None,
|
tools/__init__.py
CHANGED
|
@@ -10,3 +10,4 @@ from .text_web_browser import (
|
|
| 10 |
)
|
| 11 |
from .image_inspector_tool import ImageInspectorTool
|
| 12 |
from .audio_inspector_tool import AudioInspectorTool
|
|
|
|
|
|
| 10 |
)
|
| 11 |
from .image_inspector_tool import ImageInspectorTool
|
| 12 |
from .audio_inspector_tool import AudioInspectorTool
|
| 13 |
+
from .youtube_video_inspector_tool import YouTubeVideoInspectorTool
|
tools/mdconvert.py
CHANGED
|
@@ -311,7 +311,7 @@ class YouTubeConverter(DocumentConverter):
|
|
| 311 |
video_id = str(params["v"][0])
|
| 312 |
try:
|
| 313 |
# Must be a single transcript.
|
| 314 |
-
transcript = YouTubeTranscriptApi.
|
| 315 |
# transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
|
| 316 |
# Alternative formatting:
|
| 317 |
transcript_text = SRTFormatter().format_transcript(transcript)
|
|
|
|
| 311 |
video_id = str(params["v"][0])
|
| 312 |
try:
|
| 313 |
# Must be a single transcript.
|
| 314 |
+
transcript = YouTubeTranscriptApi.fetch(video_id) # type: ignore
|
| 315 |
# transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
|
| 316 |
# Alternative formatting:
|
| 317 |
transcript_text = SRTFormatter().format_transcript(transcript)
|
tools/youtube_video_inspector_tool.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from urllib.parse import urlparse, parse_qs
|
| 2 |
+
|
| 3 |
+
from smolagents import Tool
|
| 4 |
+
from smolagents.models import Model, ChatMessage
|
| 5 |
+
|
| 6 |
+
from .mdconvert import YouTubeConverter
|
| 7 |
+
|
| 8 |
+
import requests
|
| 9 |
+
import tempfile
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class YouTubeVideoInspectorTool(Tool):
|
| 14 |
+
name = "inspect_youtube_video"
|
| 15 |
+
description = """
|
| 16 |
+
A tool to inspect YouTube videos by URL. It extracts the video title, metadata, description, and transcript (if available), and can answer questions about the video content. Use this tool for YouTube video URLs only. It does not handle playlists or non-YouTube URLs.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
inputs = {
|
| 20 |
+
"youtube_url": {
|
| 21 |
+
"description": "The URL of the YouTube video to inspect. Must be a direct YouTube video URL (https://www.youtube.com/watch?v=...).",
|
| 22 |
+
"type": "string",
|
| 23 |
+
},
|
| 24 |
+
"question": {
|
| 25 |
+
"description": "[Optional]: Your question about the video. If not provided, returns the extracted video content and transcript.",
|
| 26 |
+
"type": "string",
|
| 27 |
+
"nullable": True,
|
| 28 |
+
},
|
| 29 |
+
}
|
| 30 |
+
output_type = "string"
|
| 31 |
+
|
| 32 |
+
def __init__(self, model: Model, text_limit: int = 100000):
|
| 33 |
+
super().__init__()
|
| 34 |
+
self.model = model
|
| 35 |
+
self.text_limit = text_limit
|
| 36 |
+
self.youtube_converter = YouTubeConverter()
|
| 37 |
+
|
| 38 |
+
def extract_youtube_video_id(self, url: str) -> str | None:
|
| 39 |
+
parsed = urlparse(url)
|
| 40 |
+
if parsed.netloc in ["www.youtube.com", "youtube.com", "m.youtube.com"]:
|
| 41 |
+
if parsed.path == "/watch":
|
| 42 |
+
qs = parse_qs(parsed.query)
|
| 43 |
+
return qs.get("v", [None])[0]
|
| 44 |
+
elif parsed.path.startswith("/embed/"):
|
| 45 |
+
return parsed.path.split("/embed/")[1].split("/")[0]
|
| 46 |
+
elif parsed.netloc == "youtu.be":
|
| 47 |
+
return parsed.path.lstrip("/")
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
def forward(self, youtube_url: str, question: str | None = None) -> str:
|
| 51 |
+
from smolagents.models import MessageRole
|
| 52 |
+
|
| 53 |
+
video_id = self.extract_youtube_video_id(youtube_url)
|
| 54 |
+
if not video_id:
|
| 55 |
+
raise Exception("This tool only supports direct YouTube video URLs (watch, youtu.be, or embed links).")
|
| 56 |
+
canonical_url = f"https://www.youtube.com/watch?v={video_id}"
|
| 57 |
+
|
| 58 |
+
# Download the HTML page of the YouTube video into a temporary directory
|
| 59 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 60 |
+
html_response = requests.get(canonical_url)
|
| 61 |
+
html_filename = f"{video_id}.html"
|
| 62 |
+
html_path = os.path.join(tmp_dir, html_filename)
|
| 63 |
+
with open(html_path, "w", encoding="utf-8") as html_file:
|
| 64 |
+
html_file.write(html_response.text)
|
| 65 |
+
# Use the temporary HTML file for conversion
|
| 66 |
+
result = self.youtube_converter.convert(local_path=html_path, file_extension='.html', url=canonical_url)
|
| 67 |
+
if result is None:
|
| 68 |
+
raise Exception("Failed to extract video data. Ensure the URL is a valid YouTube video and try again.")
|
| 69 |
+
|
| 70 |
+
if not question:
|
| 71 |
+
return result.text_content
|
| 72 |
+
|
| 73 |
+
messages = [
|
| 74 |
+
ChatMessage(
|
| 75 |
+
role=MessageRole.SYSTEM,
|
| 76 |
+
content=[
|
| 77 |
+
{
|
| 78 |
+
"type": "text",
|
| 79 |
+
"text": "You will have to write a short caption for this YouTube video, then answer this question: " + question,
|
| 80 |
+
}
|
| 81 |
+
],
|
| 82 |
+
),
|
| 83 |
+
ChatMessage(
|
| 84 |
+
role=MessageRole.USER,
|
| 85 |
+
content=[
|
| 86 |
+
{
|
| 87 |
+
"type": "text",
|
| 88 |
+
"text": "Here is the complete video transcript and metadata as markdown text:\n### "
|
| 89 |
+
+ str(result.title)
|
| 90 |
+
+ "\n\n"
|
| 91 |
+
+ result.text_content[: self.text_limit],
|
| 92 |
+
}
|
| 93 |
+
],
|
| 94 |
+
),
|
| 95 |
+
ChatMessage(
|
| 96 |
+
role=MessageRole.USER,
|
| 97 |
+
content=[
|
| 98 |
+
{
|
| 99 |
+
"type": "text",
|
| 100 |
+
"text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the video and question asked.' " + question,
|
| 101 |
+
}
|
| 102 |
+
],
|
| 103 |
+
),
|
| 104 |
+
]
|
| 105 |
+
output = self.model(messages).content
|
| 106 |
+
if isinstance(output, list):
|
| 107 |
+
output = str(output)
|
| 108 |
+
return str(output)
|