unit4_test

Sleeping

App Files Files Community

Vladyslav Khaitov commited on Jul 26, 2025

Commit

4933f00

1 Parent(s): 6211332

Add new YouTube tools, change audio tool to audio transcriber, improve system prompt

Browse files

Files changed (6) hide show

app.py +12 -7
requirements.txt +3 -1
smolagents_agent.py +12 -5
tools/__init__.py +3 -2
tools/audio_inspector_tool.py +9 -22
tools/yt_inspector_tool.py +169 -0

app.py CHANGED Viewed

@@ -23,16 +23,21 @@ class BasicAgent:
         # return fixed_answer
         # using https://huggingface.co/spaces/gaia-benchmark/leaderboard
         system_message = """
-You are a general AI assistant. I will ask you a question.
 Report your thoughts, and provide the answer.
-The answer should be a number OR as few words as possible OR a comma (with space) separated list of numbers and/or strings.
-If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise or write the digits in plain text.
-If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
-If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
-Do NOT use regular expressions is not absolutely necessary.
         """.strip()
         agent = create_agent()
-        answer = agent.run(system_message + '\n\nQuestion:\n' + question)
         return answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):

         # return fixed_answer
         # using https://huggingface.co/spaces/gaia-benchmark/leaderboard
         system_message = """
+You are a general AI assistant. I will ask you one question.
 Report your thoughts, and provide the answer.
+- The answer should be a number OR as few words as possible OR a comma (with space) separated list of numbers and/or strings.
+- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise or write the digits in plain text.
+- If you are asked for a string, don't use articles, neither abbreviations nor shortened versions (e.g. for cities), and write the digits in plain text unless specified otherwise.
+- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
+- Pay special attention to dates in question. If needed use the Wayback Machine to search for appropriate archived pages.
+- Do NOT use regex / regular expressions.
+Question:
         """.strip()
         agent = create_agent()
+        answer = agent.run(system_message + '\n' + question)
         return answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):

requirements.txt CHANGED Viewed

@@ -28,4 +28,6 @@ torch
 # opentelemetry-sdk
 # opentelemetry-exporter-otlp
 # openinference-instrumentation-smolagents
-# langfuse #==

 # opentelemetry-sdk
 # opentelemetry-exporter-otlp
 # openinference-instrumentation-smolagents
+# langfuse #==
+yt-dlp
+opencv-python-headless

smolagents_agent.py CHANGED Viewed

@@ -6,7 +6,10 @@ from smolagents import (CodeAgent, InferenceClientModel, load_tool, tool,
                         PythonInterpreterTool, \
                         FinalAnswerTool, GradioUI)
-from tools import TextFileInspectorTool, ImageInspectorTool, VisualQATool, AudioInspectorTool, YouTubeVideoInspectorTool
 from tools import (
     ArchiveSearchTool,
     FinderTool,
@@ -97,8 +100,10 @@ def create_agent():
     ]
     image_inspection_tool = ImageInspectorTool(model)
     # visual_qa_tool = VisualQATool(model)
-    audio_inspection_tool = AudioInspectorTool(model)
-    youtube_video_inspection_tool = YouTubeVideoInspectorTool(model, text_limit)
     python_interpreter = PythonInterpreterTool()
     final_answer = FinalAnswerTool()
     # TODO:
@@ -114,8 +119,10 @@ def create_agent():
             document_inspection_tool,
             image_inspection_tool,
             # visual_qa_tool,
-            audio_inspection_tool,
-            youtube_video_inspection_tool,
             python_interpreter,
             final_answer
         ],

                         PythonInterpreterTool, \
                         FinalAnswerTool, GradioUI)
+from tools import (TextFileInspectorTool, ImageInspectorTool,
+                   # VisualQATool, YouTubeVideoInspectorTool
+                   YouTubeVisualInspectorTool, YouTubeAudioTranscriberTool,
+                   AudioTranscriberTool)
 from tools import (
     ArchiveSearchTool,
     FinderTool,
     ]
     image_inspection_tool = ImageInspectorTool(model)
     # visual_qa_tool = VisualQATool(model)
+    audio_transcriber_tool = AudioTranscriberTool(model)
+    # youtube_video_inspection_tool = YouTubeVideoInspectorTool(model, text_limit)
+    youtube_visual_inspection_tool = YouTubeVisualInspectorTool(model)
+    youtube_audio_transcriber_tool = YouTubeAudioTranscriberTool(model)
     python_interpreter = PythonInterpreterTool()
     final_answer = FinalAnswerTool()
     # TODO:
             document_inspection_tool,
             image_inspection_tool,
             # visual_qa_tool,
+            audio_transcriber_tool,
+            # youtube_video_inspection_tool,
+            youtube_visual_inspection_tool,
+            youtube_audio_transcriber_tool,
             python_interpreter,
             final_answer
         ],

tools/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from .text_web_browser import (
     VisitTool,
 )
 from .image_inspector_tool import ImageInspectorTool
-from .audio_inspector_tool import AudioInspectorTool
 from .youtube_video_inspector_tool import YouTubeVideoInspectorTool
-from .visual_qa_tool import VisualQATool

     VisitTool,
 )
 from .image_inspector_tool import ImageInspectorTool
+from .audio_inspector_tool import AudioTranscriberTool
 from .youtube_video_inspector_tool import YouTubeVideoInspectorTool
+from .yt_inspector_tool import YouTubeVisualInspectorTool, YouTubeAudioTranscriberTool
+# from .visual_qa_tool import VisualQATool

tools/audio_inspector_tool.py CHANGED Viewed

@@ -9,22 +9,17 @@ from smolagents import Tool
 from smolagents.models import Model, ChatMessage
-class AudioInspectorTool(Tool):
-    name = "inspect_audio"
-    description = """A tool that can answer questions about attached audio files. Use this tool when you need to analyze or describe audio content.
-This tool handles various audio formats and can provide detailed descriptions or answer specific questions about audio content.
 """
     inputs = {
         "audio_path": {
-            "description": "The path to the audio file on which to answer the question. This should be a local path to downloaded audio.",
             "type": "string",
         },
-        "question": {
-            "description": "[Optional]: The question to answer about the audio. If not provided, will generate a detailed description.",
-            "type": "string",
-            "nullable": True,
-        },
     }
     output_type = "string"
@@ -32,14 +27,9 @@ This tool handles various audio formats and can provide detailed descriptions or
         super().__init__()
         self.model = model
-    def forward(self, audio_path: str, question: str | None = None) -> str:
         if not isinstance(audio_path, str):
-            raise Exception("You should provide at least `audio_path` string argument to this tool!")
-        add_note = False
-        if not question:
-            add_note = True
-            question = "Transcribe this audio."
         with open(audio_path, "rb") as audio_file:
             base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
@@ -51,7 +41,7 @@ This tool handles various audio formats and can provide detailed descriptions or
                 content = [
                     {
                         "type": "text",
-                        "text": question,
                     },
                     {
                         "type": "input_audio",
@@ -70,9 +60,6 @@ This tool handles various audio formats and can provide detailed descriptions or
                 # Handle case where content is a list of dicts
                 output = str(output)
         except Exception as e:
-            raise Exception("Response format unexpected: " + str(e))
-        if add_note:
-            output = f"You did not provide a particular question, so here is a detailed description of the audio: {output}"
         return str(output)

 from smolagents.models import Model, ChatMessage
+class AudioTranscriberTool(Tool):
+    name = "transcribe_audio"
+    description = """A tool that transcribes audio files to text. Use this tool when you need to convert speech or audio content into written text.
+This tool handles various audio formats and provides accurate transcriptions of audio content.
 """
     inputs = {
         "audio_path": {
+            "description": "The path to the audio file to transcribe. This should be a local path to downloaded audio.",
             "type": "string",
         },
     }
     output_type = "string"
         super().__init__()
         self.model = model
+    def forward(self, audio_path: str) -> str:
         if not isinstance(audio_path, str):
+            raise Exception("You should provide the `audio_path` string argument to this tool!")
         with open(audio_path, "rb") as audio_file:
             base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
                 content = [
                     {
                         "type": "text",
+                        "text": "Please transcribe this audio file accurately. Provide only the transcribed text without any additional commentary or formatting.",
                     },
                     {
                         "type": "input_audio",
                 # Handle case where content is a list of dicts
                 output = str(output)
         except Exception as e:
+            raise Exception("Transcription failed: " + str(e))
         return str(output)

tools/yt_inspector_tool.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import base64
+import platform
+from smolagents import Tool
+from smolagents.models import Model, ChatMessage
+import yt_dlp
+import tempfile
+import os
+import cv2
+class YouTubeVisualInspectorTool(Tool):
+    name = "youtube_visual_inspector"
+    description = """A tool that downloads a YouTube video, extracts frames, and answers a question based on the video content. Use this tool to ask questions about the visual content of a YouTube video."""
+    inputs = {
+        "youtube_url": {
+            "description": "The URL of the YouTube video to analyze.",
+            "type": "string",
+        },
+        "question": {
+            "description": "The question to answer about the video.",
+            "type": "string",
+        },
+    }
+    output_type = "string"
+    def __init__(self, model: Model):
+        super().__init__()
+        self.model = model
+    def forward(self, youtube_url: str, question: str) -> str:
+        if not isinstance(youtube_url, str) or not isinstance(question, str):
+            raise Exception("You should provide both `youtube_url` and `question` string arguments to this tool!")
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ydl_opts = {
+                'format': 'mp4',
+                'outtmpl': os.path.join(tmpdir, '%(id)s.%(ext)s'),
+                'quiet': True,
+                'noplaylist': True,
+            }
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(youtube_url, download=True)
+                video_path = ydl.prepare_filename(info)
+                if not video_path.endswith('.mp4'):
+                    for f in os.listdir(tmpdir):
+                        if f.endswith('.mp4'):
+                            video_path = os.path.join(tmpdir, f)
+                            break
+            # Extract every 25th frame using OpenCV
+            vidcap = cv2.VideoCapture(video_path)
+            frames = []
+            count = 0
+            success, image = vidcap.read()
+            while success:
+                if count % 25 == 0:
+                    _, buffer = cv2.imencode('.jpg', image)
+                    frame_b64 = base64.b64encode(buffer.tobytes()).decode('utf-8')
+                    frames.append(frame_b64)
+                success, image = vidcap.read()
+                count += 1
+            vidcap.release()
+            # Compose the message as per the provided example
+            messages = [
+                ChatMessage(
+                    role="user",
+                    content=[
+                        {"type": "text", "text": question},
+                        *[
+                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}}
+                            for frame in frames
+                        ]
+                    ]
+                )
+            ]
+            try:
+                output = self.model(messages).content
+                if isinstance(output, list):
+                    output = str(output)
+            except Exception as e:
+                raise Exception("Video QA failed: " + str(e))
+            return str(output)
+class YouTubeAudioTranscriberTool(Tool):
+    name = "youtube_audio_transcriber"
+    description = """A tool that downloads audio from a YouTube video and transcribes it to text. Use this tool when you need to convert speech or audio content from YouTube videos into written text.
+This tool handles various audio formats and provides accurate transcriptions of audio content from YouTube videos."""
+    inputs = {
+        "youtube_url": {
+            "description": "The URL of the YouTube video to download audio from and transcribe.",
+            "type": "string",
+        },
+    }
+    output_type = "string"
+    def __init__(self, model: Model):
+        super().__init__()
+        self.model = model
+    def forward(self, youtube_url: str) -> str:
+        if not isinstance(youtube_url, str):
+            raise Exception("You should provide the `youtube_url` string argument to this tool!")
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Download audio only
+            ydl_opts = {
+                'format': 'bestaudio/best',
+                'outtmpl': os.path.join(tmpdir, '%(id)s.%(ext)s'),
+                'quiet': True,
+                'noplaylist': True,
+                'postprocessors': [{
+                    'key': 'FFmpegExtractAudio',
+                    'preferredcodec': 'mp3',
+                    'preferredquality': '192',
+                }],
+            }
+            if platform.system() == "Darwin":
+                ydl_opts['ffmpeg_location'] = '/opt/homebrew/bin/ffmpeg'
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(youtube_url, download=True)
+                audio_path = ydl.prepare_filename(info)
+                # Convert to mp3 if not already
+                if not audio_path.endswith('.mp3'):
+                    for f in os.listdir(tmpdir):
+                        if f.endswith('.mp3'):
+                            audio_path = os.path.join(tmpdir, f)
+                            break
+            # Read and encode the audio file
+            with open(audio_path, "rb") as audio_file:
+                base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
+            format = audio_path.split(".")[-1]
+            messages = [
+                ChatMessage(
+                    role="user",
+                    content = [
+                        {
+                            "type": "text",
+                            "text": "Please transcribe this audio file accurately. Provide only the transcribed text without any additional commentary or formatting.",
+                        },
+                        {
+                            "type": "input_audio",
+                            "input_audio": {
+                                "data": base64_audio,
+                                "format": format
+                            }
+                        }
+                    ]
+                )
+            ]
+            try:
+                output = self.model(messages).content
+                if isinstance(output, list):
+                    # Handle case where content is a list of dicts
+                    output = str(output)
+            except Exception as e:
+                raise Exception("Transcription failed: " + str(e))
+            return str(output)