Spaces:
Sleeping
Sleeping
Vladyslav Khaitov
commited on
Commit
·
4933f00
1
Parent(s):
6211332
Add new YouTube tools, change audio tool to audio transcriber, improve system prompt
Browse files- app.py +12 -7
- requirements.txt +3 -1
- smolagents_agent.py +12 -5
- tools/__init__.py +3 -2
- tools/audio_inspector_tool.py +9 -22
- tools/yt_inspector_tool.py +169 -0
app.py
CHANGED
|
@@ -23,16 +23,21 @@ class BasicAgent:
|
|
| 23 |
# return fixed_answer
|
| 24 |
# using https://huggingface.co/spaces/gaia-benchmark/leaderboard
|
| 25 |
system_message = """
|
| 26 |
-
You are a general AI assistant. I will ask you
|
| 27 |
Report your thoughts, and provide the answer.
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
If you are asked for a
|
| 31 |
-
If you are asked for a
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
""".strip()
|
| 34 |
agent = create_agent()
|
| 35 |
-
answer = agent.run(system_message + '\n
|
| 36 |
return answer
|
| 37 |
|
| 38 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
|
|
| 23 |
# return fixed_answer
|
| 24 |
# using https://huggingface.co/spaces/gaia-benchmark/leaderboard
|
| 25 |
system_message = """
|
| 26 |
+
You are a general AI assistant. I will ask you one question.
|
| 27 |
Report your thoughts, and provide the answer.
|
| 28 |
+
|
| 29 |
+
- The answer should be a number OR as few words as possible OR a comma (with space) separated list of numbers and/or strings.
|
| 30 |
+
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise or write the digits in plain text.
|
| 31 |
+
- If you are asked for a string, don't use articles, neither abbreviations nor shortened versions (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
| 32 |
+
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
| 33 |
+
|
| 34 |
+
- Pay special attention to dates in question. If needed use the Wayback Machine to search for appropriate archived pages.
|
| 35 |
+
- Do NOT use regex / regular expressions.
|
| 36 |
+
|
| 37 |
+
Question:
|
| 38 |
""".strip()
|
| 39 |
agent = create_agent()
|
| 40 |
+
answer = agent.run(system_message + '\n' + question)
|
| 41 |
return answer
|
| 42 |
|
| 43 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
requirements.txt
CHANGED
|
@@ -28,4 +28,6 @@ torch
|
|
| 28 |
# opentelemetry-sdk
|
| 29 |
# opentelemetry-exporter-otlp
|
| 30 |
# openinference-instrumentation-smolagents
|
| 31 |
-
# langfuse #==
|
|
|
|
|
|
|
|
|
| 28 |
# opentelemetry-sdk
|
| 29 |
# opentelemetry-exporter-otlp
|
| 30 |
# openinference-instrumentation-smolagents
|
| 31 |
+
# langfuse #==
|
| 32 |
+
yt-dlp
|
| 33 |
+
opencv-python-headless
|
smolagents_agent.py
CHANGED
|
@@ -6,7 +6,10 @@ from smolagents import (CodeAgent, InferenceClientModel, load_tool, tool,
|
|
| 6 |
PythonInterpreterTool, \
|
| 7 |
FinalAnswerTool, GradioUI)
|
| 8 |
|
| 9 |
-
from tools import TextFileInspectorTool, ImageInspectorTool,
|
|
|
|
|
|
|
|
|
|
| 10 |
from tools import (
|
| 11 |
ArchiveSearchTool,
|
| 12 |
FinderTool,
|
|
@@ -97,8 +100,10 @@ def create_agent():
|
|
| 97 |
]
|
| 98 |
image_inspection_tool = ImageInspectorTool(model)
|
| 99 |
# visual_qa_tool = VisualQATool(model)
|
| 100 |
-
|
| 101 |
-
youtube_video_inspection_tool = YouTubeVideoInspectorTool(model, text_limit)
|
|
|
|
|
|
|
| 102 |
python_interpreter = PythonInterpreterTool()
|
| 103 |
final_answer = FinalAnswerTool()
|
| 104 |
# TODO:
|
|
@@ -114,8 +119,10 @@ def create_agent():
|
|
| 114 |
document_inspection_tool,
|
| 115 |
image_inspection_tool,
|
| 116 |
# visual_qa_tool,
|
| 117 |
-
|
| 118 |
-
youtube_video_inspection_tool,
|
|
|
|
|
|
|
| 119 |
python_interpreter,
|
| 120 |
final_answer
|
| 121 |
],
|
|
|
|
| 6 |
PythonInterpreterTool, \
|
| 7 |
FinalAnswerTool, GradioUI)
|
| 8 |
|
| 9 |
+
from tools import (TextFileInspectorTool, ImageInspectorTool,
|
| 10 |
+
# VisualQATool, YouTubeVideoInspectorTool
|
| 11 |
+
YouTubeVisualInspectorTool, YouTubeAudioTranscriberTool,
|
| 12 |
+
AudioTranscriberTool)
|
| 13 |
from tools import (
|
| 14 |
ArchiveSearchTool,
|
| 15 |
FinderTool,
|
|
|
|
| 100 |
]
|
| 101 |
image_inspection_tool = ImageInspectorTool(model)
|
| 102 |
# visual_qa_tool = VisualQATool(model)
|
| 103 |
+
audio_transcriber_tool = AudioTranscriberTool(model)
|
| 104 |
+
# youtube_video_inspection_tool = YouTubeVideoInspectorTool(model, text_limit)
|
| 105 |
+
youtube_visual_inspection_tool = YouTubeVisualInspectorTool(model)
|
| 106 |
+
youtube_audio_transcriber_tool = YouTubeAudioTranscriberTool(model)
|
| 107 |
python_interpreter = PythonInterpreterTool()
|
| 108 |
final_answer = FinalAnswerTool()
|
| 109 |
# TODO:
|
|
|
|
| 119 |
document_inspection_tool,
|
| 120 |
image_inspection_tool,
|
| 121 |
# visual_qa_tool,
|
| 122 |
+
audio_transcriber_tool,
|
| 123 |
+
# youtube_video_inspection_tool,
|
| 124 |
+
youtube_visual_inspection_tool,
|
| 125 |
+
youtube_audio_transcriber_tool,
|
| 126 |
python_interpreter,
|
| 127 |
final_answer
|
| 128 |
],
|
tools/__init__.py
CHANGED
|
@@ -9,6 +9,7 @@ from .text_web_browser import (
|
|
| 9 |
VisitTool,
|
| 10 |
)
|
| 11 |
from .image_inspector_tool import ImageInspectorTool
|
| 12 |
-
from .audio_inspector_tool import
|
| 13 |
from .youtube_video_inspector_tool import YouTubeVideoInspectorTool
|
| 14 |
-
from .
|
|
|
|
|
|
| 9 |
VisitTool,
|
| 10 |
)
|
| 11 |
from .image_inspector_tool import ImageInspectorTool
|
| 12 |
+
from .audio_inspector_tool import AudioTranscriberTool
|
| 13 |
from .youtube_video_inspector_tool import YouTubeVideoInspectorTool
|
| 14 |
+
from .yt_inspector_tool import YouTubeVisualInspectorTool, YouTubeAudioTranscriberTool
|
| 15 |
+
# from .visual_qa_tool import VisualQATool
|
tools/audio_inspector_tool.py
CHANGED
|
@@ -9,22 +9,17 @@ from smolagents import Tool
|
|
| 9 |
from smolagents.models import Model, ChatMessage
|
| 10 |
|
| 11 |
|
| 12 |
-
class
|
| 13 |
-
name = "
|
| 14 |
-
description = """A tool that
|
| 15 |
-
This tool handles various audio formats and
|
| 16 |
"""
|
| 17 |
|
| 18 |
inputs = {
|
| 19 |
"audio_path": {
|
| 20 |
-
"description": "The path to the audio file
|
| 21 |
"type": "string",
|
| 22 |
},
|
| 23 |
-
"question": {
|
| 24 |
-
"description": "[Optional]: The question to answer about the audio. If not provided, will generate a detailed description.",
|
| 25 |
-
"type": "string",
|
| 26 |
-
"nullable": True,
|
| 27 |
-
},
|
| 28 |
}
|
| 29 |
output_type = "string"
|
| 30 |
|
|
@@ -32,14 +27,9 @@ This tool handles various audio formats and can provide detailed descriptions or
|
|
| 32 |
super().__init__()
|
| 33 |
self.model = model
|
| 34 |
|
| 35 |
-
def forward(self, audio_path: str
|
| 36 |
if not isinstance(audio_path, str):
|
| 37 |
-
raise Exception("You should provide
|
| 38 |
-
|
| 39 |
-
add_note = False
|
| 40 |
-
if not question:
|
| 41 |
-
add_note = True
|
| 42 |
-
question = "Transcribe this audio."
|
| 43 |
|
| 44 |
with open(audio_path, "rb") as audio_file:
|
| 45 |
base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
|
|
@@ -51,7 +41,7 @@ This tool handles various audio formats and can provide detailed descriptions or
|
|
| 51 |
content = [
|
| 52 |
{
|
| 53 |
"type": "text",
|
| 54 |
-
"text":
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"type": "input_audio",
|
|
@@ -70,9 +60,6 @@ This tool handles various audio formats and can provide detailed descriptions or
|
|
| 70 |
# Handle case where content is a list of dicts
|
| 71 |
output = str(output)
|
| 72 |
except Exception as e:
|
| 73 |
-
raise Exception("
|
| 74 |
-
|
| 75 |
-
if add_note:
|
| 76 |
-
output = f"You did not provide a particular question, so here is a detailed description of the audio: {output}"
|
| 77 |
|
| 78 |
return str(output)
|
|
|
|
| 9 |
from smolagents.models import Model, ChatMessage
|
| 10 |
|
| 11 |
|
| 12 |
+
class AudioTranscriberTool(Tool):
|
| 13 |
+
name = "transcribe_audio"
|
| 14 |
+
description = """A tool that transcribes audio files to text. Use this tool when you need to convert speech or audio content into written text.
|
| 15 |
+
This tool handles various audio formats and provides accurate transcriptions of audio content.
|
| 16 |
"""
|
| 17 |
|
| 18 |
inputs = {
|
| 19 |
"audio_path": {
|
| 20 |
+
"description": "The path to the audio file to transcribe. This should be a local path to downloaded audio.",
|
| 21 |
"type": "string",
|
| 22 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
}
|
| 24 |
output_type = "string"
|
| 25 |
|
|
|
|
| 27 |
super().__init__()
|
| 28 |
self.model = model
|
| 29 |
|
| 30 |
+
def forward(self, audio_path: str) -> str:
|
| 31 |
if not isinstance(audio_path, str):
|
| 32 |
+
raise Exception("You should provide the `audio_path` string argument to this tool!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
with open(audio_path, "rb") as audio_file:
|
| 35 |
base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
|
|
|
|
| 41 |
content = [
|
| 42 |
{
|
| 43 |
"type": "text",
|
| 44 |
+
"text": "Please transcribe this audio file accurately. Provide only the transcribed text without any additional commentary or formatting.",
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"type": "input_audio",
|
|
|
|
| 60 |
# Handle case where content is a list of dicts
|
| 61 |
output = str(output)
|
| 62 |
except Exception as e:
|
| 63 |
+
raise Exception("Transcription failed: " + str(e))
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
return str(output)
|
tools/yt_inspector_tool.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import platform
|
| 3 |
+
|
| 4 |
+
from smolagents import Tool
|
| 5 |
+
from smolagents.models import Model, ChatMessage
|
| 6 |
+
|
| 7 |
+
import yt_dlp
|
| 8 |
+
import tempfile
|
| 9 |
+
import os
|
| 10 |
+
import cv2
|
| 11 |
+
|
| 12 |
+
class YouTubeVisualInspectorTool(Tool):
|
| 13 |
+
name = "youtube_visual_inspector"
|
| 14 |
+
description = """A tool that downloads a YouTube video, extracts frames, and answers a question based on the video content. Use this tool to ask questions about the visual content of a YouTube video."""
|
| 15 |
+
|
| 16 |
+
inputs = {
|
| 17 |
+
"youtube_url": {
|
| 18 |
+
"description": "The URL of the YouTube video to analyze.",
|
| 19 |
+
"type": "string",
|
| 20 |
+
},
|
| 21 |
+
"question": {
|
| 22 |
+
"description": "The question to answer about the video.",
|
| 23 |
+
"type": "string",
|
| 24 |
+
},
|
| 25 |
+
}
|
| 26 |
+
output_type = "string"
|
| 27 |
+
|
| 28 |
+
def __init__(self, model: Model):
|
| 29 |
+
super().__init__()
|
| 30 |
+
self.model = model
|
| 31 |
+
|
| 32 |
+
def forward(self, youtube_url: str, question: str) -> str:
|
| 33 |
+
if not isinstance(youtube_url, str) or not isinstance(question, str):
|
| 34 |
+
raise Exception("You should provide both `youtube_url` and `question` string arguments to this tool!")
|
| 35 |
+
|
| 36 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 37 |
+
ydl_opts = {
|
| 38 |
+
'format': 'mp4',
|
| 39 |
+
'outtmpl': os.path.join(tmpdir, '%(id)s.%(ext)s'),
|
| 40 |
+
'quiet': True,
|
| 41 |
+
'noplaylist': True,
|
| 42 |
+
}
|
| 43 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 44 |
+
info = ydl.extract_info(youtube_url, download=True)
|
| 45 |
+
video_path = ydl.prepare_filename(info)
|
| 46 |
+
if not video_path.endswith('.mp4'):
|
| 47 |
+
for f in os.listdir(tmpdir):
|
| 48 |
+
if f.endswith('.mp4'):
|
| 49 |
+
video_path = os.path.join(tmpdir, f)
|
| 50 |
+
break
|
| 51 |
+
|
| 52 |
+
# Extract every 25th frame using OpenCV
|
| 53 |
+
vidcap = cv2.VideoCapture(video_path)
|
| 54 |
+
frames = []
|
| 55 |
+
count = 0
|
| 56 |
+
success, image = vidcap.read()
|
| 57 |
+
while success:
|
| 58 |
+
if count % 25 == 0:
|
| 59 |
+
_, buffer = cv2.imencode('.jpg', image)
|
| 60 |
+
frame_b64 = base64.b64encode(buffer.tobytes()).decode('utf-8')
|
| 61 |
+
frames.append(frame_b64)
|
| 62 |
+
success, image = vidcap.read()
|
| 63 |
+
count += 1
|
| 64 |
+
vidcap.release()
|
| 65 |
+
|
| 66 |
+
# Compose the message as per the provided example
|
| 67 |
+
messages = [
|
| 68 |
+
ChatMessage(
|
| 69 |
+
role="user",
|
| 70 |
+
content=[
|
| 71 |
+
{"type": "text", "text": question},
|
| 72 |
+
*[
|
| 73 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}}
|
| 74 |
+
for frame in frames
|
| 75 |
+
]
|
| 76 |
+
]
|
| 77 |
+
)
|
| 78 |
+
]
|
| 79 |
+
try:
|
| 80 |
+
output = self.model(messages).content
|
| 81 |
+
if isinstance(output, list):
|
| 82 |
+
output = str(output)
|
| 83 |
+
except Exception as e:
|
| 84 |
+
raise Exception("Video QA failed: " + str(e))
|
| 85 |
+
|
| 86 |
+
return str(output)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class YouTubeAudioTranscriberTool(Tool):
|
| 90 |
+
name = "youtube_audio_transcriber"
|
| 91 |
+
description = """A tool that downloads audio from a YouTube video and transcribes it to text. Use this tool when you need to convert speech or audio content from YouTube videos into written text.
|
| 92 |
+
This tool handles various audio formats and provides accurate transcriptions of audio content from YouTube videos."""
|
| 93 |
+
|
| 94 |
+
inputs = {
|
| 95 |
+
"youtube_url": {
|
| 96 |
+
"description": "The URL of the YouTube video to download audio from and transcribe.",
|
| 97 |
+
"type": "string",
|
| 98 |
+
},
|
| 99 |
+
}
|
| 100 |
+
output_type = "string"
|
| 101 |
+
|
| 102 |
+
def __init__(self, model: Model):
|
| 103 |
+
super().__init__()
|
| 104 |
+
self.model = model
|
| 105 |
+
|
| 106 |
+
def forward(self, youtube_url: str) -> str:
|
| 107 |
+
if not isinstance(youtube_url, str):
|
| 108 |
+
raise Exception("You should provide the `youtube_url` string argument to this tool!")
|
| 109 |
+
|
| 110 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 111 |
+
# Download audio only
|
| 112 |
+
ydl_opts = {
|
| 113 |
+
'format': 'bestaudio/best',
|
| 114 |
+
'outtmpl': os.path.join(tmpdir, '%(id)s.%(ext)s'),
|
| 115 |
+
'quiet': True,
|
| 116 |
+
'noplaylist': True,
|
| 117 |
+
'postprocessors': [{
|
| 118 |
+
'key': 'FFmpegExtractAudio',
|
| 119 |
+
'preferredcodec': 'mp3',
|
| 120 |
+
'preferredquality': '192',
|
| 121 |
+
}],
|
| 122 |
+
}
|
| 123 |
+
if platform.system() == "Darwin":
|
| 124 |
+
ydl_opts['ffmpeg_location'] = '/opt/homebrew/bin/ffmpeg'
|
| 125 |
+
|
| 126 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 127 |
+
info = ydl.extract_info(youtube_url, download=True)
|
| 128 |
+
audio_path = ydl.prepare_filename(info)
|
| 129 |
+
# Convert to mp3 if not already
|
| 130 |
+
if not audio_path.endswith('.mp3'):
|
| 131 |
+
for f in os.listdir(tmpdir):
|
| 132 |
+
if f.endswith('.mp3'):
|
| 133 |
+
audio_path = os.path.join(tmpdir, f)
|
| 134 |
+
break
|
| 135 |
+
|
| 136 |
+
# Read and encode the audio file
|
| 137 |
+
with open(audio_path, "rb") as audio_file:
|
| 138 |
+
base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
|
| 139 |
+
format = audio_path.split(".")[-1]
|
| 140 |
+
|
| 141 |
+
messages = [
|
| 142 |
+
ChatMessage(
|
| 143 |
+
role="user",
|
| 144 |
+
content = [
|
| 145 |
+
{
|
| 146 |
+
"type": "text",
|
| 147 |
+
"text": "Please transcribe this audio file accurately. Provide only the transcribed text without any additional commentary or formatting.",
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"type": "input_audio",
|
| 151 |
+
"input_audio": {
|
| 152 |
+
"data": base64_audio,
|
| 153 |
+
"format": format
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
]
|
| 157 |
+
)
|
| 158 |
+
]
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
output = self.model(messages).content
|
| 162 |
+
if isinstance(output, list):
|
| 163 |
+
# Handle case where content is a list of dicts
|
| 164 |
+
output = str(output)
|
| 165 |
+
except Exception as e:
|
| 166 |
+
raise Exception("Transcription failed: " + str(e))
|
| 167 |
+
|
| 168 |
+
return str(output)
|
| 169 |
+
|