Vladyslav Khaitov commited on
Commit
4933f00
·
1 Parent(s): 6211332

Add new YouTube tools, change audio tool to audio transcriber, improve system prompt

Browse files
app.py CHANGED
@@ -23,16 +23,21 @@ class BasicAgent:
23
  # return fixed_answer
24
  # using https://huggingface.co/spaces/gaia-benchmark/leaderboard
25
  system_message = """
26
- You are a general AI assistant. I will ask you a question.
27
  Report your thoughts, and provide the answer.
28
- The answer should be a number OR as few words as possible OR a comma (with space) separated list of numbers and/or strings.
29
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise or write the digits in plain text.
30
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
31
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
32
- Do NOT use regular expressions is not absolutely necessary.
 
 
 
 
 
33
  """.strip()
34
  agent = create_agent()
35
- answer = agent.run(system_message + '\n\nQuestion:\n' + question)
36
  return answer
37
 
38
  def run_and_submit_all( profile: gr.OAuthProfile | None):
 
23
  # return fixed_answer
24
  # using https://huggingface.co/spaces/gaia-benchmark/leaderboard
25
  system_message = """
26
+ You are a general AI assistant. I will ask you one question.
27
  Report your thoughts, and provide the answer.
28
+
29
+ - The answer should be a number OR as few words as possible OR a comma (with space) separated list of numbers and/or strings.
30
+ - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise or write the digits in plain text.
31
+ - If you are asked for a string, don't use articles, neither abbreviations nor shortened versions (e.g. for cities), and write the digits in plain text unless specified otherwise.
32
+ - If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
33
+
34
+ - Pay special attention to dates in question. If needed use the Wayback Machine to search for appropriate archived pages.
35
+ - Do NOT use regex / regular expressions.
36
+
37
+ Question:
38
  """.strip()
39
  agent = create_agent()
40
+ answer = agent.run(system_message + '\n' + question)
41
  return answer
42
 
43
  def run_and_submit_all( profile: gr.OAuthProfile | None):
requirements.txt CHANGED
@@ -28,4 +28,6 @@ torch
28
  # opentelemetry-sdk
29
  # opentelemetry-exporter-otlp
30
  # openinference-instrumentation-smolagents
31
- # langfuse #==
 
 
 
28
  # opentelemetry-sdk
29
  # opentelemetry-exporter-otlp
30
  # openinference-instrumentation-smolagents
31
+ # langfuse #==
32
+ yt-dlp
33
+ opencv-python-headless
smolagents_agent.py CHANGED
@@ -6,7 +6,10 @@ from smolagents import (CodeAgent, InferenceClientModel, load_tool, tool,
6
  PythonInterpreterTool, \
7
  FinalAnswerTool, GradioUI)
8
 
9
- from tools import TextFileInspectorTool, ImageInspectorTool, VisualQATool, AudioInspectorTool, YouTubeVideoInspectorTool
 
 
 
10
  from tools import (
11
  ArchiveSearchTool,
12
  FinderTool,
@@ -97,8 +100,10 @@ def create_agent():
97
  ]
98
  image_inspection_tool = ImageInspectorTool(model)
99
  # visual_qa_tool = VisualQATool(model)
100
- audio_inspection_tool = AudioInspectorTool(model)
101
- youtube_video_inspection_tool = YouTubeVideoInspectorTool(model, text_limit)
 
 
102
  python_interpreter = PythonInterpreterTool()
103
  final_answer = FinalAnswerTool()
104
  # TODO:
@@ -114,8 +119,10 @@ def create_agent():
114
  document_inspection_tool,
115
  image_inspection_tool,
116
  # visual_qa_tool,
117
- audio_inspection_tool,
118
- youtube_video_inspection_tool,
 
 
119
  python_interpreter,
120
  final_answer
121
  ],
 
6
  PythonInterpreterTool, \
7
  FinalAnswerTool, GradioUI)
8
 
9
+ from tools import (TextFileInspectorTool, ImageInspectorTool,
10
+ # VisualQATool, YouTubeVideoInspectorTool
11
+ YouTubeVisualInspectorTool, YouTubeAudioTranscriberTool,
12
+ AudioTranscriberTool)
13
  from tools import (
14
  ArchiveSearchTool,
15
  FinderTool,
 
100
  ]
101
  image_inspection_tool = ImageInspectorTool(model)
102
  # visual_qa_tool = VisualQATool(model)
103
+ audio_transcriber_tool = AudioTranscriberTool(model)
104
+ # youtube_video_inspection_tool = YouTubeVideoInspectorTool(model, text_limit)
105
+ youtube_visual_inspection_tool = YouTubeVisualInspectorTool(model)
106
+ youtube_audio_transcriber_tool = YouTubeAudioTranscriberTool(model)
107
  python_interpreter = PythonInterpreterTool()
108
  final_answer = FinalAnswerTool()
109
  # TODO:
 
119
  document_inspection_tool,
120
  image_inspection_tool,
121
  # visual_qa_tool,
122
+ audio_transcriber_tool,
123
+ # youtube_video_inspection_tool,
124
+ youtube_visual_inspection_tool,
125
+ youtube_audio_transcriber_tool,
126
  python_interpreter,
127
  final_answer
128
  ],
tools/__init__.py CHANGED
@@ -9,6 +9,7 @@ from .text_web_browser import (
9
  VisitTool,
10
  )
11
  from .image_inspector_tool import ImageInspectorTool
12
- from .audio_inspector_tool import AudioInspectorTool
13
  from .youtube_video_inspector_tool import YouTubeVideoInspectorTool
14
- from .visual_qa_tool import VisualQATool
 
 
9
  VisitTool,
10
  )
11
  from .image_inspector_tool import ImageInspectorTool
12
+ from .audio_inspector_tool import AudioTranscriberTool
13
  from .youtube_video_inspector_tool import YouTubeVideoInspectorTool
14
+ from .yt_inspector_tool import YouTubeVisualInspectorTool, YouTubeAudioTranscriberTool
15
+ # from .visual_qa_tool import VisualQATool
tools/audio_inspector_tool.py CHANGED
@@ -9,22 +9,17 @@ from smolagents import Tool
9
  from smolagents.models import Model, ChatMessage
10
 
11
 
12
- class AudioInspectorTool(Tool):
13
- name = "inspect_audio"
14
- description = """A tool that can answer questions about attached audio files. Use this tool when you need to analyze or describe audio content.
15
- This tool handles various audio formats and can provide detailed descriptions or answer specific questions about audio content.
16
  """
17
 
18
  inputs = {
19
  "audio_path": {
20
- "description": "The path to the audio file on which to answer the question. This should be a local path to downloaded audio.",
21
  "type": "string",
22
  },
23
- "question": {
24
- "description": "[Optional]: The question to answer about the audio. If not provided, will generate a detailed description.",
25
- "type": "string",
26
- "nullable": True,
27
- },
28
  }
29
  output_type = "string"
30
 
@@ -32,14 +27,9 @@ This tool handles various audio formats and can provide detailed descriptions or
32
  super().__init__()
33
  self.model = model
34
 
35
- def forward(self, audio_path: str, question: str | None = None) -> str:
36
  if not isinstance(audio_path, str):
37
- raise Exception("You should provide at least `audio_path` string argument to this tool!")
38
-
39
- add_note = False
40
- if not question:
41
- add_note = True
42
- question = "Transcribe this audio."
43
 
44
  with open(audio_path, "rb") as audio_file:
45
  base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
@@ -51,7 +41,7 @@ This tool handles various audio formats and can provide detailed descriptions or
51
  content = [
52
  {
53
  "type": "text",
54
- "text": question,
55
  },
56
  {
57
  "type": "input_audio",
@@ -70,9 +60,6 @@ This tool handles various audio formats and can provide detailed descriptions or
70
  # Handle case where content is a list of dicts
71
  output = str(output)
72
  except Exception as e:
73
- raise Exception("Response format unexpected: " + str(e))
74
-
75
- if add_note:
76
- output = f"You did not provide a particular question, so here is a detailed description of the audio: {output}"
77
 
78
  return str(output)
 
9
  from smolagents.models import Model, ChatMessage
10
 
11
 
12
+ class AudioTranscriberTool(Tool):
13
+ name = "transcribe_audio"
14
+ description = """A tool that transcribes audio files to text. Use this tool when you need to convert speech or audio content into written text.
15
+ This tool handles various audio formats and provides accurate transcriptions of audio content.
16
  """
17
 
18
  inputs = {
19
  "audio_path": {
20
+ "description": "The path to the audio file to transcribe. This should be a local path to downloaded audio.",
21
  "type": "string",
22
  },
 
 
 
 
 
23
  }
24
  output_type = "string"
25
 
 
27
  super().__init__()
28
  self.model = model
29
 
30
+ def forward(self, audio_path: str) -> str:
31
  if not isinstance(audio_path, str):
32
+ raise Exception("You should provide the `audio_path` string argument to this tool!")
 
 
 
 
 
33
 
34
  with open(audio_path, "rb") as audio_file:
35
  base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
 
41
  content = [
42
  {
43
  "type": "text",
44
+ "text": "Please transcribe this audio file accurately. Provide only the transcribed text without any additional commentary or formatting.",
45
  },
46
  {
47
  "type": "input_audio",
 
60
  # Handle case where content is a list of dicts
61
  output = str(output)
62
  except Exception as e:
63
+ raise Exception("Transcription failed: " + str(e))
 
 
 
64
 
65
  return str(output)
tools/yt_inspector_tool.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import platform
3
+
4
+ from smolagents import Tool
5
+ from smolagents.models import Model, ChatMessage
6
+
7
+ import yt_dlp
8
+ import tempfile
9
+ import os
10
+ import cv2
11
+
12
+ class YouTubeVisualInspectorTool(Tool):
13
+ name = "youtube_visual_inspector"
14
+ description = """A tool that downloads a YouTube video, extracts frames, and answers a question based on the video content. Use this tool to ask questions about the visual content of a YouTube video."""
15
+
16
+ inputs = {
17
+ "youtube_url": {
18
+ "description": "The URL of the YouTube video to analyze.",
19
+ "type": "string",
20
+ },
21
+ "question": {
22
+ "description": "The question to answer about the video.",
23
+ "type": "string",
24
+ },
25
+ }
26
+ output_type = "string"
27
+
28
+ def __init__(self, model: Model):
29
+ super().__init__()
30
+ self.model = model
31
+
32
+ def forward(self, youtube_url: str, question: str) -> str:
33
+ if not isinstance(youtube_url, str) or not isinstance(question, str):
34
+ raise Exception("You should provide both `youtube_url` and `question` string arguments to this tool!")
35
+
36
+ with tempfile.TemporaryDirectory() as tmpdir:
37
+ ydl_opts = {
38
+ 'format': 'mp4',
39
+ 'outtmpl': os.path.join(tmpdir, '%(id)s.%(ext)s'),
40
+ 'quiet': True,
41
+ 'noplaylist': True,
42
+ }
43
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
44
+ info = ydl.extract_info(youtube_url, download=True)
45
+ video_path = ydl.prepare_filename(info)
46
+ if not video_path.endswith('.mp4'):
47
+ for f in os.listdir(tmpdir):
48
+ if f.endswith('.mp4'):
49
+ video_path = os.path.join(tmpdir, f)
50
+ break
51
+
52
+ # Extract every 25th frame using OpenCV
53
+ vidcap = cv2.VideoCapture(video_path)
54
+ frames = []
55
+ count = 0
56
+ success, image = vidcap.read()
57
+ while success:
58
+ if count % 25 == 0:
59
+ _, buffer = cv2.imencode('.jpg', image)
60
+ frame_b64 = base64.b64encode(buffer.tobytes()).decode('utf-8')
61
+ frames.append(frame_b64)
62
+ success, image = vidcap.read()
63
+ count += 1
64
+ vidcap.release()
65
+
66
+ # Compose the message as per the provided example
67
+ messages = [
68
+ ChatMessage(
69
+ role="user",
70
+ content=[
71
+ {"type": "text", "text": question},
72
+ *[
73
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame}"}}
74
+ for frame in frames
75
+ ]
76
+ ]
77
+ )
78
+ ]
79
+ try:
80
+ output = self.model(messages).content
81
+ if isinstance(output, list):
82
+ output = str(output)
83
+ except Exception as e:
84
+ raise Exception("Video QA failed: " + str(e))
85
+
86
+ return str(output)
87
+
88
+
89
+ class YouTubeAudioTranscriberTool(Tool):
90
+ name = "youtube_audio_transcriber"
91
+ description = """A tool that downloads audio from a YouTube video and transcribes it to text. Use this tool when you need to convert speech or audio content from YouTube videos into written text.
92
+ This tool handles various audio formats and provides accurate transcriptions of audio content from YouTube videos."""
93
+
94
+ inputs = {
95
+ "youtube_url": {
96
+ "description": "The URL of the YouTube video to download audio from and transcribe.",
97
+ "type": "string",
98
+ },
99
+ }
100
+ output_type = "string"
101
+
102
+ def __init__(self, model: Model):
103
+ super().__init__()
104
+ self.model = model
105
+
106
+ def forward(self, youtube_url: str) -> str:
107
+ if not isinstance(youtube_url, str):
108
+ raise Exception("You should provide the `youtube_url` string argument to this tool!")
109
+
110
+ with tempfile.TemporaryDirectory() as tmpdir:
111
+ # Download audio only
112
+ ydl_opts = {
113
+ 'format': 'bestaudio/best',
114
+ 'outtmpl': os.path.join(tmpdir, '%(id)s.%(ext)s'),
115
+ 'quiet': True,
116
+ 'noplaylist': True,
117
+ 'postprocessors': [{
118
+ 'key': 'FFmpegExtractAudio',
119
+ 'preferredcodec': 'mp3',
120
+ 'preferredquality': '192',
121
+ }],
122
+ }
123
+ if platform.system() == "Darwin":
124
+ ydl_opts['ffmpeg_location'] = '/opt/homebrew/bin/ffmpeg'
125
+
126
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
127
+ info = ydl.extract_info(youtube_url, download=True)
128
+ audio_path = ydl.prepare_filename(info)
129
+ # Convert to mp3 if not already
130
+ if not audio_path.endswith('.mp3'):
131
+ for f in os.listdir(tmpdir):
132
+ if f.endswith('.mp3'):
133
+ audio_path = os.path.join(tmpdir, f)
134
+ break
135
+
136
+ # Read and encode the audio file
137
+ with open(audio_path, "rb") as audio_file:
138
+ base64_audio = base64.b64encode(audio_file.read()).decode('utf-8')
139
+ format = audio_path.split(".")[-1]
140
+
141
+ messages = [
142
+ ChatMessage(
143
+ role="user",
144
+ content = [
145
+ {
146
+ "type": "text",
147
+ "text": "Please transcribe this audio file accurately. Provide only the transcribed text without any additional commentary or formatting.",
148
+ },
149
+ {
150
+ "type": "input_audio",
151
+ "input_audio": {
152
+ "data": base64_audio,
153
+ "format": format
154
+ }
155
+ }
156
+ ]
157
+ )
158
+ ]
159
+
160
+ try:
161
+ output = self.model(messages).content
162
+ if isinstance(output, list):
163
+ # Handle case where content is a list of dicts
164
+ output = str(output)
165
+ except Exception as e:
166
+ raise Exception("Transcription failed: " + str(e))
167
+
168
+ return str(output)
169
+