thivy commited on
Commit
00259b9
·
1 Parent(s): 845294b

add youtube tool

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitignore CHANGED
@@ -1 +1,2 @@
1
- .env
 
 
1
+ .env
2
+ .venv
__pycache__/agents.cpython-312.pyc ADDED
Binary file (4.12 kB). View file
 
__pycache__/tools.cpython-312.pyc ADDED
Binary file (15.2 kB). View file
 
agents.py CHANGED
@@ -1,4 +1,4 @@
1
- from tools import general_tools, file_agent_tools, data_agent_tools, math_agent_tools
2
  from langgraph.prebuilt import create_react_agent
3
  from langgraph.checkpoint.memory import MemorySaver
4
  from langchain_openai import ChatOpenAI
@@ -41,7 +41,25 @@ data_agent = create_react_agent(
41
  prompt="You process data. Use tools to filter and extract data."
42
  )
43
 
44
- prompt = """You are a supervisor. You coordinate file_reader, calculator, and data_processor to solve problems step by step.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  Do not do calculations or file reading yourself, use the tools.
46
  Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
47
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
@@ -49,10 +67,25 @@ If you are asked for a number, don't use comma to write your number neither use
49
  If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
50
  If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
51
  """
 
 
 
 
 
 
 
 
 
52
  # Supervisor
53
  excel_supervisor = create_supervisor(
54
  [file_agent, math_agent, data_agent],
55
  model=llm,
56
- prompt=prompt
57
  ).compile()
58
 
 
 
 
 
 
 
 
1
+ from tools import general_tools, file_agent_tools, data_agent_tools, math_agent_tools, analyze_video_tools, youtube_transcript_tools
2
  from langgraph.prebuilt import create_react_agent
3
  from langgraph.checkpoint.memory import MemorySaver
4
  from langchain_openai import ChatOpenAI
 
41
  prompt="You process data. Use tools to filter and extract data."
42
  )
43
 
44
+ # Create video analysis agents
45
+ video_agent = create_react_agent(
46
+ model=llm,
47
+ tools=analyze_video_tools(),
48
+ name="video_analyzer",
49
+ prompt="""You analyze visual content in videos. Use tools to detect and track objects.
50
+ The object_detection tool is a general object detection model. Use this for general cases.
51
+ The analyze_video_content uses both the object detection model and a vision llm to analyze frames with content given a question.
52
+ Use this for more difficult questions."""
53
+ )
54
+
55
+ transcript_agent = create_react_agent(
56
+ model=llm,
57
+ tools=youtube_transcript_tools(),
58
+ name="transcript_analyzer",
59
+ prompt="You analyze audio/speech content in videos. Use tools to get transcripts."
60
+ )
61
+
62
+ excel_prompt = """You are a supervisor. You coordinate file_reader, calculator, and data_processor to solve problems step by step.
63
  Do not do calculations or file reading yourself, use the tools.
64
  Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
65
  YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
 
67
  If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
68
  If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
69
  """
70
+
71
+ video_analyzer_prompt = """You coordinate video_analyzer and transcript_analyzer to answer questions about YouTube videos.
72
+ Use video_analyzer for visual questions (objects, people, actions). Use transcript_analyzer for audio questions (what people say).
73
+ Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
74
+ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
75
+ If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
76
+ If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
77
+ If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
78
+ """
79
  # Supervisor
80
  excel_supervisor = create_supervisor(
81
  [file_agent, math_agent, data_agent],
82
  model=llm,
83
+ prompt=excel_prompt
84
  ).compile()
85
 
86
+ # Video supervisor
87
+ video_supervisor = create_supervisor(
88
+ [video_agent, transcript_agent],
89
+ model=llm,
90
+ prompt=video_analyzer_prompt
91
+ ).compile()
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  import requests
4
  import inspect
5
  import pandas as pd
 
6
 
7
  # (Keep Constants as is)
8
  # --- Constants ---
@@ -11,13 +12,18 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
  # --- Basic Agent Definition ---
12
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
  class BasicAgent:
 
14
  def __init__(self):
15
  print("BasicAgent initialized.")
 
 
16
  def __call__(self, question: str) -> str:
17
  print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
 
 
21
 
22
  def run_and_submit_all( profile: gr.OAuthProfile | None):
23
  """
@@ -80,7 +86,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
80
  print(f"Skipping item with missing task_id or question: {item}")
81
  continue
82
  try:
83
- submitted_answer = agent(question_text)
84
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
86
  except Exception as e:
 
3
  import requests
4
  import inspect
5
  import pandas as pd
6
+ from qa_graph import build_graph
7
 
8
  # (Keep Constants as is)
9
  # --- Constants ---
 
12
  # --- Basic Agent Definition ---
13
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
14
  class BasicAgent:
15
+ """A langgraph agent."""
16
  def __init__(self):
17
  print("BasicAgent initialized.")
18
+ self.graph = build_graph()
19
+
20
  def __call__(self, question: str) -> str:
21
  print(f"Agent received question (first 50 chars): {question[:50]}...")
22
+ # Wrap the question in a HumanMessage from langchain_core
23
+ messages = self.graph.invoke({"question": question, "decision": "",
24
+ "answer": ""})
25
+ answer = messages['messages'][-1].content
26
+ return answer[14:]
27
 
28
  def run_and_submit_all( profile: gr.OAuthProfile | None):
29
  """
 
86
  print(f"Skipping item with missing task_id or question: {item}")
87
  continue
88
  try:
89
+ submitted_answer = agent(item)
90
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
91
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
92
  except Exception as e:
qa_graph.py CHANGED
@@ -1,7 +1,7 @@
1
  from dataclasses import dataclass
2
  from langgraph.graph import START, StateGraph, END
3
  from typing import TypedDict
4
- from agents import general_agent, excel_supervisor
5
  import os
6
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
7
  os.environ["OPENAI_API_KEY"] = str(OPENAI_API_KEY)
@@ -75,6 +75,17 @@ def ask_question_with_file(question: Question, thread_id: str = "default") -> st
75
 
76
  return ask_question(enhanced_question, thread_id)
77
 
 
 
 
 
 
 
 
 
 
 
 
78
  test = [
79
  # {
80
  # "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
@@ -88,12 +99,36 @@ test = [
88
  # "Level": "1",
89
  # "file_name": "1f975693-876d-457b-a649-393859e79bf3.mp3"
90
  # },
91
- {
92
- "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
93
- "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  "Level": "1",
95
- "file_name": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx"
96
- }
 
 
 
 
 
 
97
  ]
98
 
99
  questions = [Question(**item) for item in test]
@@ -133,10 +168,22 @@ def ask_question_with_file_node(state: State) -> dict:
133
  # Return dict to update state
134
  return {"answer": answer}
135
 
 
 
 
 
 
 
 
 
 
 
136
  def router_node(state: State):
137
  """Router node - returns dict to update state"""
138
  if state["question"].file_name:
139
  decision = "query_with_file"
 
 
140
  else:
141
  decision = "query"
142
 
@@ -146,33 +193,39 @@ def router_function(state: State):
146
  """Routing function - returns string to choose path"""
147
  return state["decision"]
148
 
149
- # Graph
150
- builder = StateGraph(State)
151
-
152
- # Use the NODE functions (not the original functions)
153
- builder.add_node("query_with_file", ask_question_with_file_node)
154
- builder.add_node("query", ask_question_node)
155
- builder.add_node("router", router_node)
156
-
157
- # Define edges
158
- builder.add_edge(START, "router")
159
- builder.add_conditional_edges(
160
- "router",
161
- router_function,
162
- {
163
- "query_with_file": "query_with_file",
164
- "query": "query",
165
- },
166
- )
167
- builder.add_edge("query_with_file", END)
168
- builder.add_edge("query", END)
169
-
170
- react_graph = builder.compile()
 
 
 
 
 
171
 
172
  if __name__ == "__main__":
173
  for i, question in enumerate(questions):
174
  print(f"\n{i}. {question.question}")
175
 
 
176
  # Invoke the graph and capture the result
177
  result = react_graph.invoke({
178
  "question": question,
 
1
  from dataclasses import dataclass
2
  from langgraph.graph import START, StateGraph, END
3
  from typing import TypedDict
4
+ from agents import general_agent, excel_supervisor, video_supervisor
5
  import os
6
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
7
  os.environ["OPENAI_API_KEY"] = str(OPENAI_API_KEY)
 
75
 
76
  return ask_question(enhanced_question, thread_id)
77
 
78
+ def ask_question_youtube(question: Question) -> str:
79
+ """Ask the agent a question, with optional file analysis."""
80
+ q = question.question
81
+ result = video_supervisor.invoke({
82
+ "messages": [
83
+ {"role": "user", "content": q}
84
+ ]
85
+ })
86
+ print(result)
87
+ return result["messages"][-1].content
88
+
89
  test = [
90
  # {
91
  # "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
 
99
  # "Level": "1",
100
  # "file_name": "1f975693-876d-457b-a649-393859e79bf3.mp3"
101
  # },
102
+ # {
103
+ # "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
104
+ # "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
105
+ # "Level": "1",
106
+ # "file_name": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx"
107
+ # },
108
+ # {
109
+ # "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
110
+ # "question": "What is the final numeric output from the attached Python code?",
111
+ # "Level": "1",
112
+ # "file_name": "f918266a-b3e0-4914-865d-4faa564f1aef.py"
113
+ # },
114
+ # {
115
+ # "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
116
+ # "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
117
+ # "Level": "1",
118
+ # "file_name": ""
119
+ # },
120
+ {
121
+ "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
122
+ "question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
123
  "Level": "1",
124
+ "file_name": ""
125
+ },
126
+ {
127
+ "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
128
+ "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
129
+ "Level": "1",
130
+ "file_name": ""
131
+ },
132
  ]
133
 
134
  questions = [Question(**item) for item in test]
 
168
  # Return dict to update state
169
  return {"answer": answer}
170
 
171
+ def ask_question_youtube_node(state: State) -> dict:
172
+ """Node function for questions with files."""
173
+ question_obj = state["question"]
174
+
175
+ # Call your existing function
176
+ answer = ask_question_youtube(question_obj)
177
+
178
+ # Return dict to update state
179
+ return {"answer": answer}
180
+
181
  def router_node(state: State):
182
  """Router node - returns dict to update state"""
183
  if state["question"].file_name:
184
  decision = "query_with_file"
185
+ elif "youtube.com" in state["question"].question or "youtu.be" in state["question"].question:
186
+ decision = "youtube"
187
  else:
188
  decision = "query"
189
 
 
193
  """Routing function - returns string to choose path"""
194
  return state["decision"]
195
 
196
+ def build_graph():
197
+ # Graph
198
+ builder = StateGraph(State)
199
+
200
+ # Use the NODE functions (not the original functions)
201
+ builder.add_node("query_with_file", ask_question_with_file_node)
202
+ builder.add_node("query", ask_question_node)
203
+ builder.add_node("youtube", ask_question_youtube_node)
204
+ builder.add_node("router", router_node)
205
+
206
+ # Define edges
207
+ builder.add_edge(START, "router")
208
+ builder.add_conditional_edges(
209
+ "router",
210
+ router_function,
211
+ {
212
+ "query_with_file": "query_with_file",
213
+ "query": "query",
214
+ "youtube": "youtube",
215
+ },
216
+ )
217
+ builder.add_edge("query_with_file", END)
218
+ builder.add_edge("query", END)
219
+ builder.add_edge("youtube", END)
220
+
221
+ react_graph = builder.compile()
222
+ return react_graph
223
 
224
  if __name__ == "__main__":
225
  for i, question in enumerate(questions):
226
  print(f"\n{i}. {question.question}")
227
 
228
+ react_graph = build_graph()
229
  # Invoke the graph and capture the result
230
  result = react_graph.invoke({
231
  "question": question,
requirements.txt CHANGED
@@ -1,2 +1,13 @@
1
  gradio
2
- requests
 
 
 
 
 
 
 
 
 
 
 
 
1
  gradio
2
+ requests
3
+ langgraph
4
+ langgraph-supervisor
5
+ langchain
6
+ langchain_community
7
+ langchain_openai
8
+ duckduckgo-search
9
+ wikipedia
10
+ arxiv
11
+ openpyxl
12
+ ultralytics
13
+ youtube-transcript-api
test.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
tools.py CHANGED
@@ -16,6 +16,11 @@ import os
16
  from huggingface_hub import InferenceClient
17
  import json
18
  import requests
 
 
 
 
 
19
 
20
  from dotenv import load_dotenv
21
  load_dotenv()
@@ -114,28 +119,6 @@ def transcribe_audio(file_path: str, question: str) -> str:
114
  except Exception as e:
115
  return f"Error transcribing audio: {str(e)}"
116
 
117
- #### Excel supervisor agent
118
-
119
-
120
- def general_tools():
121
- tools = [
122
- DuckDuckGoSearchRun(),
123
- WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()),
124
- ArxivQueryRun(api_wrapper=ArxivAPIWrapper()),
125
- analyze_image,
126
- read_python_file,
127
- transcribe_audio,
128
- ]
129
- return tools
130
-
131
-
132
- # Simple file tools
133
- @tool
134
- def read_excel(file_path: str) -> str:
135
- """Read any Excel file and return as JSON."""
136
- df = pd.read_excel(file_path)
137
- return json.dumps(df.to_dict(orient='records'))
138
-
139
  # Simple math tools
140
  @tool
141
  def add(a: float, b: float) -> float:
@@ -173,6 +156,192 @@ def filter_rows(data: str, exclude_words: list) -> str:
173
  filtered.append(row)
174
  return json.dumps(filtered)
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  def file_agent_tools():
177
  tools = [read_excel]
178
  return tools
 
16
  from huggingface_hub import InferenceClient
17
  import json
18
  import requests
19
+ from youtube_transcript_api import YouTubeTranscriptApi
20
+ from ultralytics import YOLO
21
+ import cv2
22
+
23
+ import re
24
 
25
  from dotenv import load_dotenv
26
  load_dotenv()
 
119
  except Exception as e:
120
  return f"Error transcribing audio: {str(e)}"
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  # Simple math tools
123
  @tool
124
  def add(a: float, b: float) -> float:
 
156
  filtered.append(row)
157
  return json.dumps(filtered)
158
 
159
+ @tool
160
+ def read_excel(file_path: str) -> str:
161
+ """Read any Excel file and return as JSON."""
162
+ df = pd.read_excel(file_path)
163
+ return json.dumps(df.to_dict(orient='records'))
164
+
165
+ @tool
166
+ def object_detection(video_url: str) -> str:
167
+ """Analyze objects and visual content in a YouTube video."""
168
+ try:
169
+ model = YOLO("yolo11n.pt") # Load an official Detect model
170
+ results = model.track(video_url)
171
+
172
+ # Track objects across frames
173
+ frame_objects = []
174
+ for i, result in enumerate(results):
175
+ if result.boxes is not None:
176
+ objects_in_frame = []
177
+ for j in range(len(result.boxes)):
178
+ class_name = result.names[int(result.boxes.cls[j].item())]
179
+ confidence = float(result.boxes.conf[j].item())
180
+ if confidence > 0.5: # Only high confidence detections
181
+ objects_in_frame.append(class_name)
182
+
183
+ frame_objects.append({
184
+ "frame": i,
185
+ "objects": objects_in_frame,
186
+ "unique_objects": list(set(objects_in_frame))
187
+ })
188
+
189
+ return json.dumps(frame_objects, indent=2)
190
+
191
+ except Exception as e:
192
+ return f"Error analyzing video: {str(e)}"
193
+
194
+ @tool
195
+ def get_youtube_transcript(video_url: str) -> str:
196
+ """Get transcript from a YouTube video."""
197
+ try:
198
+ # Extract video ID
199
+ video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', video_url)
200
+ if not video_id_match:
201
+ return "Error: Could not extract video ID"
202
+
203
+ video_id = video_id_match.group(1)
204
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
205
+
206
+ # Format with timestamps
207
+ formatted_transcript = []
208
+ for entry in transcript:
209
+ formatted_transcript.append({
210
+ "start": entry['start'],
211
+ "duration": entry['duration'],
212
+ "text": entry['text']
213
+ })
214
+
215
+ return json.dumps(formatted_transcript, indent=2)
216
+
217
+ except Exception as e:
218
+ return f"Error getting transcript: {str(e)}"
219
+
220
+ # @tool
221
+ def analyze_video_content(video_url: str, question: str = "", max_vision_frames: int = 1) -> str:
222
+ """Analyze video content using YOLO for object detection and vision LLM for detailed analysis."""
223
+ try:
224
+ model = YOLO("yolo11n.pt")
225
+ results = model.track(video_url)
226
+
227
+ # Step 1: YOLO analysis for all frames
228
+ frame_objects = []
229
+ frames_with_content = []
230
+
231
+ for i, result in enumerate(results):
232
+ frame_data = {
233
+ "frame": i,
234
+ "objects": [],
235
+ "unique_objects": [],
236
+ "object_counts": {}
237
+ }
238
+
239
+ if result.boxes is not None:
240
+ objects_in_frame = []
241
+ for j in range(len(result.boxes)):
242
+ class_name = result.names[int(result.boxes.cls[j].item())]
243
+ confidence = float(result.boxes.conf[j].item())
244
+ if confidence > 0.5:
245
+ objects_in_frame.append(class_name)
246
+
247
+ # Count objects
248
+ for obj in objects_in_frame:
249
+ frame_data["object_counts"][obj] = frame_data["object_counts"].get(obj, 0) + 1
250
+
251
+ frame_data["objects"] = objects_in_frame
252
+ frame_data["unique_objects"] = list(set(objects_in_frame))
253
+
254
+ # Store frame for potential vision analysis
255
+ if objects_in_frame: # Only store frames with detected objects
256
+ frames_with_content.append({
257
+ "frame_index": i,
258
+ "objects": objects_in_frame,
259
+ "object_counts": frame_data["object_counts"],
260
+ "total_objects": len(objects_in_frame),
261
+ "image": result.orig_img
262
+ })
263
+
264
+ frame_objects.append(frame_data)
265
+
266
+ # Step 2: If there's a specific question, use vision LLM on selected frames
267
+ detailed_analyses = []
268
+ if question.strip():
269
+ # Sort frames by total objects and select top frames
270
+ frames_with_content.sort(key=lambda x: x["total_objects"], reverse=True)
271
+ selected_frames = frames_with_content[:max_vision_frames]
272
+
273
+ for frame_data in selected_frames:
274
+ try:
275
+ # Encode frame directly to base64
276
+ _, buffer = cv2.imencode('.jpg', frame_data["image"])
277
+ image_bytes = buffer.tobytes()
278
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
279
+
280
+ message = [
281
+ HumanMessage(
282
+ content=[
283
+ {"type": "text", "text": question},
284
+ {
285
+ "type": "image_url",
286
+ "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
287
+ }
288
+ ]
289
+ )
290
+ ]
291
+
292
+ vision_response = vision_llm.invoke(message)
293
+
294
+ detailed_analyses.append({
295
+ "frame_index": frame_data["frame_index"],
296
+ "yolo_objects": frame_data["objects"],
297
+ "yolo_counts": frame_data["object_counts"],
298
+ "vision_analysis": vision_response.content
299
+ })
300
+
301
+ except Exception as vision_error:
302
+ detailed_analyses.append({
303
+ "frame_index": frame_data["frame_index"],
304
+ "yolo_objects": frame_data["objects"],
305
+ "yolo_counts": frame_data["object_counts"],
306
+ "vision_analysis": f"Vision analysis failed: {str(vision_error)}"
307
+ })
308
+
309
+ # Combine results
310
+ result_data = {
311
+ "video_url": video_url,
312
+ "question": question,
313
+ "total_frames": len(frame_objects),
314
+ "yolo_analysis": frame_objects,
315
+ "frames_with_objects": len(frames_with_content)
316
+ }
317
+
318
+ if detailed_analyses:
319
+ result_data["detailed_vision_analysis"] = detailed_analyses
320
+ result_data["vision_frames_analyzed"] = len(detailed_analyses)
321
+
322
+ return json.dumps(result_data, indent=2)
323
+
324
+ except Exception as e:
325
+ return f"Error analyzing video content: {str(e)}"
326
+ def general_tools():
327
+ tools = [
328
+ DuckDuckGoSearchRun(),
329
+ WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()),
330
+ ArxivQueryRun(api_wrapper=ArxivAPIWrapper()),
331
+ analyze_image,
332
+ read_python_file,
333
+ transcribe_audio,
334
+ ]
335
+ return tools
336
+
337
+ def analyze_video_tools():
338
+ tools = [object_detection, analyze_video_content]
339
+ return tools
340
+
341
+ def youtube_transcript_tools():
342
+ tools = [get_youtube_transcript]
343
+ return tools
344
+
345
  def file_agent_tools():
346
  tools = [read_excel]
347
  return tools