.DS_Store DELETED
Binary file (6.15 kB)
 
.gitattributes CHANGED
@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- *.mp3 filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
.gitignore DELETED
@@ -1,2 +0,0 @@
1
- .env
2
- .venv
 
 
 
__pycache__/agents.cpython-312.pyc DELETED
Binary file (4.12 kB)
 
__pycache__/tools.cpython-312.pyc DELETED
Binary file (15.5 kB)
 
agents.py DELETED
@@ -1,138 +0,0 @@
1
- from tools import (
2
- general_tools,
3
- file_agent_tools,
4
- data_agent_tools,
5
- math_agent_tools,
6
- analyze_video_tools,
7
- youtube_transcript_tools,
8
- google_search,
9
- wiki_search,
10
- arxiv_search
11
- )
12
- from langgraph.prebuilt import create_react_agent
13
- from langgraph.checkpoint.memory import MemorySaver
14
- from langchain_openai import ChatOpenAI
15
- from langgraph_supervisor import create_supervisor
16
-
17
-
18
- llm = ChatOpenAI(model="o4-mini")
19
-
20
- memory = MemorySaver()
21
-
22
- with open("system_prompt.txt", "r") as f:
23
- prompt = f.read()
24
-
25
- general_agent = create_react_agent(
26
- model=llm,
27
- tools=general_tools(),
28
- checkpointer=memory,
29
- prompt=prompt
30
- )
31
-
32
- # Create agents
33
- file_agent = create_react_agent(
34
- model=llm,
35
- tools=file_agent_tools(),
36
- name="file_reader",
37
- prompt="You read files. Use tools to read files."
38
- )
39
-
40
- math_agent = create_react_agent(
41
- model=llm,
42
- tools=math_agent_tools(),
43
- name="calculator",
44
- prompt="You do math. Use tools for all calculations."
45
- )
46
-
47
- data_agent = create_react_agent(
48
- model=llm,
49
- tools=data_agent_tools(),
50
- name="data_processor",
51
- prompt="You process data. Use tools to filter and extract data."
52
- )
53
-
54
- # Create video analysis agents
55
- video_agent = create_react_agent(
56
- model=llm,
57
- tools=analyze_video_tools(),
58
- name="video_analyzer",
59
- prompt="""You analyze visual content in videos. Use tools to detect and track objects.
60
- The object_detection tool is a general object detection model. Use this for general cases.
61
- The analyze_video_content uses both the object detection model and a vision llm to analyze frames with content given a question.
62
- Use this for more difficult questions."""
63
- )
64
-
65
- transcript_agent = create_react_agent(
66
- model=llm,
67
- tools=youtube_transcript_tools(),
68
- name="transcript_analyzer",
69
- prompt="You analyze audio/speech content in videos. Use tools to get transcripts."
70
- )
71
-
72
- wiki_agent = create_react_agent(
73
- model=llm,
74
- tools=[wiki_search],
75
- name="wiki_analyst",
76
- prompt="You search information from wikipedia."
77
- )
78
-
79
- google_agent = create_react_agent(
80
- model=llm,
81
- tools=[google_search],
82
- name="google_search_analyst",
83
- prompt="You search information from google search."
84
- )
85
-
86
- arxiv_agent = create_react_agent(
87
- model=llm,
88
- tools=[arxiv_search],
89
- name="arxiv_analyst",
90
- prompt="You search information from arxiv."
91
- )
92
-
93
- excel_prompt = """You are a supervisor. You coordinate file_reader, calculator, and data_processor to solve problems step by step.
94
- Do not do calculations or file reading yourself, use the tools.
95
- Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
96
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
97
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
98
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
99
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
100
- """
101
-
102
- video_analyzer_prompt = """You coordinate video_analyzer and transcript_analyzer to answer questions about YouTube videos.
103
- Use video_analyzer for visual questions (objects, people, actions). Use transcript_analyzer for audio questions (what people say).
104
- Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
105
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
106
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
107
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
108
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
109
- """
110
-
111
- search_analyzer_prompt = """You coordinate different search agents to answer questions.
112
- Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
113
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
114
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
115
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
116
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
117
- """
118
-
119
- # Supervisor
120
- excel_supervisor = create_supervisor(
121
- [file_agent, math_agent, data_agent],
122
- model=llm,
123
- prompt=excel_prompt
124
- ).compile()
125
-
126
- # Video supervisor
127
- video_supervisor = create_supervisor(
128
- [video_agent, transcript_agent],
129
- model=llm,
130
- prompt=video_analyzer_prompt
131
- ).compile()
132
-
133
- # search supervisor
134
- search_supervisor = create_supervisor(
135
- [wiki_agent, google_agent, arxiv_agent],
136
- model=llm,
137
- prompt=search_analyzer_prompt
138
- ).compile()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -3,7 +3,6 @@ import gradio as gr
3
  import requests
4
  import inspect
5
  import pandas as pd
6
- from qa_graph import build_graph, Question, extract_final_answer
7
 
8
  # (Keep Constants as is)
9
  # --- Constants ---
@@ -12,20 +11,13 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
  # --- Basic Agent Definition ---
13
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
14
  class BasicAgent:
15
- """A langgraph agent."""
16
  def __init__(self):
17
  print("BasicAgent initialized.")
18
- self.graph = build_graph()
19
-
20
- def __call__(self, question: Question) -> str:
21
- print(f"Agent received question (first 50 chars): {question.question[:50]}...")
22
- # Wrap the question in a HumanMessage from langchain_core
23
- messages = self.graph.invoke({"question": question, "decision": "",
24
- "answer": ""})
25
- answer = messages['answer']
26
- answer = extract_final_answer(answer)[1]
27
- print(answer)
28
- return answer
29
 
30
  def run_and_submit_all( profile: gr.OAuthProfile | None):
31
  """
@@ -81,15 +73,14 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
81
  results_log = []
82
  answers_payload = []
83
  print(f"Running agent on {len(questions_data)} questions...")
84
- questions = [Question(**item) for item in questions_data]
85
- for item in questions:
86
- task_id = item.task_id
87
- question_text = item.question
88
  if not task_id or question_text is None:
89
  print(f"Skipping item with missing task_id or question: {item}")
90
  continue
91
  try:
92
- submitted_answer = agent(item)
93
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
94
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
95
  except Exception as e:
 
3
  import requests
4
  import inspect
5
  import pandas as pd
 
6
 
7
  # (Keep Constants as is)
8
  # --- Constants ---
 
11
  # --- Basic Agent Definition ---
12
  # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
  class BasicAgent:
 
14
  def __init__(self):
15
  print("BasicAgent initialized.")
16
+ def __call__(self, question: str) -> str:
17
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
18
+ fixed_answer = "This is a default answer."
19
+ print(f"Agent returning fixed answer: {fixed_answer}")
20
+ return fixed_answer
 
 
 
 
 
 
21
 
22
  def run_and_submit_all( profile: gr.OAuthProfile | None):
23
  """
 
73
  results_log = []
74
  answers_payload = []
75
  print(f"Running agent on {len(questions_data)} questions...")
76
+ for item in questions_data:
77
+ task_id = item.get("task_id")
78
+ question_text = item.get("question")
 
79
  if not task_id or question_text is None:
80
  print(f"Skipping item with missing task_id or question: {item}")
81
  continue
82
  try:
83
+ submitted_answer = agent(question_text)
84
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
86
  except Exception as e:
files/1f975693-876d-457b-a649-393859e79bf3.mp3 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:200f767e732b49efef5c05d128903ee4d2c34e66fdce7f5593ac123b2e637673
3
- size 280868
 
 
 
 
files/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx DELETED
Binary file (5.29 kB)
 
files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b218c951c1f888f0bbe6f46c080f57afc7c9348fffc7ba4da35749ff1e2ac40f
3
- size 179304
 
 
 
 
files/cca530fc-4052-43b2-b130-b30968d8aa44.png DELETED
Binary file (63.1 kB)
 
files/f918266a-b3e0-4914-865d-4faa564f1aef.py DELETED
@@ -1,35 +0,0 @@
1
- from random import randint
2
- import time
3
-
4
- class UhOh(Exception):
5
- pass
6
-
7
- class Hmm:
8
- def __init__(self):
9
- self.value = randint(-100, 100)
10
-
11
- def Yeah(self):
12
- if self.value == 0:
13
- return True
14
- else:
15
- raise UhOh()
16
-
17
- def Okay():
18
- while True:
19
- yield Hmm()
20
-
21
- def keep_trying(go, first_try=True):
22
- maybe = next(go)
23
- try:
24
- if maybe.Yeah():
25
- return maybe.value
26
- except UhOh:
27
- if first_try:
28
- print("Working...")
29
- print("Please wait patiently...")
30
- time.sleep(0.1)
31
- return keep_trying(go, first_try=False)
32
-
33
- if __name__ == "__main__":
34
- go = Okay()
35
- print(f"{keep_trying(go)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
qa_graph.py DELETED
@@ -1,225 +0,0 @@
1
- from dataclasses import dataclass
2
- from langgraph.graph import START, StateGraph, END
3
- from typing import TypedDict
4
- from agents import general_agent, excel_supervisor, video_supervisor
5
- import os
6
- from typing import List
7
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
8
- os.environ["OPENAI_API_KEY"] = str(OPENAI_API_KEY)
9
-
10
- @dataclass
11
- class Question:
12
- task_id: str
13
- question: str
14
- Level: str
15
- file_name: str
16
- local_file_path: str|None = None
17
-
18
- def get_file_type(file_path: str) -> str:
19
- """Determine file type from extension."""
20
- if not file_path:
21
- return "none"
22
-
23
- file_path = file_path.lower()
24
-
25
- if file_path.endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
26
- return "image"
27
- elif file_path.endswith(('.xlsx', '.xls', '.csv')):
28
- return "excel"
29
- elif file_path.endswith('.py'):
30
- return "python"
31
- elif file_path.endswith(('.mp3', '.wav', '.m4a', '.ogg')):
32
- return "audio"
33
- else:
34
- return "unknown"
35
-
36
- def answer_qery(question: str, thread_id: str = "default") -> str:
37
- """Ask the agent a question."""
38
- config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 50}
39
-
40
- try:
41
- result = video_supervisor.invoke({
42
- "messages": [
43
- {"role": "user", "content": question}
44
- ]
45
- })
46
- return result["messages"][-1].content
47
- except Exception as e:
48
- return f"Error: {str(e)}"
49
-
50
- def ask_question(question: str, thread_id: str = "default") -> str:
51
- """Ask the agent a question."""
52
- config = {"configurable": {"thread_id": thread_id}, "recursion_limit": 100}
53
-
54
- try:
55
- response = general_agent.invoke(
56
- {"messages": [{"role": "user", "content": question}]},
57
- config=config
58
- )
59
- return response["messages"][-1].content
60
- except Exception as e:
61
- return f"Error: {str(e)}"
62
-
63
- def ask_question_with_file(question: Question, thread_id: str = "default") -> str:
64
- """Ask the agent a question, with optional file analysis."""
65
- q = question.question
66
- root_file = "./files"
67
- file_path = root_file + "/" + question.file_name
68
- if not question.file_name:
69
- return ask_question(q, thread_id)
70
-
71
- file_type = get_file_type(file_path)
72
-
73
- # Create enhanced question with file guidance
74
- if file_type == "image":
75
- enhanced_question = f"{q}\n\nThere is an image file at '{file_path}'. Use the analyze_image tool to examine it."
76
- elif file_type == "excel":
77
- enhanced_question = f"{q}\n\nFile path: {file_path}"
78
- result = excel_supervisor.invoke({
79
- "messages": [
80
- {"role": "user", "content": enhanced_question}
81
- ]
82
- })
83
- return result["messages"][-1].content
84
- elif file_type == "python":
85
- enhanced_question = f"{q}\n\nThere is a Python file at '{file_path}'. Use the read_python_file tool to examine it."
86
- elif file_type == "audio":
87
- enhanced_question = f"{q}\n\nThere is an audio file at '{file_path}'. Use the transcribe_audio tool to process it."
88
- else:
89
- enhanced_question = f"{q}\n\nThere is a file at '{file_path}' but I'm not sure what type it is."
90
-
91
- return ask_question(enhanced_question, thread_id)
92
-
93
- def ask_question_youtube(question: Question) -> str:
94
- """Ask the agent a question, with optional file analysis."""
95
- q = question.question
96
- result = video_supervisor.invoke({
97
- "messages": [
98
- {"role": "user", "content": q}
99
- ]
100
- })
101
- return result["messages"][-1].content
102
-
103
- # State
104
- class State(TypedDict):
105
- question: Question
106
- decision: str
107
- answer: str
108
-
109
- # NODE FUNCTIONS - These are the ones that work with LangGraph
110
- def ask_question_node(state: State) -> dict:
111
- """Node function for questions without files."""
112
- question_obj = state["question"]
113
- thread_id = f"test_{question_obj.task_id}"
114
-
115
- # Call your existing function
116
- answer = answer_qery(question_obj.question, thread_id)
117
-
118
- # Return dict to update state
119
- return {"answer": answer}
120
-
121
- def ask_question_with_file_node(state: State) -> dict:
122
- """Node function for questions with files."""
123
- question_obj = state["question"]
124
- thread_id = f"test_{question_obj.task_id}"
125
-
126
- # Call your existing function
127
- answer = ask_question_with_file(question_obj, thread_id)
128
-
129
- # Return dict to update state
130
- return {"answer": answer}
131
-
132
- def ask_question_youtube_node(state: State) -> dict:
133
- """Node function for questions with files."""
134
- question_obj = state["question"]
135
-
136
- # Call your existing function
137
- answer = ask_question_youtube(question_obj)
138
-
139
- # Return dict to update state
140
- return {"answer": answer}
141
-
142
- def router_node(state: State):
143
- """Router node - returns dict to update state"""
144
- if state["question"].file_name:
145
- decision = "query_with_file"
146
- elif "youtube.com" in state["question"].question or "youtu.be" in state["question"].question:
147
- decision = "youtube"
148
- else:
149
- decision = "query"
150
-
151
- return {"decision": decision}
152
-
153
- def router_function(state: State):
154
- """Routing function - returns string to choose path"""
155
- return state["decision"]
156
-
157
- def build_graph():
158
- # Graph
159
- builder = StateGraph(State)
160
-
161
- # Use the NODE functions (not the original functions)
162
- builder.add_node("query_with_file", ask_question_with_file_node)
163
- builder.add_node("query", ask_question_node)
164
- builder.add_node("youtube", ask_question_youtube_node)
165
- builder.add_node("router", router_node)
166
-
167
- # Define edges
168
- builder.add_edge(START, "router")
169
- builder.add_conditional_edges(
170
- "router",
171
- router_function,
172
- {
173
- "query_with_file": "query_with_file",
174
- "query": "query",
175
- "youtube": "youtube",
176
- },
177
- )
178
- builder.add_edge("query_with_file", END)
179
- builder.add_edge("query", END)
180
- builder.add_edge("youtube", END)
181
-
182
- react_graph = builder.compile()
183
- return react_graph
184
- def extract_final_answer(text: str) -> str|List[str]:
185
- """Extract the final answer from a string containing 'FINAL ANSWER: answer'"""
186
-
187
- # Method 1: Simple string split (most common case)
188
- if "FINAL ANSWER:" in text:
189
- # Split on "FINAL ANSWER:" and take the part after it
190
- parts = text.split("FINAL ANSWER:", 1) # Split only on first occurrence
191
- return parts
192
- else:
193
- return "FINAL ANSWER: unknown"
194
-
195
- if __name__ == "__main__":
196
- test = [
197
- {
198
- "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
199
- "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
200
- "Level": "1",
201
- "file_name": ""
202
- },
203
- {
204
- "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
205
- "question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
206
- "Level": "1",
207
- "file_name": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3"
208
- }
209
- ]
210
-
211
- questions = [Question(**item) for item in test]
212
- for i, question in enumerate(questions):
213
- print(f"\n{i}. {question.question}")
214
-
215
- react_graph = build_graph()
216
- # Invoke the graph and capture the result
217
- result = react_graph.invoke({
218
- "question": question,
219
- "decision": "",
220
- "answer": ""
221
- })
222
- answer = result['answer']
223
- print(answer)
224
- answer = extract_final_answer(answer)[1]
225
- print(answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,15 +1,2 @@
1
  gradio
2
- requests
3
- langgraph
4
- langgraph-supervisor
5
- langchain
6
- langchain_community
7
- langchain_openai
8
- duckduckgo-search
9
- wikipedia
10
- arxiv
11
- openpyxl
12
- ultralytics
13
- youtube-transcript-api
14
- google-api-python-client
15
- langchain-google-community
 
1
  gradio
2
+ requests
 
 
 
 
 
 
 
 
 
 
 
 
 
system_prompt.txt DELETED
@@ -1,7 +0,0 @@
1
- You are a general AI assistant.
2
- I will ask you a question.
3
- Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
4
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
5
- If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
6
- If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
7
- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 
 
 
 
 
 
 
 
tools.py DELETED
@@ -1,391 +0,0 @@
1
- from langchain_core.messages import HumanMessage
2
- from langchain_core.tools import tool
3
- from langchain_community.tools import (
4
- DuckDuckGoSearchRun,
5
- WikipediaQueryRun,
6
- ArxivQueryRun
7
- )
8
- from langchain_google_community.search import (
9
- GoogleSearchAPIWrapper,
10
- GoogleSearchRun
11
- )
12
- from langchain_community.utilities import WikipediaAPIWrapper, ArxivAPIWrapper
13
- from langchain_openai import ChatOpenAI
14
-
15
- import base64
16
- import pandas as pd
17
- import os
18
-
19
- import os
20
- from huggingface_hub import InferenceClient
21
- import json
22
- import requests
23
- from youtube_transcript_api import YouTubeTranscriptApi
24
- from ultralytics import YOLO
25
- import cv2
26
-
27
- import re
28
-
29
- from dotenv import load_dotenv
30
- load_dotenv()
31
- HF_TOKEN = os.getenv("HF_TOKEN")
32
- GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
33
- GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
34
- client = InferenceClient(
35
- provider="hf-inference",
36
- api_key=HF_TOKEN,
37
- )
38
-
39
- llm = ChatOpenAI(model="o4-mini")
40
- vision_llm = ChatOpenAI(model="gpt-4o")
41
-
42
- @tool
43
- def analyze_image(img_path: str, question: str) -> str:
44
- """Analyze an image and answer a question about it."""
45
- try:
46
- with open(img_path, "rb") as image_file:
47
- image_bytes = image_file.read()
48
-
49
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
50
-
51
- message = [
52
- HumanMessage(
53
- content=[
54
- {"type": "text", "text": question},
55
- {
56
- "type": "image_url",
57
- "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
58
- }
59
- ]
60
- )
61
- ]
62
-
63
- response = vision_llm.invoke(message)
64
- return response.content
65
-
66
- except Exception as e:
67
- return f"Error analyzing image: {str(e)}"
68
-
69
- @tool
70
- def read_excel_file(file_path: str, question: str) -> str:
71
- """Read and analyze an Excel file to answer a question."""
72
- try:
73
- # Read Excel file
74
- df = pd.read_excel(file_path)
75
-
76
- df_dict = df.to_dict(orient='records')
77
- info = json.dumps(df_dict)
78
- return info
79
-
80
- except Exception as e:
81
- return f"Error reading Excel file: {str(e)}"
82
-
83
- @tool
84
- def read_python_file(file_path: str, question: str) -> str:
85
- """Read and analyze a Python file to answer a question."""
86
- try:
87
- with open(file_path, 'r', encoding='utf-8') as f:
88
- code_content = f.read()
89
-
90
- prompt = f"""Here is Python code from a file:
91
-
92
- ```python
93
- {code_content}
94
- ```
95
-
96
- Question: {question}
97
-
98
- Please analyze the code and answer the question."""
99
-
100
- response = llm.invoke([HumanMessage(content=prompt)])
101
- return response.content
102
-
103
- except Exception as e:
104
- return f"Error reading Python file: {str(e)}"
105
-
106
- @tool
107
- def transcribe_audio(file_path: str, question: str) -> str:
108
- """Transcribe audio file."""
109
- try:
110
- headers = {
111
- "Authorization": f"Bearer {HF_TOKEN}",
112
- "Content-Type": "audio/mpeg" # Add this line for MP3 files
113
- }
114
- API_URL = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3"
115
-
116
- def query(filename):
117
- with open(filename, "rb") as f:
118
- data = f.read()
119
- response = requests.request("POST", API_URL, headers=headers, data=data)
120
- return json.loads(response.content.decode("utf-8"))
121
-
122
- data = query(file_path)
123
- return data
124
-
125
- except Exception as e:
126
- return f"Error transcribing audio: {str(e)}"
127
-
128
- # Simple math tools
129
- @tool
130
- def add(a: float, b: float) -> float:
131
- """Add two numbers."""
132
- return a + b
133
-
134
- @tool
135
- def sum_list(numbers: list) -> float:
136
- """Sum a list of numbers."""
137
- return sum(numbers)
138
-
139
- # Simple data tools
140
- @tool
141
- def extract_values(data: str, column: str) -> list:
142
- """Extract all values from a column in JSON data."""
143
- parsed = json.loads(data)
144
- values = []
145
- for row in parsed:
146
- for key, value in row.items():
147
- if column.lower() in key.lower():
148
- try:
149
- values.append(float(value))
150
- except:
151
- pass
152
- return values
153
-
154
- @tool
155
- def filter_rows(data: str, exclude_words: list) -> str:
156
- """Remove rows containing any of the exclude words."""
157
- parsed = json.loads(data)
158
- filtered = []
159
- for row in parsed:
160
- row_text = " ".join(str(v).lower() for v in row.values())
161
- if not any(word.lower() in row_text for word in exclude_words):
162
- filtered.append(row)
163
- return json.dumps(filtered)
164
-
165
- @tool
166
- def read_excel(file_path: str) -> str:
167
- """Read any Excel file and return as JSON."""
168
- df = pd.read_excel(file_path)
169
- return json.dumps(df.to_dict(orient='records'))
170
-
171
- @tool
172
- def object_detection(video_url: str) -> str:
173
- """Analyze objects and visual content in a YouTube video."""
174
- try:
175
- model = YOLO("yolo11n.pt") # Load an official Detect model
176
- results = model.track(video_url)
177
-
178
- # Track objects across frames
179
- frame_objects = []
180
- for i, result in enumerate(results):
181
- if result.boxes is not None:
182
- objects_in_frame = []
183
- for j in range(len(result.boxes)):
184
- class_name = result.names[int(result.boxes.cls[j].item())]
185
- confidence = float(result.boxes.conf[j].item())
186
- if confidence > 0.5: # Only high confidence detections
187
- objects_in_frame.append(class_name)
188
-
189
- frame_objects.append({
190
- "frame": i,
191
- "objects": objects_in_frame,
192
- "unique_objects": list(set(objects_in_frame))
193
- })
194
-
195
- return json.dumps(frame_objects, indent=2)
196
-
197
- except Exception as e:
198
- return f"Error analyzing video: {str(e)}"
199
-
200
- @tool
201
- def get_youtube_transcript(video_url: str) -> str:
202
- """Get transcript from a YouTube video."""
203
- try:
204
- # Extract video ID
205
- video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', video_url)
206
- if not video_id_match:
207
- return "Error: Could not extract video ID"
208
-
209
- video_id = video_id_match.group(1)
210
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
211
-
212
- # Format with timestamps
213
- formatted_transcript = []
214
- for entry in transcript:
215
- formatted_transcript.append({
216
- "start": entry['start'],
217
- "duration": entry['duration'],
218
- "text": entry['text']
219
- })
220
-
221
- return json.dumps(formatted_transcript, indent=2)
222
-
223
- except Exception as e:
224
- return f"Error getting transcript: {str(e)}"
225
-
226
- # @tool
227
- def analyze_video_content(video_url: str, question: str = "", max_vision_frames: int = 1) -> str:
228
- """Analyze video content using YOLO for object detection and vision LLM for detailed analysis."""
229
- try:
230
- model = YOLO("yolo11n.pt")
231
- results = model.track(video_url)
232
-
233
- # Step 1: YOLO analysis for all frames
234
- frame_objects = []
235
- frames_with_content = []
236
-
237
- for i, result in enumerate(results):
238
- frame_data = {
239
- "frame": i,
240
- "objects": [],
241
- "unique_objects": [],
242
- "object_counts": {}
243
- }
244
-
245
- if result.boxes is not None:
246
- objects_in_frame = []
247
- for j in range(len(result.boxes)):
248
- class_name = result.names[int(result.boxes.cls[j].item())]
249
- confidence = float(result.boxes.conf[j].item())
250
- if confidence > 0.5:
251
- objects_in_frame.append(class_name)
252
-
253
- # Count objects
254
- for obj in objects_in_frame:
255
- frame_data["object_counts"][obj] = frame_data["object_counts"].get(obj, 0) + 1
256
-
257
- frame_data["objects"] = objects_in_frame
258
- frame_data["unique_objects"] = list(set(objects_in_frame))
259
-
260
- # Store frame for potential vision analysis
261
- if objects_in_frame: # Only store frames with detected objects
262
- frames_with_content.append({
263
- "frame_index": i,
264
- "objects": objects_in_frame,
265
- "object_counts": frame_data["object_counts"],
266
- "total_objects": len(objects_in_frame),
267
- "image": result.orig_img
268
- })
269
-
270
- frame_objects.append(frame_data)
271
-
272
- # Step 2: If there's a specific question, use vision LLM on selected frames
273
- detailed_analyses = []
274
- if question.strip():
275
- # Sort frames by total objects and select top frames
276
- frames_with_content.sort(key=lambda x: x["total_objects"], reverse=True)
277
- selected_frames = frames_with_content[:max_vision_frames]
278
-
279
- for frame_data in selected_frames:
280
- try:
281
- # Encode frame directly to base64
282
- _, buffer = cv2.imencode('.jpg', frame_data["image"])
283
- image_bytes = buffer.tobytes()
284
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
285
-
286
- message = [
287
- HumanMessage(
288
- content=[
289
- {"type": "text", "text": question},
290
- {
291
- "type": "image_url",
292
- "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
293
- }
294
- ]
295
- )
296
- ]
297
-
298
- vision_response = vision_llm.invoke(message)
299
-
300
- detailed_analyses.append({
301
- "frame_index": frame_data["frame_index"],
302
- "yolo_objects": frame_data["objects"],
303
- "yolo_counts": frame_data["object_counts"],
304
- "vision_analysis": vision_response.content
305
- })
306
-
307
- except Exception as vision_error:
308
- detailed_analyses.append({
309
- "frame_index": frame_data["frame_index"],
310
- "yolo_objects": frame_data["objects"],
311
- "yolo_counts": frame_data["object_counts"],
312
- "vision_analysis": f"Vision analysis failed: {str(vision_error)}"
313
- })
314
-
315
- # Combine results
316
- result_data = {
317
- "video_url": video_url,
318
- "question": question,
319
- "total_frames": len(frame_objects),
320
- "yolo_analysis": frame_objects,
321
- "frames_with_objects": len(frames_with_content)
322
- }
323
-
324
- if detailed_analyses:
325
- result_data["detailed_vision_analysis"] = detailed_analyses
326
- result_data["vision_frames_analyzed"] = len(detailed_analyses)
327
-
328
- return json.dumps(result_data, indent=2)
329
-
330
- except Exception as e:
331
- return f"Error analyzing video content: {str(e)}"
332
- @tool
333
- def google_search():
334
- """Google search tool"""
335
- api_wrapper = GoogleSearchAPIWrapper(
336
- google_api_key=GOOGLE_API_KEY,
337
- google_cse_id=GOOGLE_CSE_ID,
338
- k=10, # Number of results
339
- siterestrict=False # Site restrictions
340
- )
341
- google_search = GoogleSearchRun(api_wrapper=api_wrapper)
342
- return google_search
343
-
344
- @tool
345
- def wiki_search():
346
- """Google search tool"""
347
- api_wrapper = WikipediaAPIWrapper()
348
- search = WikipediaQueryRun(api_wrapper=api_wrapper)
349
- return search
350
-
351
- @tool
352
- def arxiv_search():
353
- """Google search tool"""
354
- api_wrapper = ArxivAPIWrapper()
355
- search = ArxivQueryRun(api_wrapper=api_wrapper)
356
- return search
357
- def general_tools():
358
- tools = [
359
- analyze_image,
360
- read_python_file,
361
- transcribe_audio,
362
- ]
363
- return tools
364
-
365
- def analyze_video_tools():
366
- tools = [object_detection, analyze_video_content]
367
- return tools
368
-
369
- def youtube_transcript_tools():
370
- tools = [get_youtube_transcript]
371
- return tools
372
-
373
- def file_agent_tools():
374
- tools = [read_excel]
375
- return tools
376
-
377
- def math_agent_tools():
378
- tools = [add, sum_list]
379
- return tools
380
-
381
- def data_agent_tools():
382
- tools = [extract_values, filter_rows]
383
- return tools
384
-
385
- def search_agen_tools():
386
- tools = [
387
- google_search,
388
- ArxivQueryRun(api_wrapper=ArxivAPIWrapper()),
389
- WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
390
- ]
391
- return tools
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
yolo11n.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ebbc80d4a7680d14987a577cd21342b65ecfd94632bd9a8da63ae6417644ee1
3
- size 5613764