silasyl commited on
Commit
ecbc0b3
·
1 Parent(s): 2705160

Initial commit with LFS-tracked files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,15 @@
1
  ---
2
  title: Template Final Assignment
3
- emoji: 💻
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.25.2
8
  app_file: app.py
9
  pinned: false
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Template Final Assignment
3
+ emoji: 🕵🏻‍♂️
4
+ colorFrom: indigo
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.25.2
8
  app_file: app.py
9
  pinned: false
10
+ hf_oauth: true
11
+ # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
12
+ hf_oauth_expiration_minutes: 480
13
  ---
14
 
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import inspect
4
+ import json
5
+ import numpy as np
6
+ import pandas as pd
7
+ import requests
8
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, OpenAIServerModel, Tool, VisitWebpageTool
9
+ from tools import WikipediaSummaryTool, WikipediaPageTool, YouTubeVisionAnalyzer, YouTubeTranscriptTool, AudioFileTranscriptTool, PythonFileReader, ExcelFileLoader
10
+ from vision_llm import call_vision_llm
11
+ from final_answer_llm import check_final_answer
12
+
13
+
14
+ # (Keep Constants as is)
15
+ # --- Constants ---
16
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
17
+
18
+ # --- Basic Agent Definition ---
19
+ class BasicAgent:
20
+ def __init__(self, api_url):
21
+ # database credentials
22
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
23
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
24
+ self.api_url = api_url
25
+
26
+ # Initialize the LLM
27
+ model = OpenAIServerModel(
28
+ api_key=OPENAI_API_KEY,
29
+ model_id='gpt-4o-mini',
30
+ temperature=0,
31
+ )
32
+ self.model = model
33
+
34
+ # Create main agent with all the tools
35
+ self.main_agent = CodeAgent(
36
+ tools=[
37
+ DuckDuckGoSearchTool(),
38
+ VisitWebpageTool(),
39
+ WikipediaSummaryTool(),
40
+ WikipediaPageTool(),
41
+ YouTubeVisionAnalyzer(),
42
+ YouTubeTranscriptTool(),
43
+ AudioFileTranscriptTool(),
44
+ PythonFileReader(),
45
+ ExcelFileLoader(),
46
+ call_vision_llm,
47
+ ],
48
+ model=model,
49
+ max_steps=15,
50
+ planning_interval=5,
51
+ additional_authorized_imports=[
52
+ "pandas",
53
+ "json",
54
+ "numpy",
55
+ ],
56
+ )
57
+ print("BasicAgent initialized.")
58
+
59
+ def __call__(self, question_data: dict) -> str:
60
+ task_id = question_data.get("task_id")
61
+ question = question_data.get("question")
62
+ file_name = question_data.get("file_name")
63
+
64
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
65
+
66
+ # In case the question has file
67
+ if file_name != '':
68
+ # Add metadata for file download
69
+ question = f"User query:\n{question}\n\nfile_id:\n{task_id}\n\nfile_url:\n{self.api_url}"
70
+
71
+ response = self.main_agent.run(question)
72
+
73
+ final_response = check_final_answer(question, response)
74
+
75
+ print(f"Agent returning response: {final_response}")
76
+ return final_response
77
+
78
+ def run_and_submit_all( profile: gr.OAuthProfile | None):
79
+ """
80
+ Fetches all questions, runs the BasicAgent on them, submits all answers,
81
+ and displays the results.
82
+ """
83
+ # --- Determine HF Space Runtime URL and Repo URL ---
84
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
85
+
86
+ if profile:
87
+ username= f"{profile.username}"
88
+ print(f"User logged in: {username}")
89
+ else:
90
+ print("User not logged in.")
91
+ return "Please Login to Hugging Face with the button.", None
92
+
93
+ api_url = DEFAULT_API_URL
94
+ questions_url = f"{api_url}/questions"
95
+ submit_url = f"{api_url}/submit"
96
+
97
+ # 1. Instantiate Agent ( modify this part to create your agent)
98
+ try:
99
+ agent = BasicAgent(api_url)
100
+ except Exception as e:
101
+ print(f"Error instantiating agent: {e}")
102
+ return f"Error initializing agent: {e}", None
103
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
104
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
105
+ print(agent_code)
106
+
107
+ # 2. Fetch Questions
108
+ print(f"Fetching questions from: {questions_url}")
109
+ try:
110
+ response = requests.get(questions_url, timeout=15)
111
+ response.raise_for_status()
112
+ questions_data = response.json()
113
+ if not questions_data:
114
+ print("Fetched questions list is empty.")
115
+ return "Fetched questions list is empty or invalid format.", None
116
+ print(f"Fetched {len(questions_data)} questions.")
117
+ except requests.exceptions.RequestException as e:
118
+ print(f"Error fetching questions: {e}")
119
+ print("Trying to load questions from backup JSON...")
120
+ try:
121
+ with open("questions_data.json", "r", encoding="utf-8") as f:
122
+ questions_data = json.load(f)
123
+ print(f"Loaded {len(questions_data)} questions.")
124
+ except Exception as json_e:
125
+ print(f"Failed to load backup questions: {json_e}")
126
+ return f"Error fetching from API and backup failed: {json_e}", None
127
+ except requests.exceptions.JSONDecodeError as e:
128
+ print(f"Error decoding JSON response from questions endpoint: {e}")
129
+ print(f"Response text: {response.text[:500]}")
130
+ return f"Error decoding server response for questions: {e}", None
131
+ except Exception as e:
132
+ print(f"An unexpected error occurred fetching questions: {e}")
133
+ return f"An unexpected error occurred fetching questions: {e}", None
134
+
135
+ # 3. Run your Agent
136
+ results_log = []
137
+ answers_payload = []
138
+ questions_data = [questions_data[3], questions_data[9], questions_data[11], questions_data[18]] # Remove this later
139
+ print(f"Running agent on {len(questions_data)} questions...")
140
+ for item in questions_data:
141
+ task_id = item.get("task_id")
142
+ question_text = item.get("question")
143
+ if not task_id or question_text is None:
144
+ print(f"Skipping item with missing task_id or question: {item}")
145
+ continue
146
+ try:
147
+ submitted_answer = agent(item)
148
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
149
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
150
+ except Exception as e:
151
+ print(f"Error running agent on task {task_id}: {e}")
152
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
153
+
154
+ if not answers_payload:
155
+ print("Agent did not produce any answers to submit.")
156
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
157
+
158
+ # 4. Prepare Submission
159
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
160
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
161
+ print(status_update)
162
+
163
+ # 5. Submit
164
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
165
+ # try:
166
+ # response = requests.post(submit_url, json=submission_data, timeout=60)
167
+ # response.raise_for_status()
168
+ # result_data = response.json()
169
+ # final_status = (
170
+ # f"Submission Successful!\n"
171
+ # f"User: {result_data.get('username')}\n"
172
+ # f"Overall Score: {result_data.get('score', 'N/A')}% "
173
+ # f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
174
+ # f"Message: {result_data.get('message', 'No message received.')}"
175
+ # )
176
+ # print("Submission successful.")
177
+ # results_df = pd.DataFrame(results_log)
178
+ # return final_status, results_df
179
+ # except requests.exceptions.HTTPError as e:
180
+ # error_detail = f"Server responded with status {e.response.status_code}."
181
+ # try:
182
+ # error_json = e.response.json()
183
+ # error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
184
+ # except requests.exceptions.JSONDecodeError:
185
+ # error_detail += f" Response: {e.response.text[:500]}"
186
+ # status_message = f"Submission Failed: {error_detail}"
187
+ # print(status_message)
188
+ # results_df = pd.DataFrame(results_log)
189
+ # return status_message, results_df
190
+ # except requests.exceptions.Timeout:
191
+ # status_message = "Submission Failed: The request timed out."
192
+ # print(status_message)
193
+ # results_df = pd.DataFrame(results_log)
194
+ # return status_message, results_df
195
+ # except requests.exceptions.RequestException as e:
196
+ # status_message = f"Submission Failed: Network error - {e}"
197
+ # print(status_message)
198
+ # results_df = pd.DataFrame(results_log)
199
+ # return status_message, results_df
200
+ # except Exception as e:
201
+ # status_message = f"An unexpected error occurred during submission: {e}"
202
+ # print(status_message)
203
+ # results_df = pd.DataFrame(results_log)
204
+ # return status_message, results_df
205
+ return "finished", pd.DataFrame()
206
+
207
+
208
+ # --- Build Gradio Interface using Blocks ---
209
+ with gr.Blocks() as demo:
210
+ gr.Markdown("# Basic Agent Evaluation Runner")
211
+ gr.Markdown(
212
+ """
213
+ **Instructions:**
214
+
215
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
216
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
217
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
218
+
219
+ ---
220
+ **Disclaimers:**
221
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
222
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
223
+ """
224
+ )
225
+
226
+ gr.LoginButton()
227
+
228
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
229
+
230
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
231
+ # Removed max_rows=10 from DataFrame constructor
232
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
233
+
234
+ run_button.click(
235
+ fn=run_and_submit_all,
236
+ outputs=[status_output, results_table]
237
+ )
238
+
239
+ if __name__ == "__main__":
240
+ print("tests 3 (chess png), 9 (mp3 recipe), 11 (py), 18 (xlsx)")
241
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
242
+ # Check for SPACE_HOST and SPACE_ID at startup for information
243
+ space_host_startup = os.getenv("SPACE_HOST")
244
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
245
+
246
+ if space_host_startup:
247
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
248
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
249
+ else:
250
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
251
+
252
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
253
+ print(f"✅ SPACE_ID found: {space_id_startup}")
254
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
255
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
256
+ else:
257
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
258
+
259
+ print("-"*(60 + len(" App Starting ")) + "\n")
260
+
261
+ print("Launching Gradio Interface for Basic Agent Evaluation...")
262
+ demo.launch(debug=True, share=False)
files/1f975693-876d-457b-a649-393859e79bf3.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:200f767e732b49efef5c05d128903ee4d2c34e66fdce7f5593ac123b2e637673
3
+ size 280868
files/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx ADDED
Binary file (5.29 kB). View file
 
files/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b218c951c1f888f0bbe6f46c080f57afc7c9348fffc7ba4da35749ff1e2ac40f
3
+ size 179304
files/cca530fc-4052-43b2-b130-b30968d8aa44.png ADDED
files/f918266a-b3e0-4914-865d-4faa564f1aef.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from random import randint
2
+ import time
3
+
4
+ class UhOh(Exception):
5
+ pass
6
+
7
+ class Hmm:
8
+ def __init__(self):
9
+ self.value = randint(-100, 100)
10
+
11
+ def Yeah(self):
12
+ if self.value == 0:
13
+ return True
14
+ else:
15
+ raise UhOh()
16
+
17
+ def Okay():
18
+ while True:
19
+ yield Hmm()
20
+
21
+ def keep_trying(go, first_try=True):
22
+ maybe = next(go)
23
+ try:
24
+ if maybe.Yeah():
25
+ return maybe.value
26
+ except UhOh:
27
+ if first_try:
28
+ print("Working...")
29
+ print("Please wait patiently...")
30
+ time.sleep(0.1)
31
+ return keep_trying(go, first_try=False)
32
+
33
+ if __name__ == "__main__":
34
+ go = Okay()
35
+ print(f"{keep_trying(go)}")
final_answer_llm.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from smolagents import OpenAIServerModel
3
+
4
+
5
+ system_text = """
6
+ Your task is to make the given answer as concise as possible **without changing its meaning** or introducing any new information.
7
+ Only shorten it by removing redundancy or unnecessary phrasing. You must preserve the original answer's facts and structure. Do not generate new content.
8
+ Only output the revised answer. Do not include explanations or formatting like "Answer:" or "Final Answer".
9
+ """
10
+
11
+
12
+ def check_final_answer(question: str, answer: str) -> str:
13
+ """
14
+ Pass the question and answer to a LLM, to make it proper for GAIA comparison.
15
+
16
+ Args:
17
+ question: Question.
18
+ answer: Original final answer.
19
+ """
20
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
21
+
22
+ answer_model = OpenAIServerModel(
23
+ api_key=OPENAI_API_KEY,
24
+ model_id='gpt-4o-mini',
25
+ temperature=0,
26
+ )
27
+
28
+ messages = [
29
+ {
30
+ "role": "system",
31
+ "content": [
32
+ {
33
+ "type": "text",
34
+ "text": system_text
35
+ }
36
+ ]
37
+ },
38
+ {
39
+ "role": "user",
40
+ "content": [
41
+ {
42
+ "type": "text",
43
+ "text": f"Question:\n{question}\n\nOriginal Answer:\n{answer}",
44
+ }
45
+ ]
46
+ }
47
+ ]
48
+ response = answer_model(messages).content
49
+
50
+ return response
questions_data.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
4
+ "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
5
+ "Level": "1",
6
+ "file_name": ""
7
+ },
8
+ {
9
+ "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
10
+ "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
11
+ "Level": "1",
12
+ "file_name": ""
13
+ },
14
+ {
15
+ "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
16
+ "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
17
+ "Level": "1",
18
+ "file_name": ""
19
+ },
20
+ {
21
+ "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
22
+ "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
23
+ "Level": "1",
24
+ "file_name": "cca530fc-4052-43b2-b130-b30968d8aa44.png"
25
+ },
26
+ {
27
+ "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
28
+ "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
29
+ "Level": "1",
30
+ "file_name": ""
31
+ },
32
+ {
33
+ "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
34
+ "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
35
+ "Level": "1",
36
+ "file_name": ""
37
+ },
38
+ {
39
+ "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
40
+ "question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
41
+ "Level": "1",
42
+ "file_name": ""
43
+ },
44
+ {
45
+ "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
46
+ "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
47
+ "Level": "1",
48
+ "file_name": ""
49
+ },
50
+ {
51
+ "task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
52
+ "question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
53
+ "Level": "1",
54
+ "file_name": ""
55
+ },
56
+ {
57
+ "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
58
+ "question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
59
+ "Level": "1",
60
+ "file_name": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3"
61
+ },
62
+ {
63
+ "task_id": "305ac316-eef6-4446-960a-92d80d542f82",
64
+ "question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
65
+ "Level": "1",
66
+ "file_name": ""
67
+ },
68
+ {
69
+ "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
70
+ "question": "What is the final numeric output from the attached Python code?",
71
+ "Level": "1",
72
+ "file_name": "f918266a-b3e0-4914-865d-4faa564f1aef.py"
73
+ },
74
+ {
75
+ "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
76
+ "question": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
77
+ "Level": "1",
78
+ "file_name": ""
79
+ },
80
+ {
81
+ "task_id": "1f975693-876d-457b-a649-393859e79bf3",
82
+ "question": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
83
+ "Level": "1",
84
+ "file_name": "1f975693-876d-457b-a649-393859e79bf3.mp3"
85
+ },
86
+ {
87
+ "task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
88
+ "question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
89
+ "Level": "1",
90
+ "file_name": ""
91
+ },
92
+ {
93
+ "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
94
+ "question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
95
+ "Level": "1",
96
+ "file_name": ""
97
+ },
98
+ {
99
+ "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
100
+ "question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
101
+ "Level": "1",
102
+ "file_name": ""
103
+ },
104
+ {
105
+ "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
106
+ "question": "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
107
+ "Level": "1",
108
+ "file_name": ""
109
+ },
110
+ {
111
+ "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
112
+ "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
113
+ "Level": "1",
114
+ "file_name": "7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx"
115
+ },
116
+ {
117
+ "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
118
+ "question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
119
+ "Level": "1",
120
+ "file_name": ""
121
+ }
122
+ ]
requirements.txt ADDED
Binary file (4.21 kB). View file
 
tools.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import cv2
3
+ import io
4
+ import os
5
+ import requests
6
+ import whisper
7
+ import wikipedia
8
+ import yt_dlp
9
+ from dotenv import load_dotenv
10
+ from PIL import Image
11
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, OpenAIServerModel, Tool, VisitWebpageTool
12
+ from youtube_transcript_api import YouTubeTranscriptApi
13
+
14
+
15
+ load_dotenv()
16
+
17
+ # database credentials
18
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
19
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
20
+
21
+
22
+ def get_file_content(file_id: str, url: str = None):
23
+ # Simulates download, I am using this because currently I am blocked from downloading too much
24
+ # Look for any file with that ID regardless of extension
25
+ folder_path = "files"
26
+ for filename in os.listdir(folder_path):
27
+ if filename.startswith(file_id):
28
+ file_path = os.path.join(folder_path, filename)
29
+ with open(file_path, "rb") as f:
30
+ content = f.read()
31
+ # Simulate response.content
32
+ return io.BytesIO(content).getvalue()
33
+
34
+
35
+ class WikipediaSummaryTool(Tool):
36
+ name = "wikipedia_summary"
37
+ description = "Fetches a summary of a topic from Wikipedia."
38
+ inputs = {
39
+ "query": {
40
+ "type": "string",
41
+ "description": "The topic to search on Wikipedia."
42
+ }
43
+ }
44
+ output_type = "string"
45
+
46
+ def __init__(self):
47
+ wikipedia.set_lang("en")
48
+
49
+ def is_initialized(self) -> bool:
50
+ return True
51
+
52
+ def forward(self, query: str):
53
+ # Calls wikipedia api
54
+ response = wikipedia.summary(query)
55
+ return response
56
+
57
+
58
+ class WikipediaPageTool(Tool):
59
+ name = "wikipedia_page"
60
+ description = "Fetches the complete page of a topic from Wikipedia."
61
+ inputs = {
62
+ "query": {
63
+ "type": "string",
64
+ "description": "The topic to search on Wikipedia."
65
+ }
66
+ }
67
+ output_type = "string"
68
+
69
+ def __init__(self):
70
+ wikipedia.set_lang("en")
71
+
72
+ def is_initialized(self) -> bool:
73
+ return True
74
+
75
+ def forward(self, query: str):
76
+ # Calls wikipedia api
77
+ page = wikipedia.page(query)
78
+ return page.content
79
+
80
+
81
+ class YouTubeVisionAnalyzer(Tool):
82
+ name = "youtube_vision_analyzer"
83
+ description = "Analyzes visual content from YouTube videos by extracting and processing frames. It does not process audio or subtitles, and is best used for tasks involving objects, scenes, or visual patterns appearing in the video."
84
+ inputs = {
85
+ "video_url": {
86
+ "type": "string",
87
+ "description": "The URL of the YouTube video to process."
88
+ },
89
+ "user_query": {
90
+ "type": "string",
91
+ "description": "The user's query."
92
+ }
93
+ }
94
+ output_type = "string"
95
+
96
+ def __init__(self):
97
+ pass
98
+
99
+ def is_initialized(self) -> bool:
100
+ return True
101
+
102
+ @staticmethod
103
+ def download_youtube_video(url: str):
104
+ # Download the video using yt-dlp (saves as youtube_video.mp4)
105
+ ydl_opts = {
106
+ 'format': 'mp4',
107
+ 'outtmpl': 'youtube_video.mp4'
108
+ }
109
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
110
+ ydl.download([url])
111
+ return 'youtube_video.mp4'
112
+
113
+ @staticmethod
114
+ def extract_frames(video_path: str, output_dir="frames"):
115
+ os.makedirs(output_dir, exist_ok=True)
116
+
117
+ cap = cv2.VideoCapture(video_path)
118
+ fps = cap.get(cv2.CAP_PROP_FPS)
119
+ frame_interval = int(fps * 5) # 5 seconds
120
+
121
+ frame_count = 0
122
+ saved_count = 0
123
+
124
+ while cap.isOpened():
125
+ ret, frame = cap.read()
126
+ if not ret:
127
+ break
128
+ if frame_count % frame_interval == 0:
129
+ frame_filename = os.path.join(output_dir, f"frame_{saved_count:03d}.jpg")
130
+ cv2.imwrite(frame_filename, frame)
131
+ saved_count += 1
132
+
133
+ frame_count += 1
134
+ cap.release()
135
+
136
+ return output_dir
137
+
138
+ @staticmethod
139
+ def encode_image(image_path:str, new_size=512):
140
+ # Resize image to upper 512 pixels and return in base64 format
141
+
142
+ with Image.open(image_path) as image:
143
+ original_width, original_height = image.size
144
+ if original_width > original_height:
145
+ ratio = new_size / original_width
146
+ else:
147
+ ratio = new_size / original_height
148
+
149
+ new_width = int(original_width * ratio)
150
+ new_height = int(original_height * ratio)
151
+
152
+ resized_image = image.resize((new_width, new_height))
153
+
154
+ buffered = io.BytesIO()
155
+ resized_image.save(buffered, format='JPEG')
156
+ return base64.b64encode(buffered.getvalue()).decode('utf-8')
157
+
158
+ @staticmethod
159
+ def call_vision_llm(folder_path: str, user_query: str):
160
+ encoded_images = []
161
+ responses = []
162
+
163
+ model = OpenAIServerModel(
164
+ api_key=OPENAI_API_KEY,
165
+ model_id='gpt-4o-mini',
166
+ temperature=0,
167
+ )
168
+
169
+ for filename in sorted(os.listdir(folder_path)):
170
+ if filename.endswith(".jpg"):
171
+ img_path = os.path.join(folder_path, filename)
172
+ encoded_image = YouTubeVisionAnalyzer.encode_image(img_path)
173
+ encoded_images.append(encoded_image)
174
+
175
+ batch_size = 12
176
+ for i in range(0, len(encoded_images), batch_size):
177
+ batch = encoded_images[i:i+batch_size]
178
+
179
+ messages = [
180
+ {
181
+ "role": "system",
182
+ "content": [
183
+ {
184
+ "type": "text",
185
+ "text": "You are an assistant analyzing image frames extracted from a video. If the user query refers to a video, remember these are frames from the video. Do not provide extra information or external inference.",
186
+ }
187
+ ]
188
+ },
189
+ {
190
+ "role": "user",
191
+ "content": [
192
+ {
193
+ "type": "text",
194
+ "text": user_query,
195
+ },
196
+ *[
197
+ {
198
+ "type": "image_url",
199
+ "image_url": {
200
+ "url": f"data:image/jpeg;base64,{encoded_image}",
201
+ "detail": "low"
202
+ }
203
+ }
204
+ for encoded_image in batch
205
+ ]
206
+ ]
207
+ }
208
+ ]
209
+ responses.append(model(messages).content)
210
+
211
+ messages = [
212
+ {
213
+ "role": "system",
214
+ "content": "You are a helpful assistant that summarizes and extracts the correct answer from multiple partial observations. Each partial response comes from analyzing a batch of video frames. Given the user's query and the list of partial responses, your task is to provide the best final answer to the user's query. Be concise in the final answer."
215
+ },
216
+ {
217
+ "role": "user",
218
+ "content": f"User's query:\n{user_query}.\n\nPartial responses:\n" + "\n".join(f"- {response}" for response in responses)
219
+ }
220
+ ]
221
+
222
+ final_response = model(messages).content
223
+
224
+ return final_response
225
+
226
+ @staticmethod
227
+ def delete_video_file(video_path: str, folder_path: str):
228
+ if os.path.exists(video_path):
229
+ os.remove(video_path)
230
+
231
+ if os.path.exists(folder_path):
232
+ for filename in os.listdir(folder_path):
233
+ if filename.endswith(".jpg"):
234
+ file_path = os.path.join(folder_path, filename)
235
+ os.remove(file_path)
236
+
237
+ def forward(self, video_url: str, user_query: str):
238
+ # Process video: download, extract frames, detect objects, call llm
239
+ video_path = YouTubeVisionAnalyzer.download_youtube_video(video_url)
240
+ folder_path = YouTubeVisionAnalyzer.extract_frames(video_path)
241
+ response = YouTubeVisionAnalyzer.call_vision_llm(folder_path, user_query)
242
+ YouTubeVisionAnalyzer.delete_video_file(video_path, folder_path)
243
+
244
+ return response
245
+
246
+
247
+ class YouTubeTranscriptTool(Tool):
248
+ name = "youtube_transcript_tool"
249
+ description = "Extracts textual transcripts (captions) from YouTube videos to analyze spoken content. This tool is useful for identifying what is said in the video, such as dialogue, spoken instructions, or narration. It does not analyze visual elements like scenes or objects. Pay attention because transcriptions may be truncated."
250
+ inputs = {
251
+ "video_url": {
252
+ "type": "string",
253
+ "description": "The YouTube video URL."
254
+ }
255
+ }
256
+ output_type = "string"
257
+
258
+ def __init__(self):
259
+ pass
260
+
261
+ def is_initialized(self) -> bool:
262
+ return True
263
+
264
+ def forward(self, video_url: str):
265
+ # Extract the video ID from the URL
266
+ video_id = video_url.split("v=")[-1]
267
+
268
+ try:
269
+ # Fetch the transcript using YouTubeTranscriptApi
270
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
271
+ return transcript
272
+
273
+ except Exception as e:
274
+ return str(e)
275
+
276
+
277
+ class AudioFileTranscriptTool(Tool):
278
+ name = "audio_file_transcript_tool"
279
+ description = "Extracts text transcripts from uploaded audio files (e.g., MP3, WAV). Use this tool to analyze spoken content from user-provided files, not from YouTube or video links. It only processes audio, not visual information."
280
+ inputs = {
281
+ "file_id": {
282
+ "type": "string",
283
+ "description": "Metadata required to download the audio."
284
+ },
285
+ "file_url": {
286
+ "type": "string",
287
+ "description": "Metadata required to download the audio."
288
+ },
289
+ }
290
+ output_type = "string"
291
+
292
+ def __init__(self):
293
+ # Load Whisper model
294
+ self.whisper_model = whisper.load_model("base", device="cpu")
295
+
296
+ def is_initialized(self) -> bool:
297
+ return True
298
+
299
+ def forward(self, file_id: str, file_url: str):
300
+ # Downloads an audio file and transcript it to text
301
+ #questions_files = f"{file_url}/files"
302
+ #response = requests.get(f"{questions_files}/{file_id}", timeout=15)
303
+
304
+ response = get_file_content(file_id, file_url)
305
+
306
+ # Save MP3 bytes to a file
307
+ with open("audio.mp3", "wb") as f:
308
+ f.write(response.content)
309
+
310
+ # Transcribe the audio
311
+ result = self.whisper_model.transcribe("audio.mp3", language="en", fp16=False)
312
+
313
+ # Remove file
314
+ os.remove("audio.mp3")
315
+
316
+ return result['text']
317
+
318
+
319
+ class PythonFileReader(Tool):
320
+ name = "python_file_reader"
321
+ description = "Extracts the full text content of a Python (.py) file so that it can be analyzed by the agent."
322
+ inputs = {
323
+ "file_id": {
324
+ "type": "string",
325
+ "description": "Metadata required to download the file."
326
+ },
327
+ "file_url": {
328
+ "type": "string",
329
+ "description": "Metadata required to download the file."
330
+ },
331
+ }
332
+ output_type = "string"
333
+
334
+ def __init__(self):
335
+ pass
336
+
337
+ def is_initialized(self) -> bool:
338
+ return True
339
+
340
+ def forward(self, file_id: str, file_url: str):
341
+ # Downloads a python file and decode it
342
+ #questions_files = f"{file_url}/files"
343
+ #response = requests.get(f"{questions_files}/{file_id}", timeout=15)
344
+
345
+ response = get_file_content(file_id, file_url)
346
+
347
+ # Decode bytes to text
348
+ code_content = response.content.decode("utf-8")
349
+
350
+ return code_content
351
+
352
+
353
+ class ExcelFileLoader(Tool):
354
+ name = "excel_file_loader"
355
+ description = "Downloads and stores an Excel spreadsheet (.xlsx) locally as 'sheet.xlsx' so it can be programmatically analyzed by the agent using tools like pandas. This tool does not interpret or summarize the data itself — it only ensures the file is available in the environment."
356
+ inputs = {
357
+ "file_id": {
358
+ "type": "string",
359
+ "description": "Metadata required to download the file."
360
+ },
361
+ "file_url": {
362
+ "type": "string",
363
+ "description": "Metadata required to download the file."
364
+ },
365
+ }
366
+ output_type = "string"
367
+
368
+ def __init__(self):
369
+ pass
370
+
371
+ def is_initialized(self) -> bool:
372
+ return True
373
+
374
+ def forward(self, file_id: str, file_url: str):
375
+ # Downloads a spreadsheet and saves it
376
+ #questions_files = f"{file_url}/files"
377
+ #response = requests.get(f"{questions_files}/{file_id}", timeout=15)
378
+
379
+ response = get_file_content(file_id, file_url)
380
+
381
+ # Save bytes to a spreadsheet file
382
+ with open("sheet.xlsx", "wb") as f:
383
+ f.write(response.content)
384
+
385
+ return "sheet.xlsx"
386
+
vision_llm.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import base64
3
+ import os
4
+ import requests
5
+ from PIL import Image
6
+ from smolagents import tool, OpenAIServerModel
7
+ from tools import get_file_content
8
+
9
+
10
+ def encode_image(image_bytes: bytes, new_size=512):
11
+ # Resize image to upper 512 pixels and return in base64 format
12
+
13
+ image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
14
+ original_width, original_height = image.size
15
+
16
+ if original_width > original_height:
17
+ ratio = new_size / original_width
18
+ else:
19
+ ratio = new_size / original_height
20
+
21
+ new_width = int(original_width * ratio)
22
+ new_height = int(original_height * ratio)
23
+
24
+ resized_image = image.resize((new_width, new_height))
25
+ buffered = io.BytesIO()
26
+ resized_image.save(buffered, format='JPEG')
27
+ return base64.b64encode(buffered.getvalue()).decode('utf-8')
28
+
29
+ def download_image(task_id: str, api_url: str) -> None:
30
+ # Downloads an image file and encode it in base64 format
31
+ #questions_files = f"{api_url}/files"
32
+ #response = requests.get(f"{questions_files}/{task_id}", timeout=15)
33
+ response = get_file_content(task_id, api_url)
34
+ encoded_image = encode_image(response.content)
35
+ return encoded_image
36
+
37
+
38
+ @tool
39
+ def call_vision_llm(user_query: str, file_id: str, file_url: str) -> str:
40
+ """
41
+ Downloads the image using the file_id and file_url, then analyzes it using a vision-based LLM, following user query.
42
+
43
+ Args:
44
+ user_query: User request on image.
45
+ file_id: metadata required to download the image.
46
+ file_url: metadata required to download the image.
47
+ """
48
+ encoded_image = download_image(file_id, file_url)
49
+
50
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
51
+
52
+ vision_model = OpenAIServerModel(
53
+ api_key=OPENAI_API_KEY,
54
+ model_id='gpt-4o-mini',
55
+ temperature=0,
56
+ )
57
+
58
+ messages = [
59
+ {
60
+ "role": "user",
61
+ "content": [
62
+ {
63
+ "type": "text",
64
+ "text": user_query,
65
+ },
66
+ {
67
+ "type": "image_url",
68
+ "image_url": {
69
+ "url": f"data:image/jpeg;base64,{encoded_image}",
70
+ "detail": "low"
71
+ }
72
+ }
73
+ ]
74
+ }
75
+ ]
76
+ response = vision_model(messages).content
77
+
78
+ return response