santiagoahl commited on
Commit
5dfdf10
·
1 Parent(s): 28a47c2

Integrate Project work to HF template

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -35
  2. .gitignore +0 -1
  3. README.md +110 -15
  4. app.py +0 -196
  5. configs/agent_config.yaml +0 -0
  6. configs/tool_config.yaml +0 -0
  7. data/agent_experiments/iterations/1_initial_experiment.csv +47 -0
  8. data/agent_experiments/iterations/2_search_and_code_tools_integrated.csv +74 -0
  9. data/agent_experiments/iterations/3_integrate_whisper_audio_transcriber.csv +23 -0
  10. data/agent_experiments/iterations/4_test_workflow.csv +36 -0
  11. data/agent_experiments/iterations/5_integrate_text_processing_tool.csv +64 -0
  12. data/agent_experiments/iterations/6_integrate_text_handler_tool.csv +66 -0
  13. data/agent_experiments/iterations/7_test_agent_performance_against_tasks_with_attached_txt_files.csv +2 -0
  14. data/agent_experiments/iterations/8_integrate_chess_tool.csv +56 -0
  15. data/agent_experiments/summary.csv +9 -0
  16. data/images/agent_architecture.png +0 -0
  17. data/images/chess_board_example.png +0 -0
  18. data/images/cropped_chess_board.png +0 -0
  19. data/images/cropped_chess_board_gray.png +0 -0
  20. data/temp-data/temp_json.json +1 -0
  21. data/temp-data/temp_text.txt +2 -0
  22. data/temp/results_web_search.csv +39 -0
  23. external/chess-modules/board_to_fen/.github/workflows/workflow.yml +51 -0
  24. external/chess-modules/board_to_fen/.gitignore +138 -0
  25. external/chess-modules/board_to_fen/CHANGELOG.txt +26 -0
  26. external/chess-modules/board_to_fen/LICENSE +21 -0
  27. external/chess-modules/board_to_fen/MANIFEST.in +1 -0
  28. external/chess-modules/board_to_fen/README.md +86 -0
  29. external/chess-modules/board_to_fen/board_to_fen/KerasNeuralNetwork.py +52 -0
  30. external/chess-modules/board_to_fen/board_to_fen/__init__.py +0 -0
  31. external/chess-modules/board_to_fen/board_to_fen/predict.py +102 -0
  32. external/chess-modules/board_to_fen/board_to_fen/saved_models/__init__.py +0 -0
  33. external/chess-modules/board_to_fen/board_to_fen/test_image.jpeg +0 -0
  34. external/chess-modules/board_to_fen/board_to_fen/train_model.py +22 -0
  35. external/chess-modules/board_to_fen/board_to_fen/utils.py +126 -0
  36. external/chess-modules/board_to_fen/poetry.lock +0 -0
  37. external/chess-modules/board_to_fen/pyproject.toml +18 -0
  38. external/chess-modules/board_to_fen/requirements.txt +3 -0
  39. external/chess-modules/board_to_fen/setup.py +27 -0
  40. external/chess-modules/board_to_fen/test_images/test_image2.jpeg +0 -0
  41. external/chess-modules/board_to_fen/test_predict.py +13 -0
  42. langgraph.json +6 -0
  43. main.py +0 -0
  44. notebooks/final_submition.ipynb +79 -0
  45. notebooks/gaia_level_one_questions_validation.ipynb +1311 -0
  46. notebooks/ml_agent_experimentation.ipynb +0 -0
  47. notebooks/tools_poc/audio_tool.ipynb +688 -0
  48. notebooks/tools_poc/chess_tool.ipynb +863 -0
  49. notebooks/tools_poc/template.ipynb +475 -0
  50. notebooks/tools_poc/web_search.ipynb +2011 -0
.gitattributes CHANGED
@@ -1,37 +1,2 @@
1
  *.pt filter=lfs diff=lfs merge=lfs -text
2
  *.pt filter=lfs diff=lfs merge=lfs -text
3
- *.7z filter=lfs diff=lfs merge=lfs -text
4
- *.arrow filter=lfs diff=lfs merge=lfs -text
5
- *.bin filter=lfs diff=lfs merge=lfs -text
6
- *.bz2 filter=lfs diff=lfs merge=lfs -text
7
- *.ckpt filter=lfs diff=lfs merge=lfs -text
8
- *.ftz filter=lfs diff=lfs merge=lfs -text
9
- *.gz filter=lfs diff=lfs merge=lfs -text
10
- *.h5 filter=lfs diff=lfs merge=lfs -text
11
- *.joblib filter=lfs diff=lfs merge=lfs -text
12
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
13
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
- *.model filter=lfs diff=lfs merge=lfs -text
15
- *.msgpack filter=lfs diff=lfs merge=lfs -text
16
- *.npy filter=lfs diff=lfs merge=lfs -text
17
- *.npz filter=lfs diff=lfs merge=lfs -text
18
- *.onnx filter=lfs diff=lfs merge=lfs -text
19
- *.ot filter=lfs diff=lfs merge=lfs -text
20
- *.parquet filter=lfs diff=lfs merge=lfs -text
21
- *.pb filter=lfs diff=lfs merge=lfs -text
22
- *.pickle filter=lfs diff=lfs merge=lfs -text
23
- *.pkl filter=lfs diff=lfs merge=lfs -text
24
- *.pt filter=lfs diff=lfs merge=lfs -text
25
- *.pth filter=lfs diff=lfs merge=lfs -text
26
- *.rar filter=lfs diff=lfs merge=lfs -text
27
- *.safetensors filter=lfs diff=lfs merge=lfs -text
28
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
- *.tar.* filter=lfs diff=lfs merge=lfs -text
30
- *.tar filter=lfs diff=lfs merge=lfs -text
31
- *.tflite filter=lfs diff=lfs merge=lfs -text
32
- *.tgz filter=lfs diff=lfs merge=lfs -text
33
- *.wasm filter=lfs diff=lfs merge=lfs -text
34
- *.xz filter=lfs diff=lfs merge=lfs -text
35
- *.zip filter=lfs diff=lfs merge=lfs -text
36
- *.zst filter=lfs diff=lfs merge=lfs -text
37
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
  *.pt filter=lfs diff=lfs merge=lfs -text
2
  *.pt filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -38,7 +38,6 @@ notebooks/tools_poc/*.pt
38
  *.npy
39
  *.npz
40
  *.joblib
41
- *.png
42
 
43
  ms-playwright/
44
  .playwright/
 
38
  *.npy
39
  *.npz
40
  *.joblib
 
41
 
42
  ms-playwright/
43
  .playwright/
README.md CHANGED
@@ -1,15 +1,110 @@
1
- ---
2
- title: Template Final Assignment
3
- emoji: 🕵🏻‍♂️
4
- colorFrom: indigo
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.25.2
8
- app_file: app.py
9
- pinned: false
10
- hf_oauth: true
11
- # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
12
- hf_oauth_expiration_minutes: 480
13
- ---
14
-
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Chappie
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: "4.16.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # **🤖 Chappi-e**
13
+
14
+ Chappi-e is a lightweight AI assistant agent built using the ********React architecture******** with [__LangGraph__](__https://docs.langgraph.dev/__). It is designed to reason, plan, and execute actions using tools in a flexible and modular workflow.
15
+
16
+ ---
17
+
18
+
19
+ https://github.com/user-attachments/assets/8b1c4ffb-f1c3-426d-9ac7-77a81d04d991
20
+
21
+
22
+ ## **🛠️ Setup**
23
+
24
+ This project uses [__Poetry__](__https://python-poetry.org/__) for dependency management.
25
+
26
+ ### **1. Clone the repository**
27
+
28
+ ```bash
29
+ git clone https://github.com/yourusername/chappie.git
30
+ cd chappie
31
+ ````
32
+
33
+ _### 2. Install dependencies_
34
+
35
+ Make sure you have Poetry installed. Then run:
36
+
37
+ ```bash
38
+ env varibles
39
+ sudo apt install ffmpeg
40
+ poetry install
41
+ ```
42
+
43
+ ### **3. Activate the environment**
44
+
45
+ ```bash
46
+ poetry shell # Then activate the shown environment with conda or pyenv
47
+ ```
48
+
49
+ ---
50
+
51
+ ## **🔐 API Keys Required**
52
+
53
+ To run Chappie, you need valid API keys for the following services:
54
+
55
+ * [__OpenAI__](__https://platform.openai.com/__)
56
+ * [__LangGraph__](__https://www.langgraph.dev/__)
57
+ * [__Tavily Search__](__https://docs.tavily.com/__)
58
+ * [__LangChain__](__https://www.langchain.com/__) (used for tool wrappers/utilities)
59
+
60
+ You can provide these via environment variables or a `.env` file.
61
+
62
+ ---
63
+
64
+ ## **🧠 Architecture**
65
+
66
+ Chappie is a ********React-style agent********, meaning it follows a loop of:
67
+
68
+ 1. ********Reason******** about the query
69
+ 2. ********Act******** by calling a tool if needed
70
+ 3. ********Observe******** the result
71
+ 4. ********Repeat or respond********
72
+
73
+ The logic is built using ********LangGraph********, which provides a composable and stateful way to define agent workflows.
74
+
75
+ ---
76
+
77
+ ## **🚀 Run the agent**
78
+
79
+ You can invoke Chappie using:
80
+
81
+ ```bash
82
+ python -m src.agents.react
83
+ ```
84
+
85
+ ---
86
+
87
+ ## **📄 License**
88
+
89
+ MIT License.
90
+
91
+
92
+ ---
93
+
94
+ # TODO
95
+
96
+ 1. Include tutorial to download stockfish ->
97
+ ```bash
98
+ mkdir -p external/chess-modules
99
+ curl -L https://github.com/official-stockfish/Stockfish/releases/latest/download/stockfish-ubuntu-x86-64-avx2.tar -o external/chess-modules/stockfish-ubuntu-x86-64-avx2.tar
100
+ mkdir -p external/chess-modules/stockfish
101
+ tar -xvf external/chess-modules/stockfish-ubuntu-x86-64-avx2.tar -C external/chess-modules/stockfish --strip-components=1
102
+ rm external/chess-modules/stockfish-ubuntu-x86-64-avx2.tar
103
+ ```
104
+ 2. Change repo name from chappie to chappi-e
105
+ Probably use an `init.sh` script
106
+
107
+ ---
108
+
109
+ # References
110
+ - For the Chess tool, we leveraged [board_to_fen](https://github.com/mcdominik/board_to_fen) and [Stockfish 17.1](https://github.com/official-stockfish/Stockfish)
app.py CHANGED
@@ -1,196 +0,0 @@
1
- import os
2
- import gradio as gr
3
- import requests
4
- import inspect
5
- import pandas as pd
6
-
7
- # (Keep Constants as is)
8
- # --- Constants ---
9
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
-
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
- class BasicAgent:
14
- def __init__(self):
15
- print("BasicAgent initialized.")
16
- def __call__(self, question: str) -> str:
17
- print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
21
-
22
- def run_and_submit_all( profile: gr.OAuthProfile | None):
23
- """
24
- Fetches all questions, runs the BasicAgent on them, submits all answers,
25
- and displays the results.
26
- """
27
- # --- Determine HF Space Runtime URL and Repo URL ---
28
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
29
-
30
- if profile:
31
- username= f"{profile.username}"
32
- print(f"User logged in: {username}")
33
- else:
34
- print("User not logged in.")
35
- return "Please Login to Hugging Face with the button.", None
36
-
37
- api_url = DEFAULT_API_URL
38
- questions_url = f"{api_url}/questions"
39
- submit_url = f"{api_url}/submit"
40
-
41
- # 1. Instantiate Agent ( modify this part to create your agent)
42
- try:
43
- agent = BasicAgent()
44
- except Exception as e:
45
- print(f"Error instantiating agent: {e}")
46
- return f"Error initializing agent: {e}", None
47
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
48
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
49
- print(agent_code)
50
-
51
- # 2. Fetch Questions
52
- print(f"Fetching questions from: {questions_url}")
53
- try:
54
- response = requests.get(questions_url, timeout=15)
55
- response.raise_for_status()
56
- questions_data = response.json()
57
- if not questions_data:
58
- print("Fetched questions list is empty.")
59
- return "Fetched questions list is empty or invalid format.", None
60
- print(f"Fetched {len(questions_data)} questions.")
61
- except requests.exceptions.RequestException as e:
62
- print(f"Error fetching questions: {e}")
63
- return f"Error fetching questions: {e}", None
64
- except requests.exceptions.JSONDecodeError as e:
65
- print(f"Error decoding JSON response from questions endpoint: {e}")
66
- print(f"Response text: {response.text[:500]}")
67
- return f"Error decoding server response for questions: {e}", None
68
- except Exception as e:
69
- print(f"An unexpected error occurred fetching questions: {e}")
70
- return f"An unexpected error occurred fetching questions: {e}", None
71
-
72
- # 3. Run your Agent
73
- results_log = []
74
- answers_payload = []
75
- print(f"Running agent on {len(questions_data)} questions...")
76
- for item in questions_data:
77
- task_id = item.get("task_id")
78
- question_text = item.get("question")
79
- if not task_id or question_text is None:
80
- print(f"Skipping item with missing task_id or question: {item}")
81
- continue
82
- try:
83
- submitted_answer = agent(question_text)
84
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
86
- except Exception as e:
87
- print(f"Error running agent on task {task_id}: {e}")
88
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
89
-
90
- if not answers_payload:
91
- print("Agent did not produce any answers to submit.")
92
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
93
-
94
- # 4. Prepare Submission
95
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
96
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
97
- print(status_update)
98
-
99
- # 5. Submit
100
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
101
- try:
102
- response = requests.post(submit_url, json=submission_data, timeout=60)
103
- response.raise_for_status()
104
- result_data = response.json()
105
- final_status = (
106
- f"Submission Successful!\n"
107
- f"User: {result_data.get('username')}\n"
108
- f"Overall Score: {result_data.get('score', 'N/A')}% "
109
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
110
- f"Message: {result_data.get('message', 'No message received.')}"
111
- )
112
- print("Submission successful.")
113
- results_df = pd.DataFrame(results_log)
114
- return final_status, results_df
115
- except requests.exceptions.HTTPError as e:
116
- error_detail = f"Server responded with status {e.response.status_code}."
117
- try:
118
- error_json = e.response.json()
119
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
120
- except requests.exceptions.JSONDecodeError:
121
- error_detail += f" Response: {e.response.text[:500]}"
122
- status_message = f"Submission Failed: {error_detail}"
123
- print(status_message)
124
- results_df = pd.DataFrame(results_log)
125
- return status_message, results_df
126
- except requests.exceptions.Timeout:
127
- status_message = "Submission Failed: The request timed out."
128
- print(status_message)
129
- results_df = pd.DataFrame(results_log)
130
- return status_message, results_df
131
- except requests.exceptions.RequestException as e:
132
- status_message = f"Submission Failed: Network error - {e}"
133
- print(status_message)
134
- results_df = pd.DataFrame(results_log)
135
- return status_message, results_df
136
- except Exception as e:
137
- status_message = f"An unexpected error occurred during submission: {e}"
138
- print(status_message)
139
- results_df = pd.DataFrame(results_log)
140
- return status_message, results_df
141
-
142
-
143
- # --- Build Gradio Interface using Blocks ---
144
- with gr.Blocks() as demo:
145
- gr.Markdown("# Basic Agent Evaluation Runner")
146
- gr.Markdown(
147
- """
148
- **Instructions:**
149
-
150
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
151
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
152
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
153
-
154
- ---
155
- **Disclaimers:**
156
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
157
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
158
- """
159
- )
160
-
161
- gr.LoginButton()
162
-
163
- run_button = gr.Button("Run Evaluation & Submit All Answers")
164
-
165
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
166
- # Removed max_rows=10 from DataFrame constructor
167
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
168
-
169
- run_button.click(
170
- fn=run_and_submit_all,
171
- outputs=[status_output, results_table]
172
- )
173
-
174
- if __name__ == "__main__":
175
- print("\n" + "-"*30 + " App Starting " + "-"*30)
176
- # Check for SPACE_HOST and SPACE_ID at startup for information
177
- space_host_startup = os.getenv("SPACE_HOST")
178
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
179
-
180
- if space_host_startup:
181
- print(f"✅ SPACE_HOST found: {space_host_startup}")
182
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
183
- else:
184
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
185
-
186
- if space_id_startup: # Print repo URLs if SPACE_ID is found
187
- print(f"✅ SPACE_ID found: {space_id_startup}")
188
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
189
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
190
- else:
191
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
192
-
193
- print("-"*(60 + len(" App Starting ")) + "\n")
194
-
195
- print("Launching Gradio Interface for Basic Agent Evaluation...")
196
- demo.launch(debug=True, share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/agent_config.yaml ADDED
File without changes
configs/tool_config.yaml ADDED
File without changes
data/agent_experiments/iterations/1_initial_experiment.csv ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Question,Agent response,Final answer,is_correct
2
+ How many slides in this PowerPoint presentation mention crustaceans?,"I cannot access or analyze the content of a PowerPoint presentation directly. Please provide the relevant information or data, and I can assist you with any analysis or calculations needed.",4,0
3
+ "If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.","To solve this problem, we need to follow these steps:
4
+
5
+ 1. **Determine the distance from Earth to the Moon at its closest approach (perigee).**
6
+ - According to Wikipedia, the minimum perigee distance is approximately 356,500 km.
7
+
8
+ 2. **Find Eliud Kipchoge's marathon pace.**
9
+ - Kipchoge's marathon world record is 2 hours, 1 minute, and 39 seconds for 42.195 km.
10
+ - Convert this time to hours: \(2 + \frac{1}{60} + \frac{39}{3600} \approx 2.0275\) hours.
11
+ - Calculate his speed: \( \frac{42.195}{2.0275} \approx 20.82 \) km/h.
12
+
13
+ 3. **Calculate the time it would take to run the distance to the Moon.**
14
+ - Use the formula: \( \text{Time} = \frac{\text{Distance}}{\text{Speed}} \).
15
+ - \( \text{Time} = \frac{356,500}{20.82} \approx 17120.65 \) hours.
16
+
17
+ 4. **Convert the time to thousand hours and round to the nearest 1000.**
18
+ - \( \frac{17120.65}{1000} \approx 17.12065 \).
19
+ - Rounded to the nearest 1000 hours: 17000.
20
+
21
+ The answer is:
22
+
23
+ 17000",17,0
24
+ "According to Girls Who Code, how long did it take in years for the percentage of computer scientists that were women to change by 13% from a starting point of 37%?",The percentage of computer scientists that were women changed by 13% from a starting point of 37% over a period of 30 years.,22,0
25
+ "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.",`BaseForest`,BaseLabelPropagation,0
26
+ "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",The work performed by R. G. Arendt was supported by NASA award number 80GSFC21M0002.,80GSFC21M0002,0
27
+ What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,Gianluigi,Claus,0
28
+ "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?","""Japan""",Guatemala,0
29
+ "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, Lațcu IV, son of Costea, has tasked you with investigating the village of Șirnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.
30
+
31
+ You travel to the village of Șirnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.
32
+
33
+ In the afternoon, you go from house to house, speaking with all 100 residents of Șirnea. You ask everyone the same question: ""How many vampires are living in Șirnea"". Everyone in the village gives the same response, ""At least one of us is a human.""
34
+
35
+ How many residents of Șirnea have been turned into vampires?",99,100,0
36
+ "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, ""-"", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",2,3,0
37
+ "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper ""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?""",0.5,0.1777,0
38
+ "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.","Tamai, Tanaka","Yoshida, Uehara",0
39
+ "Here's a fun riddle that I think you'll enjoy.
40
+
41
+ You have been selected to play the final round of the hit new game show ""Pick That Ping-Pong"". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.
42
+
43
+ A device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.
44
+
45
+ The ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.
46
+
47
+ Which ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.","To maximize your odds of winning the big prize, you should choose ball number `1`.",3,0
data/agent_experiments/iterations/2_search_and_code_tools_integrated.csv ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Question,Agent response,Final answer,is_correct
2
+ "In the year 2022, and before December, what does ""R"" stand for in the three core policies of the type of content that was violated in the public logs on the Legume Wikipedia page?","The search did not provide specific information about the ""R"" in the three core policies related to content violations on the Legume Wikipedia page. You may need to refer directly to Wikipedia's content policies or the page's edit history for detailed information.",research,0
3
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”, what is the number of the stanza in which some lines are indented?",3,2,0
4
+ "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.",`BaseBagging`,BaseLabelPropagation,0
5
+ How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,I couldn't find the specific number of studio albums published by Mercedes Sosa between 2000 and 2009 from the search. Please check the latest version of English Wikipedia for detailed discography information.,3,0
6
+ "In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's sons that guarded his house, what word was quoted from two different authors in distaste for the nature of dragon depictions?","""grotesque""",fluffy,0
7
+ "An office held a Secret Santa gift exchange where each of its twelve employees was assigned one other employee in the group to present with a gift. Each employee filled out a profile including three likes or hobbies. On the day of the gift exchange, only eleven gifts were given, each one specific to one of the recipient's interests. Based on the information in the document, who did not give a gift?","To determine who did not give a gift, we need to identify the employee who did not present a gift to their assigned recipient. Since there are twelve employees and only eleven gifts were given, one employee did not fulfill their Secret Santa obligation.
8
+
9
+ Given that each gift was specific to the recipient's interests, we can deduce that the missing gift corresponds to the employee whose interests were not matched with a gift. Therefore, the employee who did not give a gift is the one whose assigned recipient did not receive a gift.
10
+
11
+ Without specific information about the interests and the gifts given, we cannot directly identify the employee. However, if you provide the list of employees, their interests, and the gifts given, I can help determine who did not give a gift.",Fred,0
12
+ "Bob was invited to participate in a game show, and he advanced to the final round. The final round offered Bob the chance to win a large sum by playing a game against the host. The host has 30 shiny prop coins, each of which is worth $1,000 if Bob manages to win them by playing the game. The host hides the coins in three different prize boxes and then shuffles their order. The only rule restricting the host's coin placement is that one box must contain at least 2 coins, and one box must contain 6 more coins than another box. In order to play, Bob must submit three guesses, one guess for the number of coins in each box. The box is then opened and the number of coins is revealed. If Bob's guess is a number greater than the number of coins in the box, Bob earns no coins. If Bob guesses a number equal to or less than the number of coins in the box, Bob wins a number of coins equal to his guess.
13
+
14
+ If Bob plays uses the optimal strategy, what's the minimum amount of money he can win from the game?","To solve this problem, we need to determine the distribution of coins in the boxes that satisfies the given conditions and then find Bob's optimal strategy to maximize his winnings.
15
+
16
+ ### Conditions:
17
+ 1. Total coins = 30
18
+ 2. One box must contain at least 2 coins.
19
+ 3. One box must contain 6 more coins than another box.
20
+
21
+ ### Analysis:
22
+ Let's denote the number of coins in the three boxes as \( x \), \( y \), and \( z \) such that \( x + y + z = 30 \).
23
+
24
+ - Without loss of generality, assume \( x \leq y \leq z \).
25
+ - From the conditions, we have:
26
+ - \( x \geq 2 \)
27
+ - \( z = x + 6 \)
28
+
29
+ ### Solving the Equations:
30
+ Substitute \( z = x + 6 \) into the total:
31
+ \[ x + y + (x + 6) = 30 \]
32
+ \[ 2x + y + 6 = 30 \]
33
+ \[ 2x + y = 24 \]
34
+ \[ y = 24 - 2x \]
35
+
36
+ Since \( x \leq y \), we have:
37
+ \[ x \leq 24 - 2x \]
38
+ \[ 3x \leq 24 \]
39
+ \[ x \leq 8 \]
40
+
41
+ ### Possible Values for \( x \):
42
+ - \( x \) can be 2, 3, 4, 5, 6, 7, or 8.
43
+
44
+ ### Calculate \( y \) and \( z \) for each \( x \):
45
+ - For \( x = 2 \): \( y = 20 \), \( z = 8 \) (invalid as \( y \geq z \))
46
+ - For \( x = 3 \): \( y = 18 \), \( z = 9 \) (invalid as \( y \geq z \))
47
+ - For \( x = 4 \): \( y = 16 \), \( z = 10 \) (invalid as \( y \geq z \))
48
+ - For \( x = 5 \): \( y = 14 \), \( z = 11 \) (valid)
49
+ - For \( x = 6 \): \( y = 12 \), \( z = 12 \) (valid)
50
+ - For \( x = 7 \): \( y = 10 \), \( z = 13 \) (valid)
51
+ - For \( x = 8 \): \( y = 8 \), \( z = 14 \) (valid)
52
+
53
+ ### Optimal Strategy:
54
+ Bob should guess the minimum number of coins in each box to ensure he wins at least that many coins. The optimal strategy is to guess the minimum number of coins in each box configuration.
55
+
56
+ ### Minimum Winnings:
57
+ - For \( (x, y, z) = (5, 14, 11) \), Bob guesses 5, 11, 11 → Wins 5 + 11 + 11 = 27 coins
58
+ - For \( (x, y, z) = (6, 12, 12) \), Bob guesses 6, 12, 12 → Wins 6 + 12 + 12 = 30 coins
59
+ - For \( (x, y, z) = (7, 10, 13) \), Bob guesses 7, 10, 10 → Wins 7 + 10 + 10 = 27 coins
60
+ - For \( (x, y, z) = (8, 8, 14) \), Bob guesses 8, 8, 8 → Wins 8 + 8 + 8 = 24 coins
61
+
62
+ The minimum amount of money Bob can win, using the optimal strategy, is when the distribution is \( (8, 8, 14) \), and he wins 24 coins.
63
+
64
+ ### Conclusion:
65
+ The minimum amount of money Bob can win is \( 24,000 \) dollars.",16000,0
66
+ The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,"I currently don't have the capability to access or analyze files directly. Please provide the relevant data or details from the Excel file, and I can assist you with the calculations.",89706.00,0
67
+ Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.,Saint Petersburg,Saint Petersburg,1
68
+ ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",right,Right,1
69
+ "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
70
+
71
+ In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
72
+
73
+ Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.","I'm unable to listen to audio files. However, if you can provide a transcription of the voice memo, I can help you extract and alphabetize the ingredients for the filling.","cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",0
74
+ "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one.",graphene,diamond,0
data/agent_experiments/iterations/3_integrate_whisper_audio_transcriber.csv ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Question,file_path,Agent response,Final answer,is_correct
2
+ What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,,Agnew,Louvrier,0
3
+ What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,,Dmitry,Claus,0
4
+ "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",,"""BUL""",CUB,0
5
+ "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, ""-"", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt,7,3,0
6
+ "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
7
+
8
+ Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3,"132, 133, 134, 197, 245","132, 133, 134, 197, 245",1
9
+ Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.,,Please provide the text or content of the attached files so I can assist you further.,Saint Petersburg,0
10
+ "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper ""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?""",,"I currently don't have the capability to access or retrieve content from attached files. You might want to check the paper directly for the specific information regarding the volume of the fish bag. If you have any other questions or need further assistance, feel free to ask!",0.1777,0
11
+ "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?",,Switzerland,Guatemala,0
12
+ "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.",,`semi_supervised.BaseLabelPropagation`,BaseLabelPropagation,0
13
+ "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",,The work performed by R. G. Arendt was supported by NASA award number 80NSSC21M0056.,80GSFC21M0002,0
14
+ "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.",,"I can't access or retrieve specific files or documents. You might want to check the official script or a reliable source for the exact setting name as it appears in the first scene heading of Series 9, Episode 11 of Doctor Who.",THE CASTLE,0
15
+ ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",,"""right""",Right,1
16
+ How many slides in this PowerPoint presentation mention crustaceans?,/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx,"I'm unable to directly analyze PowerPoint files. Please provide the text content or a summary of the slides, and I can assist you further.",4,0
17
+ "On Cornell Law School website's legal information institute, under the fifth section of federal rules alphabetically, what word was deleted in the last amendment to the first rule in the article that has ""witnesses"" in the most titles as of 2021?",,"The last amendment to Rule 601 of the Federal Rules of Evidence was a stylistic restyling in 2011, which did not involve the deletion of any specific word.",inference,0
18
+ The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx,"I'm unable to directly access or process Excel files. Please provide the relevant data or information from the file, and I'll be happy to assist you with your request.",89706.00,0
19
+ "You are given this Excel file as a map. You start on the START cell and move toward the END cell. You are allowed to move two cells per turn, and you may move up, down, left, or right. You may not move fewer than two cells, and you may not move backward. You must avoid moving onto any blue cells. On the eleventh turn, what is the 6-digit hex code (without prefix) of the color of the cell where you land after moving?",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx,"I currently don't have the capability to directly read or process Excel files. You can provide the relevant data or information from the Excel file, and I can assist you with the problem using that information.",F478A7,0
20
+ "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",,I can't access or analyze video content directly. You might want to watch the video and count the number of bird species visible at the same time.,3,0
21
+ "As a comma separated list with no whitespace, using the provided image provide all the fractions that use / as the fraction line and the answers to the sample problems. Order the list by the order in which the fractions appear.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png,"I'm unable to view or analyze images directly. Please provide the text or details from the image, and I'll assist you accordingly.","3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170",0
22
+ What was the complete title of the book in which two James Beard Award winners recommended the restaurant where Ali Khan enjoyed a New Mexican staple in his cost-conscious TV show that started in 2015? Write the numbers in plain text if there are some in the title.,,"""Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them""",Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them,1
23
+ What is the final numeric output from the attached Python code?,/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/f918266a-b3e0-4914-865d-4faa564f1aef.py,"I currently don't have the capability to access or read files directly from a file path. Please provide the content of the Python code here, and I can help determine the final numeric output.",0,0
data/agent_experiments/iterations/4_test_workflow.csv ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Question,file_path,Agent response,Final answer,is_correct
2
+ "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, ""-"", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt,4,3,0
3
+ "In the year 2022, and before December, what does ""R"" stand for in the three core policies of the type of content that was violated in the public logs on the Legume Wikipedia page?",,"""R"" stands for ""No original research"" in the three core policies of Wikipedia.",research,0
4
+ The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx,"I can't directly access or read Excel files. Please provide the relevant data or information from the file, and I'll be happy to assist you with your request.",89706.00,0
5
+ "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper ""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?""",,"I currently don't have the capability to access or retrieve content from attached files. You might want to check the paper directly for the specific information regarding the volume of the fish bag. If you have any other questions or need further assistance, feel free to ask!",0.1777,0
6
+ What was the complete title of the book in which two James Beard Award winners recommended the restaurant where Ali Khan enjoyed a New Mexican staple in his cost-conscious TV show that started in 2015? Write the numbers in plain text if there are some in the title.,,"""Cheap Eats: The Restaurant Guide by James Beard Award Winners""",Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them,0
7
+ "An office held a Secret Santa gift exchange where each of its twelve employees was assigned one other employee in the group to present with a gift. Each employee filled out a profile including three likes or hobbies. On the day of the gift exchange, only eleven gifts were given, each one specific to one of the recipient's interests. Based on the information in the document, who did not give a gift?",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx,"To determine who did not give a gift, I need to extract and analyze the information from the provided document. However, I currently do not have the capability to directly read or extract content from `.docx` files. You can provide the text content or key details from the document, and I can assist you further in solving the problem.",Fred,0
8
+ "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",,I can't access or analyze video content directly. You might want to watch the video and count the number of bird species visible at the same time.,3,0
9
+ "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?",,Switzerland,Guatemala,0
10
+ How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?,,"The BERT base encoder has 12 layers, while the encoder from the ""Attention is All You Need"" architecture, also known as the original Transformer model, has 6 layers. Therefore, the BERT base encoder has 6 more layers than the original Transformer encoder.",6,0
11
+ "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",,The work performed by R. G. Arendt was supported by NASA award number 80GSFC21M0002.,80GSFC21M0002,0
12
+ ".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",,"""right""",Right,1
13
+ What is the final numeric output from the attached Python code?,/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/f918266a-b3e0-4914-865d-4faa564f1aef.py,"I couldn't access the file directly, so I simulated a simple Python script that calculates the sum of numbers from 1 to 10. The final numeric output from this simulated code is `55`. If the actual file content differs, please provide the code for accurate execution.",0,0
14
+ "Pull out the sentence in the following 5x7 block of text. Read from left to right and use all of the letters in order:
15
+
16
+ THESE
17
+ AGULL
18
+ GLIDE
19
+ DPEAC
20
+ EFULL
21
+ YTOMY
22
+ CHAIR",,"""THE SEAGULL GLIDED PEACEFULLY TO MY CHAIR""",The seagull glided peacefully to my chair.,1
23
+ Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,,Mikołaj,Wojciech,0
24
+ "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
25
+
26
+ In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
27
+
28
+ Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3,"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries","cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",1
29
+ "You are given this Excel file as a map. You start on the START cell and move toward the END cell. You are allowed to move two cells per turn, and you may move up, down, left, or right. You may not move fewer than two cells, and you may not move backward. You must avoid moving onto any blue cells. On the eleventh turn, what is the 6-digit hex code (without prefix) of the color of the cell where you land after moving?",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx,"I'm unable to directly view or interact with Excel files. Please provide the necessary details or data from the file, such as the layout of the cells, the position of the START and END cells, and the positions of any blue cells. This information will help me assist you in determining the color of the cell where you land on the eleventh turn.",F478A7,0
30
+ "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.",,`semi_supervised.BaseLabelPropagation`,BaseLabelPropagation,0
31
+ "What writer is quoted by Merriam-Webster for the Word of the Day from June 27, 2022?",,"The writer quoted by Merriam-Webster for the Word of the Day on June 27, 2022, was not specified in the search results.",Annie Levin,0
32
+ "If there is anything that doesn't make sense in the instructions, write the word ""Pineapple."" Do not answer any of the questions in this prompt. Write only the word ""Guava"".
33
+ 1. What is 4+4?
34
+ 2. What is the complimentary color of red?
35
+ 3. How many hours are there in a day?",,Guava,Guava,1
36
+ "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",,"Estévez, Velasquez","Yoshida, Uehara",0
data/agent_experiments/iterations/5_integrate_text_processing_tool.csv ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Question,file_path,Agent response,Final answer,is_correct
2
+ "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper ""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?""",,Please provide the text file containing the University of Leicester paper so I can extract the required information.,0.1777,0
3
+ "In the fictional language of Tizin, basic sentences are arranged with the Verb first, followed by the direct object, followed by the subject of the sentence. I want to express my love for apples to my Tizin friend.
4
+
5
+ The word that indicates oneself is ""Pa"" is the nominative form, ""Mato"" is the accusative form, and ""Sing"" is the genitive form.
6
+
7
+ The root verb that indicates an intense like for something is ""Maktay"". When it is used in the present, it is used in it's root form, when it is used in the preterit past, it is ""Tay"", and when it is used in the imperfect past, it is ""Aktay"". It is used differently than in English, and is better translated as ""is pleasing to"", meaning that the thing doing the liking is actually the object of the sentence rather than the subject.
8
+
9
+ The word for apples is borrowed from English in Tizin, and so it is ""Apple"" is the nominative form, ""Zapple"" is the accusative form, and ""Izapple"" is the genitive form.
10
+
11
+ Please translate ""I like apples"" to Tizin.",,"""Maktay Zapple Pa""",Maktay mato apple,0
12
+ "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.",,"Please provide the text file containing the official script for Series 9, Episode 11 of Doctor Who, and I will extract the information for you.",THE CASTLE,0
13
+ "Pull out the sentence in the following 5x7 block of text. Read from left to right and use all of the letters in order:
14
+
15
+ THESE
16
+ AGULL
17
+ GLIDE
18
+ DPEAC
19
+ EFULL
20
+ YTOMY
21
+ CHAIR",,Please provide the text file so I can process it and extract the sentence.,The seagull glided peacefully to my chair.,0
22
+ "Here's a fun riddle that I think you'll enjoy.
23
+
24
+ You have been selected to play the final round of the hit new game show ""Pick That Ping-Pong"". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.
25
+
26
+ A device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.
27
+
28
+ The ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.
29
+
30
+ Which ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.",,"To maximize your odds of winning the big prize, you should choose ball number `1`.",3,0
31
+ "You are given this Excel file as a map. You start on the START cell and move toward the END cell. You are allowed to move two cells per turn, and you may move up, down, left, or right. You may not move fewer than two cells, and you may not move backward. You must avoid moving onto any blue cells. On the eleventh turn, what is the 6-digit hex code (without prefix) of the color of the cell where you land after moving?",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx,"I currently cannot directly process Excel files. Please convert the Excel file to a text format or provide the necessary data in text form, and I will assist you with the task.",F478A7,0
32
+ "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",,I can't access or analyze video content directly. You might want to watch the video and count the number of bird species visible at the same time.,3,0
33
+ What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,,"""Claus""",Claus,1
34
+ "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",,"I couldn't find the specific paper or the NASA award number from the search results. You may need to visit the Universe Today website and locate the article by Carolyn Collins Petersen published on June 6, 2023, to find the link to the paper and the NASA award number.",80GSFC21M0002,0
35
+ "Each cell in the attached spreadsheet represents a plot of land. The color of the cell indicates who owns that plot. Green cells are plots owned by Earl Smith. Can Earl walk through every plot he owns (and no other plots) and return to his starting plot without backtracking? For this question, consider backtracking to be any instance where Earl would enter a plot of land he had already entered since leaving his starting plot.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx,"I'm unable to directly view or process spreadsheet files. Please provide the data in a text format or describe the layout of the plots, and I can help determine if Earl can walk through every plot he owns without backtracking.",No,0
36
+ "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, Lațcu IV, son of Costea, has tasked you with investigating the village of Șirnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.
37
+
38
+ You travel to the village of Șirnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.
39
+
40
+ In the afternoon, you go from house to house, speaking with all 100 residents of Șirnea. You ask everyone the same question: ""How many vampires are living in Șirnea"". Everyone in the village gives the same response, ""At least one of us is a human.""
41
+
42
+ How many residents of Șirnea have been turned into vampires?",,99,100,0
43
+ "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",,LUX,CUB,0
44
+ "A standard Rubik’s cube has been broken into cubes making up its sides. The cubes are jumbled, and one is removed. There are 6 cubes with one colored face, 12 edge cubes with two colored faces, and 8 corner cubes with three colored faces. All blue cubes have been found. All cubes directly left, right, above, and below the orange center cube have been found, along with the center cube. The green corners have all been found, along with all green that borders yellow. For all orange cubes found, the opposite face’s cubes have been found. The removed cube has two colors on its faces. What are they? Answer using a comma separated list, with the colors ordered alphabetically.",,"The removed cube has two colors on its faces: orange, red.","green, white",0
45
+ "As a comma separated list with no whitespace, using the provided image provide all the fractions that use / as the fraction line and the answers to the sample problems. Order the list by the order in which the fractions appear.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png,I can't directly view images. Please provide the text or details from the image for further assistance.,"3/4,1/4,3/4,3/4,2/4,1/2,5/35,7/21,30/5,30/5,3/4,1/15,1/3,4/9,1/8,32/23,103/170",0
46
+ "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
47
+
48
+ What does Teal'c say in response to the question ""Isn't that hot?""",,"I can't access or analyze video content directly from YouTube or any other external websites. You can watch the video and listen for Teal'c's response to the question ""Isn't that hot?""",Extremely,0
49
+ "On Cornell Law School website's legal information institute, under the fifth section of federal rules alphabetically, what word was deleted in the last amendment to the first rule in the article that has ""witnesses"" in the most titles as of 2021?",,"""Rule 611""",inference,0
50
+ "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?",,Please provide the attached files so I can assist you further.,Guatemala,0
51
+ "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
52
+
53
+ Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3,"132, 133, 134, 197, 245","132, 133, 134, 197, 245",1
54
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”, what is the number of the stanza in which some lines are indented?",,"Please provide the text file containing Audre Lorde's poem ""Father Son and Holy Ghost"" so I can analyze it for you.",2,0
55
+ How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?,,"To determine the difference in the number of blocks (or layers) between the BERT base encoder and the encoder from the ""Attention is All You Need"" architecture, we need to know the number of layers in each model:
56
+
57
+ 1. **BERT Base Encoder**: It has 12 layers.
58
+ 2. **Attention is All You Need (Transformer) Encoder**: It has 6 layers.
59
+
60
+ Now, we can calculate the difference:
61
+
62
+ 12 (BERT Base) - 6 (Transformer) = 6
63
+
64
+ Therefore, the BERT base encoder has 6 more layers than the encoder from the ""Attention is All You Need"" architecture.",6,0
data/agent_experiments/iterations/6_integrate_text_handler_tool.csv ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Question,file_path,Agent response,Final answer,is_correct
2
+ "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
3
+
4
+ Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3,"132, 133, 134, 197, 245","132, 133, 134, 197, 245",1
5
+ "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.",,"Please provide the text file containing the official script for Series 9, Episode 11 of Doctor Who, so I can extract the required information.",THE CASTLE,0
6
+ How many slides in this PowerPoint presentation mention crustaceans?,/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx,I can only process text files directly. Please convert the PowerPoint presentation to a text format and provide the file for analysis.,4,0
7
+ "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, ""-"", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt,2,3,0
8
+ "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one.",,"Please provide the text file containing the conference proceedings from the 2012 Nature journal's Scientific Reports, so I can process it and find the information you need.",diamond,0
9
+ "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",,"""BUL""",CUB,0
10
+ Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,,FunkMonk,FunkMonk,1
11
+ "Here's a fun riddle that I think you'll enjoy.
12
+
13
+ You have been selected to play the final round of the hit new game show ""Pick That Ping-Pong"". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.
14
+
15
+ A device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.
16
+
17
+ The ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.
18
+
19
+ Which ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.",,"To maximize your odds of winning the big prize, you should choose ball number **1**.",3,0
20
+ "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",,NNH16ZDA001N,80GSFC21M0002,0
21
+ What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,,"To find the first name of the Malko Competition recipient from the 20th Century (after 1977) whose nationality was from a country that no longer exists, I would need to access the relevant data or list of recipients. If you have a file with this information, please provide it, and I can assist you further.",Claus,0
22
+ Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,,"The Polish-language version of ""Everybody Loves Raymond"" retains the original English cast, so there is no separate Polish actor for Ray.",Wojciech,0
23
+ "According to Girls Who Code, how long did it take in years for the percentage of computer scientists that were women to change by 13% from a starting point of 37%?",,"The percentage of computer scientists that were women changed by 13% from a starting point of 37% over a period of 28 years, from 1995 to 2023.",22,0
24
+ "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper ""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?""",,Please provide the text file containing the University of Leicester paper so I can extract the required information.,0.1777,0
25
+ What was the complete title of the book in which two James Beard Award winners recommended the restaurant where Ali Khan enjoyed a New Mexican staple in his cost-conscious TV show that started in 2015? Write the numbers in plain text if there are some in the title.,,"""Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them""",Five Hundred Things To Eat Before It's Too Late: and the Very Best Places to Eat Them,1
26
+ How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,,3,3,1
27
+ "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",,Please provide the file containing the relevant information so I can assist you further.,"Yoshida, Uehara",0
28
+ "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
29
+
30
+ milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
31
+
32
+ I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",,"bell pepper, broccoli, celery, fresh basil, green beans, lettuce, sweet potatoes, zucchini","broccoli, celery, fresh basil, lettuce, sweet potatoes",0
33
+ "Of the authors (First M. Last) that worked on the paper ""Pie Menus or Linear Menus, Which Is Better?"" in 2015, what was the title of the first paper authored by the one that had authored prior papers?",,"Please provide the text file containing the paper ""Pie Menus or Linear Menus, Which Is Better?"" from 2015, so I can extract the necessary information.",Mapping Human Oriented Information to Software Agents for Online Systems Usage,0
34
+ "If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.",,"Eliud Kipchoge's marathon world record pace is approximately 2 hours, 1 minute, and 39 seconds for 42.195 kilometers. To find his pace in kilometers per hour, we first convert his marathon time to hours:
35
+
36
+ 2 hours + (1/60) hours + (39/3600) hours = 2.0275 hours
37
+
38
+ Now, calculate his speed in kilometers per hour:
39
+
40
+ 42.195 km / 2.0275 hours ≈ 20.82 km/h
41
+
42
+ Next, calculate the time it would take to run the minimum perigee distance to the Moon:
43
+
44
+ 363,396 km / 20.82 km/h ≈ 17,454.28 hours
45
+
46
+ Finally, convert this time to thousand hours and round to the nearest 1000:
47
+
48
+ 17,454.28 hours ≈ 17,000 hours
49
+
50
+ Therefore, it would take approximately 17000 hours.",17,0
51
+ "¬(A ∧ B) ↔ (¬A ∨ ¬B)
52
+ ¬(A ∨ B) ↔ (¬A ∧ ¬B)
53
+ (A → B) ↔ (¬B → ¬A)
54
+ (A → B) ↔ (¬A ∨ B)
55
+ (¬A → B) ↔ (A ∨ ¬B)
56
+ ¬(A → B) ↔ (A ∧ ¬B)
57
+
58
+ Which of the above is not logically equivalent to the rest? Provide the full statement that doesn't fit.",,"I am unable to evaluate the logical equivalence of the statements due to technical issues. However, I can provide a general approach to solve this problem:
59
+
60
+ 1. **Understand Logical Equivalence**: Logical equivalence means that two statements have the same truth value in every possible scenario.
61
+
62
+ 2. **Evaluate Each Statement**: Use truth tables or logical laws to determine if each statement is equivalent to the others.
63
+
64
+ 3. **Identify the Odd One Out**: The statement that does not have the same truth value as the others in all scenarios is the one that doesn't fit.
65
+
66
+ If you have access to a tool or software that can evaluate logical expressions, you can use it to perform these steps. Alternatively, you can manually create truth tables for each statement to determine their equivalence.",(¬A → B) ↔ (A ∨ ¬B),0
data/agent_experiments/iterations/7_test_agent_performance_against_tasks_with_attached_txt_files.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Question,file_path,Agent response,Final answer,is_correct
2
+ "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, ""-"", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt,2,3,0
data/agent_experiments/iterations/8_integrate_chess_tool.csv ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Question,file_path,Agent response,Final answer,is_correct
2
+ "If there is anything that doesn't make sense in the instructions, write the word ""Pineapple."" Do not answer any of the questions in this prompt. Write only the word ""Guava"".
3
+ 1. What is 4+4?
4
+ 2. What is the complimentary color of red?
5
+ 3. How many hours are there in a day?",,Guava,Guava,1
6
+ "Here's a fun riddle that I think you'll enjoy.
7
+
8
+ You have been selected to play the final round of the hit new game show ""Pick That Ping-Pong"". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.
9
+
10
+ A device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.
11
+
12
+ The ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.
13
+
14
+ Which ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.",,"To maximize your odds of winning the grand prize, you should choose ball number **1**.",3,0
15
+ "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
16
+
17
+ What does Teal'c say in response to the question ""Isn't that hot?""",,"I can't access or analyze video content directly from YouTube or any other external websites. You can watch the video and listen for Teal'c's response to the question ""Isn't that hot?""",Extremely,0
18
+ How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,,I couldn't find the specific number of studio albums released by Mercedes Sosa between 2000 and 2009. Please check the latest version of her discography on Wikipedia for detailed information.,3,0
19
+ "In the fictional language of Tizin, basic sentences are arranged with the Verb first, followed by the direct object, followed by the subject of the sentence. I want to express my love for apples to my Tizin friend.
20
+
21
+ The word that indicates oneself is ""Pa"" is the nominative form, ""Mato"" is the accusative form, and ""Sing"" is the genitive form.
22
+
23
+ The root verb that indicates an intense like for something is ""Maktay"". When it is used in the present, it is used in it's root form, when it is used in the preterit past, it is ""Tay"", and when it is used in the imperfect past, it is ""Aktay"". It is used differently than in English, and is better translated as ""is pleasing to"", meaning that the thing doing the liking is actually the object of the sentence rather than the subject.
24
+
25
+ The word for apples is borrowed from English in Tizin, and so it is ""Apple"" is the nominative form, ""Zapple"" is the accusative form, and ""Izapple"" is the genitive form.
26
+
27
+ Please translate ""I like apples"" to Tizin.",,"""Maktay Zapple Pa""",Maktay mato apple,0
28
+ Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.,/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png,Rd5,Rd5,1
29
+ "In the Scikit-Learn July 2017 changelog, what other predictor base command received a bug fix? Just give the name, not a path.",,Please provide the file path to the Scikit-Learn July 2017 changelog so I can access the information.,BaseLabelPropagation,0
30
+ "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",,"To find the country with the least number of athletes at the 1928 Summer Olympics, I need data on the number of athletes each country sent to the event. Please provide the relevant data file or information.",CUB,0
31
+ "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one.",,"Please provide the file containing the conference proceedings from the Nature journal's Scientific Reports in 2012, so I can assist you in finding the information you need.",diamond,0
32
+ "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper ""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?""",,Please provide the file containing the University of Leicester paper so I can extract the required information.,0.1777,0
33
+ "Of the authors (First M. Last) that worked on the paper ""Pie Menus or Linear Menus, Which Is Better?"" in 2015, what was the title of the first paper authored by the one that had authored prior papers?",,"To assist you, I need the attached files that contain the relevant information about the paper and its authors. Please provide the files so I can proceed.",Mapping Human Oriented Information to Software Agents for Online Systems Usage,0
34
+ "Pull out the sentence in the following 5x7 block of text. Read from left to right and use all of the letters in order:
35
+
36
+ THESE
37
+ AGULL
38
+ GLIDE
39
+ DPEAC
40
+ EFULL
41
+ YTOMY
42
+ CHAIR",,Please provide the file containing the 5x7 block of text.,The seagull glided peacefully to my chair.,0
43
+ How many slides in this PowerPoint presentation mention crustaceans?,/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx,I can only process text files. Please convert the PowerPoint presentation to a text format and provide it again.,4,0
44
+ "In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.",,"Please provide the file path to the script of Series 9, Episode 11 of Doctor Who, so I can retrieve the information for you.",THE CASTLE,0
45
+ Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.,,Please provide the file containing Nedoshivina's 2010 paper so I can extract the required information.,Saint Petersburg,0
46
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”, what is the number of the stanza in which some lines are indented?",,"Please provide the text of the poem ""Father Son and Holy Ghost"" by Audre Lorde, and I will help you identify the stanza with indented lines.",2,0
47
+ "A standard Rubik’s cube has been broken into cubes making up its sides. The cubes are jumbled, and one is removed. There are 6 cubes with one colored face, 12 edge cubes with two colored faces, and 8 corner cubes with three colored faces. All blue cubes have been found. All cubes directly left, right, above, and below the orange center cube have been found, along with the center cube. The green corners have all been found, along with all green that borders yellow. For all orange cubes found, the opposite face’s cubes have been found. The removed cube has two colors on its faces. What are they? Answer using a comma separated list, with the colors ordered alphabetically.",,"The removed cube has two colors on its faces: `red, white`.","green, white",0
48
+ "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
49
+
50
+ Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/1f975693-876d-457b-a649-393859e79bf3.mp3,"132, 133, 134, 197, 245","132, 133, 134, 197, 245",1
51
+ "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
52
+
53
+ In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
54
+
55
+ Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3,"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries","cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",1
56
+ "Under DDC 633 on Bielefeld University Library's BASE, as of 2020, from what country was the unknown language article with a flag unique from the others?",,Please provide the attached files so I can assist you further.,Guatemala,0
data/agent_experiments/summary.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ iteration,experiment,agent,tools,accuracy
2
+ 1,Implement Calculator Tool,React agent,Aritmetic,0.0
3
+ 2,Implement Search and Code tools,React agent,"Aritmetic, Search, Code",0.17
4
+ 3,Integrate Whisper Audio Transcriber,React agent,"Aritmetic, Search, Code, Audio Transcriber",0.15
5
+ 4,Test Workflow,React agent,"Aritmetic, Search, Code, Audio Transcriber, ",0.2
6
+ 5,Integrate Text processing tool,React agent,"Aritmetic, Search, Code, Audio Transcriber, Text Handler",0.1
7
+ 6,Integrate Text Handler tool,React agent,"Aritmetic, Search, Code, Audio Transcriber, Text Handler, Text Handler",0.2
8
+ 7,Test Agent performance against tasks with attached .txt files,React agent,"Aritmetic, Search, Code, Audio Transcriber, Text Handler, Text Handler, , ",0.0
9
+ 8,Integrate Chess Tool,React agent,"Aritmetic, Search, Code, Audio Transcriber, Text Handler, Text Handler, , , chess_tool",0.2
data/images/agent_architecture.png ADDED
data/images/chess_board_example.png ADDED
data/images/cropped_chess_board.png ADDED
data/images/cropped_chess_board_gray.png ADDED
data/temp-data/temp_json.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"key1": 1, "key2": 2, "key3": 30}
data/temp-data/temp_text.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Hi, this is a sample text
2
+ that's all:)
data/temp/results_web_search.csv ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task_id,answer
2
+ 5d0080cb-90d7-4712-bc33-848150e917d3,"It seems there is an issue with accessing the webpage directly. You can try visiting the [University of Leicester's journal page](https://journals.le.ac.uk/ojs1/index.php/jist/article/view/2983) to find the paper ""Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?"" and check the volume of the fish bag mentioned in the paper. If you have any other questions or need further assistance, feel free to ask!"
3
+ 46719c30-f4c3-4cad-be07-d5cb21eee6bb,"It seems there's an issue with accessing external websites directly. However, you can manually search for the paper ""Pie Menus or Linear Menus, Which Is Better?"" on Google Scholar or a similar academic database. Once you find the paper, you can check the list of authors and then search for their previous works to identify the first paper authored by the one with prior publications. If you need further assistance with this process, feel free to ask!"
4
+ b816bfce-3d80-4913-a07d-69b752ce6377,"It seems I can't access external websites directly to retrieve the information. However, I can guide you on how to find this information:
5
+
6
+ 1. **Journal Name**: The journal named for one of Hreidmar's sons is likely ""Fafnir – Nordic Journal of Science Fiction and Fantasy Research,"" as Fafnir is a son of Hreidmar in Norse mythology.
7
+
8
+ 2. **Finding the Article**:
9
+ - Visit a database like JSTOR or your institution's library portal.
10
+ - Search for Emily Midkiff's article from June 2014 in ""Fafnir.""
11
+
12
+ 3. **Reading the Article**:
13
+ - Once you have access to the article, look for sections where Midkiff discusses dragon depictions.
14
+ - Pay attention to any quotes from other authors that express distaste for how dragons are depicted.
15
+
16
+ If you have access to JSTOR or another academic database, you should be able to find the article and the specific word you're looking for. If you need further assistance, feel free to ask!"
17
+ b415aba4-4b68-4fc6-9b89-2c812e55a3e1,"It seems I'm unable to access external websites directly at the moment. However, you can visit the [Nature Scientific Reports conference proceedings for 2012](https://www.nature.com/srep/conference-proceedings/2012) and search for the article that does not mention plasmons or plasmonics. Look for the specific nano-compound studied in that article. If you have any other questions or need further assistance, feel free to ask!"
18
+ 935e2cff-ae78-4218-b3f5-115589b19dae,"It seems that I'm unable to navigate to the webpage directly. However, you can find the information you're looking for by visiting the Legume Wikipedia page and checking the public logs for content violations. The three core policies often referred to in Wikipedia content violations are ""Neutral point of view,"" ""Verifiability,"" and ""No original research."" The ""R"" in this context likely stands for ""No original research,"" but you should verify this by checking the specific logs on the Wikipedia page."
19
+ 5188369a-3bbe-43d8-8b94-11558f909a08,"It seems there is an issue with accessing the Merriam-Webster website directly. You can visit [Merriam-Webster's Word of the Day Archive](https://www.merriam-webster.com/word-of-the-day/calendar) and navigate to June 27, 2022, to find the writer quoted for that day's word."
20
+ 7673d772-ef80-4f0f-a602-1bf4485c9b43,"It seems there is an issue with navigating to the website directly. However, I can guide you on how to find the information you need:
21
+
22
+ 1. **Visit the Cornell Law School's Legal Information Institute website**: Go to [https://www.law.cornell.edu/](https://www.law.cornell.edu/).
23
+
24
+ 2. **Access the Federal Rules**: Look for a section or link related to ""Federal Rules"" on the homepage or use the search function.
25
+
26
+ 3. **Find the Fifth Section Alphabetically**: Once you are in the Federal Rules section, identify the fifth section alphabetically. This could be something like ""Federal Rules of Civil Procedure"" or similar.
27
+
28
+ 4. **Locate Articles with ""Witnesses"" in the Title**: Within the identified section, look for articles or rules that have ""witnesses"" in their titles.
29
+
30
+ 5. **Check the First Rule for Amendments**: Open the first rule in the article with the most titles containing ""witnesses"" and look for any amendment notes. These notes often indicate what changes were made, including any words that were deleted.
31
+
32
+ If you follow these steps, you should be able to find the information you are looking for. If you need further assistance, please let me know!"
33
+ c365c1c7-a3db-4d5e-a9a1-66f56eae7865,"Honolulu, Norfolk"
34
+ 7d4a7d1d-cac6-44a8-96e8-ea9584a70825,"I am unable to access the Girls Who Code website directly at the moment. However, you can visit their website and look for statistics or reports related to the percentage of women in computer science. If you have any other questions or need further assistance, feel free to ask!"
35
+ 3f57289b-8c60-48be-bd80-01f8099ca449,"The New York Yankee with the most walks in the 1977 regular season was Reggie Jackson, who had 86 walks. During that same season, Reggie Jackson had 582 at-bats."
36
+ 23dd907f-1261-4488-b21c-e9185af91d5e,"It seems there is an issue with accessing the webpage directly. However, you can visit the [Poetry Foundation's website](https://www.poetryfoundation.org/poems/42576/father-son-and-holy-ghost) and search for Audre Lorde's poem ""Father Son and Holy Ghost"" to find the stanza with indented lines. If you have any other questions or need further assistance, feel free to ask!"
37
+ 840bfca7-4f7b-481a-8794-c560c340185d,"It seems there is an issue with navigating to the Universe Today website. Unfortunately, I can't directly access the web to retrieve the article and the linked paper. However, you can visit the Universe Today website and search for the article by Carolyn Collins Petersen published on June 6, 2023. At the bottom of the article, you should find a link to the paper. Once you access the paper, you can look for the NASA award number associated with R. G. Arendt's work in the acknowledgments or funding section of the paper."
38
+ a0068077-79f4-461a-adfe-75c1a4148545,"It seems there is an issue with accessing the website directly. You can visit [ClinicalTrials.gov](https://clinicaltrials.gov) and search for the clinical trial on H. pylori in acne vulgaris patients to find the actual enrollment count for the period of January to May 2018. If you need further assistance with navigating the site or specific search tips, feel free to ask!"
39
+ a0c07678-e491-4bbc-8f0b-07405144218f,"It seems I'm unable to access the website directly to find the information. However, you can visit the [official NPB website](https://npb.jp/bis/eng/teams/index_d.html) and navigate to the team roster for the Orix Buffaloes to find Taishō Tamai's number. From there, you can identify the pitchers with the numbers immediately before and after his. If you need further assistance, feel free to ask!"
external/chess-modules/board_to_fen/.github/workflows/workflow.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: workflow
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ test:
14
+
15
+ runs-on: ubuntu-latest
16
+
17
+ steps:
18
+ - uses: actions/checkout@v3
19
+ - name: Set up Python 3.9
20
+ uses: actions/setup-python@v3
21
+ with:
22
+ python-version: "3.9"
23
+ - uses: actions/cache@v3
24
+ id: cache
25
+ with:
26
+ path: ~/.cache/pip
27
+ key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.*') }}
28
+ restore-keys: |
29
+ ${{ runner.os }}-pip-
30
+ - name: Install dependencies
31
+ run: |
32
+ python -m pip install --upgrade pip
33
+ pip install -r requirements.txt
34
+ pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
35
+ - name: Run pytest
36
+ run: |
37
+ pytest
38
+ deploy:
39
+ needs: test
40
+ runs-on: ubuntu-latest
41
+ steps:
42
+ - uses: actions/checkout@v3
43
+ - name: Set up Python 3.10.6
44
+ uses: actions/setup-python@v4
45
+ with:
46
+ python-version: 3.10.6
47
+ - run: python3 setup.py sdist
48
+ - name: Publish a Python distribution to PyPI
49
+ uses: pypa/gh-action-pypi-publish@release/v1
50
+ with:
51
+ password: ${{ secrets.PYPI_API_TOKEN }}
external/chess-modules/board_to_fen/.gitignore ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ # C extensions
6
+ *.so
7
+
8
+ # Distribution / packaging
9
+ .Python
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .egg/
16
+ .egg
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ pip-wheel-metadata/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ .python-version
87
+ .venv
88
+
89
+ # pipenv
90
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
92
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
93
+ # install all needed dependencies.
94
+ #Pipfile.lock
95
+
96
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
97
+ __pypackages__/
98
+
99
+ # Celery stuff
100
+ celerybeat-schedule
101
+ celerybeat.pid
102
+
103
+ # SageMath parsed files
104
+ *.sage.py
105
+
106
+ # Environments
107
+ .env
108
+ .venv
109
+ env/
110
+ venv/
111
+ ENV/
112
+ env.bak/
113
+ venv.bak/
114
+
115
+ # Spyder project settings
116
+ .spyderproject
117
+ .spyproject
118
+
119
+ # Rope project settings
120
+ .ropeproject
121
+
122
+ # mkdocs documentation
123
+ /site
124
+
125
+ # mypy
126
+ .mypy_cache/
127
+ .dmypy.json
128
+ dmypy.json
129
+
130
+ # Pyre type checker
131
+ .pyre/
132
+
133
+ board_to_fen.egg-info/
134
+ .DS_Store
135
+ DS_Store
136
+ board_to_fen/.DS_Store
137
+ */__pycache__
138
+ */*/__pycache__
external/chess-modules/board_to_fen/CHANGELOG.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ===============================
2
+ november 2022
3
+ version 0.0.0-0.0.13
4
+ - fix problems related with publishing on pypi
5
+ ================================
6
+ december 2022
7
+ version 0.0.14
8
+ - add automatic resizing (now image can have any size)
9
+ ===============================
10
+ january 2023
11
+ version 0.0.14-0.0.16
12
+ - fix problems related with github actions
13
+ ================================
14
+ january 2023
15
+ version 0.0.17-25
16
+ - add simple board validation
17
+ - bug fixes
18
+ ================================
19
+ ================================
20
+ february 2023
21
+ version 0.1.0-0.1.1
22
+ - migratation from cv2 to PIL
23
+ - new function for direct image object load
24
+ - add simple tests
25
+ - bug fixes
26
+ ================================
external/chess-modules/board_to_fen/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 mcdominik
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
external/chess-modules/board_to_fen/MANIFEST.in ADDED
@@ -0,0 +1 @@
 
 
1
+ global-include *.txt *.py *.jpg *png saved_models/* *
external/chess-modules/board_to_fen/README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # board_to_fen
2
+
3
+
4
+ Python package that converts digital chessboard image into Forsyth-Edwards notation (FEN) notation
5
+
6
+ [![Downloads](https://static.pepy.tech/personalized-badge/board-to-fen?period=total&units=none&left_color=purple&right_color=blue&left_text=downloads)](https://pepy.tech/project/board-to-fen)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+ [![PyPI](https://img.shields.io/pypi/v/board_to_fen)](https://pypi.org/project/board_to_fen/)
9
+ [![GitHub last commit](https://img.shields.io/github/last-commit/mcdominik/board_to_fen)](https://github.com/mcdominik/board_to_fen)
10
+
11
+ ### Installation
12
+ board_to_fen is available on PyPI:
13
+ ```
14
+ pip3 install board_to_fen
15
+ ```
16
+
17
+ ### Quick Start
18
+ ```python
19
+ from board_to_fen.predict import get_fen_from_image_path
20
+
21
+ print(get_fen_from_image_path(PATH_TO_CHESSBOARD_IMAGE))
22
+ ```
23
+
24
+ or, if you want you can load image object by yourself:
25
+
26
+ ```python
27
+ from PIL import Image
28
+ from board_to_fen.predict import get_fen_from_image
29
+
30
+ img = Image.open(PATH_TO_CHESSBOARD_IMAGE)
31
+
32
+ print(get_fen_from_image(img))
33
+ ```
34
+
35
+
36
+ **Note:** *The package uses tensorflow+keras API.
37
+ They are pretty heavy.*
38
+
39
+ ### Customization
40
+
41
+ get_fen_from_image_path takes has 3 arguments:
42
+
43
+ - image_path [required]
44
+ - end_of_row '/' by default
45
+ - black_view False by default -> set True if chessboard is provided from black player perspective
46
+
47
+
48
+ ### Web version (currently may not work)
49
+ Available at: https://board2fen.bieda.it
50
+
51
+
52
+ ### Training
53
+ For training You would probably want to download the source code by cloning the repository:
54
+ ```
55
+ $ git clone https://github.com/mcdominik/board_to_fen.git
56
+ ```
57
+ Download training data from:<br>
58
+ I will supply url for data in the future
59
+
60
+ In the main repository dir, run
61
+ ```
62
+ $ python3 ./board_to_fen/train_model.py
63
+ ```
64
+
65
+ ### Version history
66
+
67
+ - january 2023
68
+ - versions 0.0.17-25
69
+ - added simple board validation
70
+ - bug fixes
71
+
72
+ - february 2023
73
+ version 0.1.0-0.1.1
74
+ - migratation from cv2 to PIL
75
+ - new function for direct image object load
76
+ - add simple tests
77
+ - bug fixes
78
+
79
+ ## Warnings
80
+ - Image has to be provided in neutral angle (white or black player's perspective).
81
+ - Image has to be square (~3% tolerance depending on image resolution).
82
+ - Image can't contain paddings, board borders etc. other than 64 squares (with pieces) itself.
83
+
84
+
85
+ ### References:
86
+ https://www.kaggle.com/datasets/koryakinp/chess-positions
external/chess-modules/board_to_fen/board_to_fen/KerasNeuralNetwork.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Ignore warnings (E.g. CUDA drivers, tf, etc.)
3
+ from contextlib import redirect_stderr
4
+ from keras import models, layers
5
+ import numpy as np
6
+
7
+ INPUT_SHAPE = (50, 50, 3)
8
+
9
+ class KerasNeuralNetwork:
10
+ def __init__(self) -> None:
11
+ self.CATEGORIES = ["bishop_black", "bishop_white","empty","king_black","king_white","knight_black", "knight_white","pawn_black", "pawn_white", "queen_black","queen_white", "rook_black","rook_white"]
12
+ self.predictions = []
13
+ self.model = models.Sequential()
14
+ self.model.add(layers.Input(shape=INPUT_SHAPE))
15
+ #self.model.add(layers.Conv2D(50, (3,3), activation='relu', input_shape=(50,50,3))) # DEPRECATED
16
+ self.model.add(layers.Conv2D(50, (3,3), activation='relu'))
17
+ self.model.add(layers.MaxPooling2D((2,2)))
18
+ self.model.add(layers.Conv2D(100,(3,3), activation='relu'))
19
+ self.model.add(layers.MaxPooling2D((2,2)))
20
+ self.model.add(layers.Conv2D(100, (3,3), activation='relu'))
21
+ self.model.add(layers.Flatten())
22
+ self.model.add(layers.Dense(100, activation='relu'))
23
+ self.model.add(layers.Dense(13, activation='softmax'))
24
+ self.model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
25
+
26
+ def fit(self, train_images, train_labels, test_images, test_labels, num_of_epochs=2, batch_size=32):
27
+ self.model.fit(train_images, train_labels, epochs=num_of_epochs, verbose=1, validation_data=(test_images, test_labels), batch_size=batch_size)
28
+
29
+ def save(self, path='.') -> None:
30
+ self.model.save(path)
31
+
32
+ def evaluate(self, test_images, test_labels):
33
+ loss, accuracy = self.model.evaluate(test_images, test_labels)
34
+ print(f"accuracy:{accuracy}")
35
+ print(f"loss:{loss}")
36
+
37
+ def load_model_from_weights(self, path):
38
+ self.model.load_weights(path)
39
+
40
+ def load_model(self, path):
41
+ self.model = models.load_model(path)
42
+
43
+ def predict(self, tiles) -> list:
44
+ for image in tiles:
45
+ image = np.array(image)
46
+ image = np.reshape(image, (INPUT_SHAPE))
47
+ image = np.expand_dims(image, axis=0)
48
+ prediction = self.model.predict(image, verbose=0)
49
+ index = np.argmax(prediction)
50
+ self.predictions.append(self.CATEGORIES[index])
51
+ self.predictions = np.array(self.predictions)
52
+ return self.predictions
external/chess-modules/board_to_fen/board_to_fen/__init__.py ADDED
File without changes
external/chess-modules/board_to_fen/board_to_fen/predict.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from PIL import Image
3
+ from .KerasNeuralNetwork import KerasNeuralNetwork
4
+ from .utils import Decoder_FEN, Tiler
5
+ import os
6
+ import tensorflow as tf
7
+ from board_to_fen import saved_models
8
+ import numpy as np
9
+
10
+ # try:
11
+ # import importlib.resources as pkg_resources
12
+ # except ImportError:
13
+ # # Try backported to PY<37 `importlib_resources`.
14
+ # import importlib_resources as pkg_resources
15
+ # deprecated
16
+
17
+ os.sys.path.append("board_to_fen/KerasNeuralNetwork")
18
+ from .KerasNeuralNetwork import KerasNeuralNetwork
19
+
20
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
21
+ MODELS_DIR = os.path.join(CURRENT_DIR, "saved_models")
22
+ PATH_TO_MODEL = os.path.join(MODELS_DIR, "november_model")
23
+ PATH_TO_MODEL_WEIGHTS = os.path.join(MODELS_DIR, "november_model_weights.h5")
24
+ # MODELS_DIR = "board_to_fen/saved_models/"
25
+ # PATH_TO_MODEL_WEIGHTS = os.path.join(MODELS_DIR, "november_model_weights.h5")
26
+
27
+ model = KerasNeuralNetwork()
28
+ model.load_model_from_weights(path=PATH_TO_MODEL_WEIGHTS)
29
+ # model = tf.keras.models.load_model("board_to_fen/saved_models/november_model_weights.h5")
30
+
31
+ def get_fen_from_image_path(image_path, end_of_row='/', black_view=False) -> str:
32
+ """
33
+ Predict FEN position from path to a chess image.
34
+
35
+ Parameters
36
+ ----------
37
+ image : type
38
+ Description
39
+ end_of_row : str
40
+ Indicate how to process end of chess row
41
+ black_view : bool
42
+ Set to True iff the board is viewed from Black's perspective.
43
+
44
+ Returns:
45
+ str: Predicted chess position in FEN notation
46
+ """
47
+ image = Image.open(image_path)
48
+ decoder = Decoder_FEN()
49
+ # net = KerasNeuralNetwork()
50
+ tiler = Tiler()
51
+ tiles = tiler.get_tiles(img=image)
52
+
53
+ # f = pkg_resources.open_text(saved_models, 'november_model') # Deprecated
54
+ #net.load_model(f.name)
55
+ #predictions = net.predict(tiles=tiles) # Deprecated
56
+
57
+ # convert tiles object to tf tensor before compute predictions
58
+ tiles_array = np.array([np.array(tile) for tile in tiles], dtype=np.float32)
59
+ tiles_tensor = tf.convert_to_tensor(tiles_array, dtype=tf.float32)
60
+ predictions = model.predict(tiles_tensor)
61
+ fen = decoder.fen_decode(squares=predictions, end_of_row=end_of_row, black_view=black_view)
62
+ return fen
63
+
64
+ def get_fen_from_image(image, end_of_row='/', black_view=False) -> str:
65
+ """
66
+ Predict FEN position from Image-like object, which represents a chess board.
67
+
68
+ Parameters
69
+ ----------
70
+ image : PIL.Image
71
+ image Object
72
+ end_of_row : str
73
+ Indicate how to process end of chess row
74
+ black_view : bool
75
+ Set to True if the board is viewed from Black's perspective.
76
+
77
+
78
+ Returns:
79
+ str: Predicted chess position in FEN notation
80
+ """
81
+ decoder = Decoder_FEN()
82
+ tiler = Tiler()
83
+ tiles = tiler.get_tiles(img=image)
84
+
85
+ tiles_array = np.array([np.array(tile) for tile in tiles], dtype=np.float32)
86
+ tiles_tensor = tf.convert_to_tensor(tiles_array, dtype=tf.float32)
87
+
88
+ # Predict
89
+ predictions = model.predict(tiles_tensor)
90
+
91
+ # Decode predictions to FEN
92
+ fen = decoder.fen_decode(
93
+ squares=predictions,
94
+ end_of_row=end_of_row,
95
+ black_view=black_view
96
+ )
97
+ return fen
98
+
99
+ if __name__ == "__main__":
100
+ img_path = "./board_to_fen/test_image.jpeg"
101
+ fen = get_fen_from_image_path(img_path)
102
+ print(fen)
external/chess-modules/board_to_fen/board_to_fen/saved_models/__init__.py ADDED
File without changes
external/chess-modules/board_to_fen/board_to_fen/test_image.jpeg ADDED
external/chess-modules/board_to_fen/board_to_fen/train_model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # silence tensorflow and keras debugging information
3
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
4
+ from .utils import DataFetcher
5
+ from .KerasNeuralNetwork import KerasNeuralNetwork
6
+
7
+
8
+ if __name__ == "__main__":
9
+ #fetch and prepare data
10
+ fetcher = DataFetcher()
11
+ fetcher.fetch_and_shuffle(data_dir='./test_splitted/')
12
+ train_images, train_labels, test_images, test_labels = fetcher.get_train_test()
13
+
14
+ #call and feed the neural network
15
+ net = KerasNeuralNetwork()
16
+ net.fit(train_images, train_labels, test_images, test_labels)
17
+ net.evaluate(test_images, test_labels)
18
+ net.save(path='./saved_models/another_model')
19
+
20
+
21
+
22
+
external/chess-modules/board_to_fen/board_to_fen/utils.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import random
4
+ import re
5
+ from PIL import Image
6
+ from itertools import product
7
+
8
+
9
+ LEGEND = {
10
+ 'pawn_white' : 'P',
11
+ 'pawn_black' : 'p',
12
+ 'knight_white' : 'N',
13
+ 'knight_black' : 'n',
14
+ 'bishop_white' : 'B',
15
+ 'bishop_black' : 'b',
16
+ 'rook_white' : 'R',
17
+ 'rook_black' : 'r',
18
+ 'queen_white' : 'Q',
19
+ 'queen_black' : 'q',
20
+ 'king_white' : 'K',
21
+ 'king_black' : 'k',
22
+ 'empty':'1'
23
+ }
24
+
25
+ class Decoder_FEN():
26
+ def __init__(self) -> None:
27
+ pass
28
+
29
+ def _squeeze(self, input) -> str:
30
+ filtered = re.sub('11111111', '8', input)
31
+ filtered = re.sub('1111111', '7', filtered)
32
+ filtered = re.sub('111111', '6', filtered)
33
+ filtered = re.sub('11111', '5', filtered)
34
+ filtered = re.sub('1111', '4', filtered)
35
+ filtered = re.sub('111', '3', filtered)
36
+ squeezed = re.sub('11', '2', filtered)
37
+ return squeezed
38
+
39
+ def _simple_validator(self, squares):
40
+ ''' ensures that model:
41
+ - found only one white king
42
+ - found only one black king
43
+ - number of pieces is not greater than 32
44
+
45
+ Args:
46
+ figures: list of figures before decoding
47
+ '''
48
+ count_king_white = 0
49
+ count_king_black = 0
50
+ figures = 0
51
+
52
+ for square in squares:
53
+ if square == 'king_white':
54
+ count_king_white+=1
55
+ if square == 'king_black':
56
+ count_king_black+=1
57
+ if square != 'empty':
58
+ figures +=1
59
+ if ((count_king_black != 1) or (count_king_white != 1) or ((figures) > 32)):
60
+ return 'invalid'
61
+
62
+ def fen_decode(self, squares, end_of_row='/', black_view=False) -> str:
63
+ if (self._simple_validator(squares) == 'invalid'):
64
+ return 'Model can\'t find valid chessboard layout'
65
+ long_fen = ''
66
+ for i, square in enumerate(squares):
67
+ if i % 8 == 0 and i>0:
68
+ long_fen+=end_of_row
69
+ long_fen+=LEGEND[square]
70
+ fen = self._squeeze(long_fen)
71
+ if black_view:
72
+ fen = fen[::-1]
73
+ return fen
74
+
75
+
76
+ class DataFetcher:
77
+ def __init__(self) -> None:
78
+ self.CATEGORIES = ["bishop_black", "bishop_white","empty","king_black","king_white","knight_black", "knight_white","pawn_black", "pawn_white", "queen_black","queen_white", "rook_black","rook_white"]
79
+ self.data = []
80
+ self.images = []
81
+ self.labels = []
82
+
83
+ def fetch_and_shuffle(self, data_dir):
84
+ for category in self.CATEGORIES:
85
+ path = os.path.join(data_dir, category)
86
+ class_num = self.CATEGORIES.index(category)
87
+ for filename in os.listdir(path):
88
+ try:
89
+ img = Image.open(os.path.join(path,filename)).convert('RGB')
90
+ ima = np.array(img)
91
+ #consider division /255
92
+ self.data.append([img/255, class_num])
93
+ except Exception as e:
94
+ pass
95
+ random.shuffle(self.data)
96
+ self._create_labels_and_images()
97
+
98
+ def _create_labels_and_images(self):
99
+ for features, label in self.data:
100
+ self.images.append(features)
101
+ self.labels.append(label)
102
+ self.images = np.array(self.images).reshape(-1, 50, 50, 3)
103
+ self.labels = np.array(self.labels)
104
+
105
+ def get_train_test(self, split=0.85):
106
+ pivot = int(split * len(self.images))
107
+ train_images = self.images[:pivot]
108
+ train_labels = self.labels[:pivot]
109
+ test_images = self.images[pivot:]
110
+ test_labels = self.labels[pivot:]
111
+ return train_images, train_labels, test_images, test_labels
112
+
113
+
114
+ class Tiler:
115
+ def __init__(self) -> None:
116
+ self.tile_images = []
117
+ pass
118
+
119
+ def get_tiles(self, img, d=50) -> list:
120
+ img = img.resize((400,400), Image.NEAREST).convert('RGB')
121
+ w, h = img.size
122
+ grid = product(range(0, h-h%d, d), range(0, w-w%d, d))
123
+ for i, j in grid:
124
+ box = (j, i, j+d, i+d)
125
+ self.tile_images.append(img.crop(box))
126
+ return self.tile_images
external/chess-modules/board_to_fen/poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
external/chess-modules/board_to_fen/pyproject.toml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "board_to_fen"
3
+ version = "0.1.2"
4
+ description = "Convert chessboard images to FEN notation using a CNN model."
5
+ authors = ["Dominik Maćkiewicz <https://github.com/mcdominik>"]
6
+ license = "MIT"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = ">=3.9,<3.13"
10
+ tensorflow = "2.17.1"
11
+ keras = "3.2.1"
12
+
13
+ [tool.poetry.group.dev.dependencies]
14
+ pytest = "7.1.1"
15
+
16
+ [build-system]
17
+ requires = ["poetry-core"]
18
+ build-backend = "poetry.core.masonry.api"
external/chess-modules/board_to_fen/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ tensorflow==2.17.1
2
+ keras==3.2.1
3
+ pytest==7.1.1
external/chess-modules/board_to_fen/setup.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ classifiers = [
4
+ 'Development Status :: 5 - Production/Stable',
5
+ 'Intended Audience :: Education',
6
+ 'License :: OSI Approved :: MIT License',
7
+ 'Programming Language :: Python :: 3'
8
+ ]
9
+
10
+
11
+
12
+ setup(
13
+ name='board_to_fen',
14
+ version='0.1.1',
15
+ author='Dominik Maćkiewicz',
16
+ author_email='dominik.mackiewicz@icloud.com',
17
+ packages=find_packages(),
18
+ scripts=[],
19
+ url='http://pypi.python.org/pypi/board_to_fen/',
20
+ license='MIT',
21
+ description='An package that converts digial chessboard image into Forsyth-Edwards notation',
22
+ long_description=open('README.md').read(),
23
+ include_package_data=True,
24
+ long_description_content_type='text/markdown',
25
+ python_requires='>=3.7',
26
+ install_requires=['pillow']
27
+ )
external/chess-modules/board_to_fen/test_images/test_image2.jpeg ADDED
external/chess-modules/board_to_fen/test_predict.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from board_to_fen.predict import get_fen_from_image, get_fen_from_image_path
3
+ from PIL import Image
4
+
5
+
6
+ img_path = "./test_images/test_image1.png"
7
+
8
+ class TestNetwork:
9
+ def test_path_prediction(self):
10
+ assert get_fen_from_image_path(img_path) == "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR"
11
+ def test_object_prediction(self):
12
+ img = Image.open(img_path)
13
+ assert get_fen_from_image(img) == "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR"
langgraph.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "dependencies": ["./"],
3
+ "graphs": {
4
+ "mvp_agent": "src/agents/react.py:graph"
5
+ }
6
+ }
main.py ADDED
File without changes
notebooks/final_submition.ipynb ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "a9bbf8e8",
6
+ "metadata": {},
7
+ "source": [
8
+ "### Setup"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "0ce6fb6a",
15
+ "metadata": {
16
+ "vscode": {
17
+ "languageId": "plaintext"
18
+ }
19
+ },
20
+ "outputs": [],
21
+ "source": [
22
+ "# Imports\n",
23
+ "\n",
24
+ "\n",
25
+ "# Local modules\n",
26
+ "\n"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "id": "747d91b7",
33
+ "metadata": {
34
+ "vscode": {
35
+ "languageId": "plaintext"
36
+ }
37
+ },
38
+ "outputs": [],
39
+ "source": [
40
+ "# HF settings"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "markdown",
45
+ "id": "049ae61d",
46
+ "metadata": {
47
+ "vscode": {
48
+ "languageId": "plaintext"
49
+ }
50
+ },
51
+ "source": [
52
+ "### Get GAIA test questions"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "markdown",
57
+ "id": "a7ca1548",
58
+ "metadata": {},
59
+ "source": [
60
+ "### Make inferences on test questions"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "markdown",
65
+ "id": "bd0bd958",
66
+ "metadata": {},
67
+ "source": [
68
+ "### Submit results"
69
+ ]
70
+ }
71
+ ],
72
+ "metadata": {
73
+ "language_info": {
74
+ "name": "python"
75
+ }
76
+ },
77
+ "nbformat": 4,
78
+ "nbformat_minor": 5
79
+ }
notebooks/gaia_level_one_questions_validation.ipynb ADDED
@@ -0,0 +1,1311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "c545435f",
6
+ "metadata": {},
7
+ "source": [
8
+ "This Notebook is made for experimentation purposes, so we can track the changes vs performance of our agent"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "8a78bd29",
14
+ "metadata": {},
15
+ "source": [
16
+ "### Setup"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 1,
22
+ "id": "f7c489ed",
23
+ "metadata": {},
24
+ "outputs": [
25
+ {
26
+ "name": "stderr",
27
+ "output_type": "stream",
28
+ "text": [
29
+ "/home/santiagoal/.cache/pypoetry/virtualenvs/chappie-CHLGiFC_-py3.12/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
30
+ " from .autonotebook import tqdm as notebook_tqdm\n"
31
+ ]
32
+ }
33
+ ],
34
+ "source": [
35
+ "# Libraries\n",
36
+ "from typing import Literal\n",
37
+ "from datasets import load_dataset\n",
38
+ "import pandas as pd\n",
39
+ "import os\n",
40
+ "from random import sample\n",
41
+ "from dotenv import load_dotenv\n",
42
+ "from huggingface_hub import snapshot_download, login"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 2,
48
+ "id": "585346bc",
49
+ "metadata": {},
50
+ "outputs": [
51
+ {
52
+ "name": "stderr",
53
+ "output_type": "stream",
54
+ "text": [
55
+ "Fetching 114 files: 100%|██████████| 114/114 [00:00<00:00, 2483.91it/s]"
56
+ ]
57
+ },
58
+ {
59
+ "name": "stdout",
60
+ "output_type": "stream",
61
+ "text": [
62
+ "/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316\n"
63
+ ]
64
+ },
65
+ {
66
+ "name": "stderr",
67
+ "output_type": "stream",
68
+ "text": [
69
+ "\n"
70
+ ]
71
+ }
72
+ ],
73
+ "source": [
74
+ "# Import tasks attached files \n",
75
+ "load_dotenv()\n",
76
+ "login(token=os.getenv(key=\"HF_TOKEN_CHAPPIE\"))\n",
77
+ "gaia_repo_dir = snapshot_download(repo_id=\"gaia-benchmark/GAIA\", repo_type=\"dataset\")\n",
78
+ "print(gaia_repo_dir)"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 3,
84
+ "id": "2fbda084",
85
+ "metadata": {},
86
+ "outputs": [
87
+ {
88
+ "name": "stderr",
89
+ "output_type": "stream",
90
+ "text": [
91
+ "Langfuse client is disabled since no public_key was provided as a parameter or environment variable 'LANGFUSE_PUBLIC_KEY'. See our docs: https://langfuse.com/docs/sdk/python/low-level-sdk#initialize-client\n"
92
+ ]
93
+ }
94
+ ],
95
+ "source": [
96
+ "# Local Modules\n",
97
+ "os.sys.path.append(\"../src/\")\n",
98
+ "os.sys.path.append(\"../src/agents/\")\n",
99
+ "os.sys.path.append(\"../src/utils/\")\n",
100
+ "\n",
101
+ "import react # My AI assistant\n",
102
+ "import gaia_eval"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 4,
108
+ "id": "73bcdc75",
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "# Paths\n",
113
+ "output_results_path = \"/home/santiagoal/current-projects/chappie/data/agent_experiments/\"\n",
114
+ "experiment_iterations_path = os.path.join(output_results_path, \"iterations/\")\n",
115
+ "summary_experiments_path = os.path.join(output_results_path, \"summary.csv\")"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 5,
121
+ "id": "53df192e",
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "# GAIA dataset\n",
126
+ "gaia_dataset = load_dataset(\"gaia-benchmark/GAIA\", \"2023_level1\")"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "markdown",
131
+ "id": "c4faa863",
132
+ "metadata": {},
133
+ "source": [
134
+ "Docs suggest to focus on validation set for dev purposes, let's explore it"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 6,
140
+ "id": "6fed73d9",
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "data": {
145
+ "text/plain": [
146
+ "Dataset({\n",
147
+ " features: ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata'],\n",
148
+ " num_rows: 53\n",
149
+ "})"
150
+ ]
151
+ },
152
+ "execution_count": 6,
153
+ "metadata": {},
154
+ "output_type": "execute_result"
155
+ }
156
+ ],
157
+ "source": [
158
+ "dev_set = gaia_dataset[\"validation\"]\n",
159
+ "dev_set"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "markdown",
164
+ "id": "ce03df90",
165
+ "metadata": {},
166
+ "source": [
167
+ "### Explore GAIA Lvl. 1 Questions"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 7,
173
+ "id": "540838d8",
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "df_dev = pd.DataFrame(dev_set)"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 8,
183
+ "id": "70dcbd6a",
184
+ "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "data": {
188
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
189
+ "columns": [
190
+ {
191
+ "name": "index",
192
+ "rawType": "int64",
193
+ "type": "integer"
194
+ },
195
+ {
196
+ "name": "task_id",
197
+ "rawType": "object",
198
+ "type": "string"
199
+ },
200
+ {
201
+ "name": "Question",
202
+ "rawType": "object",
203
+ "type": "string"
204
+ },
205
+ {
206
+ "name": "Level",
207
+ "rawType": "object",
208
+ "type": "string"
209
+ },
210
+ {
211
+ "name": "Final answer",
212
+ "rawType": "object",
213
+ "type": "string"
214
+ },
215
+ {
216
+ "name": "file_name",
217
+ "rawType": "object",
218
+ "type": "string"
219
+ },
220
+ {
221
+ "name": "file_path",
222
+ "rawType": "object",
223
+ "type": "string"
224
+ },
225
+ {
226
+ "name": "Annotator Metadata",
227
+ "rawType": "object",
228
+ "type": "unknown"
229
+ }
230
+ ],
231
+ "ref": "3d225c1c-ded5-478b-9b86-dc72eed15534",
232
+ "rows": [
233
+ [
234
+ "0",
235
+ "e1fc63a2-da7a-432f-be78-7c4a95598703",
236
+ "If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.",
237
+ "1",
238
+ "17",
239
+ "",
240
+ "",
241
+ "{'Steps': '1. Googled Eliud Kipchoge marathon pace to find 4min 37sec/mile\\n2. Converted into fractions of hours.\\n3. Found moon periapsis in miles (225,623 miles).\\n4. Multiplied the two to find the number of hours and rounded to the nearest 100 hours.', 'Number of steps': '4', 'How long did this take?': '20 Minutes', 'Tools': '1. A web browser.\\n2. A search engine.\\n3. A calculator.', 'Number of tools': '3'}"
242
+ ],
243
+ [
244
+ "1",
245
+ "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
246
+ "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
247
+ "1",
248
+ "3",
249
+ "",
250
+ "",
251
+ "{'Steps': '1. I did a search for Mercedes Sosa\\n2. I went to the Wikipedia page for her\\n3. I scrolled down to \"Studio albums\"\\n4. I counted the ones between 2000 and 2009', 'Number of steps': '4', 'How long did this take?': '5 minutes', 'Tools': '1. web browser\\n2. google search', 'Number of tools': '2'}"
252
+ ],
253
+ [
254
+ "2",
255
+ "ec09fa32-d03f-4bf8-84b0-1f16922c3ae4",
256
+ "Here's a fun riddle that I think you'll enjoy.\n\nYou have been selected to play the final round of the hit new game show \"Pick That Ping-Pong\". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.\n\nA device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.\n\nThe ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.\n\nWhich ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.",
257
+ "1",
258
+ "3",
259
+ "",
260
+ "",
261
+ "{'Steps': 'Step 1: Evaluate the problem statement provided in my user\\'s prompt\\nStep 2: Consider the probability of any ball on the platform earning the prize.\\nStep 3: Evaluate the ball in position one. The probability of it earning the prize, P1, is 1/3\\nStep 4: Using a calculator, evaluate the ball in position two. The probability of it earning the prize, P2, is the difference between 1 and the product of the complementary probabilities for each trial\\nP2 = 1 - (2/3)(2/3)\\nP2 = 5/9\\nStep 5: Using a calculator, evaluate the ball in position three. The probability of it earning the prize, P3, is the difference between 1 and the product of the complementary probabilities for each trial\\nP3 = 1 - (2/3)(2/3)(2/3)\\nP3 = 19/27\\nStep 6: Consider the possible outcomes of numbers higher than 3.\\nStep 7: For each trial, either 1 or 2 balls from the ramp will advance to the platform. For any given selection, there is a 50% chance that the ball advances to position 2 or position 3.\\nStep 8: As position three holds the highest chance of earning the prize, select the only ball known to occupy position three with certainty, ball 3.\\nStep 9: Report the correct answer to my user, \"3\"', 'Number of steps': '9', 'How long did this take?': '1 minute', 'Tools': 'None', 'Number of tools': '0'}"
262
+ ],
263
+ [
264
+ "3",
265
+ "5d0080cb-90d7-4712-bc33-848150e917d3",
266
+ "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper \"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\"",
267
+ "1",
268
+ "0.1777",
269
+ "",
270
+ "",
271
+ "{'Steps': '1. Searched \\'\"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\"\\' on Google.\\n2. Opened \"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\" at https://journals.le.ac.uk/ojs1/index.php/jist/article/view/733.\\n3. Clicked \"PDF\".\\n4. Found the calculations for the volume of the fish bag and noted them.', 'Number of steps': '4', 'How long did this take?': '5 minutes', 'Tools': '1. Web browser\\n2. Search engine\\n3. PDF access', 'Number of tools': '3'}"
272
+ ],
273
+ [
274
+ "4",
275
+ "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
276
+ "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
277
+ "1",
278
+ "3",
279
+ "",
280
+ "",
281
+ "{'Steps': '1. Navigate to the YouTube link.\\n2. Watch the video to see the highest number of bird species.\\n3. Note the number.', 'Number of steps': '3', 'How long did this take?': '3 minutes', 'Tools': '1. Web browser\\n2. Video parsing', 'Number of tools': '2'}"
282
+ ]
283
+ ],
284
+ "shape": {
285
+ "columns": 7,
286
+ "rows": 5
287
+ }
288
+ },
289
+ "text/html": [
290
+ "<div>\n",
291
+ "<style scoped>\n",
292
+ " .dataframe tbody tr th:only-of-type {\n",
293
+ " vertical-align: middle;\n",
294
+ " }\n",
295
+ "\n",
296
+ " .dataframe tbody tr th {\n",
297
+ " vertical-align: top;\n",
298
+ " }\n",
299
+ "\n",
300
+ " .dataframe thead th {\n",
301
+ " text-align: right;\n",
302
+ " }\n",
303
+ "</style>\n",
304
+ "<table border=\"1\" class=\"dataframe\">\n",
305
+ " <thead>\n",
306
+ " <tr style=\"text-align: right;\">\n",
307
+ " <th></th>\n",
308
+ " <th>task_id</th>\n",
309
+ " <th>Question</th>\n",
310
+ " <th>Level</th>\n",
311
+ " <th>Final answer</th>\n",
312
+ " <th>file_name</th>\n",
313
+ " <th>file_path</th>\n",
314
+ " <th>Annotator Metadata</th>\n",
315
+ " </tr>\n",
316
+ " </thead>\n",
317
+ " <tbody>\n",
318
+ " <tr>\n",
319
+ " <th>0</th>\n",
320
+ " <td>e1fc63a2-da7a-432f-be78-7c4a95598703</td>\n",
321
+ " <td>If Eliud Kipchoge could maintain his record-ma...</td>\n",
322
+ " <td>1</td>\n",
323
+ " <td>17</td>\n",
324
+ " <td></td>\n",
325
+ " <td></td>\n",
326
+ " <td>{'Steps': '1. Googled Eliud Kipchoge marathon ...</td>\n",
327
+ " </tr>\n",
328
+ " <tr>\n",
329
+ " <th>1</th>\n",
330
+ " <td>8e867cd7-cff9-4e6c-867a-ff5ddc2550be</td>\n",
331
+ " <td>How many studio albums were published by Merce...</td>\n",
332
+ " <td>1</td>\n",
333
+ " <td>3</td>\n",
334
+ " <td></td>\n",
335
+ " <td></td>\n",
336
+ " <td>{'Steps': '1. I did a search for Mercedes Sosa...</td>\n",
337
+ " </tr>\n",
338
+ " <tr>\n",
339
+ " <th>2</th>\n",
340
+ " <td>ec09fa32-d03f-4bf8-84b0-1f16922c3ae4</td>\n",
341
+ " <td>Here's a fun riddle that I think you'll enjoy....</td>\n",
342
+ " <td>1</td>\n",
343
+ " <td>3</td>\n",
344
+ " <td></td>\n",
345
+ " <td></td>\n",
346
+ " <td>{'Steps': 'Step 1: Evaluate the problem statem...</td>\n",
347
+ " </tr>\n",
348
+ " <tr>\n",
349
+ " <th>3</th>\n",
350
+ " <td>5d0080cb-90d7-4712-bc33-848150e917d3</td>\n",
351
+ " <td>What was the volume in m^3 of the fish bag tha...</td>\n",
352
+ " <td>1</td>\n",
353
+ " <td>0.1777</td>\n",
354
+ " <td></td>\n",
355
+ " <td></td>\n",
356
+ " <td>{'Steps': '1. Searched '\"Can Hiccup Supply Eno...</td>\n",
357
+ " </tr>\n",
358
+ " <tr>\n",
359
+ " <th>4</th>\n",
360
+ " <td>a1e91b78-d3d8-4675-bb8d-62741b4b68a6</td>\n",
361
+ " <td>In the video https://www.youtube.com/watch?v=L...</td>\n",
362
+ " <td>1</td>\n",
363
+ " <td>3</td>\n",
364
+ " <td></td>\n",
365
+ " <td></td>\n",
366
+ " <td>{'Steps': '1. Navigate to the YouTube link.\n",
367
+ "2....</td>\n",
368
+ " </tr>\n",
369
+ " </tbody>\n",
370
+ "</table>\n",
371
+ "</div>"
372
+ ],
373
+ "text/plain": [
374
+ " task_id \\\n",
375
+ "0 e1fc63a2-da7a-432f-be78-7c4a95598703 \n",
376
+ "1 8e867cd7-cff9-4e6c-867a-ff5ddc2550be \n",
377
+ "2 ec09fa32-d03f-4bf8-84b0-1f16922c3ae4 \n",
378
+ "3 5d0080cb-90d7-4712-bc33-848150e917d3 \n",
379
+ "4 a1e91b78-d3d8-4675-bb8d-62741b4b68a6 \n",
380
+ "\n",
381
+ " Question Level Final answer \\\n",
382
+ "0 If Eliud Kipchoge could maintain his record-ma... 1 17 \n",
383
+ "1 How many studio albums were published by Merce... 1 3 \n",
384
+ "2 Here's a fun riddle that I think you'll enjoy.... 1 3 \n",
385
+ "3 What was the volume in m^3 of the fish bag tha... 1 0.1777 \n",
386
+ "4 In the video https://www.youtube.com/watch?v=L... 1 3 \n",
387
+ "\n",
388
+ " file_name file_path Annotator Metadata \n",
389
+ "0 {'Steps': '1. Googled Eliud Kipchoge marathon ... \n",
390
+ "1 {'Steps': '1. I did a search for Mercedes Sosa... \n",
391
+ "2 {'Steps': 'Step 1: Evaluate the problem statem... \n",
392
+ "3 {'Steps': '1. Searched '\"Can Hiccup Supply Eno... \n",
393
+ "4 {'Steps': '1. Navigate to the YouTube link.\n",
394
+ "2.... "
395
+ ]
396
+ },
397
+ "execution_count": 8,
398
+ "metadata": {},
399
+ "output_type": "execute_result"
400
+ }
401
+ ],
402
+ "source": [
403
+ "df_dev.head()"
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "markdown",
408
+ "id": "5eef07a0",
409
+ "metadata": {},
410
+ "source": [
411
+ "In order to start our dev phase, let's observe how does our React Agent perform of a single question"
412
+ ]
413
+ },
414
+ {
415
+ "cell_type": "code",
416
+ "execution_count": 9,
417
+ "id": "be110221",
418
+ "metadata": {},
419
+ "outputs": [],
420
+ "source": [
421
+ "# DEL\n",
422
+ "# Experiment with only .txt like tasts\n",
423
+ "# Temporal cell\n",
424
+ "df_dev = df_dev[df_dev[\"file_path\"].apply(lambda row: row.endswith(\".txt\"))]"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": 10,
430
+ "id": "b3632592",
431
+ "metadata": {},
432
+ "outputs": [],
433
+ "source": [
434
+ "n_samples = min(20, df_dev.shape[0])\n",
435
+ "sample_questions = df_dev.sample(n_samples)"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": 11,
441
+ "id": "0d10258b",
442
+ "metadata": {},
443
+ "outputs": [],
444
+ "source": [
445
+ "# Dataset copy just for eval\n",
446
+ "\n",
447
+ "results_df = sample_questions.copy()[[\"Question\", \"Final answer\", \"file_path\"]]\n",
448
+ "results_df[\"Agent response\"] = None\n",
449
+ "results_df[\"is_correct\"] = None # 1 if it is correct, 0 otherwise\n",
450
+ "\n",
451
+ "results_df = results_df[[\"Question\", \"file_path\", \"Agent response\", \"Final answer\", \"is_correct\"]]"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "markdown",
456
+ "id": "ef5f9a65",
457
+ "metadata": {},
458
+ "source": [
459
+ "### Experiment and Track performance on dev set"
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "code",
464
+ "execution_count": 12,
465
+ "id": "ef1a21ec",
466
+ "metadata": {},
467
+ "outputs": [],
468
+ "source": [
469
+ "# Get XP history\n",
470
+ "old_experiments_data = pd.read_csv(summary_experiments_path)"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": 13,
476
+ "id": "58541f5c",
477
+ "metadata": {},
478
+ "outputs": [],
479
+ "source": [
480
+ "# Form\n",
481
+ "\n",
482
+ "# Get last information \n",
483
+ "latest_experiment = old_experiments_data.iloc[-1]\n",
484
+ "#latest_xp_name = latest_experiment[\"experiment\"]\n",
485
+ "latest_agent = latest_experiment[\"agent\"]\n",
486
+ "latest_tools = latest_experiment[\"tools\"]\n",
487
+ "latest_iteration = latest_experiment[\"iteration\"]\n",
488
+ "current_iteration = latest_iteration + 1\n",
489
+ "\n",
490
+ "# Get XP name\n",
491
+ "msg = \"Type the experiment name (E.g. Integrate Whisper Transcriber)\"\n",
492
+ "usr_response = \"\"\n",
493
+ "while usr_response.lower() == \"\":\n",
494
+ " usr_response = input(msg + \": \")\n",
495
+ "xp_name = usr_response\n",
496
+ "\n",
497
+ "\n",
498
+ "# Get Agent Archictecure\n",
499
+ "usr_response = \"\"\n",
500
+ "msg = f\"Is your agent different from '{latest_agent}'? [y/N]\"\n",
501
+ "usr_response = input(msg + \": \")\n",
502
+ "warning_msg = f\"Oops! '{usr_response}' is not a valid response, pls try again. \"\n",
503
+ "while usr_response.lower() not in (\"y\", \"n\"):\n",
504
+ " usr_response = input(warning_msg.format(usr_response) + msg + \": \")\n",
505
+ "\n",
506
+ "if usr_response.lower() == \"n\":\n",
507
+ " agent_architecture = latest_agent\n",
508
+ "elif usr_response.lower() == \"y\":\n",
509
+ " agent_architecture = input(\"Please type the new agent architecture to track\" + \": \")\n",
510
+ "\n",
511
+ "# Get Tools\n",
512
+ "usr_response = \"\"\n",
513
+ "msg = \"Are there new tools to track? [y/N]\"\n",
514
+ "usr_response = input(msg + \": \")\n",
515
+ "while usr_response.lower() not in (\"y\", \"n\"):\n",
516
+ " usr_response = input(warning_msg.format(usr_response) + msg + \": \")\n",
517
+ "\n",
518
+ "if usr_response.lower() == \"n\":\n",
519
+ " new_tools = \"\"\n",
520
+ "elif usr_response.lower() == \"y\":\n",
521
+ " new_tools = input(\"Please type the new tools list to track (comma separated)\" + \": \")\n",
522
+ "\n",
523
+ "new_tools_list = latest_tools + \", \" + new_tools\n",
524
+ "\n",
525
+ "# Format xp name and path\n",
526
+ "xp_name_snake = str(current_iteration) + \"_\" + xp_name.replace(\" \", \"_\").replace(\",\", \"\").lower()\n",
527
+ "xp_path = os.path.join(experiment_iterations_path, xp_name_snake + \".csv\")"
528
+ ]
529
+ },
530
+ {
531
+ "cell_type": "code",
532
+ "execution_count": 14,
533
+ "id": "7e4c15d8",
534
+ "metadata": {},
535
+ "outputs": [
536
+ {
537
+ "data": {
538
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
539
+ "columns": [
540
+ {
541
+ "name": "index",
542
+ "rawType": "int64",
543
+ "type": "integer"
544
+ },
545
+ {
546
+ "name": "Question",
547
+ "rawType": "object",
548
+ "type": "string"
549
+ },
550
+ {
551
+ "name": "file_path",
552
+ "rawType": "object",
553
+ "type": "string"
554
+ },
555
+ {
556
+ "name": "Agent response",
557
+ "rawType": "object",
558
+ "type": "unknown"
559
+ },
560
+ {
561
+ "name": "Final answer",
562
+ "rawType": "object",
563
+ "type": "string"
564
+ },
565
+ {
566
+ "name": "is_correct",
567
+ "rawType": "object",
568
+ "type": "unknown"
569
+ }
570
+ ],
571
+ "ref": "8ca69d17-0092-4421-a86c-d63edb41dea2",
572
+ "rows": [
573
+ [
574
+ "22",
575
+ "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, \"-\", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",
576
+ "/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt",
577
+ null,
578
+ "3",
579
+ null
580
+ ]
581
+ ],
582
+ "shape": {
583
+ "columns": 5,
584
+ "rows": 1
585
+ }
586
+ },
587
+ "text/html": [
588
+ "<div>\n",
589
+ "<style scoped>\n",
590
+ " .dataframe tbody tr th:only-of-type {\n",
591
+ " vertical-align: middle;\n",
592
+ " }\n",
593
+ "\n",
594
+ " .dataframe tbody tr th {\n",
595
+ " vertical-align: top;\n",
596
+ " }\n",
597
+ "\n",
598
+ " .dataframe thead th {\n",
599
+ " text-align: right;\n",
600
+ " }\n",
601
+ "</style>\n",
602
+ "<table border=\"1\" class=\"dataframe\">\n",
603
+ " <thead>\n",
604
+ " <tr style=\"text-align: right;\">\n",
605
+ " <th></th>\n",
606
+ " <th>Question</th>\n",
607
+ " <th>file_path</th>\n",
608
+ " <th>Agent response</th>\n",
609
+ " <th>Final answer</th>\n",
610
+ " <th>is_correct</th>\n",
611
+ " </tr>\n",
612
+ " </thead>\n",
613
+ " <tbody>\n",
614
+ " <tr>\n",
615
+ " <th>22</th>\n",
616
+ " <td>You are a telecommunications engineer who want...</td>\n",
617
+ " <td>/home/santiagoal/.cache/huggingface/hub/datase...</td>\n",
618
+ " <td>None</td>\n",
619
+ " <td>3</td>\n",
620
+ " <td>None</td>\n",
621
+ " </tr>\n",
622
+ " </tbody>\n",
623
+ "</table>\n",
624
+ "</div>"
625
+ ],
626
+ "text/plain": [
627
+ " Question \\\n",
628
+ "22 You are a telecommunications engineer who want... \n",
629
+ "\n",
630
+ " file_path Agent response \\\n",
631
+ "22 /home/santiagoal/.cache/huggingface/hub/datase... None \n",
632
+ "\n",
633
+ " Final answer is_correct \n",
634
+ "22 3 None "
635
+ ]
636
+ },
637
+ "execution_count": 14,
638
+ "metadata": {},
639
+ "output_type": "execute_result"
640
+ }
641
+ ],
642
+ "source": [
643
+ "results_df"
644
+ ]
645
+ },
646
+ {
647
+ "cell_type": "code",
648
+ "execution_count": 15,
649
+ "id": "2b48dcc8",
650
+ "metadata": {},
651
+ "outputs": [
652
+ {
653
+ "name": "stdout",
654
+ "output_type": "stream",
655
+ "text": [
656
+ "attached_files: /home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt\n"
657
+ ]
658
+ },
659
+ {
660
+ "data": {
661
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
662
+ "columns": [
663
+ {
664
+ "name": "index",
665
+ "rawType": "int64",
666
+ "type": "integer"
667
+ },
668
+ {
669
+ "name": "Question",
670
+ "rawType": "object",
671
+ "type": "string"
672
+ },
673
+ {
674
+ "name": "file_path",
675
+ "rawType": "object",
676
+ "type": "string"
677
+ },
678
+ {
679
+ "name": "Agent response",
680
+ "rawType": "object",
681
+ "type": "string"
682
+ },
683
+ {
684
+ "name": "Final answer",
685
+ "rawType": "object",
686
+ "type": "string"
687
+ },
688
+ {
689
+ "name": "is_correct",
690
+ "rawType": "int64",
691
+ "type": "integer"
692
+ }
693
+ ],
694
+ "ref": "cc4b4ed7-e0af-4c13-8682-95d79797ea08",
695
+ "rows": [
696
+ [
697
+ "22",
698
+ "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, \"-\", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",
699
+ "/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt",
700
+ "2",
701
+ "3",
702
+ "0"
703
+ ]
704
+ ],
705
+ "shape": {
706
+ "columns": 5,
707
+ "rows": 1
708
+ }
709
+ },
710
+ "text/html": [
711
+ "<div>\n",
712
+ "<style scoped>\n",
713
+ " .dataframe tbody tr th:only-of-type {\n",
714
+ " vertical-align: middle;\n",
715
+ " }\n",
716
+ "\n",
717
+ " .dataframe tbody tr th {\n",
718
+ " vertical-align: top;\n",
719
+ " }\n",
720
+ "\n",
721
+ " .dataframe thead th {\n",
722
+ " text-align: right;\n",
723
+ " }\n",
724
+ "</style>\n",
725
+ "<table border=\"1\" class=\"dataframe\">\n",
726
+ " <thead>\n",
727
+ " <tr style=\"text-align: right;\">\n",
728
+ " <th></th>\n",
729
+ " <th>Question</th>\n",
730
+ " <th>file_path</th>\n",
731
+ " <th>Agent response</th>\n",
732
+ " <th>Final answer</th>\n",
733
+ " <th>is_correct</th>\n",
734
+ " </tr>\n",
735
+ " </thead>\n",
736
+ " <tbody>\n",
737
+ " <tr>\n",
738
+ " <th>22</th>\n",
739
+ " <td>You are a telecommunications engineer who want...</td>\n",
740
+ " <td>/home/santiagoal/.cache/huggingface/hub/datase...</td>\n",
741
+ " <td>2</td>\n",
742
+ " <td>3</td>\n",
743
+ " <td>0</td>\n",
744
+ " </tr>\n",
745
+ " </tbody>\n",
746
+ "</table>\n",
747
+ "</div>"
748
+ ],
749
+ "text/plain": [
750
+ " Question \\\n",
751
+ "22 You are a telecommunications engineer who want... \n",
752
+ "\n",
753
+ " file_path Agent response \\\n",
754
+ "22 /home/santiagoal/.cache/huggingface/hub/datase... 2 \n",
755
+ "\n",
756
+ " Final answer is_correct \n",
757
+ "22 3 0 "
758
+ ]
759
+ },
760
+ "execution_count": 15,
761
+ "metadata": {},
762
+ "output_type": "execute_result"
763
+ }
764
+ ],
765
+ "source": [
766
+ "# Compute and save agent responses and their evaluation\n",
767
+ "\n",
768
+ "results_df[\"Agent response\"] = results_df.apply(func=gaia_eval.get_agent_response, axis=1)\n",
769
+ "results_df[\"is_correct\"] = results_df.apply(func=gaia_eval.evaluate_response, axis=1)\n",
770
+ "results_df"
771
+ ]
772
+ },
773
+ {
774
+ "cell_type": "code",
775
+ "execution_count": null,
776
+ "id": "0bfdb7b8",
777
+ "metadata": {},
778
+ "outputs": [],
779
+ "source": []
780
+ },
781
+ {
782
+ "cell_type": "code",
783
+ "execution_count": 16,
784
+ "id": "89f8f584",
785
+ "metadata": {},
786
+ "outputs": [
787
+ {
788
+ "name": "stdout",
789
+ "output_type": "stream",
790
+ "text": [
791
+ " Experiment Accuracy: 0.00 %\n"
792
+ ]
793
+ }
794
+ ],
795
+ "source": [
796
+ "accuracy = results_df[\"is_correct\"].mean()\n",
797
+ "print(f\" Experiment Accuracy: {(100 * accuracy):.2f} %\")"
798
+ ]
799
+ },
800
+ {
801
+ "cell_type": "code",
802
+ "execution_count": 17,
803
+ "id": "a326127a",
804
+ "metadata": {},
805
+ "outputs": [
806
+ {
807
+ "data": {
808
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
809
+ "columns": [
810
+ {
811
+ "name": "index",
812
+ "rawType": "int64",
813
+ "type": "integer"
814
+ },
815
+ {
816
+ "name": "Question",
817
+ "rawType": "object",
818
+ "type": "string"
819
+ },
820
+ {
821
+ "name": "file_path",
822
+ "rawType": "object",
823
+ "type": "string"
824
+ },
825
+ {
826
+ "name": "Agent response",
827
+ "rawType": "object",
828
+ "type": "string"
829
+ },
830
+ {
831
+ "name": "Final answer",
832
+ "rawType": "object",
833
+ "type": "string"
834
+ },
835
+ {
836
+ "name": "is_correct",
837
+ "rawType": "int64",
838
+ "type": "integer"
839
+ }
840
+ ],
841
+ "ref": "74bb579d-4727-47ec-af34-50c669e56ee9",
842
+ "rows": [
843
+ [
844
+ "22",
845
+ "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. In the reference file is a layout of the road and nearby houses. Each dash, \"-\", is a marker indicating a mile. Each capital H indicates a house located next to a mile marker, appearing above or below the stretch of road. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.",
846
+ "/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt",
847
+ "2",
848
+ "3",
849
+ "0"
850
+ ]
851
+ ],
852
+ "shape": {
853
+ "columns": 5,
854
+ "rows": 1
855
+ }
856
+ },
857
+ "text/html": [
858
+ "<div>\n",
859
+ "<style scoped>\n",
860
+ " .dataframe tbody tr th:only-of-type {\n",
861
+ " vertical-align: middle;\n",
862
+ " }\n",
863
+ "\n",
864
+ " .dataframe tbody tr th {\n",
865
+ " vertical-align: top;\n",
866
+ " }\n",
867
+ "\n",
868
+ " .dataframe thead th {\n",
869
+ " text-align: right;\n",
870
+ " }\n",
871
+ "</style>\n",
872
+ "<table border=\"1\" class=\"dataframe\">\n",
873
+ " <thead>\n",
874
+ " <tr style=\"text-align: right;\">\n",
875
+ " <th></th>\n",
876
+ " <th>Question</th>\n",
877
+ " <th>file_path</th>\n",
878
+ " <th>Agent response</th>\n",
879
+ " <th>Final answer</th>\n",
880
+ " <th>is_correct</th>\n",
881
+ " </tr>\n",
882
+ " </thead>\n",
883
+ " <tbody>\n",
884
+ " <tr>\n",
885
+ " <th>22</th>\n",
886
+ " <td>You are a telecommunications engineer who want...</td>\n",
887
+ " <td>/home/santiagoal/.cache/huggingface/hub/datase...</td>\n",
888
+ " <td>2</td>\n",
889
+ " <td>3</td>\n",
890
+ " <td>0</td>\n",
891
+ " </tr>\n",
892
+ " </tbody>\n",
893
+ "</table>\n",
894
+ "</div>"
895
+ ],
896
+ "text/plain": [
897
+ " Question \\\n",
898
+ "22 You are a telecommunications engineer who want... \n",
899
+ "\n",
900
+ " file_path Agent response \\\n",
901
+ "22 /home/santiagoal/.cache/huggingface/hub/datase... 2 \n",
902
+ "\n",
903
+ " Final answer is_correct \n",
904
+ "22 3 0 "
905
+ ]
906
+ },
907
+ "execution_count": 17,
908
+ "metadata": {},
909
+ "output_type": "execute_result"
910
+ }
911
+ ],
912
+ "source": [
913
+ "results_df"
914
+ ]
915
+ },
916
+ {
917
+ "cell_type": "markdown",
918
+ "id": "c826fde0",
919
+ "metadata": {},
920
+ "source": [
921
+ "### Save Results"
922
+ ]
923
+ },
924
+ {
925
+ "cell_type": "code",
926
+ "execution_count": 18,
927
+ "id": "06eb6132",
928
+ "metadata": {},
929
+ "outputs": [],
930
+ "source": [
931
+ "# Save current experiment\n",
932
+ "results_df.to_csv(xp_path, index=False)\n",
933
+ "\n",
934
+ "# Update experimentation history\n",
935
+ "\n",
936
+ "xp_results = {\n",
937
+ " \"iteration\": current_iteration,\n",
938
+ " \"experiment\": xp_name,\n",
939
+ " \"agent\": agent_architecture,\n",
940
+ " \"tools\": new_tools_list,\n",
941
+ " \"accuracy\": round(results_df.copy()[\"is_correct\"].mean(), 2),\n",
942
+ "}\n",
943
+ "\n",
944
+ "updated_experiments_data = pd.concat([old_experiments_data, pd.DataFrame([xp_results])], ignore_index=True)\n",
945
+ "updated_experiments_data.drop_duplicates(inplace=True)\n",
946
+ "updated_experiments_data.to_csv(summary_experiments_path, index=False)"
947
+ ]
948
+ },
949
+ {
950
+ "cell_type": "code",
951
+ "execution_count": 19,
952
+ "id": "84074f8d",
953
+ "metadata": {},
954
+ "outputs": [
955
+ {
956
+ "data": {
957
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
958
+ "columns": [
959
+ {
960
+ "name": "index",
961
+ "rawType": "int64",
962
+ "type": "integer"
963
+ },
964
+ {
965
+ "name": "iteration",
966
+ "rawType": "int64",
967
+ "type": "integer"
968
+ },
969
+ {
970
+ "name": "experiment",
971
+ "rawType": "object",
972
+ "type": "string"
973
+ },
974
+ {
975
+ "name": "agent",
976
+ "rawType": "object",
977
+ "type": "string"
978
+ },
979
+ {
980
+ "name": "tools",
981
+ "rawType": "object",
982
+ "type": "string"
983
+ },
984
+ {
985
+ "name": "accuracy",
986
+ "rawType": "float64",
987
+ "type": "float"
988
+ }
989
+ ],
990
+ "ref": "42db43c4-0a26-4465-9ab4-0a3a9b103b8a",
991
+ "rows": [
992
+ [
993
+ "0",
994
+ "1",
995
+ "Implement Calculator Tool",
996
+ "React agent",
997
+ "Aritmetic",
998
+ "0.0"
999
+ ],
1000
+ [
1001
+ "1",
1002
+ "2",
1003
+ "Implement Search and Code tools",
1004
+ "React agent",
1005
+ "Aritmetic, Search, Code",
1006
+ "0.17"
1007
+ ],
1008
+ [
1009
+ "2",
1010
+ "3",
1011
+ "Integrate Whisper Audio Transcriber",
1012
+ "React agent",
1013
+ "Aritmetic, Search, Code, Audio Transcriber",
1014
+ "0.15"
1015
+ ],
1016
+ [
1017
+ "3",
1018
+ "4",
1019
+ "Test Workflow",
1020
+ "React agent",
1021
+ "Aritmetic, Search, Code, Audio Transcriber, ",
1022
+ "0.2"
1023
+ ],
1024
+ [
1025
+ "4",
1026
+ "5",
1027
+ "Integrate Text processing tool",
1028
+ "React agent",
1029
+ "Aritmetic, Search, Code, Audio Transcriber, , Text Handler",
1030
+ "0.1"
1031
+ ],
1032
+ [
1033
+ "5",
1034
+ "6",
1035
+ "Integrate Text Handler tool",
1036
+ "React agent",
1037
+ "Aritmetic, Search, Code, Audio Transcriber, , Text Handler, Text Handler",
1038
+ "0.2"
1039
+ ],
1040
+ [
1041
+ "6",
1042
+ "7",
1043
+ "Evaluate Agent Performance against tasks with .txt files",
1044
+ "React agent",
1045
+ "Aritmetic, Search, Code, Audio Transcriber, , Text Handler, Text Handler, ",
1046
+ null
1047
+ ],
1048
+ [
1049
+ "7",
1050
+ "8",
1051
+ "Test Agent performance against tasks with attached .txt files",
1052
+ "React agent",
1053
+ "Aritmetic, Search, Code, Audio Transcriber, , Text Handler, Text Handler, , ",
1054
+ "0.0"
1055
+ ]
1056
+ ],
1057
+ "shape": {
1058
+ "columns": 5,
1059
+ "rows": 8
1060
+ }
1061
+ },
1062
+ "text/html": [
1063
+ "<div>\n",
1064
+ "<style scoped>\n",
1065
+ " .dataframe tbody tr th:only-of-type {\n",
1066
+ " vertical-align: middle;\n",
1067
+ " }\n",
1068
+ "\n",
1069
+ " .dataframe tbody tr th {\n",
1070
+ " vertical-align: top;\n",
1071
+ " }\n",
1072
+ "\n",
1073
+ " .dataframe thead th {\n",
1074
+ " text-align: right;\n",
1075
+ " }\n",
1076
+ "</style>\n",
1077
+ "<table border=\"1\" class=\"dataframe\">\n",
1078
+ " <thead>\n",
1079
+ " <tr style=\"text-align: right;\">\n",
1080
+ " <th></th>\n",
1081
+ " <th>iteration</th>\n",
1082
+ " <th>experiment</th>\n",
1083
+ " <th>agent</th>\n",
1084
+ " <th>tools</th>\n",
1085
+ " <th>accuracy</th>\n",
1086
+ " </tr>\n",
1087
+ " </thead>\n",
1088
+ " <tbody>\n",
1089
+ " <tr>\n",
1090
+ " <th>0</th>\n",
1091
+ " <td>1</td>\n",
1092
+ " <td>Implement Calculator Tool</td>\n",
1093
+ " <td>React agent</td>\n",
1094
+ " <td>Aritmetic</td>\n",
1095
+ " <td>0.00</td>\n",
1096
+ " </tr>\n",
1097
+ " <tr>\n",
1098
+ " <th>1</th>\n",
1099
+ " <td>2</td>\n",
1100
+ " <td>Implement Search and Code tools</td>\n",
1101
+ " <td>React agent</td>\n",
1102
+ " <td>Aritmetic, Search, Code</td>\n",
1103
+ " <td>0.17</td>\n",
1104
+ " </tr>\n",
1105
+ " <tr>\n",
1106
+ " <th>2</th>\n",
1107
+ " <td>3</td>\n",
1108
+ " <td>Integrate Whisper Audio Transcriber</td>\n",
1109
+ " <td>React agent</td>\n",
1110
+ " <td>Aritmetic, Search, Code, Audio Transcriber</td>\n",
1111
+ " <td>0.15</td>\n",
1112
+ " </tr>\n",
1113
+ " <tr>\n",
1114
+ " <th>3</th>\n",
1115
+ " <td>4</td>\n",
1116
+ " <td>Test Workflow</td>\n",
1117
+ " <td>React agent</td>\n",
1118
+ " <td>Aritmetic, Search, Code, Audio Transcriber,</td>\n",
1119
+ " <td>0.20</td>\n",
1120
+ " </tr>\n",
1121
+ " <tr>\n",
1122
+ " <th>4</th>\n",
1123
+ " <td>5</td>\n",
1124
+ " <td>Integrate Text processing tool</td>\n",
1125
+ " <td>React agent</td>\n",
1126
+ " <td>Aritmetic, Search, Code, Audio Transcriber, , ...</td>\n",
1127
+ " <td>0.10</td>\n",
1128
+ " </tr>\n",
1129
+ " <tr>\n",
1130
+ " <th>5</th>\n",
1131
+ " <td>6</td>\n",
1132
+ " <td>Integrate Text Handler tool</td>\n",
1133
+ " <td>React agent</td>\n",
1134
+ " <td>Aritmetic, Search, Code, Audio Transcriber, , ...</td>\n",
1135
+ " <td>0.20</td>\n",
1136
+ " </tr>\n",
1137
+ " <tr>\n",
1138
+ " <th>6</th>\n",
1139
+ " <td>7</td>\n",
1140
+ " <td>Evaluate Agent Performance against tasks with ...</td>\n",
1141
+ " <td>React agent</td>\n",
1142
+ " <td>Aritmetic, Search, Code, Audio Transcriber, , ...</td>\n",
1143
+ " <td>NaN</td>\n",
1144
+ " </tr>\n",
1145
+ " <tr>\n",
1146
+ " <th>7</th>\n",
1147
+ " <td>8</td>\n",
1148
+ " <td>Test Agent performance against tasks with atta...</td>\n",
1149
+ " <td>React agent</td>\n",
1150
+ " <td>Aritmetic, Search, Code, Audio Transcriber, , ...</td>\n",
1151
+ " <td>0.00</td>\n",
1152
+ " </tr>\n",
1153
+ " </tbody>\n",
1154
+ "</table>\n",
1155
+ "</div>"
1156
+ ],
1157
+ "text/plain": [
1158
+ " iteration experiment agent \\\n",
1159
+ "0 1 Implement Calculator Tool React agent \n",
1160
+ "1 2 Implement Search and Code tools React agent \n",
1161
+ "2 3 Integrate Whisper Audio Transcriber React agent \n",
1162
+ "3 4 Test Workflow React agent \n",
1163
+ "4 5 Integrate Text processing tool React agent \n",
1164
+ "5 6 Integrate Text Handler tool React agent \n",
1165
+ "6 7 Evaluate Agent Performance against tasks with ... React agent \n",
1166
+ "7 8 Test Agent performance against tasks with atta... React agent \n",
1167
+ "\n",
1168
+ " tools accuracy \n",
1169
+ "0 Aritmetic 0.00 \n",
1170
+ "1 Aritmetic, Search, Code 0.17 \n",
1171
+ "2 Aritmetic, Search, Code, Audio Transcriber 0.15 \n",
1172
+ "3 Aritmetic, Search, Code, Audio Transcriber, 0.20 \n",
1173
+ "4 Aritmetic, Search, Code, Audio Transcriber, , ... 0.10 \n",
1174
+ "5 Aritmetic, Search, Code, Audio Transcriber, , ... 0.20 \n",
1175
+ "6 Aritmetic, Search, Code, Audio Transcriber, , ... NaN \n",
1176
+ "7 Aritmetic, Search, Code, Audio Transcriber, , ... 0.00 "
1177
+ ]
1178
+ },
1179
+ "execution_count": 19,
1180
+ "metadata": {},
1181
+ "output_type": "execute_result"
1182
+ }
1183
+ ],
1184
+ "source": [
1185
+ "updated_experiments_data"
1186
+ ]
1187
+ },
1188
+ {
1189
+ "cell_type": "markdown",
1190
+ "id": "f705469a",
1191
+ "metadata": {},
1192
+ "source": [
1193
+ "### Evaluation Summary\n",
1194
+ "\n"
1195
+ ]
1196
+ },
1197
+ {
1198
+ "cell_type": "code",
1199
+ "execution_count": 20,
1200
+ "id": "71648634",
1201
+ "metadata": {},
1202
+ "outputs": [],
1203
+ "source": [
1204
+ "good_responses = results_df[results_df[\"is_correct\"] == 1].copy()\n",
1205
+ "good_extensions = (\n",
1206
+ " good_responses[\"file_path\"]\n",
1207
+ " .fillna(\"No files\")\n",
1208
+ " .apply(lambda row: row.split(\".\")[-1] if \".\" in row else \"No files\")\n",
1209
+ " .unique()\n",
1210
+ ")\n",
1211
+ "good_file_management = \", \".join(sorted(good_extensions))\n",
1212
+ "\n",
1213
+ "bad_responses = results_df[results_df[\"is_correct\"] == 0].copy()\n",
1214
+ "bad_extensions = (\n",
1215
+ " bad_responses[\"file_path\"]\n",
1216
+ " .fillna(\"No files\")\n",
1217
+ " .apply(lambda row: row.split(\".\")[-1] if \".\" in row else \"No files\")\n",
1218
+ " .unique()\n",
1219
+ ")\n",
1220
+ "bad_file_management = \", \".join(sorted(bad_extensions))\n",
1221
+ "\n",
1222
+ "performance_no_attached = results_df[results_df[\"file_path\"].apply(lambda row: len(row))==0][\"is_correct\"].mean()"
1223
+ ]
1224
+ },
1225
+ {
1226
+ "cell_type": "code",
1227
+ "execution_count": 21,
1228
+ "id": "1d29a298",
1229
+ "metadata": {},
1230
+ "outputs": [
1231
+ {
1232
+ "name": "stdout",
1233
+ "output_type": "stream",
1234
+ "text": [
1235
+ "Insights\n",
1236
+ "\n",
1237
+ " -------------------------------------------------- \n",
1238
+ "\n",
1239
+ "1. The Agent has an overall accuracy of 0.0%\n",
1240
+ "2. The Agent succeded at questions with the following files types: (0/1) \n",
1241
+ "3. The Agent failed at questions with the following files types: txt (1/1) \n",
1242
+ "4. The Agent has an Accuracy of nan% at tasks with no attached files\n"
1243
+ ]
1244
+ }
1245
+ ],
1246
+ "source": [
1247
+ "print(\n",
1248
+ " \"Insights\\n\\n\",\n",
1249
+ " \"-\" * 50,\n",
1250
+ " f\"\\n\\n1. The Agent has an overall accuracy of {100 * accuracy:.1f}%\"\n",
1251
+ " f\"\\n2. The Agent succeded at questions with the following files types: {good_file_management} ({good_responses.is_correct.shape[0]}/{results_df.shape[0]})\",\n",
1252
+ " f\"\\n3. The Agent failed at questions with the following files types: {bad_file_management} ({bad_responses.is_correct.shape[0]}/{results_df.shape[0]})\",\n",
1253
+ " f\"\\n4. The Agent has an Accuracy of {100 * performance_no_attached:.1f}% at tasks with no attached files\"\n",
1254
+ ")"
1255
+ ]
1256
+ },
1257
+ {
1258
+ "cell_type": "markdown",
1259
+ "id": "c4c9f4ff",
1260
+ "metadata": {},
1261
+ "source": [
1262
+ "Conclusions\n",
1263
+ "\n",
1264
+ "1. The Agent has a 0% of accuracy. Showing poor performance at tasks with attached .txt files\n",
1265
+ "4. Reasoning task has a big margin of improvement (0% of success on reasoning tasks). There are some questions that ask for watching internet videos, so it might be necessary to implement tools to adress this kind of tasks\n",
1266
+ "\n",
1267
+ "Next steps\n",
1268
+ "\n",
1269
+ "1. Build tools to deal with .png, .pptx, .py .xlsx files\n",
1270
+ "2. Investigate improvements for pure-resoning tasks\n",
1271
+ "3. Build tools to enable the agent watch / process youtube videos"
1272
+ ]
1273
+ },
1274
+ {
1275
+ "cell_type": "markdown",
1276
+ "id": "9e69ee21",
1277
+ "metadata": {},
1278
+ "source": [
1279
+ "\n",
1280
+ "### Main Questions to solve\n",
1281
+ "\n",
1282
+ "$\\square$ Which are the core tools for each level of questions \n",
1283
+ " - Level 1:\n",
1284
+ " - Level 2:\n",
1285
+ " - Level 3:\n",
1286
+ "\n"
1287
+ ]
1288
+ }
1289
+ ],
1290
+ "metadata": {
1291
+ "kernelspec": {
1292
+ "display_name": "chappie-CHLGiFC_-py3.11",
1293
+ "language": "python",
1294
+ "name": "python3"
1295
+ },
1296
+ "language_info": {
1297
+ "codemirror_mode": {
1298
+ "name": "ipython",
1299
+ "version": 3
1300
+ },
1301
+ "file_extension": ".py",
1302
+ "mimetype": "text/x-python",
1303
+ "name": "python",
1304
+ "nbconvert_exporter": "python",
1305
+ "pygments_lexer": "ipython3",
1306
+ "version": "3.11.13"
1307
+ }
1308
+ },
1309
+ "nbformat": 4,
1310
+ "nbformat_minor": 5
1311
+ }
notebooks/ml_agent_experimentation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/tools_poc/audio_tool.ipynb ADDED
@@ -0,0 +1,688 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "bf9121e7",
6
+ "metadata": {},
7
+ "source": [
8
+ "In this Notebook we create and sketch tools for the Agent. Building proofs of concept (PoC)"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "c29d8861",
14
+ "metadata": {},
15
+ "source": [
16
+ "### Setup"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "94d01490",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "# Libraries\n",
27
+ "\n",
28
+ "import os"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "id": "f09b59ab",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "# Local Modules\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "id": "f32d7959",
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "# import Whisper\n",
49
+ "\n",
50
+ "import whisper"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "id": "5e131840",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "# Import Agent\n",
61
+ "\n",
62
+ "os.sys.path.append(\"../src\")\n",
63
+ "os.sys.path.append(\"../src/agents\")\n",
64
+ "\n",
65
+ "import react # My AI assistant"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "id": "dc02cd0f",
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "# DEBUG\n",
76
+ "\n",
77
+ "os.sys.path.append(\"../src/tools\")"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "id": "3ca7a88a",
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "\n",
88
+ "# Import GAIA Questions\n",
89
+ "from datasets import load_dataset\n"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "id": "a46c1c67",
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "\n",
100
+ "import pandas as pd\n"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": null,
106
+ "id": "5a5cf241",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "from dotenv import load_dotenv\n",
111
+ "from huggingface_hub import snapshot_download, login"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": null,
117
+ "id": "64a3d4d1",
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "# Load Hugging face credentials\n",
122
+ "\n",
123
+ "#load_dotenv()\n",
124
+ "#login(os.getenv(key=\"HF_TOKEN_CHAPPIE\")) # Replace with your hf api key name\n"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "id": "fda05cb1",
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "#gaia_questions_path = snapshot_download(repo_id=\"gaia-benchmark/GAIA\", repo_type=\"dataset\")\n",
135
+ "#gaia_questions = load_dataset(path=\"gaia-benchmark/GAIA\", name=\"2023_level1\")"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": null,
141
+ "id": "e9ebf52c",
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": [
145
+ "gaia_index_dir = \"../../../.cache/huggingface/datasets/gaia-benchmark___gaia/2023_level1/0.0.1/ec492fe4320ee795b1aed6bb46229c5f693226b0f1316347501c24b4baeee005\"\n",
146
+ "#gaia_index_dir = os.path.abspath(gaia_index_dir)\n",
147
+ "gaia_data_path = os.path.join(gaia_index_dir, \"gaia-validation.arrow\")"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "id": "101f8c73",
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "# Temporal cell: wrote because of hugging face api calls limit\n",
158
+ "\n",
159
+ "import pyarrow.ipc as ipc\n",
160
+ "\n",
161
+ "with open(gaia_data_path, \"rb\") as f:\n",
162
+ " reader = ipc.RecordBatchStreamReader(f)\n",
163
+ " table = reader.read_all()\n",
164
+ "\n",
165
+ "gaia_df = table.to_pandas()\n"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "id": "fe9a5e51",
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "#gaia_questions = gaia_questions[\"validation\"] # Filter for dev purposes\n",
176
+ "#gaia_df = pd.DataFrame(gaia_questions)"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": null,
182
+ "id": "30c08c4d",
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "gaia_df.head()"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": null,
192
+ "id": "4826e755",
193
+ "metadata": {},
194
+ "outputs": [],
195
+ "source": [
196
+ "# DEL\n",
197
+ "# temp filter to get chess images\n",
198
+ "gaia_df[gaia_df[\"file_path\"].map(lambda f: f.endswith(\"44.png\"))]"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "markdown",
203
+ "id": "f2d34bca",
204
+ "metadata": {},
205
+ "source": [
206
+ "---"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": null,
212
+ "id": "0150df47",
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "filetypes = {d[1].file_path.split(\".\")[-1] for d in gaia_df.iterrows()}\n",
217
+ "filetypes"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "markdown",
222
+ "id": "9c9208da",
223
+ "metadata": {},
224
+ "source": [
225
+ "### Read Historical XPs"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "id": "82dd866a",
232
+ "metadata": {},
233
+ "outputs": [],
234
+ "source": [
235
+ "xp_paths = \"../data/agent_experiments/iterations/\""
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": null,
241
+ "id": "5649c3c2",
242
+ "metadata": {},
243
+ "outputs": [],
244
+ "source": [
245
+ "# Index all XPs\n",
246
+ "import os\n",
247
+ "import pandas as pd\n",
248
+ "\n",
249
+ "xp_dir = \"../data/agent_experiments/iterations\"\n",
250
+ "xp_list = []\n",
251
+ "# FIXME: from the 10th XP iteration, the sorted method is unuseful\n",
252
+ "\n",
253
+ "for i, xp_path in enumerate(sorted(os.listdir(xp_paths))): \n",
254
+ " xp_path = os.path.join(xp_dir, xp_path)\n",
255
+ " temp_xp_df = pd.read_csv(xp_path)\n",
256
+ " xp_list.append((i, temp_xp_df))\n",
257
+ " del temp_xp_df "
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "id": "e62498d0",
264
+ "metadata": {},
265
+ "outputs": [],
266
+ "source": [
267
+ "# DEL\n",
268
+ "xp[xp[\"Question\"] == xp_question][[\"is_correct\"]]"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": null,
274
+ "id": "5ca5b1fd",
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "# Join latest XP result to each question\n",
279
+ "\n",
280
+ "index = gaia_df.index.tolist() # Index for all the questions\n",
281
+ "questions = gaia_df[\"Question\"].tolist()\n",
282
+ "answers = list()\n",
283
+ "\n",
284
+ "\n",
285
+ "def filter_condition(xp_data: list) -> bool:\n",
286
+ " i, xp = xp_data\n",
287
+ " for question in questions:\n",
288
+ " xp_addressed_questions = xp[\"Question\"].tolist()\n",
289
+ " if question in xp_addressed_questions:\n",
290
+ " return True\n",
291
+ " return False\n",
292
+ "\n",
293
+ "filtered_xps = list(filter(filter_condition, xp_list))\n",
294
+ "filtered_xps_reversed = filtered_xps\n",
295
+ "filtered_xps_reversed.reverse()\n",
296
+ "\n",
297
+ "# Join latest answer result (i.e. result of the latest xp)\n",
298
+ "\n",
299
+ "# Join latest answer result (i.e. result of the latest xp)\n",
300
+ "\n",
301
+ "for question in questions:\n",
302
+ " answer_result = 0 # Assume wrong answer by default\n",
303
+ " for i, xp in filtered_xps_reversed:\n",
304
+ " for xp_question in xp[\"Question\"].tolist():\n",
305
+ " \n",
306
+ " if question == xp_question:\n",
307
+ " answer_result_row = list(xp[xp[\"Question\"] == xp_question][\"is_correct\"])#.loc[0]\n",
308
+ " answer_result_temp = answer_result_row[0] \n",
309
+ " if answer_result_temp >= answer_result:\n",
310
+ " answer_result = answer_result_temp\n",
311
+ " del xp\n",
312
+ " answers.append(answer_result)\n",
313
+ "\n",
314
+ "# Join answers\n",
315
+ "historical_xp_results = gaia_df.copy()\n",
316
+ "historical_xp_results[\"is_correct\"] = pd.Series(answers)\n",
317
+ "del answers, questions"
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": null,
323
+ "id": "ca756f04",
324
+ "metadata": {},
325
+ "outputs": [],
326
+ "source": [
327
+ "historical_xp_results"
328
+ ]
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": null,
333
+ "id": "fd11615a",
334
+ "metadata": {},
335
+ "outputs": [],
336
+ "source": [
337
+ "historical_xp_results.is_correct.mean()"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "markdown",
342
+ "id": "d3ddaede",
343
+ "metadata": {},
344
+ "source": [
345
+ "Let's study first which are the most common wrong tasks. I.e. How can we increase accuracy with a single next step (e.g. implement a new tool, modify sys message, etc.)"
346
+ ]
347
+ },
348
+ {
349
+ "cell_type": "code",
350
+ "execution_count": null,
351
+ "id": "a0613fc4",
352
+ "metadata": {},
353
+ "outputs": [],
354
+ "source": [
355
+ "wrong_ans_df = historical_xp_results[historical_xp_results[\"is_correct\"]==0]\n",
356
+ "wrong_ans_df[\"fp_extension\"] = wrong_ans_df[\"file_path\"].map(lambda path: path.split(\".\")[-1])\n",
357
+ "wrong_ans_df"
358
+ ]
359
+ },
360
+ {
361
+ "cell_type": "code",
362
+ "execution_count": null,
363
+ "id": "43925ec4",
364
+ "metadata": {},
365
+ "outputs": [],
366
+ "source": [
367
+ "# Summary\n",
368
+ "wrong_ans_df.groupby(\"fp_extension\")[\"is_correct\"].count()"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "markdown",
373
+ "id": "1f77dabb",
374
+ "metadata": {},
375
+ "source": [
376
+ "The vast majority of remaining tasks do not include files to read. So we sould study them at first"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": null,
382
+ "id": "1379da15",
383
+ "metadata": {},
384
+ "outputs": [],
385
+ "source": [
386
+ "wrong_ans_df_no_extension = wrong_ans_df[wrong_ans_df[\"file_path\"].map(lambda fp: len(fp)==0)]"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": null,
392
+ "id": "5e40e190",
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "wrong_ans_df_no_extension"
397
+ ]
398
+ },
399
+ {
400
+ "cell_type": "markdown",
401
+ "id": "b4cbc05d",
402
+ "metadata": {},
403
+ "source": [
404
+ "## Audio Tool"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "markdown",
409
+ "id": "57b579ef",
410
+ "metadata": {},
411
+ "source": [
412
+ "Let's study where does our current Agent fail, especially on audio-like questions"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": null,
418
+ "id": "6b7a1444",
419
+ "metadata": {},
420
+ "outputs": [],
421
+ "source": [
422
+ "run = False"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "code",
427
+ "execution_count": null,
428
+ "id": "60d3af35",
429
+ "metadata": {},
430
+ "outputs": [],
431
+ "source": [
432
+ "gaia_df"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "markdown",
437
+ "id": "fe9e932b",
438
+ "metadata": {},
439
+ "source": [
440
+ "### Identify Audio-like tasks"
441
+ ]
442
+ },
443
+ {
444
+ "cell_type": "markdown",
445
+ "id": "9e8c9a12",
446
+ "metadata": {},
447
+ "source": [
448
+ "The first questions is to understand which questions need audio? \n",
449
+ "\n",
450
+ "* Hypothesis: Just look at such questions that have a .mp3 file attached\n"
451
+ ]
452
+ },
453
+ {
454
+ "cell_type": "code",
455
+ "execution_count": null,
456
+ "id": "23516201",
457
+ "metadata": {},
458
+ "outputs": [],
459
+ "source": [
460
+ "# Gather a sample file from any task\n",
461
+ "\n",
462
+ "audio_tasks = gaia_df[(gaia_df[\"file_path\"].str.len()>0) & (gaia_df[\"file_path\"].str.endswith(\".mp3\"))]\n",
463
+ "sample_task = audio_tasks.iloc[0]"
464
+ ]
465
+ },
466
+ {
467
+ "cell_type": "code",
468
+ "execution_count": null,
469
+ "id": "aa03a8a6",
470
+ "metadata": {},
471
+ "outputs": [],
472
+ "source": [
473
+ "print(f\"{(audio_tasks.shape[0] / gaia_df.shape[0]) * 100:.1f}% of tasks need audio processing\")"
474
+ ]
475
+ },
476
+ {
477
+ "cell_type": "code",
478
+ "execution_count": null,
479
+ "id": "b30a0394",
480
+ "metadata": {},
481
+ "outputs": [],
482
+ "source": [
483
+ "sample_task"
484
+ ]
485
+ },
486
+ {
487
+ "cell_type": "code",
488
+ "execution_count": null,
489
+ "id": "d47c9bde",
490
+ "metadata": {},
491
+ "outputs": [],
492
+ "source": [
493
+ "# Get the sample file\n",
494
+ "\n",
495
+ "filepath = sample_task[\"file_path\"]\n"
496
+ ]
497
+ },
498
+ {
499
+ "cell_type": "markdown",
500
+ "id": "0d07cab8",
501
+ "metadata": {},
502
+ "source": [
503
+ "Let's import the mp3 file with [ffmpeg](https://stackoverflow.com/questions/9458480/read-mp3-in-python-3#:~:text=%24%20ffmpeg%20%2Di%20foo.mp3%20%2Dvn%20%2Dacodec%20pcm_s16le%20%2Dac%201%20%2Dar%2044100%20%2Df%20wav%20foo.wav)"
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "code",
508
+ "execution_count": null,
509
+ "id": "a82f6246",
510
+ "metadata": {},
511
+ "outputs": [],
512
+ "source": [
513
+ "temp_data_path = \"/home/santiagoal/current-projects/chappie/data/temp-data/\""
514
+ ]
515
+ },
516
+ {
517
+ "cell_type": "code",
518
+ "execution_count": null,
519
+ "id": "c8e255c8",
520
+ "metadata": {},
521
+ "outputs": [],
522
+ "source": [
523
+ "if run:\n",
524
+ " os.system(f\"ffmpeg -i {filepath} -vn -acodec pcm_s16le -ac 1 -ar 44100 -f wav {temp_data_path}sample_audio.wav\")"
525
+ ]
526
+ },
527
+ {
528
+ "cell_type": "code",
529
+ "execution_count": null,
530
+ "id": "7f9d0ff9",
531
+ "metadata": {},
532
+ "outputs": [],
533
+ "source": [
534
+ "if run:\n",
535
+ " use_gpu = False\n",
536
+ " model_size = \"tiny\"\n",
537
+ "\n",
538
+ " model = (\n",
539
+ " whisper.load_model(model_size).cuda()\n",
540
+ " if use_gpu\n",
541
+ " else whisper.load_model(model_size)\n",
542
+ " )"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "code",
547
+ "execution_count": null,
548
+ "id": "df808df1",
549
+ "metadata": {},
550
+ "outputs": [],
551
+ "source": [
552
+ "print(filepath)"
553
+ ]
554
+ },
555
+ {
556
+ "cell_type": "code",
557
+ "execution_count": null,
558
+ "id": "9ce2c352",
559
+ "metadata": {},
560
+ "outputs": [],
561
+ "source": [
562
+ "if run: \n",
563
+ " raw_transcript = model.transcribe(\n",
564
+ " filepath,\n",
565
+ " word_timestamps=False,\n",
566
+ " no_speech_threshold=0.5,\n",
567
+ " condition_on_previous_text=True,\n",
568
+ " compression_ratio_threshold=2.0,\n",
569
+ " )\n",
570
+ "\n",
571
+ " transcript = raw_transcript[\"text\"]\n",
572
+ " print(transcript)"
573
+ ]
574
+ },
575
+ {
576
+ "cell_type": "markdown",
577
+ "id": "0654ee48",
578
+ "metadata": {},
579
+ "source": [
580
+ "### Tool POC\n"
581
+ ]
582
+ },
583
+ {
584
+ "cell_type": "code",
585
+ "execution_count": null,
586
+ "id": "83335239",
587
+ "metadata": {},
588
+ "outputs": [],
589
+ "source": [
590
+ "\n",
591
+ "if run: \n",
592
+ " def transcriber(audio_path: str, ai_model = model) -> str:\n",
593
+ " \"\"\"\n",
594
+ " Transcribes an audio file\n",
595
+ "\n",
596
+ " Parameters\n",
597
+ " ----------\n",
598
+ " audio_path : str\n",
599
+ " Path to the audio file\n",
600
+ " ai_model\n",
601
+ " audio-to-text AI model \n",
602
+ "\n",
603
+ " Returns:\n",
604
+ " str: Text of the transcript \n",
605
+ " \"\"\"\n",
606
+ " raw_transcript = ai_model.transcribe(\n",
607
+ " audio_path,\n",
608
+ " word_timestamps=False,\n",
609
+ " no_speech_threshold=0.5,\n",
610
+ " condition_on_previous_text=True,\n",
611
+ " compression_ratio_threshold=2.0,\n",
612
+ " )\n",
613
+ "\n",
614
+ " transcript = raw_transcript[\"text\"]\n",
615
+ "\n",
616
+ " return transcript"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "markdown",
621
+ "id": "aa9d4e58",
622
+ "metadata": {},
623
+ "source": [
624
+ "### Integrate Tool POC and experiment\n"
625
+ ]
626
+ },
627
+ {
628
+ "cell_type": "markdown",
629
+ "id": "8dee4276",
630
+ "metadata": {},
631
+ "source": [
632
+ "The changes have been integrated, now we will experiment with the new version of the agent"
633
+ ]
634
+ },
635
+ {
636
+ "cell_type": "code",
637
+ "execution_count": null,
638
+ "id": "811185e7",
639
+ "metadata": {},
640
+ "outputs": [],
641
+ "source": [
642
+ "audio_tasks[\"Agent response\"] = audio_tasks.apply(func=get_agent_response, axis=1)\n",
643
+ "audio_tasks[\"is_correct\"] = audio_tasks.apply(func=eval_answer, axis=1)"
644
+ ]
645
+ },
646
+ {
647
+ "cell_type": "code",
648
+ "execution_count": null,
649
+ "id": "c2ab6ae6",
650
+ "metadata": {},
651
+ "outputs": [],
652
+ "source": [
653
+ "audio_tasks"
654
+ ]
655
+ },
656
+ {
657
+ "cell_type": "code",
658
+ "execution_count": null,
659
+ "id": "76e9241c",
660
+ "metadata": {},
661
+ "outputs": [],
662
+ "source": [
663
+ "# Next steps: Update model... "
664
+ ]
665
+ }
666
+ ],
667
+ "metadata": {
668
+ "kernelspec": {
669
+ "display_name": "chappie-CHLGiFC_-py3.11",
670
+ "language": "python",
671
+ "name": "python3"
672
+ },
673
+ "language_info": {
674
+ "codemirror_mode": {
675
+ "name": "ipython",
676
+ "version": 3
677
+ },
678
+ "file_extension": ".py",
679
+ "mimetype": "text/x-python",
680
+ "name": "python",
681
+ "nbconvert_exporter": "python",
682
+ "pygments_lexer": "ipython3",
683
+ "version": "3.11.13"
684
+ }
685
+ },
686
+ "nbformat": 4,
687
+ "nbformat_minor": 5
688
+ }
notebooks/tools_poc/chess_tool.ipynb ADDED
@@ -0,0 +1,863 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "bf9121e7",
6
+ "metadata": {},
7
+ "source": [
8
+ "In this Notebook we create and sketch tools for the Agent. Building proofs of concept (PoC)"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "c29d8861",
14
+ "metadata": {},
15
+ "source": [
16
+ "### Setup"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "94d01490",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "# Libraries\n",
27
+ "\n",
28
+ "import os"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "id": "f09b59ab",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "# Local Modules\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "id": "f32d7959",
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "# import Whisper\n",
49
+ "\n",
50
+ "import whisper"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "id": "5e131840",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "# Import Agent\n",
61
+ "\n",
62
+ "os.sys.path.append(\"../src\")\n",
63
+ "os.sys.path.append(\"../src/agents\")\n",
64
+ "\n",
65
+ "import react # My AI assistant"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "id": "dc02cd0f",
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "# DEBUG\n",
76
+ "\n",
77
+ "os.sys.path.append(\"../src/tools\")"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "id": "3ca7a88a",
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "\n",
88
+ "# Import GAIA Questions\n",
89
+ "from datasets import load_dataset\n"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "id": "a46c1c67",
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "\n",
100
+ "import pandas as pd\n"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": null,
106
+ "id": "5a5cf241",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "from dotenv import load_dotenv\n",
111
+ "from huggingface_hub import snapshot_download, login"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": null,
117
+ "id": "64a3d4d1",
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "# Load Hugging face credentials\n",
122
+ "\n",
123
+ "#load_dotenv()\n",
124
+ "#login(os.getenv(key=\"HF_TOKEN_CHAPPIE\")) # Replace with your hf api key name\n"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "id": "fda05cb1",
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "#gaia_questions_path = snapshot_download(repo_id=\"gaia-benchmark/GAIA\", repo_type=\"dataset\")\n",
135
+ "#gaia_questions = load_dataset(path=\"gaia-benchmark/GAIA\", name=\"2023_level1\")"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": null,
141
+ "id": "e9ebf52c",
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": [
145
+ "gaia_index_dir = \"../../../.cache/huggingface/datasets/gaia-benchmark___gaia/2023_level1/0.0.1/ec492fe4320ee795b1aed6bb46229c5f693226b0f1316347501c24b4baeee005\"\n",
146
+ "#gaia_index_dir = os.path.abspath(gaia_index_dir)\n",
147
+ "gaia_data_path = os.path.join(gaia_index_dir, \"gaia-validation.arrow\")"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "id": "101f8c73",
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "# Temporal cell: wrote because of hugging face api calls limit\n",
158
+ "\n",
159
+ "import pyarrow.ipc as ipc\n",
160
+ "\n",
161
+ "with open(gaia_data_path, \"rb\") as f:\n",
162
+ " reader = ipc.RecordBatchStreamReader(f)\n",
163
+ " table = reader.read_all()\n",
164
+ "\n",
165
+ "gaia_df = table.to_pandas()\n"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "id": "fe9a5e51",
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "#gaia_questions = gaia_questions[\"validation\"] # Filter for dev purposes\n",
176
+ "#gaia_df = pd.DataFrame(gaia_questions)"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": null,
182
+ "id": "30c08c4d",
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "gaia_df.head()"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": null,
192
+ "id": "4826e755",
193
+ "metadata": {},
194
+ "outputs": [],
195
+ "source": [
196
+ "# DEL\n",
197
+ "# temp filter to get chess images\n",
198
+ "gaia_df[gaia_df[\"file_path\"].map(lambda f: f.endswith(\"44.png\"))]"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "markdown",
203
+ "id": "f2d34bca",
204
+ "metadata": {},
205
+ "source": [
206
+ "---"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": null,
212
+ "id": "0150df47",
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "filetypes = {d[1].file_path.split(\".\")[-1] for d in gaia_df.iterrows()}\n",
217
+ "filetypes"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "markdown",
222
+ "id": "94422aae",
223
+ "metadata": {},
224
+ "source": [
225
+ "### Read Historical XPs"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "id": "82dd866a",
232
+ "metadata": {},
233
+ "outputs": [],
234
+ "source": [
235
+ "xp_paths = \"../data/agent_experiments/iterations/\""
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": null,
241
+ "id": "5649c3c2",
242
+ "metadata": {},
243
+ "outputs": [],
244
+ "source": [
245
+ "# Index all XPs\n",
246
+ "import os\n",
247
+ "import pandas as pd\n",
248
+ "\n",
249
+ "xp_dir = \"../data/agent_experiments/iterations\"\n",
250
+ "xp_list = []\n",
251
+ "# FIXME: from the 10th XP iteration, the sorted method is unuseful\n",
252
+ "\n",
253
+ "for i, xp_path in enumerate(sorted(os.listdir(xp_paths))): \n",
254
+ " xp_path = os.path.join(xp_dir, xp_path)\n",
255
+ " temp_xp_df = pd.read_csv(xp_path)\n",
256
+ " xp_list.append((i, temp_xp_df))\n",
257
+ " del temp_xp_df "
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "id": "e62498d0",
264
+ "metadata": {},
265
+ "outputs": [],
266
+ "source": [
267
+ "# DEL\n",
268
+ "xp[xp[\"Question\"] == xp_question][[\"is_correct\"]]"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": null,
274
+ "id": "5ca5b1fd",
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "# Join latest XP result to each question\n",
279
+ "\n",
280
+ "index = gaia_df.index.tolist() # Index for all the questions\n",
281
+ "questions = gaia_df[\"Question\"].tolist()\n",
282
+ "answers = list()\n",
283
+ "\n",
284
+ "\n",
285
+ "def filter_condition(xp_data: list) -> bool:\n",
286
+ " i, xp = xp_data\n",
287
+ " for question in questions:\n",
288
+ " xp_addressed_questions = xp[\"Question\"].tolist()\n",
289
+ " if question in xp_addressed_questions:\n",
290
+ " return True\n",
291
+ " return False\n",
292
+ "\n",
293
+ "filtered_xps = list(filter(filter_condition, xp_list))\n",
294
+ "filtered_xps_reversed = filtered_xps\n",
295
+ "filtered_xps_reversed.reverse()\n",
296
+ "\n",
297
+ "# Join latest answer result (i.e. result of the latest xp)\n",
298
+ "\n",
299
+ "# Join latest answer result (i.e. result of the latest xp)\n",
300
+ "\n",
301
+ "for question in questions:\n",
302
+ " answer_result = 0 # Assume wrong answer by default\n",
303
+ " for i, xp in filtered_xps_reversed:\n",
304
+ " for xp_question in xp[\"Question\"].tolist():\n",
305
+ " \n",
306
+ " if question == xp_question:\n",
307
+ " answer_result_row = list(xp[xp[\"Question\"] == xp_question][\"is_correct\"])#.loc[0]\n",
308
+ " answer_result_temp = answer_result_row[0] \n",
309
+ " if answer_result_temp >= answer_result:\n",
310
+ " answer_result = answer_result_temp\n",
311
+ " del xp\n",
312
+ " answers.append(answer_result)\n",
313
+ "\n",
314
+ "# Join answers\n",
315
+ "historical_xp_results = gaia_df.copy()\n",
316
+ "historical_xp_results[\"is_correct\"] = pd.Series(answers)\n",
317
+ "del answers, questions"
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": null,
323
+ "id": "ca756f04",
324
+ "metadata": {},
325
+ "outputs": [],
326
+ "source": [
327
+ "historical_xp_results"
328
+ ]
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": null,
333
+ "id": "fd11615a",
334
+ "metadata": {},
335
+ "outputs": [],
336
+ "source": [
337
+ "historical_xp_results.is_correct.mean()"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "markdown",
342
+ "id": "d3ddaede",
343
+ "metadata": {},
344
+ "source": [
345
+ "Let's study first which are the most common wrong tasks. I.e. How can we increase accuracy with a single next step (e.g. implement a new tool, modify sys message, etc.)"
346
+ ]
347
+ },
348
+ {
349
+ "cell_type": "code",
350
+ "execution_count": null,
351
+ "id": "a0613fc4",
352
+ "metadata": {},
353
+ "outputs": [],
354
+ "source": [
355
+ "wrong_ans_df = historical_xp_results[historical_xp_results[\"is_correct\"]==0]\n",
356
+ "wrong_ans_df[\"fp_extension\"] = wrong_ans_df[\"file_path\"].map(lambda path: path.split(\".\")[-1])\n",
357
+ "wrong_ans_df"
358
+ ]
359
+ },
360
+ {
361
+ "cell_type": "code",
362
+ "execution_count": null,
363
+ "id": "43925ec4",
364
+ "metadata": {},
365
+ "outputs": [],
366
+ "source": [
367
+ "# Summary\n",
368
+ "wrong_ans_df.groupby(\"fp_extension\")[\"is_correct\"].count()"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "markdown",
373
+ "id": "1f77dabb",
374
+ "metadata": {},
375
+ "source": [
376
+ "The vast majority of remaining tasks do not include files to read. So we sould study them at first"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": null,
382
+ "id": "1379da15",
383
+ "metadata": {},
384
+ "outputs": [],
385
+ "source": [
386
+ "wrong_ans_df_no_extension = wrong_ans_df[wrong_ans_df[\"file_path\"].map(lambda fp: len(fp)==0)]"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": null,
392
+ "id": "5e40e190",
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "wrong_ans_df_no_extension"
397
+ ]
398
+ },
399
+ {
400
+ "cell_type": "markdown",
401
+ "id": "ad4ce0bc",
402
+ "metadata": {},
403
+ "source": [
404
+ "### Identify Image-like tasks"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": null,
410
+ "id": "387e0aa7",
411
+ "metadata": {},
412
+ "outputs": [],
413
+ "source": [
414
+ "img_filetypes = ('png', 'jpg')"
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "code",
419
+ "execution_count": null,
420
+ "id": "0ec59bb6",
421
+ "metadata": {},
422
+ "outputs": [],
423
+ "source": [
424
+ "image_tasks_df = gaia_df[gaia_df[\"file_path\"].apply(lambda row: row.split(\".\")[-1] in img_filetypes)]\n",
425
+ "image_tasks_df"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": null,
431
+ "id": "180294ba",
432
+ "metadata": {},
433
+ "outputs": [],
434
+ "source": [
435
+ "image_tasks_df.iloc[0]"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "code",
440
+ "execution_count": null,
441
+ "id": "49d48c65",
442
+ "metadata": {},
443
+ "outputs": [],
444
+ "source": [
445
+ "chess_task = image_tasks_df.iloc[0]"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": null,
451
+ "id": "73d7605e",
452
+ "metadata": {},
453
+ "outputs": [],
454
+ "source": [
455
+ "os.path.isfile(chess_task.file_path)"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": null,
461
+ "id": "672fec32",
462
+ "metadata": {},
463
+ "outputs": [],
464
+ "source": [
465
+ "import cv2\n",
466
+ "import matplotlib.pyplot as plt\n",
467
+ "\n",
468
+ "image_path = chess_task.file_path\n",
469
+ "image_bgr = cv2.imread(image_path)\n",
470
+ "\n",
471
+ "image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)\n",
472
+ "\n",
473
+ "plt.imshow(image_rgb)\n",
474
+ "plt.title(\"Chess Image\")\n",
475
+ "plt.axis(\"off\")\n",
476
+ "plt.show()\n"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "markdown",
481
+ "id": "a1d60868",
482
+ "metadata": {},
483
+ "source": [
484
+ "### Chess Tool"
485
+ ]
486
+ },
487
+ {
488
+ "cell_type": "markdown",
489
+ "id": "a0738cd7",
490
+ "metadata": {},
491
+ "source": [
492
+ "In this section, our methodology is to \n",
493
+ "\n",
494
+ "1. Use a CV model to get the FEN position of a 2D chess image\n",
495
+ "2. Usea another model (likely stockfish) to predict the best move from a FEN position"
496
+ ]
497
+ },
498
+ {
499
+ "cell_type": "markdown",
500
+ "id": "0c16f891",
501
+ "metadata": {},
502
+ "source": [
503
+ "#### Tool POC"
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "markdown",
508
+ "id": "3d531e68",
509
+ "metadata": {},
510
+ "source": [
511
+ "To extract the FEN position from the board image, we used [board_to_fen](https://github.com/mcdominik/board_to_fen)"
512
+ ]
513
+ },
514
+ {
515
+ "cell_type": "code",
516
+ "execution_count": null,
517
+ "id": "de3a51d4",
518
+ "metadata": {},
519
+ "outputs": [],
520
+ "source": [
521
+ "import cairosvg\n",
522
+ "from io import BytesIO\n",
523
+ "import numpy as np"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": null,
529
+ "id": "01860727",
530
+ "metadata": {},
531
+ "outputs": [],
532
+ "source": [
533
+ "from PIL import Image\n",
534
+ "from board_to_fen.predict import get_fen_from_image\n",
535
+ "\n",
536
+ "img = Image.open(chess_task.file_path)\n",
537
+ "fen = get_fen_from_image(img, black_view=True)"
538
+ ]
539
+ },
540
+ {
541
+ "cell_type": "code",
542
+ "execution_count": null,
543
+ "id": "2e07228d",
544
+ "metadata": {},
545
+ "outputs": [],
546
+ "source": [
547
+ "fen"
548
+ ]
549
+ },
550
+ {
551
+ "cell_type": "markdown",
552
+ "id": "6819c048",
553
+ "metadata": {},
554
+ "source": [
555
+ "We observed this position has no enough information about the current position, so we need to post process it:"
556
+ ]
557
+ },
558
+ {
559
+ "cell_type": "code",
560
+ "execution_count": null,
561
+ "id": "02ba9e63",
562
+ "metadata": {},
563
+ "outputs": [],
564
+ "source": [
565
+ "fen_cleaned = fen + \" b - - 0 1\""
566
+ ]
567
+ },
568
+ {
569
+ "cell_type": "code",
570
+ "execution_count": null,
571
+ "id": "2ac810f0",
572
+ "metadata": {},
573
+ "outputs": [],
574
+ "source": [
575
+ "fen_cleaned"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "markdown",
580
+ "id": "38f81960",
581
+ "metadata": {},
582
+ "source": [
583
+ "Looking forward, we will need to a warn the AI agent to post-process the FEN before passing to stockfish. However, we can validate the prected FEN "
584
+ ]
585
+ },
586
+ {
587
+ "cell_type": "code",
588
+ "execution_count": null,
589
+ "id": "e6365917",
590
+ "metadata": {},
591
+ "outputs": [],
592
+ "source": [
593
+ "# try out with pychess\n",
594
+ "\n",
595
+ "import chess as c\n",
596
+ "import chess.engine as ce\n",
597
+ "\n",
598
+ "board = c.Board(fen_cleaned)"
599
+ ]
600
+ },
601
+ {
602
+ "cell_type": "code",
603
+ "execution_count": null,
604
+ "id": "7bc3963d",
605
+ "metadata": {},
606
+ "outputs": [],
607
+ "source": [
608
+ "board"
609
+ ]
610
+ },
611
+ {
612
+ "cell_type": "code",
613
+ "execution_count": null,
614
+ "id": "96037282",
615
+ "metadata": {},
616
+ "outputs": [],
617
+ "source": [
618
+ "svg_data = c.svg.board(board=board, flipped=True)\n",
619
+ "png_data = BytesIO()\n",
620
+ "cairosvg.svg2png(bytestring=svg_data.encode('utf-8'), write_to=png_data)\n",
621
+ "png_data.seek(0)\n",
622
+ "img_array = np.asarray(bytearray(png_data.read()), dtype=np.uint8)\n",
623
+ "img = cv2.imdecode(img_array, cv2.COLOR_BGR2RGB)"
624
+ ]
625
+ },
626
+ {
627
+ "cell_type": "code",
628
+ "execution_count": null,
629
+ "id": "0fbe36a1",
630
+ "metadata": {},
631
+ "outputs": [],
632
+ "source": [
633
+ "fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(25, 10))\n",
634
+ "\n",
635
+ "ax[0].imshow(cv2.imread(chess_task.file_path))\n",
636
+ "ax[0].axis(\"off\")\n",
637
+ "ax[0].set_title(\"Original Chess Board\", fontsize=18)\n",
638
+ "\n",
639
+ "ax[1].imshow(img)\n",
640
+ "ax[1].axis(\"off\")\n",
641
+ "ax[1].set_title(f\"Predicted Chess Board\\nFEN: {fen_cleaned}\", fontsize=18)\n",
642
+ "plt.show()"
643
+ ]
644
+ },
645
+ {
646
+ "cell_type": "markdown",
647
+ "id": "a1037165",
648
+ "metadata": {},
649
+ "source": [
650
+ "Once we extracted the FEN chess position, we pass it to Stockfish in order to get the best next move"
651
+ ]
652
+ },
653
+ {
654
+ "cell_type": "code",
655
+ "execution_count": null,
656
+ "id": "67f340fb",
657
+ "metadata": {},
658
+ "outputs": [],
659
+ "source": [
660
+ "# Import Stockfish \n",
661
+ "stockfish_path = \"../external/chess-engines/stockfish/stockfish-ubuntu-x86-64-avx2\"\n",
662
+ "chess_engine = ce.SimpleEngine.popen_uci(stockfish_path)"
663
+ ]
664
+ },
665
+ {
666
+ "cell_type": "code",
667
+ "execution_count": null,
668
+ "id": "901f4ad7",
669
+ "metadata": {},
670
+ "outputs": [],
671
+ "source": [
672
+ "# Predict next best move \n",
673
+ "stockfish_player = chess_engine.play(board=board, limit=ce.Limit(time=10))\n",
674
+ "best_move_uci = stockfish_player.move\n",
675
+ "best_move = board.san(move=best_move_uci) # Convert from Universal Chess interface to Standard Algebraic Notation\n",
676
+ "best_move"
677
+ ]
678
+ },
679
+ {
680
+ "cell_type": "code",
681
+ "execution_count": null,
682
+ "id": "8b2bffbb",
683
+ "metadata": {},
684
+ "outputs": [],
685
+ "source": [
686
+ "print(\n",
687
+ " \"Results\\n\",\n",
688
+ " \"=\" * 30,\n",
689
+ " f\"\\nPredicted best next move: {best_move}\",\n",
690
+ " f\"\\nCorrect Answer: {chess_task['Final answer']}\"\n",
691
+ " f\"\\nIs the prediction correct?: {'yes' if best_move==chess_task['Final answer'] else 'no'}\"\n",
692
+ " )"
693
+ ]
694
+ },
695
+ {
696
+ "cell_type": "markdown",
697
+ "id": "7d56f6c1",
698
+ "metadata": {},
699
+ "source": [
700
+ "There we go! our model finds the correct answer"
701
+ ]
702
+ },
703
+ {
704
+ "cell_type": "markdown",
705
+ "id": "4eb6d948",
706
+ "metadata": {},
707
+ "source": [
708
+ "As a bonus, we might consider the case the board view is set from black or white pieces' perspective, so it worths considering to extract text from the board image"
709
+ ]
710
+ },
711
+ {
712
+ "cell_type": "code",
713
+ "execution_count": null,
714
+ "id": "d48f84bd",
715
+ "metadata": {},
716
+ "outputs": [],
717
+ "source": [
718
+ "import numpy as np\n",
719
+ "import cv2 \n",
720
+ "import matplotlib.pyplot as plt\n",
721
+ "\n",
722
+ "chess_img_path = \"/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png\"\n",
723
+ "cropped_chess_board_path = \"../data/images/cropped_chess_board.png\""
724
+ ]
725
+ },
726
+ {
727
+ "cell_type": "markdown",
728
+ "id": "df75df03",
729
+ "metadata": {},
730
+ "source": [
731
+ "Our method is to crop the bottom-left chess square and extract the text. \n",
732
+ "1. If the text is a1 -> The board has white view\n",
733
+ "2. If the text is h8 -> The board has black view"
734
+ ]
735
+ },
736
+ {
737
+ "cell_type": "code",
738
+ "execution_count": null,
739
+ "id": "c5e520a9",
740
+ "metadata": {},
741
+ "outputs": [],
742
+ "source": [
743
+ "# Read Chess Board Img and Crop the bottom-left \n",
744
+ "img_bgr = cv2.imread(chess_img_path)\n",
745
+ "img_bgr = cv2.imread(chess_img_path)\n",
746
+ "img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)\n",
747
+ "\n",
748
+ "img_np = np.array(img_rgb)\n",
749
+ "img_np_shape = img_np.shape\n",
750
+ "img_rows, img_cols = img_np_shape[:2]\n",
751
+ "img_cropped = img_np[img_rows // 8 * 7:, :img_cols // 8 * 1, :]"
752
+ ]
753
+ },
754
+ {
755
+ "cell_type": "code",
756
+ "execution_count": null,
757
+ "id": "a332c910",
758
+ "metadata": {},
759
+ "outputs": [],
760
+ "source": [
761
+ "fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(20, 10))\n",
762
+ "\n",
763
+ "ax[0].imshow(img_rgb)\n",
764
+ "ax[1].imshow(img_cropped)\n",
765
+ "\n",
766
+ "ax[0].set_title(\"Original Image\")\n",
767
+ "ax[1].set_title(\"Image Cropped\")\n",
768
+ "\n",
769
+ "ax[0].axis(\"off\")\n",
770
+ "ax[1].axis(\"off\")\n",
771
+ "\n",
772
+ "fig.suptitle(\"Crop image to identify board orientation\", y=0.92, fontsize=15)"
773
+ ]
774
+ },
775
+ {
776
+ "cell_type": "markdown",
777
+ "id": "8723e6fd",
778
+ "metadata": {},
779
+ "source": [
780
+ "Once we crop the board image, we might streamline a simple OCR model to extract the board text"
781
+ ]
782
+ },
783
+ {
784
+ "cell_type": "code",
785
+ "execution_count": null,
786
+ "id": "d60d5248",
787
+ "metadata": {},
788
+ "outputs": [],
789
+ "source": [
790
+ "import easyocr\n",
791
+ "\n",
792
+ "reader = easyocr.Reader(['en'], gpu=False) \n",
793
+ "\n",
794
+ "results = reader.readtext(cropped_chess_board_path)\n",
795
+ "results.reverse()\n",
796
+ "\n",
797
+ "# Grab text\n",
798
+ "left_bottom_cell = \"\"\n",
799
+ "for _, text_temp, _ in results:\n",
800
+ " left_bottom_cell += text_temp\n",
801
+ " \n",
802
+ "print(f\"The left-bottom chess board square is {left_bottom_cell}\")"
803
+ ]
804
+ },
805
+ {
806
+ "cell_type": "markdown",
807
+ "id": "54fadcf2",
808
+ "metadata": {},
809
+ "source": [
810
+ "---"
811
+ ]
812
+ },
813
+ {
814
+ "cell_type": "markdown",
815
+ "id": "57b579ef",
816
+ "metadata": {},
817
+ "source": [
818
+ "Let's study where does our current Agent fail, especially on audio-like questions"
819
+ ]
820
+ },
821
+ {
822
+ "cell_type": "code",
823
+ "execution_count": null,
824
+ "id": "6b7a1444",
825
+ "metadata": {},
826
+ "outputs": [],
827
+ "source": [
828
+ "run = False"
829
+ ]
830
+ },
831
+ {
832
+ "cell_type": "code",
833
+ "execution_count": null,
834
+ "id": "60d3af35",
835
+ "metadata": {},
836
+ "outputs": [],
837
+ "source": [
838
+ "gaia_df"
839
+ ]
840
+ }
841
+ ],
842
+ "metadata": {
843
+ "kernelspec": {
844
+ "display_name": "chappie-CHLGiFC_-py3.11",
845
+ "language": "python",
846
+ "name": "python3"
847
+ },
848
+ "language_info": {
849
+ "codemirror_mode": {
850
+ "name": "ipython",
851
+ "version": 3
852
+ },
853
+ "file_extension": ".py",
854
+ "mimetype": "text/x-python",
855
+ "name": "python",
856
+ "nbconvert_exporter": "python",
857
+ "pygments_lexer": "ipython3",
858
+ "version": "3.11.13"
859
+ }
860
+ },
861
+ "nbformat": 4,
862
+ "nbformat_minor": 5
863
+ }
notebooks/tools_poc/template.ipynb ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "bf9121e7",
6
+ "metadata": {},
7
+ "source": [
8
+ "In this Notebook we create and sketch tools for the Agent. Building proofs of concept (PoC)"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "c29d8861",
14
+ "metadata": {},
15
+ "source": [
16
+ "### Setup"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "94d01490",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "# Libraries\n",
27
+ "\n",
28
+ "import os"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "id": "f09b59ab",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "# Local Modules\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "id": "f32d7959",
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "# import Whisper\n",
49
+ "\n",
50
+ "import whisper"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "id": "5e131840",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "# Import Agent\n",
61
+ "\n",
62
+ "os.sys.path.append(\"../src\")\n",
63
+ "os.sys.path.append(\"../src/agents\")\n",
64
+ "\n",
65
+ "import react # My AI assistant"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": null,
71
+ "id": "dc02cd0f",
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "# DEBUG\n",
76
+ "\n",
77
+ "os.sys.path.append(\"../src/tools\")"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": null,
83
+ "id": "3ca7a88a",
84
+ "metadata": {},
85
+ "outputs": [],
86
+ "source": [
87
+ "\n",
88
+ "# Import GAIA Questions\n",
89
+ "from datasets import load_dataset\n"
90
+ ]
91
+ },
92
+ {
93
+ "cell_type": "code",
94
+ "execution_count": null,
95
+ "id": "a46c1c67",
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "\n",
100
+ "import pandas as pd\n"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": null,
106
+ "id": "5a5cf241",
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "from dotenv import load_dotenv\n",
111
+ "from huggingface_hub import snapshot_download, login"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": null,
117
+ "id": "64a3d4d1",
118
+ "metadata": {},
119
+ "outputs": [],
120
+ "source": [
121
+ "# Load Hugging face credentials\n",
122
+ "\n",
123
+ "#load_dotenv()\n",
124
+ "#login(os.getenv(key=\"HF_TOKEN_CHAPPIE\")) # Replace with your hf api key name\n"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "id": "fda05cb1",
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "#gaia_questions_path = snapshot_download(repo_id=\"gaia-benchmark/GAIA\", repo_type=\"dataset\")\n",
135
+ "#gaia_questions = load_dataset(path=\"gaia-benchmark/GAIA\", name=\"2023_level1\")"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": null,
141
+ "id": "e9ebf52c",
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": [
145
+ "gaia_index_dir = \"../../../.cache/huggingface/datasets/gaia-benchmark___gaia/2023_level1/0.0.1/ec492fe4320ee795b1aed6bb46229c5f693226b0f1316347501c24b4baeee005\"\n",
146
+ "#gaia_index_dir = os.path.abspath(gaia_index_dir)\n",
147
+ "gaia_data_path = os.path.join(gaia_index_dir, \"gaia-validation.arrow\")"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "id": "101f8c73",
154
+ "metadata": {},
155
+ "outputs": [],
156
+ "source": [
157
+ "# Temporal cell: wrote because of hugging face api calls limit\n",
158
+ "\n",
159
+ "import pyarrow.ipc as ipc\n",
160
+ "\n",
161
+ "with open(gaia_data_path, \"rb\") as f:\n",
162
+ " reader = ipc.RecordBatchStreamReader(f)\n",
163
+ " table = reader.read_all()\n",
164
+ "\n",
165
+ "gaia_df = table.to_pandas()\n"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "id": "fe9a5e51",
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "#gaia_questions = gaia_questions[\"validation\"] # Filter for dev purposes\n",
176
+ "#gaia_df = pd.DataFrame(gaia_questions)"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": null,
182
+ "id": "30c08c4d",
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "gaia_df.head()"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": null,
192
+ "id": "4826e755",
193
+ "metadata": {},
194
+ "outputs": [],
195
+ "source": [
196
+ "# DEL\n",
197
+ "# temp filter to get chess images\n",
198
+ "gaia_df[gaia_df[\"file_path\"].map(lambda f: f.endswith(\"44.png\"))]"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "markdown",
203
+ "id": "f2d34bca",
204
+ "metadata": {},
205
+ "source": [
206
+ "---"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": null,
212
+ "id": "0150df47",
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "filetypes = {d[1].file_path.split(\".\")[-1] for d in gaia_df.iterrows()}\n",
217
+ "filetypes"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "markdown",
222
+ "id": "c41c99d7",
223
+ "metadata": {},
224
+ "source": [
225
+ "### Read Historical XPs"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": null,
231
+ "id": "82dd866a",
232
+ "metadata": {},
233
+ "outputs": [],
234
+ "source": [
235
+ "xp_paths = \"../data/agent_experiments/iterations/\""
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": null,
241
+ "id": "5649c3c2",
242
+ "metadata": {},
243
+ "outputs": [],
244
+ "source": [
245
+ "# Index all XPs\n",
246
+ "import os\n",
247
+ "import pandas as pd\n",
248
+ "\n",
249
+ "xp_dir = \"../data/agent_experiments/iterations\"\n",
250
+ "xp_list = []\n",
251
+ "# FIXME: from the 10th XP iteration, the sorted method is unuseful\n",
252
+ "\n",
253
+ "for i, xp_path in enumerate(sorted(os.listdir(xp_paths))): \n",
254
+ " xp_path = os.path.join(xp_dir, xp_path)\n",
255
+ " temp_xp_df = pd.read_csv(xp_path)\n",
256
+ " xp_list.append((i, temp_xp_df))\n",
257
+ " del temp_xp_df "
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "id": "e62498d0",
264
+ "metadata": {},
265
+ "outputs": [],
266
+ "source": [
267
+ "# DEL\n",
268
+ "xp[xp[\"Question\"] == xp_question][[\"is_correct\"]]"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": null,
274
+ "id": "5ca5b1fd",
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "# Join latest XP result to each question\n",
279
+ "\n",
280
+ "index = gaia_df.index.tolist() # Index for all the questions\n",
281
+ "questions = gaia_df[\"Question\"].tolist()\n",
282
+ "answers = list()\n",
283
+ "\n",
284
+ "\n",
285
+ "def filter_condition(xp_data: list) -> bool:\n",
286
+ " i, xp = xp_data\n",
287
+ " for question in questions:\n",
288
+ " xp_addressed_questions = xp[\"Question\"].tolist()\n",
289
+ " if question in xp_addressed_questions:\n",
290
+ " return True\n",
291
+ " return False\n",
292
+ "\n",
293
+ "filtered_xps = list(filter(filter_condition, xp_list))\n",
294
+ "filtered_xps_reversed = filtered_xps\n",
295
+ "filtered_xps_reversed.reverse()\n",
296
+ "\n",
297
+ "# Join latest answer result (i.e. result of the latest xp)\n",
298
+ "\n",
299
+ "# Join latest answer result (i.e. result of the latest xp)\n",
300
+ "\n",
301
+ "for question in questions:\n",
302
+ " answer_result = 0 # Assume wrong answer by default\n",
303
+ " for i, xp in filtered_xps_reversed:\n",
304
+ " for xp_question in xp[\"Question\"].tolist():\n",
305
+ " \n",
306
+ " if question == xp_question:\n",
307
+ " answer_result_row = list(xp[xp[\"Question\"] == xp_question][\"is_correct\"])#.loc[0]\n",
308
+ " answer_result_temp = answer_result_row[0] \n",
309
+ " if answer_result_temp >= answer_result:\n",
310
+ " answer_result = answer_result_temp\n",
311
+ " del xp\n",
312
+ " answers.append(answer_result)\n",
313
+ "\n",
314
+ "# Join answers\n",
315
+ "historical_xp_results = gaia_df.copy()\n",
316
+ "historical_xp_results[\"is_correct\"] = pd.Series(answers)\n",
317
+ "del answers, questions"
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": null,
323
+ "id": "ca756f04",
324
+ "metadata": {},
325
+ "outputs": [],
326
+ "source": [
327
+ "historical_xp_results"
328
+ ]
329
+ },
330
+ {
331
+ "cell_type": "code",
332
+ "execution_count": null,
333
+ "id": "fd11615a",
334
+ "metadata": {},
335
+ "outputs": [],
336
+ "source": [
337
+ "historical_xp_results.is_correct.mean()"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "markdown",
342
+ "id": "d3ddaede",
343
+ "metadata": {},
344
+ "source": [
345
+ "Let's study first which are the most common wrong tasks. I.e. How can we increase accuracy with a single next step (e.g. implement a new tool, modify sys message, etc.)"
346
+ ]
347
+ },
348
+ {
349
+ "cell_type": "code",
350
+ "execution_count": null,
351
+ "id": "a0613fc4",
352
+ "metadata": {},
353
+ "outputs": [],
354
+ "source": [
355
+ "wrong_ans_df = historical_xp_results[historical_xp_results[\"is_correct\"]==0]\n",
356
+ "wrong_ans_df[\"fp_extension\"] = wrong_ans_df[\"file_path\"].map(lambda path: path.split(\".\")[-1])\n",
357
+ "wrong_ans_df"
358
+ ]
359
+ },
360
+ {
361
+ "cell_type": "code",
362
+ "execution_count": null,
363
+ "id": "43925ec4",
364
+ "metadata": {},
365
+ "outputs": [],
366
+ "source": [
367
+ "# Summary\n",
368
+ "wrong_ans_df.groupby(\"fp_extension\")[\"is_correct\"].count()"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "markdown",
373
+ "id": "1f77dabb",
374
+ "metadata": {},
375
+ "source": [
376
+ "The vast majority of remaining tasks do not include files to read. So we sould study them at first"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": null,
382
+ "id": "1379da15",
383
+ "metadata": {},
384
+ "outputs": [],
385
+ "source": [
386
+ "wrong_ans_df_no_extension = wrong_ans_df[wrong_ans_df[\"file_path\"].map(lambda fp: len(fp)==0)]"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": null,
392
+ "id": "5e40e190",
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "wrong_ans_df_no_extension"
397
+ ]
398
+ },
399
+ {
400
+ "cell_type": "markdown",
401
+ "id": "d25a6fd5",
402
+ "metadata": {},
403
+ "source": [
404
+ "### Build Tool"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "markdown",
409
+ "id": "116ee0cb",
410
+ "metadata": {},
411
+ "source": [
412
+ "#### 1. FIlter tasks to solve with the tool"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": null,
418
+ "id": "95a3b948",
419
+ "metadata": {},
420
+ "outputs": [],
421
+ "source": []
422
+ },
423
+ {
424
+ "cell_type": "markdown",
425
+ "id": "6a68fb6e",
426
+ "metadata": {},
427
+ "source": [
428
+ "#### 2. Tool PoC"
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "code",
433
+ "execution_count": null,
434
+ "id": "2a9a59a6",
435
+ "metadata": {},
436
+ "outputs": [],
437
+ "source": []
438
+ },
439
+ {
440
+ "cell_type": "markdown",
441
+ "id": "30a22395",
442
+ "metadata": {},
443
+ "source": [
444
+ "#### 3. Test Tool"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "markdown",
449
+ "id": "069ffda9",
450
+ "metadata": {},
451
+ "source": []
452
+ }
453
+ ],
454
+ "metadata": {
455
+ "kernelspec": {
456
+ "display_name": "chappie-CHLGiFC_-py3.11",
457
+ "language": "python",
458
+ "name": "python3"
459
+ },
460
+ "language_info": {
461
+ "codemirror_mode": {
462
+ "name": "ipython",
463
+ "version": 3
464
+ },
465
+ "file_extension": ".py",
466
+ "mimetype": "text/x-python",
467
+ "name": "python",
468
+ "nbconvert_exporter": "python",
469
+ "pygments_lexer": "ipython3",
470
+ "version": "3.11.13"
471
+ }
472
+ },
473
+ "nbformat": 4,
474
+ "nbformat_minor": 5
475
+ }
notebooks/tools_poc/web_search.ipynb ADDED
@@ -0,0 +1,2011 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "bf9121e7",
6
+ "metadata": {},
7
+ "source": [
8
+ "In this Notebook we create and sketch tools for the Agent. Building proofs of concept (PoC)"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "c29d8861",
14
+ "metadata": {},
15
+ "source": [
16
+ "### Setup"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 1,
22
+ "id": "94d01490",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "# Libraries\n",
27
+ "\n",
28
+ "import os"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 2,
34
+ "id": "f32d7959",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "# import Whisper\n",
39
+ "\n",
40
+ "import whisper"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 3,
46
+ "id": "5e131840",
47
+ "metadata": {},
48
+ "outputs": [
49
+ {
50
+ "name": "stderr",
51
+ "output_type": "stream",
52
+ "text": [
53
+ "Langfuse client is disabled since no public_key was provided as a parameter or environment variable 'LANGFUSE_PUBLIC_KEY'. See our docs: https://langfuse.com/docs/sdk/python/low-level-sdk#initialize-client\n"
54
+ ]
55
+ }
56
+ ],
57
+ "source": [
58
+ "# Import Agent\n",
59
+ "\n",
60
+ "os.sys.path.append(\"../../src\")\n",
61
+ "os.sys.path.append(\"../../src/agents\")\n",
62
+ "\n",
63
+ "import react # My AI assistant"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": 4,
69
+ "id": "dc02cd0f",
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "# DEBUG\n",
74
+ "\n",
75
+ "os.sys.path.append(\"../../src/tools\")"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 5,
81
+ "id": "3ca7a88a",
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "name": "stderr",
86
+ "output_type": "stream",
87
+ "text": [
88
+ "/home/santiagoal/.cache/pypoetry/virtualenvs/chappie-CHLGiFC_-py3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
89
+ " from .autonotebook import tqdm as notebook_tqdm\n"
90
+ ]
91
+ }
92
+ ],
93
+ "source": [
94
+ "\n",
95
+ "# Import GAIA Questions\n",
96
+ "from datasets import load_dataset\n"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 6,
102
+ "id": "a46c1c67",
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "\n",
107
+ "import pandas as pd\n"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 7,
113
+ "id": "5a5cf241",
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "from dotenv import load_dotenv\n",
118
+ "from huggingface_hub import snapshot_download, login"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 8,
124
+ "id": "64a3d4d1",
125
+ "metadata": {},
126
+ "outputs": [],
127
+ "source": [
128
+ "# Load Hugging face credentials\n",
129
+ "\n",
130
+ "#load_dotenv()\n",
131
+ "#login(os.getenv(key=\"HF_TOKEN_CHAPPIE\")) # Replace with your hf api key name\n"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": 9,
137
+ "id": "fda05cb1",
138
+ "metadata": {},
139
+ "outputs": [],
140
+ "source": [
141
+ "#gaia_questions_path = snapshot_download(repo_id=\"gaia-benchmark/GAIA\", repo_type=\"dataset\")\n",
142
+ "#gaia_questions = load_dataset(path=\"gaia-benchmark/GAIA\", name=\"2023_level1\")"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": 10,
148
+ "id": "e9ebf52c",
149
+ "metadata": {},
150
+ "outputs": [],
151
+ "source": [
152
+ "gaia_index_dir = \"../../../../.cache/huggingface/datasets/gaia-benchmark___gaia/2023_level1/0.0.1/ec492fe4320ee795b1aed6bb46229c5f693226b0f1316347501c24b4baeee005\"\n",
153
+ "#gaia_index_dir = os.path.abspath(gaia_index_dir)\n",
154
+ "gaia_data_path = os.path.join(gaia_index_dir, \"gaia-validation.arrow\")"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 11,
160
+ "id": "101f8c73",
161
+ "metadata": {},
162
+ "outputs": [],
163
+ "source": [
164
+ "# Temporal cell: wrote because of hugging face api calls limit\n",
165
+ "\n",
166
+ "import pyarrow.ipc as ipc\n",
167
+ "\n",
168
+ "with open(gaia_data_path, \"rb\") as f:\n",
169
+ " reader = ipc.RecordBatchStreamReader(f)\n",
170
+ " table = reader.read_all()\n",
171
+ "\n",
172
+ "gaia_df = table.to_pandas()\n"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 12,
178
+ "id": "fe9a5e51",
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "#gaia_questions = gaia_questions[\"validation\"] # Filter for dev purposes\n",
183
+ "#gaia_df = pd.DataFrame(gaia_questions)"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": 13,
189
+ "id": "30c08c4d",
190
+ "metadata": {},
191
+ "outputs": [
192
+ {
193
+ "data": {
194
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
195
+ "columns": [
196
+ {
197
+ "name": "index",
198
+ "rawType": "int64",
199
+ "type": "integer"
200
+ },
201
+ {
202
+ "name": "task_id",
203
+ "rawType": "object",
204
+ "type": "string"
205
+ },
206
+ {
207
+ "name": "Question",
208
+ "rawType": "object",
209
+ "type": "string"
210
+ },
211
+ {
212
+ "name": "Level",
213
+ "rawType": "object",
214
+ "type": "string"
215
+ },
216
+ {
217
+ "name": "Final answer",
218
+ "rawType": "object",
219
+ "type": "string"
220
+ },
221
+ {
222
+ "name": "file_name",
223
+ "rawType": "object",
224
+ "type": "string"
225
+ },
226
+ {
227
+ "name": "file_path",
228
+ "rawType": "object",
229
+ "type": "string"
230
+ },
231
+ {
232
+ "name": "Annotator Metadata",
233
+ "rawType": "object",
234
+ "type": "unknown"
235
+ }
236
+ ],
237
+ "ref": "46594f81-f33f-486c-8c5c-005503176990",
238
+ "rows": [
239
+ [
240
+ "0",
241
+ "e1fc63a2-da7a-432f-be78-7c4a95598703",
242
+ "If Eliud Kipchoge could maintain his record-making marathon pace indefinitely, how many thousand hours would it take him to run the distance between the Earth and the Moon its closest approach? Please use the minimum perigee value on the Wikipedia page for the Moon when carrying out your calculation. Round your result to the nearest 1000 hours and do not use any comma separators if necessary.",
243
+ "1",
244
+ "17",
245
+ "",
246
+ "",
247
+ "{'Steps': '1. Googled Eliud Kipchoge marathon pace to find 4min 37sec/mile\\n2. Converted into fractions of hours.\\n3. Found moon periapsis in miles (225,623 miles).\\n4. Multiplied the two to find the number of hours and rounded to the nearest 100 hours.', 'Number of steps': '4', 'How long did this take?': '20 Minutes', 'Tools': '1. A web browser.\\n2. A search engine.\\n3. A calculator.', 'Number of tools': '3'}"
248
+ ],
249
+ [
250
+ "1",
251
+ "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
252
+ "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
253
+ "1",
254
+ "3",
255
+ "",
256
+ "",
257
+ "{'Steps': '1. I did a search for Mercedes Sosa\\n2. I went to the Wikipedia page for her\\n3. I scrolled down to \"Studio albums\"\\n4. I counted the ones between 2000 and 2009', 'Number of steps': '4', 'How long did this take?': '5 minutes', 'Tools': '1. web browser\\n2. google search', 'Number of tools': '2'}"
258
+ ],
259
+ [
260
+ "2",
261
+ "ec09fa32-d03f-4bf8-84b0-1f16922c3ae4",
262
+ "Here's a fun riddle that I think you'll enjoy.\n\nYou have been selected to play the final round of the hit new game show \"Pick That Ping-Pong\". In this round, you will be competing for a large cash prize. Your job will be to pick one of several different numbered ping-pong balls, and then the game will commence. The host describes how the game works.\n\nA device consisting of a winding clear ramp and a series of pistons controls the outcome of the game. The ramp feeds balls onto a platform. The platform has room for three ping-pong balls at a time. The three balls on the platform are each aligned with one of three pistons. At each stage of the game, one of the three pistons will randomly fire, ejecting the ball it strikes. If the piston ejects the ball in the first position on the platform the balls in the second and third position on the platform each advance one space, and the next ball on the ramp advances to the third position. If the piston ejects the ball in the second position, the ball in the first position is released and rolls away, the ball in the third position advances two spaces to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform. If the piston ejects the ball in the third position, the ball in the first position is released and rolls away, the ball in the second position advances one space to occupy the first position, and the next two balls on the ramp advance to occupy the second and third positions on the platform.\n\nThe ramp begins with 100 numbered ping-pong balls, arranged in ascending order from 1 to 100. The host activates the machine and the first three balls, numbered 1, 2, and 3, advance to the platform. Before the random firing of the pistons begins, you are asked which of the 100 balls you would like to pick. If your pick is ejected by one of the pistons, you win the grand prize, $10,000.\n\nWhich ball should you choose to maximize your odds of winning the big prize? Please provide your answer as the number of the ball selected.",
263
+ "1",
264
+ "3",
265
+ "",
266
+ "",
267
+ "{'Steps': 'Step 1: Evaluate the problem statement provided in my user\\'s prompt\\nStep 2: Consider the probability of any ball on the platform earning the prize.\\nStep 3: Evaluate the ball in position one. The probability of it earning the prize, P1, is 1/3\\nStep 4: Using a calculator, evaluate the ball in position two. The probability of it earning the prize, P2, is the difference between 1 and the product of the complementary probabilities for each trial\\nP2 = 1 - (2/3)(2/3)\\nP2 = 5/9\\nStep 5: Using a calculator, evaluate the ball in position three. The probability of it earning the prize, P3, is the difference between 1 and the product of the complementary probabilities for each trial\\nP3 = 1 - (2/3)(2/3)(2/3)\\nP3 = 19/27\\nStep 6: Consider the possible outcomes of numbers higher than 3.\\nStep 7: For each trial, either 1 or 2 balls from the ramp will advance to the platform. For any given selection, there is a 50% chance that the ball advances to position 2 or position 3.\\nStep 8: As position three holds the highest chance of earning the prize, select the only ball known to occupy position three with certainty, ball 3.\\nStep 9: Report the correct answer to my user, \"3\"', 'Number of steps': '9', 'How long did this take?': '1 minute', 'Tools': 'None', 'Number of tools': '0'}"
268
+ ],
269
+ [
270
+ "3",
271
+ "5d0080cb-90d7-4712-bc33-848150e917d3",
272
+ "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper \"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\"",
273
+ "1",
274
+ "0.1777",
275
+ "",
276
+ "",
277
+ "{'Steps': '1. Searched \\'\"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\"\\' on Google.\\n2. Opened \"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\" at https://journals.le.ac.uk/ojs1/index.php/jist/article/view/733.\\n3. Clicked \"PDF\".\\n4. Found the calculations for the volume of the fish bag and noted them.', 'Number of steps': '4', 'How long did this take?': '5 minutes', 'Tools': '1. Web browser\\n2. Search engine\\n3. PDF access', 'Number of tools': '3'}"
278
+ ],
279
+ [
280
+ "4",
281
+ "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
282
+ "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
283
+ "1",
284
+ "3",
285
+ "",
286
+ "",
287
+ "{'Steps': '1. Navigate to the YouTube link.\\n2. Watch the video to see the highest number of bird species.\\n3. Note the number.', 'Number of steps': '3', 'How long did this take?': '3 minutes', 'Tools': '1. Web browser\\n2. Video parsing', 'Number of tools': '2'}"
288
+ ]
289
+ ],
290
+ "shape": {
291
+ "columns": 7,
292
+ "rows": 5
293
+ }
294
+ },
295
+ "text/html": [
296
+ "<div>\n",
297
+ "<style scoped>\n",
298
+ " .dataframe tbody tr th:only-of-type {\n",
299
+ " vertical-align: middle;\n",
300
+ " }\n",
301
+ "\n",
302
+ " .dataframe tbody tr th {\n",
303
+ " vertical-align: top;\n",
304
+ " }\n",
305
+ "\n",
306
+ " .dataframe thead th {\n",
307
+ " text-align: right;\n",
308
+ " }\n",
309
+ "</style>\n",
310
+ "<table border=\"1\" class=\"dataframe\">\n",
311
+ " <thead>\n",
312
+ " <tr style=\"text-align: right;\">\n",
313
+ " <th></th>\n",
314
+ " <th>task_id</th>\n",
315
+ " <th>Question</th>\n",
316
+ " <th>Level</th>\n",
317
+ " <th>Final answer</th>\n",
318
+ " <th>file_name</th>\n",
319
+ " <th>file_path</th>\n",
320
+ " <th>Annotator Metadata</th>\n",
321
+ " </tr>\n",
322
+ " </thead>\n",
323
+ " <tbody>\n",
324
+ " <tr>\n",
325
+ " <th>0</th>\n",
326
+ " <td>e1fc63a2-da7a-432f-be78-7c4a95598703</td>\n",
327
+ " <td>If Eliud Kipchoge could maintain his record-ma...</td>\n",
328
+ " <td>1</td>\n",
329
+ " <td>17</td>\n",
330
+ " <td></td>\n",
331
+ " <td></td>\n",
332
+ " <td>{'Steps': '1. Googled Eliud Kipchoge marathon ...</td>\n",
333
+ " </tr>\n",
334
+ " <tr>\n",
335
+ " <th>1</th>\n",
336
+ " <td>8e867cd7-cff9-4e6c-867a-ff5ddc2550be</td>\n",
337
+ " <td>How many studio albums were published by Merce...</td>\n",
338
+ " <td>1</td>\n",
339
+ " <td>3</td>\n",
340
+ " <td></td>\n",
341
+ " <td></td>\n",
342
+ " <td>{'Steps': '1. I did a search for Mercedes Sosa...</td>\n",
343
+ " </tr>\n",
344
+ " <tr>\n",
345
+ " <th>2</th>\n",
346
+ " <td>ec09fa32-d03f-4bf8-84b0-1f16922c3ae4</td>\n",
347
+ " <td>Here's a fun riddle that I think you'll enjoy....</td>\n",
348
+ " <td>1</td>\n",
349
+ " <td>3</td>\n",
350
+ " <td></td>\n",
351
+ " <td></td>\n",
352
+ " <td>{'Steps': 'Step 1: Evaluate the problem statem...</td>\n",
353
+ " </tr>\n",
354
+ " <tr>\n",
355
+ " <th>3</th>\n",
356
+ " <td>5d0080cb-90d7-4712-bc33-848150e917d3</td>\n",
357
+ " <td>What was the volume in m^3 of the fish bag tha...</td>\n",
358
+ " <td>1</td>\n",
359
+ " <td>0.1777</td>\n",
360
+ " <td></td>\n",
361
+ " <td></td>\n",
362
+ " <td>{'Steps': '1. Searched '\"Can Hiccup Supply Eno...</td>\n",
363
+ " </tr>\n",
364
+ " <tr>\n",
365
+ " <th>4</th>\n",
366
+ " <td>a1e91b78-d3d8-4675-bb8d-62741b4b68a6</td>\n",
367
+ " <td>In the video https://www.youtube.com/watch?v=L...</td>\n",
368
+ " <td>1</td>\n",
369
+ " <td>3</td>\n",
370
+ " <td></td>\n",
371
+ " <td></td>\n",
372
+ " <td>{'Steps': '1. Navigate to the YouTube link.\n",
373
+ "2....</td>\n",
374
+ " </tr>\n",
375
+ " </tbody>\n",
376
+ "</table>\n",
377
+ "</div>"
378
+ ],
379
+ "text/plain": [
380
+ " task_id \\\n",
381
+ "0 e1fc63a2-da7a-432f-be78-7c4a95598703 \n",
382
+ "1 8e867cd7-cff9-4e6c-867a-ff5ddc2550be \n",
383
+ "2 ec09fa32-d03f-4bf8-84b0-1f16922c3ae4 \n",
384
+ "3 5d0080cb-90d7-4712-bc33-848150e917d3 \n",
385
+ "4 a1e91b78-d3d8-4675-bb8d-62741b4b68a6 \n",
386
+ "\n",
387
+ " Question Level Final answer \\\n",
388
+ "0 If Eliud Kipchoge could maintain his record-ma... 1 17 \n",
389
+ "1 How many studio albums were published by Merce... 1 3 \n",
390
+ "2 Here's a fun riddle that I think you'll enjoy.... 1 3 \n",
391
+ "3 What was the volume in m^3 of the fish bag tha... 1 0.1777 \n",
392
+ "4 In the video https://www.youtube.com/watch?v=L... 1 3 \n",
393
+ "\n",
394
+ " file_name file_path Annotator Metadata \n",
395
+ "0 {'Steps': '1. Googled Eliud Kipchoge marathon ... \n",
396
+ "1 {'Steps': '1. I did a search for Mercedes Sosa... \n",
397
+ "2 {'Steps': 'Step 1: Evaluate the problem statem... \n",
398
+ "3 {'Steps': '1. Searched '\"Can Hiccup Supply Eno... \n",
399
+ "4 {'Steps': '1. Navigate to the YouTube link.\n",
400
+ "2.... "
401
+ ]
402
+ },
403
+ "execution_count": 13,
404
+ "metadata": {},
405
+ "output_type": "execute_result"
406
+ }
407
+ ],
408
+ "source": [
409
+ "gaia_df.head()"
410
+ ]
411
+ },
412
+ {
413
+ "cell_type": "code",
414
+ "execution_count": 14,
415
+ "id": "4826e755",
416
+ "metadata": {},
417
+ "outputs": [
418
+ {
419
+ "data": {
420
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
421
+ "columns": [
422
+ {
423
+ "name": "index",
424
+ "rawType": "int64",
425
+ "type": "integer"
426
+ },
427
+ {
428
+ "name": "task_id",
429
+ "rawType": "object",
430
+ "type": "string"
431
+ },
432
+ {
433
+ "name": "Question",
434
+ "rawType": "object",
435
+ "type": "string"
436
+ },
437
+ {
438
+ "name": "Level",
439
+ "rawType": "object",
440
+ "type": "string"
441
+ },
442
+ {
443
+ "name": "Final answer",
444
+ "rawType": "object",
445
+ "type": "string"
446
+ },
447
+ {
448
+ "name": "file_name",
449
+ "rawType": "object",
450
+ "type": "string"
451
+ },
452
+ {
453
+ "name": "file_path",
454
+ "rawType": "object",
455
+ "type": "string"
456
+ },
457
+ {
458
+ "name": "Annotator Metadata",
459
+ "rawType": "object",
460
+ "type": "unknown"
461
+ }
462
+ ],
463
+ "ref": "682031dd-71c0-4a11-a29b-48f858136885",
464
+ "rows": [
465
+ [
466
+ "16",
467
+ "cca530fc-4052-43b2-b130-b30968d8aa44",
468
+ "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
469
+ "1",
470
+ "Rd5",
471
+ "cca530fc-4052-43b2-b130-b30968d8aa44.png",
472
+ "/home/santiagoal/.cache/huggingface/hub/datasets--gaia-benchmark--GAIA/snapshots/897f2dfbb5c952b5c3c1509e648381f9c7b70316/2023/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png",
473
+ "{'Steps': 'Step 1: Evaluate the position of the pieces in the chess position\\nStep 2: Report the best move available for black: \"Rd5\"', 'Number of steps': '2', 'How long did this take?': '10 minutes', 'Tools': '1. Image recognition tools', 'Number of tools': '1'}"
474
+ ]
475
+ ],
476
+ "shape": {
477
+ "columns": 7,
478
+ "rows": 1
479
+ }
480
+ },
481
+ "text/html": [
482
+ "<div>\n",
483
+ "<style scoped>\n",
484
+ " .dataframe tbody tr th:only-of-type {\n",
485
+ " vertical-align: middle;\n",
486
+ " }\n",
487
+ "\n",
488
+ " .dataframe tbody tr th {\n",
489
+ " vertical-align: top;\n",
490
+ " }\n",
491
+ "\n",
492
+ " .dataframe thead th {\n",
493
+ " text-align: right;\n",
494
+ " }\n",
495
+ "</style>\n",
496
+ "<table border=\"1\" class=\"dataframe\">\n",
497
+ " <thead>\n",
498
+ " <tr style=\"text-align: right;\">\n",
499
+ " <th></th>\n",
500
+ " <th>task_id</th>\n",
501
+ " <th>Question</th>\n",
502
+ " <th>Level</th>\n",
503
+ " <th>Final answer</th>\n",
504
+ " <th>file_name</th>\n",
505
+ " <th>file_path</th>\n",
506
+ " <th>Annotator Metadata</th>\n",
507
+ " </tr>\n",
508
+ " </thead>\n",
509
+ " <tbody>\n",
510
+ " <tr>\n",
511
+ " <th>16</th>\n",
512
+ " <td>cca530fc-4052-43b2-b130-b30968d8aa44</td>\n",
513
+ " <td>Review the chess position provided in the imag...</td>\n",
514
+ " <td>1</td>\n",
515
+ " <td>Rd5</td>\n",
516
+ " <td>cca530fc-4052-43b2-b130-b30968d8aa44.png</td>\n",
517
+ " <td>/home/santiagoal/.cache/huggingface/hub/datase...</td>\n",
518
+ " <td>{'Steps': 'Step 1: Evaluate the position of th...</td>\n",
519
+ " </tr>\n",
520
+ " </tbody>\n",
521
+ "</table>\n",
522
+ "</div>"
523
+ ],
524
+ "text/plain": [
525
+ " task_id \\\n",
526
+ "16 cca530fc-4052-43b2-b130-b30968d8aa44 \n",
527
+ "\n",
528
+ " Question Level Final answer \\\n",
529
+ "16 Review the chess position provided in the imag... 1 Rd5 \n",
530
+ "\n",
531
+ " file_name \\\n",
532
+ "16 cca530fc-4052-43b2-b130-b30968d8aa44.png \n",
533
+ "\n",
534
+ " file_path \\\n",
535
+ "16 /home/santiagoal/.cache/huggingface/hub/datase... \n",
536
+ "\n",
537
+ " Annotator Metadata \n",
538
+ "16 {'Steps': 'Step 1: Evaluate the position of th... "
539
+ ]
540
+ },
541
+ "execution_count": 14,
542
+ "metadata": {},
543
+ "output_type": "execute_result"
544
+ }
545
+ ],
546
+ "source": [
547
+ "# DEL\n",
548
+ "# temp filter to get chess images\n",
549
+ "gaia_df[gaia_df[\"file_path\"].map(lambda f: f.endswith(\"44.png\"))]"
550
+ ]
551
+ },
552
+ {
553
+ "cell_type": "markdown",
554
+ "id": "f2d34bca",
555
+ "metadata": {},
556
+ "source": [
557
+ "---"
558
+ ]
559
+ },
560
+ {
561
+ "cell_type": "code",
562
+ "execution_count": 15,
563
+ "id": "0150df47",
564
+ "metadata": {},
565
+ "outputs": [
566
+ {
567
+ "data": {
568
+ "text/plain": [
569
+ "{'', 'docx', 'mp3', 'png', 'pptx', 'py', 'txt', 'xlsx'}"
570
+ ]
571
+ },
572
+ "execution_count": 15,
573
+ "metadata": {},
574
+ "output_type": "execute_result"
575
+ }
576
+ ],
577
+ "source": [
578
+ "filetypes = {d[1].file_path.split(\".\")[-1] for d in gaia_df.iterrows()}\n",
579
+ "filetypes"
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "markdown",
584
+ "id": "c41c99d7",
585
+ "metadata": {},
586
+ "source": [
587
+ "### Read Historical XPs"
588
+ ]
589
+ },
590
+ {
591
+ "cell_type": "code",
592
+ "execution_count": null,
593
+ "id": "82dd866a",
594
+ "metadata": {},
595
+ "outputs": [],
596
+ "source": [
597
+ "xp_paths = \"../data/agent_experiments/iterations/\""
598
+ ]
599
+ },
600
+ {
601
+ "cell_type": "code",
602
+ "execution_count": null,
603
+ "id": "5649c3c2",
604
+ "metadata": {},
605
+ "outputs": [],
606
+ "source": [
607
+ "# Index all XPs\n",
608
+ "import os\n",
609
+ "import pandas as pd\n",
610
+ "\n",
611
+ "xp_dir = \"../../data/agent_experiments/iterations\"\n",
612
+ "xp_list = []\n",
613
+ "# FIXME: from the 10th XP iteration, the sorted method is unuseful\n",
614
+ "\n",
615
+ "for i, xp_path in enumerate(sorted(os.listdir(xp_dir))): \n",
616
+ " xp_path = os.path.join(xp_dir, xp_path)\n",
617
+ " temp_xp_df = pd.read_csv(xp_path)\n",
618
+ " xp_list.append((i, temp_xp_df))\n",
619
+ " del temp_xp_df "
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "code",
624
+ "execution_count": null,
625
+ "id": "5ca5b1fd",
626
+ "metadata": {},
627
+ "outputs": [],
628
+ "source": [
629
+ "# Join latest XP result to each question\n",
630
+ "\n",
631
+ "index = gaia_df.index.tolist() # Index for all the questions\n",
632
+ "questions = gaia_df[\"Question\"].tolist()\n",
633
+ "answers = list()\n",
634
+ "\n",
635
+ "\n",
636
+ "def filter_condition(xp_data: list) -> bool:\n",
637
+ " i, xp = xp_data\n",
638
+ " for question in questions:\n",
639
+ " xp_addressed_questions = xp[\"Question\"].tolist()\n",
640
+ " if question in xp_addressed_questions:\n",
641
+ " return True\n",
642
+ " return False\n",
643
+ "\n",
644
+ "filtered_xps = list(filter(filter_condition, xp_list))\n",
645
+ "filtered_xps_reversed = filtered_xps\n",
646
+ "filtered_xps_reversed.reverse()\n",
647
+ "\n",
648
+ "# Join latest answer result (i.e. result of the latest xp)\n",
649
+ "\n",
650
+ "# Join latest answer result (i.e. result of the latest xp)\n",
651
+ "\n",
652
+ "for question in questions:\n",
653
+ " answer_result = 0 # Assume wrong answer by default\n",
654
+ " for i, xp in filtered_xps_reversed:\n",
655
+ " for xp_question in xp[\"Question\"].tolist():\n",
656
+ " \n",
657
+ " if question == xp_question:\n",
658
+ " answer_result_row = list(xp[xp[\"Question\"] == xp_question][\"is_correct\"])#.loc[0]\n",
659
+ " answer_result_temp = answer_result_row[0] \n",
660
+ " if answer_result_temp >= answer_result:\n",
661
+ " answer_result = answer_result_temp\n",
662
+ " del xp\n",
663
+ " answers.append(answer_result)\n",
664
+ "\n",
665
+ "# Join answers\n",
666
+ "historical_xp_results = gaia_df.copy()\n",
667
+ "historical_xp_results[\"is_correct\"] = pd.Series(answers)\n",
668
+ "del answers, questions"
669
+ ]
670
+ },
671
+ {
672
+ "cell_type": "code",
673
+ "execution_count": null,
674
+ "id": "ca756f04",
675
+ "metadata": {},
676
+ "outputs": [],
677
+ "source": [
678
+ "historical_xp_results"
679
+ ]
680
+ },
681
+ {
682
+ "cell_type": "code",
683
+ "execution_count": null,
684
+ "id": "fd11615a",
685
+ "metadata": {},
686
+ "outputs": [],
687
+ "source": [
688
+ "historical_xp_results.is_correct.mean()"
689
+ ]
690
+ },
691
+ {
692
+ "cell_type": "markdown",
693
+ "id": "d3ddaede",
694
+ "metadata": {},
695
+ "source": [
696
+ "Let's study first which are the most common wrong tasks. I.e. How can we increase accuracy with a single next step (e.g. implement a new tool, modify sys message, etc.)"
697
+ ]
698
+ },
699
+ {
700
+ "cell_type": "code",
701
+ "execution_count": null,
702
+ "id": "a0613fc4",
703
+ "metadata": {},
704
+ "outputs": [],
705
+ "source": [
706
+ "wrong_ans_df = historical_xp_results[historical_xp_results[\"is_correct\"]==0]\n",
707
+ "wrong_ans_df[\"fp_extension\"] = wrong_ans_df[\"file_path\"].map(lambda path: path.split(\".\")[-1])\n",
708
+ "wrong_ans_df"
709
+ ]
710
+ },
711
+ {
712
+ "cell_type": "code",
713
+ "execution_count": null,
714
+ "id": "43925ec4",
715
+ "metadata": {},
716
+ "outputs": [],
717
+ "source": [
718
+ "# Summary\n",
719
+ "wrong_ans_df.groupby(\"fp_extension\")[\"is_correct\"].count()"
720
+ ]
721
+ },
722
+ {
723
+ "cell_type": "markdown",
724
+ "id": "1f77dabb",
725
+ "metadata": {},
726
+ "source": [
727
+ "The vast majority of remaining tasks do not include files to read. So we sould study them at first"
728
+ ]
729
+ },
730
+ {
731
+ "cell_type": "code",
732
+ "execution_count": null,
733
+ "id": "1379da15",
734
+ "metadata": {},
735
+ "outputs": [],
736
+ "source": [
737
+ "wrong_ans_df_no_extension = wrong_ans_df[wrong_ans_df[\"file_path\"].map(lambda fp: len(fp)==0)]"
738
+ ]
739
+ },
740
+ {
741
+ "cell_type": "code",
742
+ "execution_count": null,
743
+ "id": "5e40e190",
744
+ "metadata": {},
745
+ "outputs": [],
746
+ "source": [
747
+ "wrong_ans_df_no_extension"
748
+ ]
749
+ },
750
+ {
751
+ "cell_type": "markdown",
752
+ "id": "14a8ebcf",
753
+ "metadata": {},
754
+ "source": [
755
+ "### Method\n",
756
+ "\n",
757
+ "We identified three main necessities to cover through web search\n",
758
+ "\n",
759
+ "1. Analize youtube videos (transcription and computer vision)\n",
760
+ "2. Scrap web pages (e.g. journals, pdfs, wikipedia articles)\n",
761
+ "\n",
762
+ "\n",
763
+ "So we decided to modularize the web_search_tool into three main modules\n",
764
+ "\n",
765
+ "1. High-level Web results\n",
766
+ "2. Youtube Video Class: To gather both audio and video\n",
767
+ "3. Web page scrapper: Given a web url, retrieve the text as markdown\n",
768
+ "4. Object detection tool: Ideally coded into handle_image tool\n",
769
+ "\n",
770
+ "So we plan to generate three main tools: `youtube_tool` (Class + Langchain runnable), `web_search` (Expanded) and `handle_images` with an initial module `object_detection` using YoloV4-tiny"
771
+ ]
772
+ },
773
+ {
774
+ "cell_type": "markdown",
775
+ "id": "b63c5f35",
776
+ "metadata": {},
777
+ "source": [
778
+ "### Youtube Tool"
779
+ ]
780
+ },
781
+ {
782
+ "cell_type": "code",
783
+ "execution_count": null,
784
+ "id": "2141e4ea",
785
+ "metadata": {},
786
+ "outputs": [],
787
+ "source": [
788
+ "import re"
789
+ ]
790
+ },
791
+ {
792
+ "cell_type": "markdown",
793
+ "id": "892f468e",
794
+ "metadata": {},
795
+ "source": [
796
+ "#### 1. FIlter tasks to solve with the tool"
797
+ ]
798
+ },
799
+ {
800
+ "cell_type": "code",
801
+ "execution_count": null,
802
+ "id": "7e6b3b0c",
803
+ "metadata": {},
804
+ "outputs": [],
805
+ "source": [
806
+ "pattern = r\".*youtube.*\"\n",
807
+ "yt_tasks = gaia_df.copy()\n",
808
+ "yt_tasks = yt_tasks[yt_tasks[\"Question\"].map(lambda question: bool(re.search(pattern, question, re.IGNORECASE)))]\n",
809
+ "yt_tasks"
810
+ ]
811
+ },
812
+ {
813
+ "cell_type": "markdown",
814
+ "id": "d41b2f29",
815
+ "metadata": {},
816
+ "source": [
817
+ "#### 2. Tool PoC"
818
+ ]
819
+ },
820
+ {
821
+ "cell_type": "code",
822
+ "execution_count": null,
823
+ "id": "64c2891e",
824
+ "metadata": {},
825
+ "outputs": [],
826
+ "source": [
827
+ "sample_yt_url = \"https://www.youtube.com/watch?v=L1vXCYZAYYM\""
828
+ ]
829
+ },
830
+ {
831
+ "cell_type": "markdown",
832
+ "id": "9b82ace7",
833
+ "metadata": {},
834
+ "source": [
835
+ "Lets import that video"
836
+ ]
837
+ },
838
+ {
839
+ "cell_type": "code",
840
+ "execution_count": null,
841
+ "id": "f5ad954c",
842
+ "metadata": {},
843
+ "outputs": [],
844
+ "source": [
845
+ "import os"
846
+ ]
847
+ },
848
+ {
849
+ "cell_type": "code",
850
+ "execution_count": null,
851
+ "id": "5c12ac9f",
852
+ "metadata": {},
853
+ "outputs": [],
854
+ "source": [
855
+ "output_dir = \"../../data/temp/\"\n",
856
+ "\n",
857
+ "yt_audio_filename = \"yt_audio.mp3\"\n",
858
+ "yt_audio_path = os.path.join(output_dir, yt_audio_filename)\n",
859
+ "\n",
860
+ "raw_yt_video_filename = \"raw_yt_video.mp4\"\n",
861
+ "processed_yt_video_filename = \"processed_yt_video.mp4\"\n",
862
+ "\n",
863
+ "raw_yt_video_path = os.path.join(output_dir, raw_yt_video_filename)\n",
864
+ "processed_yt_video_path = os.path.join(output_dir, processed_yt_video_filename)"
865
+ ]
866
+ },
867
+ {
868
+ "cell_type": "code",
869
+ "execution_count": null,
870
+ "id": "6f1e8a09",
871
+ "metadata": {},
872
+ "outputs": [],
873
+ "source": [
874
+ "from pytubefix import YouTube"
875
+ ]
876
+ },
877
+ {
878
+ "cell_type": "code",
879
+ "execution_count": null,
880
+ "id": "67f68f09",
881
+ "metadata": {},
882
+ "outputs": [],
883
+ "source": [
884
+ "yt = YouTube(sample_yt_url)\n",
885
+ "#yt_audio = yt.streams.filter(only_audio=True).first()\n",
886
+ "#yt_video = yt.streams.filter(only_video=True).first()"
887
+ ]
888
+ },
889
+ {
890
+ "cell_type": "code",
891
+ "execution_count": null,
892
+ "id": "2f04e971",
893
+ "metadata": {},
894
+ "outputs": [],
895
+ "source": [
896
+ "yt_audio = yt.streams.filter(only_audio=True).first()\n",
897
+ "yt_audio_filename = \"yt_audio.mp3\"\n",
898
+ "yt_audio_path = os.path.join(output_dir, yt_audio_filename)"
899
+ ]
900
+ },
901
+ {
902
+ "cell_type": "code",
903
+ "execution_count": null,
904
+ "id": "e4c19f5a",
905
+ "metadata": {},
906
+ "outputs": [],
907
+ "source": [
908
+ "yt_video = yt.streams.filter(only_video=True, fps=25, res=\"144p\").order_by(\"fps\").asc().first()\n",
909
+ "yt_video_filename = \"yt_video.mp4\"\n",
910
+ "yt_video_path = os.path.join(output_dir, raw_yt_video_filename)"
911
+ ]
912
+ },
913
+ {
914
+ "cell_type": "markdown",
915
+ "id": "3e2ee8ec",
916
+ "metadata": {},
917
+ "source": [
918
+ "We imported a binary representation of both YT video and audio. The idea now is to try to pass these objects to a CV model (e.g. YoloV4) and to a Transcriber model (e.g. Whisper)"
919
+ ]
920
+ },
921
+ {
922
+ "cell_type": "markdown",
923
+ "id": "ff5864bc",
924
+ "metadata": {},
925
+ "source": [
926
+ "YoLo"
927
+ ]
928
+ },
929
+ {
930
+ "cell_type": "code",
931
+ "execution_count": null,
932
+ "id": "5fef61e0",
933
+ "metadata": {},
934
+ "outputs": [],
935
+ "source": [
936
+ "yt_video.download(output_path=output_dir, filename=raw_yt_video_filename)"
937
+ ]
938
+ },
939
+ {
940
+ "cell_type": "code",
941
+ "execution_count": null,
942
+ "id": "fd04bfdf",
943
+ "metadata": {},
944
+ "outputs": [],
945
+ "source": [
946
+ "import subprocess\n",
947
+ "\n",
948
+ "new_fps = 1\n",
949
+ "reduce_fps_cmd = f'ffmpeg -y -i {raw_yt_video_path} -filter:v \"fps={new_fps}\" -an {processed_yt_video_path}'\n",
950
+ "subprocess.run(\n",
951
+ " reduce_fps_cmd, shell=True\n",
952
+ ")"
953
+ ]
954
+ },
955
+ {
956
+ "cell_type": "code",
957
+ "execution_count": null,
958
+ "id": "b0383e2f",
959
+ "metadata": {},
960
+ "outputs": [],
961
+ "source": [
962
+ "from ultralytics import YOLO\n",
963
+ "\n",
964
+ "cv_model = YOLO(model=\"yolov8s.pt\", task=\"detect\")"
965
+ ]
966
+ },
967
+ {
968
+ "cell_type": "code",
969
+ "execution_count": null,
970
+ "id": "8c6851e3",
971
+ "metadata": {},
972
+ "outputs": [],
973
+ "source": [
974
+ "cv_results = cv_model.predict(source=processed_yt_video_path, stream=True)"
975
+ ]
976
+ },
977
+ {
978
+ "cell_type": "code",
979
+ "execution_count": null,
980
+ "id": "04e3ecf3",
981
+ "metadata": {},
982
+ "outputs": [],
983
+ "source": [
984
+ "os.sys.getsizeof(cv_results)"
985
+ ]
986
+ },
987
+ {
988
+ "cell_type": "code",
989
+ "execution_count": null,
990
+ "id": "2ddf175f",
991
+ "metadata": {},
992
+ "outputs": [],
993
+ "source": [
994
+ "def summarize(results):# -> Generator[dict[str, Any], Any, None]:# -> Generator[dict[str, Any], Any, None]:\n",
995
+ " for i, result in enumerate(results):\n",
996
+ " yield {\n",
997
+ " \"frame\": i + 1,\n",
998
+ " \"result\": [result.names[int(cls)] for cls in result.boxes.cls]\n",
999
+ " }"
1000
+ ]
1001
+ },
1002
+ {
1003
+ "cell_type": "code",
1004
+ "execution_count": null,
1005
+ "id": "93b55a97",
1006
+ "metadata": {},
1007
+ "outputs": [],
1008
+ "source": [
1009
+ "data = list(summarize(cv_results))"
1010
+ ]
1011
+ },
1012
+ {
1013
+ "cell_type": "code",
1014
+ "execution_count": null,
1015
+ "id": "fd51f5b2",
1016
+ "metadata": {},
1017
+ "outputs": [],
1018
+ "source": [
1019
+ "# TODO: make this a typed dict to let the agent process this data\n",
1020
+ "processed_data = {\n",
1021
+ " \"video_url\": sample_yt_url,\n",
1022
+ " \"detected_objects\": data\n",
1023
+ "}"
1024
+ ]
1025
+ },
1026
+ {
1027
+ "cell_type": "code",
1028
+ "execution_count": null,
1029
+ "id": "2595c49b",
1030
+ "metadata": {},
1031
+ "outputs": [],
1032
+ "source": [
1033
+ "processed_data"
1034
+ ]
1035
+ },
1036
+ {
1037
+ "cell_type": "code",
1038
+ "execution_count": null,
1039
+ "id": "293a9894",
1040
+ "metadata": {},
1041
+ "outputs": [],
1042
+ "source": [
1043
+ "# delete unnecesary stuff\n",
1044
+ "import subprocess\n",
1045
+ "remove_videos_cmd = f\"rm {raw_yt_video_path} {processed_yt_video_path}\"\n",
1046
+ "subprocess.run(remove_videos_cmd, shell=True)"
1047
+ ]
1048
+ },
1049
+ {
1050
+ "cell_type": "markdown",
1051
+ "id": "6e928c09",
1052
+ "metadata": {},
1053
+ "source": [
1054
+ "Whisper custom tool"
1055
+ ]
1056
+ },
1057
+ {
1058
+ "cell_type": "code",
1059
+ "execution_count": null,
1060
+ "id": "06d786cf",
1061
+ "metadata": {},
1062
+ "outputs": [],
1063
+ "source": [
1064
+ "import time\n",
1065
+ "yt_audio.download(output_path=output_dir, filename=yt_audio_filename)"
1066
+ ]
1067
+ },
1068
+ {
1069
+ "cell_type": "code",
1070
+ "execution_count": null,
1071
+ "id": "408eae4d",
1072
+ "metadata": {},
1073
+ "outputs": [],
1074
+ "source": [
1075
+ "tools_dir = \"../../src/tools\"\n",
1076
+ "os.sys.path.append(tools_dir)"
1077
+ ]
1078
+ },
1079
+ {
1080
+ "cell_type": "code",
1081
+ "execution_count": null,
1082
+ "id": "fb62c338",
1083
+ "metadata": {},
1084
+ "outputs": [],
1085
+ "source": [
1086
+ "import transcriber"
1087
+ ]
1088
+ },
1089
+ {
1090
+ "cell_type": "code",
1091
+ "execution_count": null,
1092
+ "id": "10ab97b7",
1093
+ "metadata": {},
1094
+ "outputs": [],
1095
+ "source": [
1096
+ "transcript = transcriber.transcriber.invoke(input={\"audio_path\": yt_audio_path})"
1097
+ ]
1098
+ },
1099
+ {
1100
+ "cell_type": "code",
1101
+ "execution_count": null,
1102
+ "id": "4729fe7a",
1103
+ "metadata": {},
1104
+ "outputs": [],
1105
+ "source": [
1106
+ "transcript"
1107
+ ]
1108
+ },
1109
+ {
1110
+ "cell_type": "code",
1111
+ "execution_count": null,
1112
+ "id": "0d3271ef",
1113
+ "metadata": {},
1114
+ "outputs": [],
1115
+ "source": [
1116
+ "import subprocess\n",
1117
+ "subprocess.run(cmd=f\"rm {yt_audio_path}\", shell=True)"
1118
+ ]
1119
+ },
1120
+ {
1121
+ "cell_type": "markdown",
1122
+ "id": "715d48ab",
1123
+ "metadata": {},
1124
+ "source": [
1125
+ "#### 3. Test Tool"
1126
+ ]
1127
+ },
1128
+ {
1129
+ "cell_type": "code",
1130
+ "execution_count": null,
1131
+ "id": "009aba7c",
1132
+ "metadata": {},
1133
+ "outputs": [],
1134
+ "source": [
1135
+ "import os\n",
1136
+ "os.sys.path.append(\"../../src/\")\n",
1137
+ "os.sys.path.append(\"../../src/agents/\")\n",
1138
+ "os.sys.path.append(\"../../src/utils/\")\n",
1139
+ "\n",
1140
+ "import react"
1141
+ ]
1142
+ },
1143
+ {
1144
+ "cell_type": "code",
1145
+ "execution_count": null,
1146
+ "id": "9194249b",
1147
+ "metadata": {},
1148
+ "outputs": [],
1149
+ "source": [
1150
+ "yt_tasks"
1151
+ ]
1152
+ },
1153
+ {
1154
+ "cell_type": "code",
1155
+ "execution_count": null,
1156
+ "id": "fcc960f6",
1157
+ "metadata": {},
1158
+ "outputs": [],
1159
+ "source": [
1160
+ "sample_yt_task = yt_tasks.iloc[0]\n",
1161
+ "sample_yt_question = sample_yt_task[\"Question\"]"
1162
+ ]
1163
+ },
1164
+ {
1165
+ "cell_type": "code",
1166
+ "execution_count": null,
1167
+ "id": "952a70a1",
1168
+ "metadata": {},
1169
+ "outputs": [],
1170
+ "source": [
1171
+ "sample_yt_question"
1172
+ ]
1173
+ },
1174
+ {
1175
+ "cell_type": "code",
1176
+ "execution_count": null,
1177
+ "id": "2e178e9f",
1178
+ "metadata": {},
1179
+ "outputs": [],
1180
+ "source": [
1181
+ "react.run_app(user_query=sample_yt_question + \". Just use the pull_youtube_video and the transcriber tools. Try to figure out the number of bird species from the transcript as those are mentioned\")"
1182
+ ]
1183
+ },
1184
+ {
1185
+ "cell_type": "markdown",
1186
+ "id": "f9712cc9",
1187
+ "metadata": {},
1188
+ "source": [
1189
+ "### Web search Tool (Web scrapping)"
1190
+ ]
1191
+ },
1192
+ {
1193
+ "cell_type": "markdown",
1194
+ "id": "c06f6e50",
1195
+ "metadata": {},
1196
+ "source": [
1197
+ "#### 1. FIlter tasks to solve with the tool"
1198
+ ]
1199
+ },
1200
+ {
1201
+ "cell_type": "code",
1202
+ "execution_count": 16,
1203
+ "id": "4765f8e7",
1204
+ "metadata": {},
1205
+ "outputs": [],
1206
+ "source": [
1207
+ "sample_task_ids = [\n",
1208
+ " \"5d0080cb-90d7-4712-bc33-848150e917d3\",\n",
1209
+ " \"46719c30-f4c3-4cad-be07-d5cb21eee6bb\",\n",
1210
+ " \"b816bfce-3d80-4913-a07d-69b752ce6377\",\n",
1211
+ " \"b415aba4-4b68-4fc6-9b89-2c812e55a3e1\",\n",
1212
+ " \"935e2cff-ae78-4218-b3f5-115589b19dae\",\n",
1213
+ " \"5188369a-3bbe-43d8-8b94-11558f909a08\",\n",
1214
+ " \"7673d772-ef80-4f0f-a602-1bf4485c9b43\",\n",
1215
+ " \"c365c1c7-a3db-4d5e-a9a1-66f56eae7865\",\n",
1216
+ " \"7d4a7d1d-cac6-44a8-96e8-ea9584a70825\",\n",
1217
+ " \"3f57289b-8c60-48be-bd80-01f8099ca449\",\n",
1218
+ " \"23dd907f-1261-4488-b21c-e9185af91d5e\",\n",
1219
+ " \"840bfca7-4f7b-481a-8794-c560c340185d\",\n",
1220
+ " \"a0068077-79f4-461a-adfe-75c1a4148545\",\n",
1221
+ " \"a0c07678-e491-4bbc-8f0b-07405144218f\"\n",
1222
+ "]"
1223
+ ]
1224
+ },
1225
+ {
1226
+ "cell_type": "code",
1227
+ "execution_count": 17,
1228
+ "id": "ef998b32",
1229
+ "metadata": {},
1230
+ "outputs": [
1231
+ {
1232
+ "data": {
1233
+ "text/plain": [
1234
+ "True"
1235
+ ]
1236
+ },
1237
+ "execution_count": 17,
1238
+ "metadata": {},
1239
+ "output_type": "execute_result"
1240
+ }
1241
+ ],
1242
+ "source": [
1243
+ "l = [\"a\", \"b\", \"c\"]\n",
1244
+ "b = \"a\"\n",
1245
+ "\n",
1246
+ "b in l"
1247
+ ]
1248
+ },
1249
+ {
1250
+ "cell_type": "code",
1251
+ "execution_count": null,
1252
+ "id": "d318749c",
1253
+ "metadata": {},
1254
+ "outputs": [
1255
+ {
1256
+ "data": {
1257
+ "application/vnd.microsoft.datawrangler.viewer.v0+json": {
1258
+ "columns": [
1259
+ {
1260
+ "name": "index",
1261
+ "rawType": "int64",
1262
+ "type": "integer"
1263
+ },
1264
+ {
1265
+ "name": "index",
1266
+ "rawType": "int64",
1267
+ "type": "integer"
1268
+ },
1269
+ {
1270
+ "name": "task_id",
1271
+ "rawType": "object",
1272
+ "type": "string"
1273
+ },
1274
+ {
1275
+ "name": "Question",
1276
+ "rawType": "object",
1277
+ "type": "string"
1278
+ },
1279
+ {
1280
+ "name": "Level",
1281
+ "rawType": "object",
1282
+ "type": "string"
1283
+ },
1284
+ {
1285
+ "name": "Final answer",
1286
+ "rawType": "object",
1287
+ "type": "string"
1288
+ },
1289
+ {
1290
+ "name": "file_name",
1291
+ "rawType": "object",
1292
+ "type": "string"
1293
+ },
1294
+ {
1295
+ "name": "file_path",
1296
+ "rawType": "object",
1297
+ "type": "string"
1298
+ },
1299
+ {
1300
+ "name": "Annotator Metadata",
1301
+ "rawType": "object",
1302
+ "type": "unknown"
1303
+ }
1304
+ ],
1305
+ "ref": "c253276a-1799-43b2-bc36-438a090ebf7d",
1306
+ "rows": [
1307
+ [
1308
+ "0",
1309
+ "3",
1310
+ "5d0080cb-90d7-4712-bc33-848150e917d3",
1311
+ "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper \"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\"",
1312
+ "1",
1313
+ "0.1777",
1314
+ "",
1315
+ "",
1316
+ "{'Steps': '1. Searched \\'\"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\"\\' on Google.\\n2. Opened \"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\" at https://journals.le.ac.uk/ojs1/index.php/jist/article/view/733.\\n3. Clicked \"PDF\".\\n4. Found the calculations for the volume of the fish bag and noted them.', 'Number of steps': '4', 'How long did this take?': '5 minutes', 'Tools': '1. Web browser\\n2. Search engine\\n3. PDF access', 'Number of tools': '3'}"
1317
+ ],
1318
+ [
1319
+ "1",
1320
+ "5",
1321
+ "46719c30-f4c3-4cad-be07-d5cb21eee6bb",
1322
+ "Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus, Which Is Better?\" in 2015, what was the title of the first paper authored by the one that had authored prior papers?",
1323
+ "1",
1324
+ "Mapping Human Oriented Information to Software Agents for Online Systems Usage",
1325
+ "",
1326
+ "",
1327
+ "{'Steps': '1. Searched \"Pie Menus or Linear Menus, Which Is Better?\" on Google.\\n2. Opened \"Pie Menus or Linear Menus, Which Is Better?\" on https://oda.oslomet.no/oda-xmlui/handle/10642/3162.\\n3. Clicked each author\\'s name.\\n4. Noted the name that had no other papers listed.\\n5. Searched \"Murano, Pietro\" on Google.\\n6. Opened http://www.pietromurano.org/.\\n7. Clicked \"Publications\".\\n8. Found the earliest paper he contributed to.', 'Number of steps': '8', 'How long did this take?': '10 minutes', 'Tools': '1. Web browser\\n2. Search engine', 'Number of tools': '2'}"
1328
+ ],
1329
+ [
1330
+ "2",
1331
+ "12",
1332
+ "b816bfce-3d80-4913-a07d-69b752ce6377",
1333
+ "In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's sons that guarded his house, what word was quoted from two different authors in distaste for the nature of dragon depictions?",
1334
+ "1",
1335
+ "fluffy",
1336
+ "",
1337
+ "",
1338
+ "{'Steps': '1. Searched \"Hreidmar\\'s sons\" on Google.\\n2. Opened https://en.wikipedia.org/wiki/Hrei%C3%B0marr.\\n3. Noted Fafnir guarded his house.\\n4. Searched \"Emily Midkiff June 2014 Fafnir\" on Google.\\n5. Opened \"Fafnir 2/2014 |\" at http://journal.finfar.org/journal/archive/fafnir-22014/.\\n6. Clicked the title \\'“Dragons are Tricksy”: The Uncanny Dragons of Children’s Literature\\'.\\n7. Found the word in quotation marks from two different authors (Ruth Stein and Margaret Blount) in the text.', 'Number of steps': '7', 'How long did this take?': '10 minutes', 'Tools': '1. Web browser\\n2. Search engine', 'Number of tools': '2'}"
1339
+ ],
1340
+ [
1341
+ "3",
1342
+ "15",
1343
+ "b415aba4-4b68-4fc6-9b89-2c812e55a3e1",
1344
+ "In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied? Don't use the prefix nano in your answer if there is one.",
1345
+ "1",
1346
+ "diamond",
1347
+ "",
1348
+ "",
1349
+ "{'Steps': '1. Searched \"nature scientific reports\" on Google.\\n2. Opened https://www.nature.com/srep/.\\n3. Selected Explore Content > Research Articles.\\n4. Filtered for Conference Proceedings from 2012.\\n5. Opened each article link.\\n6. Checked for \"plasmon\" or \"plasmonic\".\\n7. Noted the nano-compound in the article that did not include either.', 'Number of steps': '7', 'How long did this take?': '10 minutes', 'Tools': '1. Web browser\\n2. Search engine', 'Number of tools': '2'}"
1350
+ ],
1351
+ [
1352
+ "4",
1353
+ "17",
1354
+ "935e2cff-ae78-4218-b3f5-115589b19dae",
1355
+ "In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content that was violated in the public logs on the Legume Wikipedia page?",
1356
+ "1",
1357
+ "research",
1358
+ "",
1359
+ "",
1360
+ "{'Steps': '1. Searched \"legume wikipedia\" on Google.\\n2. Opened \"Legume\" on Wikipedia.\\n3. Clicked \"View history\".\\n4. Clicked \"View logs for this page\".\\n5. Checked all types of logs.\\n6. Set the date to November 2022.\\n7. Followed the BLP link of the violation.\\n8. Noted the meaning of \"R\".', 'Number of steps': '8', 'How long did this take?': '10 minutes', 'Tools': '1. Web browser\\n2. Search engine', 'Number of tools': '2'}"
1361
+ ],
1362
+ [
1363
+ "5",
1364
+ "19",
1365
+ "5188369a-3bbe-43d8-8b94-11558f909a08",
1366
+ "What writer is quoted by Merriam-Webster for the Word of the Day from June 27, 2022?",
1367
+ "1",
1368
+ "Annie Levin",
1369
+ "",
1370
+ "",
1371
+ "{'Steps': '1. Search \"merriam-webster word of the day\" on Google search.\\n2. Opened the top \"Word of the Day\" result from the Merriam-Webster dictionary online.\\n3. Clicked \"SEE ALL WORDS OF THE DAY\" at the bottom.\\n4. Scrolled down to June 27, 2022.\\n5. Opened the Word of the Day (\"jingoism\").\\n6. Scrolled down and identified context quote for \"jingoism\".\\n7. Noted the name attributed to the quote. ', 'Number of steps': '7', 'How long did this take?': '8 minutes', 'Tools': '1. Web browser\\n2. Search engine\\n3. Audio capability', 'Number of tools': '3'}"
1372
+ ],
1373
+ [
1374
+ "6",
1375
+ "38",
1376
+ "7673d772-ef80-4f0f-a602-1bf4485c9b43",
1377
+ "On Cornell Law School website's legal information institute, under the fifth section of federal rules alphabetically, what word was deleted in the last amendment to the first rule in the article that has \"witnesses\" in the most titles as of 2021?",
1378
+ "1",
1379
+ "inference",
1380
+ "",
1381
+ "",
1382
+ "{'Steps': '1. Searched \"Cornell Law School legal information institute\" on Google.\\n2. Opened https://www.law.cornell.edu/.\\n3. Clicked Get The Law > Federal Rules > Federal Rules of Evidence (fourth section down).\\n4. Found the article that has \"witnesses\" in the most titles (VII).\\n5. Opened the first rule (701).\\n6. Scrolled to the last amendment as of 2021 (2011 amendment).\\n7. Found the word that was deleted (inference).', 'Number of steps': '7', 'How long did this take?': '10 minutes', 'Tools': '1. Web browser\\n2. Search engine', 'Number of tools': '2'}"
1383
+ ],
1384
+ [
1385
+ "7",
1386
+ "39",
1387
+ "c365c1c7-a3db-4d5e-a9a1-66f56eae7865",
1388
+ "Of the cities within the United States where U.S. presidents were born, which two are the farthest apart from the westernmost to the easternmost going east, giving the city names only? Give them to me in alphabetical order, in a comma-separated list",
1389
+ "1",
1390
+ "Braintree, Honolulu",
1391
+ "",
1392
+ "",
1393
+ "{'Steps': '1. Searched \"cities where us presidents are born\" on Google.\\n2. Opened \"List of presidents of the United States by home state\" on Wikipedia.\\n3. Searched the eastern cities to find the easternmost one (Braintree, MA).\\n4. Checked the westernmost city (Honolulu, HI).', 'Number of steps': '4', 'How long did this take?': '8 minutes', 'Tools': '1. Search engine\\n2. Web browser', 'Number of tools': '3'}"
1394
+ ],
1395
+ [
1396
+ "8",
1397
+ "40",
1398
+ "7d4a7d1d-cac6-44a8-96e8-ea9584a70825",
1399
+ "According to Girls Who Code, how long did it take in years for the percentage of computer scientists that were women to change by 13% from a starting point of 37%?",
1400
+ "1",
1401
+ "22",
1402
+ "",
1403
+ "",
1404
+ "{'Steps': '1. Searched \"Girls Who Code\" on Google.\\n2. Opened https://girlswhocode.com/.\\n3. Clicked \"About Us\".\\n4. Noted that the chart started at 37% and declined to 24%.\\n5. Subtracted the marked years to find the number of years (2017 - 1995 = 22).', 'Number of steps': '5', 'How long did this take?': '10 minutes', 'Tools': '1. Web browser\\n2. Search engine\\n3. Calculator', 'Number of tools': '3'}"
1405
+ ],
1406
+ [
1407
+ "9",
1408
+ "42",
1409
+ "3f57289b-8c60-48be-bd80-01f8099ca449",
1410
+ "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
1411
+ "1",
1412
+ "519",
1413
+ "",
1414
+ "",
1415
+ "{'Steps': '1. Search \"yankee stats\" to find their MLB stats page.\\n2. Set the data to the 1977 regular season.\\n3. Sort to find the most walks.\\n4. See how many at bats the player had.', 'Number of steps': '4', 'How long did this take?': '5 minutes', 'Tools': '1. web browser\\n2. search engine', 'Number of tools': '2'}"
1416
+ ],
1417
+ [
1418
+ "10",
1419
+ "43",
1420
+ "23dd907f-1261-4488-b21c-e9185af91d5e",
1421
+ "In Audre Lorde’s poem “Father Son and Holy Ghost”, what is the number of the stanza in which some lines are indented?",
1422
+ "1",
1423
+ "2",
1424
+ "",
1425
+ "",
1426
+ "{'Steps': '1. Search the web for “Audre Lorde Father Son and Holy Ghost”.\\n2. Click on Poetry Foundation result.\\n3. Note the stanza that appears to have lines indented, the second one.\\n4. Return to search results to confirm.\\n5. Click on second result.\\n6. Confirm that the indentation appears in the second stanza here as well.', 'Number of steps': '6', 'How long did this take?': '5 minutes', 'Tools': '1. Search engine\\n2. Web browser', 'Number of tools': '2'}"
1427
+ ],
1428
+ [
1429
+ "11",
1430
+ "45",
1431
+ "840bfca7-4f7b-481a-8794-c560c340185d",
1432
+ "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
1433
+ "1",
1434
+ "80GSFC21M0002",
1435
+ "",
1436
+ "",
1437
+ "{'Steps': '1. Google \"June 6, 2023 Carolyn Collins Petersen Universe Today\"\\n2. Find the relevant link to the scientific paper and follow that link\\n3. Open the PDF. \\n4. Search for NASA award number', 'Number of steps': '4', 'How long did this take?': '5 minutes', 'Tools': '1. Web browser\\n2. Search engine\\n3. Access to academic journal websites', 'Number of tools': '2'}"
1438
+ ],
1439
+ [
1440
+ "12",
1441
+ "46",
1442
+ "a0068077-79f4-461a-adfe-75c1a4148545",
1443
+ "What was the actual enrollment count of the clinical trial on H. pylori in acne vulgaris patients from Jan-May 2018 as listed on the NIH website?",
1444
+ "1",
1445
+ "90",
1446
+ "",
1447
+ "",
1448
+ "{'Steps': '1. Searched \"nih\" on Google search.\\n2. Clicked the top link to nih.gov.\\n3. Searched \"h pylori acne\" in the search box.\\n4. Clicked \"More\" and selected \"Clinical Trials\".\\n5. Clicked the result about H. Pylori and acne.\\n6. Checked the date to confirm it was January to May 2018.\\n7. Opened \"Tabular View\".\\n8. Scrolled down to Actual Enrollment and recorded the number.', 'Number of steps': '8', 'How long did this take?': '8 minutes', 'Tools': '1. Search engine\\n2. Web browser', 'Number of tools': '2'}"
1449
+ ],
1450
+ [
1451
+ "13",
1452
+ "50",
1453
+ "a0c07678-e491-4bbc-8f0b-07405144218f",
1454
+ "Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
1455
+ "1",
1456
+ "Yoshida, Uehara",
1457
+ "",
1458
+ "",
1459
+ "{'Steps': '1. Look up Taishō Tamai on Wikipedia\\n2. See the pitcher with the number 18 (before) is Kōsei Yoshida and number 20 (after) is Kenta Uehara', 'Number of steps': '2', 'How long did this take?': '5 minutes', 'Tools': '1. Wikipedia', 'Number of tools': '1'}"
1460
+ ]
1461
+ ],
1462
+ "shape": {
1463
+ "columns": 8,
1464
+ "rows": 14
1465
+ }
1466
+ },
1467
+ "text/html": [
1468
+ "<div>\n",
1469
+ "<style scoped>\n",
1470
+ " .dataframe tbody tr th:only-of-type {\n",
1471
+ " vertical-align: middle;\n",
1472
+ " }\n",
1473
+ "\n",
1474
+ " .dataframe tbody tr th {\n",
1475
+ " vertical-align: top;\n",
1476
+ " }\n",
1477
+ "\n",
1478
+ " .dataframe thead th {\n",
1479
+ " text-align: right;\n",
1480
+ " }\n",
1481
+ "</style>\n",
1482
+ "<table border=\"1\" class=\"dataframe\">\n",
1483
+ " <thead>\n",
1484
+ " <tr style=\"text-align: right;\">\n",
1485
+ " <th></th>\n",
1486
+ " <th>index</th>\n",
1487
+ " <th>task_id</th>\n",
1488
+ " <th>Question</th>\n",
1489
+ " <th>Level</th>\n",
1490
+ " <th>Final answer</th>\n",
1491
+ " <th>file_name</th>\n",
1492
+ " <th>file_path</th>\n",
1493
+ " <th>Annotator Metadata</th>\n",
1494
+ " </tr>\n",
1495
+ " </thead>\n",
1496
+ " <tbody>\n",
1497
+ " <tr>\n",
1498
+ " <th>0</th>\n",
1499
+ " <td>3</td>\n",
1500
+ " <td>5d0080cb-90d7-4712-bc33-848150e917d3</td>\n",
1501
+ " <td>What was the volume in m^3 of the fish bag tha...</td>\n",
1502
+ " <td>1</td>\n",
1503
+ " <td>0.1777</td>\n",
1504
+ " <td></td>\n",
1505
+ " <td></td>\n",
1506
+ " <td>{'Steps': '1. Searched '\"Can Hiccup Supply Eno...</td>\n",
1507
+ " </tr>\n",
1508
+ " <tr>\n",
1509
+ " <th>1</th>\n",
1510
+ " <td>5</td>\n",
1511
+ " <td>46719c30-f4c3-4cad-be07-d5cb21eee6bb</td>\n",
1512
+ " <td>Of the authors (First M. Last) that worked on ...</td>\n",
1513
+ " <td>1</td>\n",
1514
+ " <td>Mapping Human Oriented Information to Software...</td>\n",
1515
+ " <td></td>\n",
1516
+ " <td></td>\n",
1517
+ " <td>{'Steps': '1. Searched \"Pie Menus or Linear Me...</td>\n",
1518
+ " </tr>\n",
1519
+ " <tr>\n",
1520
+ " <th>2</th>\n",
1521
+ " <td>12</td>\n",
1522
+ " <td>b816bfce-3d80-4913-a07d-69b752ce6377</td>\n",
1523
+ " <td>In Emily Midkiff's June 2014 article in a jour...</td>\n",
1524
+ " <td>1</td>\n",
1525
+ " <td>fluffy</td>\n",
1526
+ " <td></td>\n",
1527
+ " <td></td>\n",
1528
+ " <td>{'Steps': '1. Searched \"Hreidmar's sons\" on Go...</td>\n",
1529
+ " </tr>\n",
1530
+ " <tr>\n",
1531
+ " <th>3</th>\n",
1532
+ " <td>15</td>\n",
1533
+ " <td>b415aba4-4b68-4fc6-9b89-2c812e55a3e1</td>\n",
1534
+ " <td>In Nature journal's Scientific Reports confere...</td>\n",
1535
+ " <td>1</td>\n",
1536
+ " <td>diamond</td>\n",
1537
+ " <td></td>\n",
1538
+ " <td></td>\n",
1539
+ " <td>{'Steps': '1. Searched \"nature scientific repo...</td>\n",
1540
+ " </tr>\n",
1541
+ " <tr>\n",
1542
+ " <th>4</th>\n",
1543
+ " <td>17</td>\n",
1544
+ " <td>935e2cff-ae78-4218-b3f5-115589b19dae</td>\n",
1545
+ " <td>In the year 2022, and before December, what do...</td>\n",
1546
+ " <td>1</td>\n",
1547
+ " <td>research</td>\n",
1548
+ " <td></td>\n",
1549
+ " <td></td>\n",
1550
+ " <td>{'Steps': '1. Searched \"legume wikipedia\" on G...</td>\n",
1551
+ " </tr>\n",
1552
+ " <tr>\n",
1553
+ " <th>5</th>\n",
1554
+ " <td>19</td>\n",
1555
+ " <td>5188369a-3bbe-43d8-8b94-11558f909a08</td>\n",
1556
+ " <td>What writer is quoted by Merriam-Webster for t...</td>\n",
1557
+ " <td>1</td>\n",
1558
+ " <td>Annie Levin</td>\n",
1559
+ " <td></td>\n",
1560
+ " <td></td>\n",
1561
+ " <td>{'Steps': '1. Search \"merriam-webster word of ...</td>\n",
1562
+ " </tr>\n",
1563
+ " <tr>\n",
1564
+ " <th>6</th>\n",
1565
+ " <td>38</td>\n",
1566
+ " <td>7673d772-ef80-4f0f-a602-1bf4485c9b43</td>\n",
1567
+ " <td>On Cornell Law School website's legal informat...</td>\n",
1568
+ " <td>1</td>\n",
1569
+ " <td>inference</td>\n",
1570
+ " <td></td>\n",
1571
+ " <td></td>\n",
1572
+ " <td>{'Steps': '1. Searched \"Cornell Law School leg...</td>\n",
1573
+ " </tr>\n",
1574
+ " <tr>\n",
1575
+ " <th>7</th>\n",
1576
+ " <td>39</td>\n",
1577
+ " <td>c365c1c7-a3db-4d5e-a9a1-66f56eae7865</td>\n",
1578
+ " <td>Of the cities within the United States where U...</td>\n",
1579
+ " <td>1</td>\n",
1580
+ " <td>Braintree, Honolulu</td>\n",
1581
+ " <td></td>\n",
1582
+ " <td></td>\n",
1583
+ " <td>{'Steps': '1. Searched \"cities where us presid...</td>\n",
1584
+ " </tr>\n",
1585
+ " <tr>\n",
1586
+ " <th>8</th>\n",
1587
+ " <td>40</td>\n",
1588
+ " <td>7d4a7d1d-cac6-44a8-96e8-ea9584a70825</td>\n",
1589
+ " <td>According to Girls Who Code, how long did it t...</td>\n",
1590
+ " <td>1</td>\n",
1591
+ " <td>22</td>\n",
1592
+ " <td></td>\n",
1593
+ " <td></td>\n",
1594
+ " <td>{'Steps': '1. Searched \"Girls Who Code\" on Goo...</td>\n",
1595
+ " </tr>\n",
1596
+ " <tr>\n",
1597
+ " <th>9</th>\n",
1598
+ " <td>42</td>\n",
1599
+ " <td>3f57289b-8c60-48be-bd80-01f8099ca449</td>\n",
1600
+ " <td>How many at bats did the Yankee with the most ...</td>\n",
1601
+ " <td>1</td>\n",
1602
+ " <td>519</td>\n",
1603
+ " <td></td>\n",
1604
+ " <td></td>\n",
1605
+ " <td>{'Steps': '1. Search \"yankee stats\" to find th...</td>\n",
1606
+ " </tr>\n",
1607
+ " <tr>\n",
1608
+ " <th>10</th>\n",
1609
+ " <td>43</td>\n",
1610
+ " <td>23dd907f-1261-4488-b21c-e9185af91d5e</td>\n",
1611
+ " <td>In Audre Lorde’s poem “Father Son and Holy Gho...</td>\n",
1612
+ " <td>1</td>\n",
1613
+ " <td>2</td>\n",
1614
+ " <td></td>\n",
1615
+ " <td></td>\n",
1616
+ " <td>{'Steps': '1. Search the web for “Audre Lorde ...</td>\n",
1617
+ " </tr>\n",
1618
+ " <tr>\n",
1619
+ " <th>11</th>\n",
1620
+ " <td>45</td>\n",
1621
+ " <td>840bfca7-4f7b-481a-8794-c560c340185d</td>\n",
1622
+ " <td>On June 6, 2023, an article by Carolyn Collins...</td>\n",
1623
+ " <td>1</td>\n",
1624
+ " <td>80GSFC21M0002</td>\n",
1625
+ " <td></td>\n",
1626
+ " <td></td>\n",
1627
+ " <td>{'Steps': '1. Google \"June 6, 2023 Carolyn Col...</td>\n",
1628
+ " </tr>\n",
1629
+ " <tr>\n",
1630
+ " <th>12</th>\n",
1631
+ " <td>46</td>\n",
1632
+ " <td>a0068077-79f4-461a-adfe-75c1a4148545</td>\n",
1633
+ " <td>What was the actual enrollment count of the cl...</td>\n",
1634
+ " <td>1</td>\n",
1635
+ " <td>90</td>\n",
1636
+ " <td></td>\n",
1637
+ " <td></td>\n",
1638
+ " <td>{'Steps': '1. Searched \"nih\" on Google search....</td>\n",
1639
+ " </tr>\n",
1640
+ " <tr>\n",
1641
+ " <th>13</th>\n",
1642
+ " <td>50</td>\n",
1643
+ " <td>a0c07678-e491-4bbc-8f0b-07405144218f</td>\n",
1644
+ " <td>Who are the pitchers with the number before an...</td>\n",
1645
+ " <td>1</td>\n",
1646
+ " <td>Yoshida, Uehara</td>\n",
1647
+ " <td></td>\n",
1648
+ " <td></td>\n",
1649
+ " <td>{'Steps': '1. Look up Taishō Tamai on Wikipedi...</td>\n",
1650
+ " </tr>\n",
1651
+ " </tbody>\n",
1652
+ "</table>\n",
1653
+ "</div>"
1654
+ ],
1655
+ "text/plain": [
1656
+ " index task_id \\\n",
1657
+ "0 3 5d0080cb-90d7-4712-bc33-848150e917d3 \n",
1658
+ "1 5 46719c30-f4c3-4cad-be07-d5cb21eee6bb \n",
1659
+ "2 12 b816bfce-3d80-4913-a07d-69b752ce6377 \n",
1660
+ "3 15 b415aba4-4b68-4fc6-9b89-2c812e55a3e1 \n",
1661
+ "4 17 935e2cff-ae78-4218-b3f5-115589b19dae \n",
1662
+ "5 19 5188369a-3bbe-43d8-8b94-11558f909a08 \n",
1663
+ "6 38 7673d772-ef80-4f0f-a602-1bf4485c9b43 \n",
1664
+ "7 39 c365c1c7-a3db-4d5e-a9a1-66f56eae7865 \n",
1665
+ "8 40 7d4a7d1d-cac6-44a8-96e8-ea9584a70825 \n",
1666
+ "9 42 3f57289b-8c60-48be-bd80-01f8099ca449 \n",
1667
+ "10 43 23dd907f-1261-4488-b21c-e9185af91d5e \n",
1668
+ "11 45 840bfca7-4f7b-481a-8794-c560c340185d \n",
1669
+ "12 46 a0068077-79f4-461a-adfe-75c1a4148545 \n",
1670
+ "13 50 a0c07678-e491-4bbc-8f0b-07405144218f \n",
1671
+ "\n",
1672
+ " Question Level \\\n",
1673
+ "0 What was the volume in m^3 of the fish bag tha... 1 \n",
1674
+ "1 Of the authors (First M. Last) that worked on ... 1 \n",
1675
+ "2 In Emily Midkiff's June 2014 article in a jour... 1 \n",
1676
+ "3 In Nature journal's Scientific Reports confere... 1 \n",
1677
+ "4 In the year 2022, and before December, what do... 1 \n",
1678
+ "5 What writer is quoted by Merriam-Webster for t... 1 \n",
1679
+ "6 On Cornell Law School website's legal informat... 1 \n",
1680
+ "7 Of the cities within the United States where U... 1 \n",
1681
+ "8 According to Girls Who Code, how long did it t... 1 \n",
1682
+ "9 How many at bats did the Yankee with the most ... 1 \n",
1683
+ "10 In Audre Lorde’s poem “Father Son and Holy Gho... 1 \n",
1684
+ "11 On June 6, 2023, an article by Carolyn Collins... 1 \n",
1685
+ "12 What was the actual enrollment count of the cl... 1 \n",
1686
+ "13 Who are the pitchers with the number before an... 1 \n",
1687
+ "\n",
1688
+ " Final answer file_name file_path \\\n",
1689
+ "0 0.1777 \n",
1690
+ "1 Mapping Human Oriented Information to Software... \n",
1691
+ "2 fluffy \n",
1692
+ "3 diamond \n",
1693
+ "4 research \n",
1694
+ "5 Annie Levin \n",
1695
+ "6 inference \n",
1696
+ "7 Braintree, Honolulu \n",
1697
+ "8 22 \n",
1698
+ "9 519 \n",
1699
+ "10 2 \n",
1700
+ "11 80GSFC21M0002 \n",
1701
+ "12 90 \n",
1702
+ "13 Yoshida, Uehara \n",
1703
+ "\n",
1704
+ " Annotator Metadata \n",
1705
+ "0 {'Steps': '1. Searched '\"Can Hiccup Supply Eno... \n",
1706
+ "1 {'Steps': '1. Searched \"Pie Menus or Linear Me... \n",
1707
+ "2 {'Steps': '1. Searched \"Hreidmar's sons\" on Go... \n",
1708
+ "3 {'Steps': '1. Searched \"nature scientific repo... \n",
1709
+ "4 {'Steps': '1. Searched \"legume wikipedia\" on G... \n",
1710
+ "5 {'Steps': '1. Search \"merriam-webster word of ... \n",
1711
+ "6 {'Steps': '1. Searched \"Cornell Law School leg... \n",
1712
+ "7 {'Steps': '1. Searched \"cities where us presid... \n",
1713
+ "8 {'Steps': '1. Searched \"Girls Who Code\" on Goo... \n",
1714
+ "9 {'Steps': '1. Search \"yankee stats\" to find th... \n",
1715
+ "10 {'Steps': '1. Search the web for “Audre Lorde ... \n",
1716
+ "11 {'Steps': '1. Google \"June 6, 2023 Carolyn Col... \n",
1717
+ "12 {'Steps': '1. Searched \"nih\" on Google search.... \n",
1718
+ "13 {'Steps': '1. Look up Taishō Tamai on Wikipedi... "
1719
+ ]
1720
+ },
1721
+ "execution_count": 18,
1722
+ "metadata": {},
1723
+ "output_type": "execute_result"
1724
+ }
1725
+ ],
1726
+ "source": [
1727
+ "sample_tasks = gaia_df[gaia_df[\"task_id\"].map(lambda row: row in sample_task_ids)]\n",
1728
+ "sample_tasks.reset_index(inplace=True)\n",
1729
+ "sample_tasksb"
1730
+ ]
1731
+ },
1732
+ {
1733
+ "cell_type": "code",
1734
+ "execution_count": null,
1735
+ "id": "0c3a2fc9",
1736
+ "metadata": {},
1737
+ "outputs": [],
1738
+ "source": []
1739
+ },
1740
+ {
1741
+ "cell_type": "markdown",
1742
+ "id": "1b8f4daf",
1743
+ "metadata": {},
1744
+ "source": [
1745
+ "#### 2. Tool PoC"
1746
+ ]
1747
+ },
1748
+ {
1749
+ "cell_type": "code",
1750
+ "execution_count": 19,
1751
+ "id": "66478316",
1752
+ "metadata": {},
1753
+ "outputs": [],
1754
+ "source": [
1755
+ "from langchain_community.tools.playwright.utils import create_async_playwright_browser\n",
1756
+ "from langchain_community.agent_toolkits import PlayWrightBrowserToolkit\n",
1757
+ "\n",
1758
+ "PlayWrightBrowserToolkit.model_rebuild()"
1759
+ ]
1760
+ },
1761
+ {
1762
+ "cell_type": "code",
1763
+ "execution_count": 21,
1764
+ "id": "597fcf82",
1765
+ "metadata": {},
1766
+ "outputs": [],
1767
+ "source": [
1768
+ "import nest_asyncio # Just for jupyter notebooks\n",
1769
+ "\n",
1770
+ "nest_asyncio.apply()\n",
1771
+ "\n",
1772
+ "# Create async browser context to launch browser \n",
1773
+ "async_browser = create_async_playwright_browser()\n",
1774
+ "toolkit = PlayWrightBrowserToolkit.from_browser(async_browser=async_browser)\n",
1775
+ "tools = toolkit.get_tools()"
1776
+ ]
1777
+ },
1778
+ {
1779
+ "cell_type": "code",
1780
+ "execution_count": 22,
1781
+ "id": "35333b1e",
1782
+ "metadata": {},
1783
+ "outputs": [],
1784
+ "source": [
1785
+ "from langgraph.prebuilt import create_react_agent\n",
1786
+ "from langchain_openai import ChatOpenAI\n",
1787
+ "\n",
1788
+ "llm = ChatOpenAI(model=\"gpt-4o\", temperature=0.0)\n",
1789
+ "agent = create_react_agent(model=llm, tools=tools)"
1790
+ ]
1791
+ },
1792
+ {
1793
+ "cell_type": "markdown",
1794
+ "id": "0a9abe8b",
1795
+ "metadata": {},
1796
+ "source": [
1797
+ "#### 3. Test Tool"
1798
+ ]
1799
+ },
1800
+ {
1801
+ "cell_type": "code",
1802
+ "execution_count": 24,
1803
+ "id": "8974a8ff",
1804
+ "metadata": {},
1805
+ "outputs": [],
1806
+ "source": [
1807
+ "sys_message = \"\"\n",
1808
+ "\n",
1809
+ "with open(\"../../prompts/agent/gaia_system_message.md\", \"r\") as f:\n",
1810
+ " for line in f:\n",
1811
+ " sys_message += line"
1812
+ ]
1813
+ },
1814
+ {
1815
+ "cell_type": "code",
1816
+ "execution_count": 26,
1817
+ "id": "35d2e16b",
1818
+ "metadata": {},
1819
+ "outputs": [],
1820
+ "source": [
1821
+ "results = [\n",
1822
+ " {\n",
1823
+ " \"task_id\": task_id,\n",
1824
+ " \"answer\": None\n",
1825
+ " }\n",
1826
+ " for task_id in sample_task_ids\n",
1827
+ "]"
1828
+ ]
1829
+ },
1830
+ {
1831
+ "cell_type": "code",
1832
+ "execution_count": 53,
1833
+ "id": "08932eb5",
1834
+ "metadata": {},
1835
+ "outputs": [
1836
+ {
1837
+ "data": {
1838
+ "text/plain": [
1839
+ "[ClickTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/santiagoal/.cache/ms-playwright/chromium-1179/chrome-linux/chrome> version=138.0.7204.23>),\n",
1840
+ " NavigateTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/santiagoal/.cache/ms-playwright/chromium-1179/chrome-linux/chrome> version=138.0.7204.23>),\n",
1841
+ " NavigateBackTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/santiagoal/.cache/ms-playwright/chromium-1179/chrome-linux/chrome> version=138.0.7204.23>),\n",
1842
+ " ExtractTextTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/santiagoal/.cache/ms-playwright/chromium-1179/chrome-linux/chrome> version=138.0.7204.23>),\n",
1843
+ " ExtractHyperlinksTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/santiagoal/.cache/ms-playwright/chromium-1179/chrome-linux/chrome> version=138.0.7204.23>),\n",
1844
+ " GetElementsTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/santiagoal/.cache/ms-playwright/chromium-1179/chrome-linux/chrome> version=138.0.7204.23>),\n",
1845
+ " CurrentWebPageTool(async_browser=<Browser type=<BrowserType name=chromium executable_path=/home/santiagoal/.cache/ms-playwright/chromium-1179/chrome-linux/chrome> version=138.0.7204.23>)]"
1846
+ ]
1847
+ },
1848
+ "execution_count": 53,
1849
+ "metadata": {},
1850
+ "output_type": "execute_result"
1851
+ }
1852
+ ],
1853
+ "source": [
1854
+ "tools"
1855
+ ]
1856
+ },
1857
+ {
1858
+ "cell_type": "code",
1859
+ "execution_count": 65,
1860
+ "id": "b1ddc9a5",
1861
+ "metadata": {},
1862
+ "outputs": [],
1863
+ "source": [
1864
+ "import time\n",
1865
+ "for task_item in results:\n",
1866
+ " task_id = task_item[\"task_id\"]\n",
1867
+ " answer = task_item[\"answer\"]\n",
1868
+ "\n",
1869
+ " # get question\n",
1870
+ " question = sample_tasks[sample_tasks[\"task_id\"]==task_id] # Filter record\n",
1871
+ " #print(question)\n",
1872
+ " question_processed = question[\"Question\"].iloc[0] # filter field\n",
1873
+ " \n",
1874
+ " # fill question with sys message \n",
1875
+ " #question_processed = sys_message + \"\\n\" + question\n",
1876
+ " #print(question_processed)\n",
1877
+ "\n",
1878
+ " # pass processed question to react agent\n",
1879
+ "\n",
1880
+ " agent_response = agent.invoke(input={\"messages\": question_processed})\n",
1881
+ " task_item[\"answer\"] = agent_response[\"messages\"][-1].content\n",
1882
+ " time.sleep(4)\n"
1883
+ ]
1884
+ },
1885
+ {
1886
+ "cell_type": "code",
1887
+ "execution_count": 58,
1888
+ "id": "95e8c66e",
1889
+ "metadata": {},
1890
+ "outputs": [
1891
+ {
1892
+ "name": "stdout",
1893
+ "output_type": "stream",
1894
+ "text": [
1895
+ "content='It seems there is an issue with accessing the webpage directly. You can try visiting the [University of Leicester\\'s journal page](https://journals.le.ac.uk/ojs1/index.php/jist/article/view/2985) directly in your browser to find the information about the fish bag volume in the paper \"Can Hiccup Supply Enough Fish to Maintain a Dragon’s Diet?\". If you have any other questions or need further assistance, feel free to ask!' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 93, 'prompt_tokens': 453, 'total_tokens': 546, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZL932hfEp4hA2xH97IFjkieQe5r', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--a49dce90-e0e0-49cb-9e4f-3666cab83ad0-0' usage_metadata={'input_tokens': 453, 'output_tokens': 93, 'total_tokens': 546, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1896
+ "content='It seems there\\'s an issue with navigating to external websites directly. However, you can manually search for the paper \"Pie Menus or Linear Menus, Which Is Better?\" on Google Scholar to find the authors. Once you have the list of authors, you can search for their publication history to identify the first paper authored by the one with prior publications. If you need further assistance with this process, feel free to ask!' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 85, 'prompt_tokens': 382, 'total_tokens': 467, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZLH0iQmPE8EmQEJB4kDWOyLX0b5', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--c04f3176-6860-4449-bc9d-6abe56c8b776-0' usage_metadata={'input_tokens': 382, 'output_tokens': 85, 'total_tokens': 467, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1897
+ "content='It seems I can\\'t access external websites directly to retrieve the article. However, I can guide you on how to find the information.\\n\\n1. **Journal Name**: The journal you\\'re referring to is likely \"Saga-Book,\" named after Fafnir, one of Hreidmar\\'s sons who guarded his house in Norse mythology.\\n\\n2. **Finding the Article**:\\n - Visit a database like JSTOR or your institution\\'s library portal.\\n - Search for Emily Midkiff\\'s article from June 2014 in the \"Saga-Book\" journal.\\n\\n3. **Identifying the Quoted Word**:\\n - Once you have access to the article, look for sections discussing dragon depictions.\\n - Pay attention to any quotes from other authors expressing distaste for these depictions.\\n\\nIf you have access to the article, you can follow these steps to find the specific word quoted in distaste. If you need further assistance, feel free to ask!' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 197, 'prompt_tokens': 400, 'total_tokens': 597, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZLQLqyUJ9gQ7hpaOyWXimDsurFM', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--527dd0d5-c1b9-4187-9eba-277f3ec9c483-0' usage_metadata={'input_tokens': 400, 'output_tokens': 197, 'total_tokens': 597, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1898
+ "content='It seems there is an issue with accessing the webpage directly. You can visit the [Nature Scientific Reports conference proceedings for 2012](https://www.nature.com/srep/conference-proceedings/2012) and search for the article that does not mention plasmons or plasmonics to find the nano-compound studied. If you have any other questions or need further assistance, feel free to ask!' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 84, 'prompt_tokens': 399, 'total_tokens': 483, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZLcnLiuIxBJnmbzCBIoLxXJ6MuH', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--1e784dc3-7a50-4766-a08b-be3d1c5bcf00-0' usage_metadata={'input_tokens': 399, 'output_tokens': 84, 'total_tokens': 483, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1899
+ "content='It seems I am unable to access the webpage directly at the moment. However, I can guide you on how to find this information:\\n\\n1. Go to the [Legume Wikipedia page](https://en.wikipedia.org/wiki/Legume).\\n2. On the left-hand side, click on \"View history\" to access the page\\'s edit history.\\n3. Look for any public logs or discussions related to content violations.\\n4. The three core policies often referred to in Wikipedia are \"Neutral point of view,\" \"Verifiability,\" and \"No original research.\" The \"R\" in these policies typically stands for \"Research\" in \"No original research.\"\\n\\nIf you need further assistance, you can try accessing the page directly through your browser.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 150, 'prompt_tokens': 380, 'total_tokens': 530, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZLkx0v7VlxdKIKROu9Utpvrhooh', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--aad2df13-a5e9-4b39-93fa-b2bc73a46839-0' usage_metadata={'input_tokens': 380, 'output_tokens': 150, 'total_tokens': 530, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1900
+ "content=\"It seems there is an issue with accessing the Merriam-Webster website directly. You can visit the [Merriam-Webster Word of the Day Archive](https://www.merriam-webster.com/word-of-the-day/calendar) and navigate to June 27, 2022, to find the writer quoted for that day's word.\" additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 70, 'prompt_tokens': 372, 'total_tokens': 442, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZLtOIpU3OPCd5tkbwTt5YLchm7X', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--34ea79ab-6408-495f-bf96-24571d945e22-0' usage_metadata={'input_tokens': 372, 'output_tokens': 70, 'total_tokens': 442, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1901
+ "content='It seems there is an issue with navigating to the website directly. However, I can guide you on how to find the information you need:\\n\\n1. **Visit the Cornell Law School\\'s Legal Information Institute website**: Go to [https://www.law.cornell.edu/](https://www.law.cornell.edu/).\\n\\n2. **Access the Federal Rules**: Look for a section or link related to \"Federal Rules\" on the homepage. This might be under a menu or a direct link.\\n\\n3. **Find the Fifth Section Alphabetically**: Once you are in the Federal Rules section, identify the fifth section alphabetically. This could be something like \"Federal Rules of Evidence\" or similar.\\n\\n4. **Search for Articles with \"Witnesses\" in the Title**: Within the identified section, look for articles or rules that have \"witnesses\" in their titles.\\n\\n5. **Check the Amendments**: For the first rule in the article with the most titles containing \"witnesses,\" look at the amendment history. The website should provide details on what changes were made, including any words that were deleted.\\n\\nIf you follow these steps, you should be able to find the specific word that was deleted in the last amendment. If you need further assistance, please let me know!' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 267, 'prompt_tokens': 392, 'total_tokens': 659, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZM19gRtBkrW7j9zs384oBAetICC', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--64bb8616-03b7-45bd-a560-20e30d813c1e-0' usage_metadata={'input_tokens': 392, 'output_tokens': 267, 'total_tokens': 659, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1902
+ "content='Honolulu, Norfolk' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 292, 'total_tokens': 297, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZMBOrJ52lxUEKqPl9643vEG3Uur', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--5b4d6367-af83-4664-8a8a-97013f00bb34-0' usage_metadata={'input_tokens': 292, 'output_tokens': 5, 'total_tokens': 297, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1903
+ "content='I am unable to access the Girls Who Code website directly. However, I can guide you on how to find this information:\\n\\n1. Visit the [Girls Who Code website](https://girlswhocode.com).\\n2. Look for sections related to statistics or reports on women in computer science.\\n3. Check for any historical data or reports that mention changes in the percentage of women in computer science over the years.\\n\\nIf you have any specific questions or need further assistance, feel free to ask!' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 99, 'prompt_tokens': 369, 'total_tokens': 468, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZMJSFKAE3PbT5yaQjwOStt5kH5I', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--efa3cfa7-ed0e-4cf7-8b9b-89fb435df512-0' usage_metadata={'input_tokens': 369, 'output_tokens': 99, 'total_tokens': 468, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1904
+ "content='In the 1977 regular season, the New York Yankees player with the most walks was Reggie Jackson, who had 86 walks. During that same season, Reggie Jackson had 582 at-bats.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 44, 'prompt_tokens': 263, 'total_tokens': 307, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZMPTirhNjOLLpQ7aY97f3kiwgKc', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--2bdd593f-25c6-4447-b61f-b4b89a341835-0' usage_metadata={'input_tokens': 263, 'output_tokens': 44, 'total_tokens': 307, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1905
+ "content='It seems there is an issue with accessing the webpage directly. However, you can find the poem \"Father Son and Holy Ghost\" by Audre Lorde on the Poetry Foundation website or other poetry resources online. Look for the stanza where some lines are indented to identify its number. If you have access to the text, you can also share it here, and I can help analyze it for you.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 82, 'prompt_tokens': 452, 'total_tokens': 534, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZMYHeM8Ya1yxiL0hu5TKzmzHIrq', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--0617cb01-94a9-4b32-897d-27c0b5e4d401-0' usage_metadata={'input_tokens': 452, 'output_tokens': 82, 'total_tokens': 534, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1906
+ "content=\"It seems there is an issue with navigating to the Universe Today website. Unfortunately, I can't directly access the web to find the article and the linked paper. However, you can visit the [Universe Today website](https://www.universetoday.com/) and search for the article by Carolyn Collins Petersen published on June 6, 2023. Once you find the article, you can look for the link to the paper at the bottom of the article to find the NASA award number associated with R. G. Arendt's work.\" additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 109, 'prompt_tokens': 452, 'total_tokens': 561, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZMi1y6B1LLAZANMww4hNlhi8IIF', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--7f8ca547-269d-4543-a918-f7a0698a203a-0' usage_metadata={'input_tokens': 452, 'output_tokens': 109, 'total_tokens': 561, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1907
+ "content='It seems there\\'s an issue with accessing the website directly. You can visit [ClinicalTrials.gov](https://clinicaltrials.gov) and search for the clinical trial on H. pylori in acne vulgaris patients to find the actual enrollment count. Use keywords like \"H. pylori,\" \"acne vulgaris,\" and the date range \"Jan-May 2018\" to narrow down the search results.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 83, 'prompt_tokens': 365, 'total_tokens': 448, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZMrw062kbVH3s0i4Vx8s3sjopAH', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--f2f168c9-9531-42ed-b0ec-9b7ee9ed5234-0' usage_metadata={'input_tokens': 365, 'output_tokens': 83, 'total_tokens': 448, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n",
1908
+ "content=\"It seems there's an issue with accessing the website directly. You can visit the [official NPB website](https://npb.jp/bis/eng/teams/index2023.html) to find the team rosters and identify the pitchers with numbers before and after Taishō Tamai's number. If you have any other questions or need further assistance, feel free to ask!\" additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 77, 'prompt_tokens': 454, 'total_tokens': 531, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_07871e2ad8', 'id': 'chatcmpl-BmZN15zPT7foUv3zhjFLjSqzUzZCt', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--4ae3e703-097f-4b65-b682-f1fada191dcd-0' usage_metadata={'input_tokens': 454, 'output_tokens': 77, 'total_tokens': 531, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}\n"
1909
+ ]
1910
+ }
1911
+ ],
1912
+ "source": [
1913
+ "for i in range(len(sample_task_ids)):\n",
1914
+ " print(results[i][\"answer\"][\"messages\"][-1])"
1915
+ ]
1916
+ },
1917
+ {
1918
+ "cell_type": "code",
1919
+ "execution_count": 68,
1920
+ "id": "12119c36",
1921
+ "metadata": {},
1922
+ "outputs": [],
1923
+ "source": [
1924
+ "pd.DataFrame(results).to_csv(\"../../data/temp/results_web_search.csv\", index=False)"
1925
+ ]
1926
+ },
1927
+ {
1928
+ "cell_type": "markdown",
1929
+ "id": "b0e4521f",
1930
+ "metadata": {},
1931
+ "source": [
1932
+ "### Handle Images Tool (Object detection)"
1933
+ ]
1934
+ },
1935
+ {
1936
+ "cell_type": "markdown",
1937
+ "id": "4cd203e5",
1938
+ "metadata": {},
1939
+ "source": [
1940
+ "#### 1. FIlter tasks to solve with the tool"
1941
+ ]
1942
+ },
1943
+ {
1944
+ "cell_type": "code",
1945
+ "execution_count": null,
1946
+ "id": "109858cf",
1947
+ "metadata": {},
1948
+ "outputs": [],
1949
+ "source": []
1950
+ },
1951
+ {
1952
+ "cell_type": "markdown",
1953
+ "id": "f7d03390",
1954
+ "metadata": {},
1955
+ "source": [
1956
+ "#### 2. Tool PoC"
1957
+ ]
1958
+ },
1959
+ {
1960
+ "cell_type": "code",
1961
+ "execution_count": null,
1962
+ "id": "ca297e23",
1963
+ "metadata": {},
1964
+ "outputs": [],
1965
+ "source": []
1966
+ },
1967
+ {
1968
+ "cell_type": "markdown",
1969
+ "id": "4b926609",
1970
+ "metadata": {},
1971
+ "source": [
1972
+ "#### 3. Test Tool"
1973
+ ]
1974
+ },
1975
+ {
1976
+ "cell_type": "markdown",
1977
+ "id": "adc49004",
1978
+ "metadata": {},
1979
+ "source": []
1980
+ },
1981
+ {
1982
+ "cell_type": "code",
1983
+ "execution_count": null,
1984
+ "id": "efb50438",
1985
+ "metadata": {},
1986
+ "outputs": [],
1987
+ "source": []
1988
+ }
1989
+ ],
1990
+ "metadata": {
1991
+ "kernelspec": {
1992
+ "display_name": "chappie-CHLGiFC_-py3.11",
1993
+ "language": "python",
1994
+ "name": "python3"
1995
+ },
1996
+ "language_info": {
1997
+ "codemirror_mode": {
1998
+ "name": "ipython",
1999
+ "version": 3
2000
+ },
2001
+ "file_extension": ".py",
2002
+ "mimetype": "text/x-python",
2003
+ "name": "python",
2004
+ "nbconvert_exporter": "python",
2005
+ "pygments_lexer": "ipython3",
2006
+ "version": "3.11.13"
2007
+ }
2008
+ },
2009
+ "nbformat": 4,
2010
+ "nbformat_minor": 5
2011
+ }