Paperbag commited on
Commit
d4fac05
·
1 Parent(s): 09be979

Refactor LLM provider invocation and add new providers

Browse files

- Updated `invoke_llm` function to improve provider fallback logic and error handling.
- Added new LLM providers: OpenRouter, Together, ZAI, HF Inference, and Opencode Zen.
- Modified provider order to include new providers and adjusted model retrieval logic.
- Enhanced error handling for rate limits and other exceptions during invocation.
- Added spreadsheet parsing functionality to read Excel and CSV files.
- Improved web search tool to use DDGS for better results.
- Introduced a new tool for fetching full Wikipedia page content.
- Updated local run script to handle AI message extraction more robustly.
- Added environment variable checks in `test_env.py` for better debugging.

__pycache__/agent.cpython-39.pyc CHANGED
Binary files a/__pycache__/agent.cpython-39.pyc and b/__pycache__/agent.cpython-39.pyc differ
 
agent.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import re
 
3
  from typing import TypedDict, List, Union
4
 
5
  from dotenv import load_dotenv
@@ -15,15 +16,15 @@ load_dotenv()
15
  class AgentState(TypedDict):
16
  messages: List[Union[HumanMessage, AIMessage, SystemMessage, ToolMessage]]
17
  reflection_count: int
 
 
18
 
19
 
20
  def _invoke_llm_with_tools(messages, fallback_count=0):
21
- """Invoke LLM with provider fallback."""
22
  return invoke_llm(messages, tools, fallback_count)
23
 
24
- # --- Helper Functions ---
25
  def is_reversed_text(question: str) -> bool:
26
- """Check if text appears to be reversed."""
27
  words = question.split()
28
  if len(words) < 3:
29
  return False
@@ -35,75 +36,64 @@ def is_reversed_text(question: str) -> bool:
35
  orig_valid = len([w for w in orig_words if w in common_words])
36
  return rev_valid > orig_valid
37
 
38
- # --- Graph Nodes ---
 
 
 
 
 
39
  def call_model(state: AgentState):
40
- messages = state["messages"]
41
-
42
- # Pre-processing: Detect and handle reversed text in the first message
43
  if len(messages) == 1 and isinstance(messages[0], HumanMessage):
44
  user_msg = messages[0].content
45
  if is_reversed_text(user_msg):
46
  fixed_msg = user_msg[::-1]
47
  messages = [HumanMessage(content=f"The following message was detected as reversed. I have reversed it back for you:\n{fixed_msg}")]
48
-
49
- # Add System Message if not present
50
  if not any(isinstance(m, SystemMessage) for m in messages):
51
- system_prompt = """You are a highly capable General AI Assistant (GAIA). Your goal is to solve complex, multi-step tasks.
52
-
53
- Your thought process MUST be methodical:
54
- 1. THINK:
55
- - Analyze the question deeply. Identify the core goal and ALL constraints (units, date formats, precision, etc.).
56
- - If the task involves an image or video, describe the visual elements before attempting to solve.
57
- - Plan your steps. Break the problem into smaller sub-problems.
58
- 2. ACT (Python-First):
59
- - Use `python_repl` for ANY task involving: math, counting, data analysis, list filtering (e.g., botany), or verifying logic (e.g., commutativity). DO NOT do these manually.
60
- - Use `web_search` for initial discovery and `browse_url` to verify details from the source.
61
- 3. OBSERVE: Carefully review tool outputs. If a result is ambiguous, search for a second source to triangulate.
62
- 4. REFINE: Question your assumptions. If the answer seems too simple for a complex GAIA task, you likely missed a constraint.
63
- 5. VERIFY: Before finalizing, double-check units and precision.
64
- 6. FINALIZE: Provide the result in the exact format: FINAL ANSWER: <answer>.
65
-
66
- Guidelines:
67
- - [Attached Files]: Always use `read_file` for local files.
68
- - Research: Don't trust a single snippet; browse the full page if the answer is buried.
69
- - Constraints: If the question says 'alphabetize' or 'comma-separated', use Python to ensure it is perfect.
70
- - Final Output: Return ONLY the final answer in the requested format.
71
- """
72
- messages = [SystemMessage(content=system_prompt)] + messages
73
 
74
  response = _invoke_llm_with_tools(messages)
75
- return {"messages": [response]}
 
76
 
77
  def reflect(state: AgentState):
78
- """Node to reflect on the final answer and verify correctness."""
79
  messages = state["messages"]
80
  last_message = messages[-1]
81
-
82
  if "FINAL ANSWER:" not in last_message.content:
83
- return {"messages": []} # Should not happen based on routing
84
 
85
  reflection_prompt = (
86
- "You have provided a FINAL ANSWER. Before we finish, please perform a final a self-critique:\n"
87
  "1. Did you miss any constraints from the original question?\n"
88
- "2. Are the units and precision exactly as requested?\n"
89
- "3. Is there any step in your reasoning that could be flawed?\n"
90
- "If the answer is correct, simply repeat the FINAL ANSWER: <answer> exactly as before.\n"
91
- "If you find an error, explain it and provide a corrected FINAL ANSWER: <answer>."
92
  )
93
-
94
- # We add the reflection prompt as a human message to trigger a new response
95
  response = _invoke_llm_with_tools(messages + [HumanMessage(content=reflection_prompt)])
96
- return {"messages": [response], "reflection_count": state.get("reflection_count", 0) + 1}
 
97
 
98
  def call_tool(state: AgentState):
99
  messages = state["messages"]
100
  last_message = messages[-1]
101
-
 
 
 
102
  tool_outputs = []
103
- for tool_call in last_message.tool_calls:
 
104
  tool_name = tool_call["name"]
105
  tool_args = tool_call["args"]
106
-
 
 
 
107
  if tool_name not in tools_by_name:
108
  tool_outputs.append(ToolMessage(
109
  content=f"Error: Tool {tool_name} not found.",
@@ -111,13 +101,16 @@ def call_tool(state: AgentState):
111
  name=tool_name
112
  ))
113
  continue
114
-
115
  tool = tools_by_name[tool_name]
116
- print(f"Calling tool: {tool_name} with args: {tool_args}")
117
  try:
118
  output = tool.invoke(tool_args)
 
 
 
119
  tool_outputs.append(ToolMessage(
120
- content=str(output),
121
  tool_call_id=tool_call["id"],
122
  name=tool_name
123
  ))
@@ -127,18 +120,34 @@ def call_tool(state: AgentState):
127
  tool_call_id=tool_call["id"],
128
  name=tool_name
129
  ))
130
- return {"messages": tool_outputs}
 
131
 
132
  def should_continue(state: AgentState):
133
  messages = state["messages"]
134
  last_message = messages[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  if hasattr(last_message, "tool_calls") and last_message.tool_calls:
136
  return "action"
137
- if "FINAL ANSWER:" in last_message.content and state.get("reflection_count", 0) == 0:
 
138
  return "reflect"
139
  return END
140
 
141
- # --- Graph Construction ---
142
  def build_graph():
143
  workflow = StateGraph(AgentState)
144
  workflow.add_node("agent", call_model)
 
1
  import os
2
  import re
3
+ from collections import Counter
4
  from typing import TypedDict, List, Union
5
 
6
  from dotenv import load_dotenv
 
16
  class AgentState(TypedDict):
17
  messages: List[Union[HumanMessage, AIMessage, SystemMessage, ToolMessage]]
18
  reflection_count: int
19
+ tool_call_count: int
20
+ tool_call_history: List[str]
21
 
22
 
23
  def _invoke_llm_with_tools(messages, fallback_count=0):
 
24
  return invoke_llm(messages, tools, fallback_count)
25
 
26
+
27
  def is_reversed_text(question: str) -> bool:
 
28
  words = question.split()
29
  if len(words) < 3:
30
  return False
 
36
  orig_valid = len([w for w in orig_words if w in common_words])
37
  return rev_valid > orig_valid
38
 
39
+
40
+ SYSTEM_PROMPT = """Answer with FINAL ANSWER: <value>.
41
+ Use tools to research. Read full Wikipedia pages (browse_url) rather than just searching.
42
+ Never repeat the same tool call."""
43
+
44
+
45
  def call_model(state: AgentState):
46
+ messages = list(state["messages"])
47
+
 
48
  if len(messages) == 1 and isinstance(messages[0], HumanMessage):
49
  user_msg = messages[0].content
50
  if is_reversed_text(user_msg):
51
  fixed_msg = user_msg[::-1]
52
  messages = [HumanMessage(content=f"The following message was detected as reversed. I have reversed it back for you:\n{fixed_msg}")]
53
+
 
54
  if not any(isinstance(m, SystemMessage) for m in messages):
55
+ messages = [SystemMessage(content=SYSTEM_PROMPT)] + messages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  response = _invoke_llm_with_tools(messages)
58
+ return {"messages": state["messages"] + [response]}
59
+
60
 
61
  def reflect(state: AgentState):
 
62
  messages = state["messages"]
63
  last_message = messages[-1]
64
+
65
  if "FINAL ANSWER:" not in last_message.content:
66
+ return {"messages": []}
67
 
68
  reflection_prompt = (
69
+ "Before finalizing, double-check:\n"
70
  "1. Did you miss any constraints from the original question?\n"
71
+ "2. Are units and precision exactly as requested?\n"
72
+ "3. Could any step in reasoning be flawed?\n"
73
+ "If correct, repeat FINAL ANSWER: <answer> exactly.\n"
74
+ "If wrong, explain and provide corrected FINAL ANSWER: <answer>."
75
  )
76
+
 
77
  response = _invoke_llm_with_tools(messages + [HumanMessage(content=reflection_prompt)])
78
+ return {"messages": state["messages"] + [response], "reflection_count": state.get("reflection_count", 0) + 1}
79
+
80
 
81
  def call_tool(state: AgentState):
82
  messages = state["messages"]
83
  last_message = messages[-1]
84
+
85
+ tool_call_history = state.get("tool_call_history", [])
86
+ tool_call_count = state.get("tool_call_count", 0)
87
+
88
  tool_outputs = []
89
+ # Limit 5 tool calls per response
90
+ for tool_call in last_message.tool_calls[:5]:
91
  tool_name = tool_call["name"]
92
  tool_args = tool_call["args"]
93
+ key = f"{tool_name}({tool_args})"
94
+ tool_call_history.append(key)
95
+ tool_call_count += 1
96
+
97
  if tool_name not in tools_by_name:
98
  tool_outputs.append(ToolMessage(
99
  content=f"Error: Tool {tool_name} not found.",
 
101
  name=tool_name
102
  ))
103
  continue
104
+
105
  tool = tools_by_name[tool_name]
106
+ print(f"Calling tool: {tool_name} with args: {tool_args}", flush=True)
107
  try:
108
  output = tool.invoke(tool_args)
109
+ output_str = str(output)
110
+ if len(output_str) > 15000:
111
+ output_str = output_str[:15000] + "\n...[truncated]"
112
  tool_outputs.append(ToolMessage(
113
+ content=output_str,
114
  tool_call_id=tool_call["id"],
115
  name=tool_name
116
  ))
 
120
  tool_call_id=tool_call["id"],
121
  name=tool_name
122
  ))
123
+ return {"messages": state["messages"] + tool_outputs, "tool_call_count": tool_call_count, "tool_call_history": tool_call_history}
124
+
125
 
126
  def should_continue(state: AgentState):
127
  messages = state["messages"]
128
  last_message = messages[-1]
129
+ tool_call_count = state.get("tool_call_count", 0)
130
+ tool_call_history = state.get("tool_call_history", [])
131
+ reflection_count = state.get("reflection_count", 0)
132
+
133
+ # Max 8 tool calls (128K context handles it)
134
+ if tool_call_count >= 8:
135
+ return END
136
+
137
+ # Detect loop: same tool name called 4+ times
138
+ if len(tool_call_history) >= 4:
139
+ tool_names = [h.split("(")[0] for h in tool_call_history]
140
+ if any(tool_names.count(n) >= 4 for n in set(tool_names)):
141
+ return END
142
+
143
  if hasattr(last_message, "tool_calls") and last_message.tool_calls:
144
  return "action"
145
+ content = getattr(last_message, "content", "") or ""
146
+ if "FINAL ANSWER:" in content and reflection_count == 0:
147
  return "reflect"
148
  return END
149
 
150
+
151
  def build_graph():
152
  workflow = StateGraph(AgentState)
153
  workflow.add_node("agent", call_model)
analyze_results.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Analyze existing gaia_results.json and produce a diagnostic report.
3
+ """
4
+ import json
5
+ import re
6
+ import sys
7
+
8
+ # Fix Windows console encoding issues
9
+ try:
10
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace")
11
+ except Exception:
12
+ pass
13
+
14
+
15
+ with open("gaia_results.json", "r") as f:
16
+ data = json.load(f)
17
+
18
+ results = data["results"]
19
+
20
+ # Categorize failures
21
+ categories = {
22
+ "groq_rate_limit_tpm": [],
23
+ "recursion_limit": [],
24
+ "tool_call_format_error": [],
25
+ "wrong_output_format": [],
26
+ "other_error": [],
27
+ "correct": [],
28
+ }
29
+
30
+ for r in results:
31
+ ans = r["submitted_answer"]
32
+ q = r["question"][:80]
33
+ tid = r["task_id"][:8]
34
+ gt = r["ground_truth"]
35
+ is_correct = r["correct"]
36
+
37
+ if is_correct:
38
+ categories["correct"].append(r)
39
+ elif "413" in ans and "tokens per minute" in ans.lower():
40
+ categories["groq_rate_limit_tpm"].append(r)
41
+ elif "Recursion limit" in ans:
42
+ categories["recursion_limit"].append(r)
43
+ elif "tool_use_failed" in ans or "tool call validation" in ans:
44
+ categories["tool_call_format_error"].append(r)
45
+ elif "<|python_tag|>" in ans or ("AGENT ERROR" in ans and "tool" in ans.lower()):
46
+ categories["wrong_output_format"].append(r)
47
+ else:
48
+ categories["other_error"].append(r)
49
+
50
+ print("=" * 70)
51
+ print("GAIA BENCHMARK - FAILURE ANALYSIS REPORT")
52
+ print(f"Score: {data['correct']}/{data['total']} = {data['score']:.0f}%")
53
+ print("=" * 70)
54
+
55
+ print("\n## CATEGORY BREAKDOWN")
56
+ for cat, items in categories.items():
57
+ print(f" {cat}: {len(items)} questions")
58
+
59
+ print("\n" + "=" * 70)
60
+
61
+ for cat, label in [
62
+ ("groq_rate_limit_tpm", "[RATE_LIMIT] GROQ TPM RATE LIMIT (request too large for fallback model)"),
63
+ ("recursion_limit", "[RECURSION] RECURSION LIMIT (agent stuck in tool loop, no answer found)"),
64
+ ("tool_call_format_error", "[FORMAT_ERR] TOOL CALL FORMAT ERROR (LLM generated malformed tool invocations)"),
65
+ ("wrong_output_format", "[WRONG_OUT] WRONG OUTPUT FORMAT (agent returned tool calls as text, not answer)"),
66
+ ("other_error", "[OTHER] OTHER ERROR"),
67
+ ]:
68
+ items = categories[cat]
69
+ if not items:
70
+ continue
71
+ print(f"\n### {label} ({len(items)} questions)")
72
+ for r in items:
73
+ print(f" - [{r['task_id'][:8]}] GT={r['ground_truth'][:40]!r}")
74
+ print(f" Q: {r['question'][:100]}")
75
+ # Classify what tool/skill would solve it
76
+ q_lower = r["question"].lower()
77
+ skills = []
78
+ if "youtube.com" in r["question"]:
79
+ skills.append("YouTube Transcript / Video Analysis")
80
+ if "mp3" in q_lower or "audio" in q_lower or "voice memo" in q_lower or "recording" in q_lower:
81
+ skills.append("Audio Transcription (Whisper)")
82
+ if "image" in q_lower or "chess" in q_lower:
83
+ skills.append("Image Analysis (Vision LLM)")
84
+ if "excel" in q_lower or ".xlsx" in q_lower:
85
+ skills.append("Excel/File Reading")
86
+ if "wikipedia" in q_lower or "wiki" in q_lower:
87
+ skills.append("Wikipedia Search")
88
+ if "paper" in q_lower or "article" in q_lower:
89
+ skills.append("Web Browsing/Research")
90
+ if "python code" in q_lower or "code" in q_lower:
91
+ skills.append("Python REPL execution")
92
+ if "table" in q_lower or "commutative" in q_lower:
93
+ skills.append("Python REPL (logic check)")
94
+ if not skills:
95
+ skills.append("Web Search")
96
+ print(f" Needed: {', '.join(skills)}")
97
+ print()
98
+
99
+ print("\n" + "=" * 70)
100
+ print("## PRIORITIZED IMPROVEMENT AREAS")
101
+ print("""
102
+ 1. CRITICAL - PROVIDER FALLBACK (affects 8 questions):
103
+ - Groq falls back to llama-3.1-8b-instant (6000 TPM limit)
104
+ - Gemini API quota exhausted (free tier daily limit hit)
105
+ - Fix: Use gemini-1.5-flash or gemini-2.5-flash as PRIMARY provider
106
+ - Fix: Add proper provider rotation that skips quota-exhausted models
107
+
108
+ 2. CRITICAL - RECURSION LIMIT (affects 8 questions):
109
+ - Agent loops indefinitely (25 steps) without providing an answer
110
+ - Causes: Tool keeps failing or returning unhelpful results
111
+ - Fix: Add a MAX_TOOL_CALLS guard and force FINAL ANSWER after N iterations
112
+
113
+ 3. HIGH - TOOL CALL FORMAT ERRORS (affects 3 questions):
114
+ - LLM generates tool calls not matching the registered tool names/schema
115
+ - wiki_search called with wrong JSON format
116
+ - web_search called with 'keywords' instead of 'query' parameter
117
+ - Fix: Add function signature validation / tool schema alignment
118
+
119
+ 4. HIGH - AUDIO/VIDEO QUESTIONS (affects 3 questions):
120
+ - YouTube video analysis requires video frames, not just transcript
121
+ - Audio transcription (mp3) failing due to context overflow
122
+ - Fix: Ensure transcribe_audio + get_youtube_transcript work reliably
123
+
124
+ 5. HIGH - IMAGE/CHESS QUESTIONS (affects 1 question):
125
+ - Chess position from image requires multimodal vision model
126
+ - Current setup can't directly process images
127
+ - Fix: Pass image URL to Gemini vision model
128
+
129
+ 6. MEDIUM - CONTEXT OVERFLOW on multi-step research questions:
130
+ - Long Wikipedia/web searches fill context window before finding answer
131
+ - Fix: Summarize intermediate tool results before appending to messages
132
+ """)
app.py CHANGED
@@ -24,7 +24,10 @@ class BasicAgent:
24
  def __call__(self, question: str) -> str:
25
  print(f"Agent received question (first 50 chars): {question[:50]}...")
26
  messages = [HumanMessage(content=question)]
27
- result = self.graph.invoke({"messages": messages})
 
 
 
28
  answer = result['messages'][-1].content
29
  print(f"Agent returning answer: {answer}")
30
  return answer
 
24
  def __call__(self, question: str) -> str:
25
  print(f"Agent received question (first 50 chars): {question[:50]}...")
26
  messages = [HumanMessage(content=question)]
27
+ result = self.graph.invoke(
28
+ {"messages": messages, "tool_call_count": 0, "reflection_count": 0, "tool_call_history": []},
29
+ config={"recursion_limit": 50},
30
+ )
31
  answer = result['messages'][-1].content
32
  print(f"Agent returning answer: {answer}")
33
  return answer
gaia_results.csv CHANGED
@@ -1,11 +1,9 @@
1
  task_id,question,submitted_answer,ground_truth,correct
2
- 8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,"ERROR: LLM invocation failed: Error code: 400 - {'error': {'message': 'tool call validation failed: attempted to call tool \'wiki_search{""query"": ""Mercedes Sosa discography""}\' which was not in request.tools', 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=wiki_search{""query"": ""Mercedes Sosa discography""}></function>'}}",3,False
3
- a1e91b78-d3d8-4675-bb8d-62741b4b68a6,"In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?","ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 29530, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",3,False
4
- 2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI","ERROR: LLM invocation failed: Error code: 400 - {'error': {'message': ""Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details."", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=reverse_text>{""text"": ""The following message was detected as reversed. I have reversed it back for you: If you understand this sentence, write the opposite of the word ""left"" as the answer.""}</function>'}}",Right,False
5
- cca530fc-4052-43b2-b130-b30968d8aa44,Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.,"ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
6
- For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",Rd5,False
7
- 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8,Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,"ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
8
- For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",FunkMonk,False
9
  6f37996b-2ac7-44b0-8e68-6d28256631b4,"Given this table defining * on the set S = {a, b, c, d, e}
10
 
11
  |*|a|b|c|d|e|
@@ -16,37 +14,30 @@ For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/er
16
  |d|b|e|b|e|d|
17
  |e|d|b|a|d|c|
18
 
19
- provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.","ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 9367, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}","b, e",False
20
  9d191bce-651d-4746-be2d-7ef8ecadb9c2,"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
21
 
22
- What does Teal'c say in response to the question ""Isn't that hot?""","ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
23
- For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",Extremely,False
24
- cabe07ed-9eca-40ea-8ead-410ef5e83f91,What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,"ERROR: LLM invocation failed: Error code: 400 - {'error': {'message': ""Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details."", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=web_search {""keywords"": ""equine veterinarian surname CK-12 license LibreText Introductory Chemistry materials Marisa Alviar-Agnew Henry Agnew""} </function>'}}",Louvrier,False
25
  3cef3a44-215e-4aed-8e3b-b1e3f08063b7,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
26
 
27
  milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
28
 
29
- I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
30
- For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT","broccoli, celery, fresh basil, lettuce, sweet potatoes",False
31
  99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3,"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
32
 
33
  In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
34
 
35
- Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.","ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 8415, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}","cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",False
36
- 305ac316-eef6-4446-960a-92d80d542f82,Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,"ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
37
- For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",Wojciech,False
38
- f918266a-b3e0-4914-865d-4faa564f1aef,What is the final numeric output from the attached Python code?,"<|python_tag|>web_search{""keywords"": ""definition of artificial intelligence""}; browse_url{""url"": ""https://www.example.com/what-is-ai""}; browse_url{""url"": ""https://www.example.com/ai-definition""}; python_repl{""code"": ""print('Artificial Intelligence (AI) is a field of computer science that focuses on creating intelligent machines that can perform tasks that typically require human intelligence, such as visual perception, speech recognition, and decision-making.')""}",0,False
39
- 3f57289b-8c60-48be-bd80-01f8099ca449,How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?,"ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 6414, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",519,False
40
  1f975693-876d-457b-a649-393859e79bf3,"Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
41
 
42
- Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.","ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
43
- For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT","132, 133, 134, 197, 245",False
44
- 840bfca7-4f7b-481a-8794-c560c340185d,"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?","ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
45
- For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",80GSFC21M0002,False
46
- bda648d7-d618-4883-88f4-3466eabd860e,Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.,"ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 7915, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",Saint Petersburg,False
47
- cf106601-ab4f-4af9-b045-5295fe67b37d,"What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.","ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
48
- For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",CUB,False
49
- a0c07678-e491-4bbc-8f0b-07405144218f,"Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.","ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
50
- For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT","Yoshida, Uehara",False
51
- 7bd855d8-463d-4ed5-93ca-5fe35145f733,The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,"ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 14486, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",89706.00,False
52
- 5a0c1adf-205e-4841-a666-7c3ef95def9d,What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,"ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 23154, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",Claus,False
 
1
  task_id,question,submitted_answer,ground_truth,correct
2
+ 8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,,3,False
3
+ a1e91b78-d3d8-4675-bb8d-62741b4b68a6,"In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",,3,False
4
+ 2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",left,Right,False
5
+ cca530fc-4052-43b2-b130-b30968d8aa44,Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.,,Rd5,False
6
+ 4fc2f1ae-8625-45b5-ab34-ad4433bc21f8,Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,,FunkMonk,False
 
 
7
  6f37996b-2ac7-44b0-8e68-6d28256631b4,"Given this table defining * on the set S = {a, b, c, d, e}
8
 
9
  |*|a|b|c|d|e|
 
14
  |d|b|e|b|e|d|
15
  |e|d|b|a|d|c|
16
 
17
+ provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.","b,e","b, e",False
18
  9d191bce-651d-4746-be2d-7ef8ecadb9c2,"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
19
 
20
+ What does Teal'c say in response to the question ""Isn't that hot?""",,Extremely,False
21
+ cabe07ed-9eca-40ea-8ead-410ef5e83f91,What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,,Louvrier,False
 
22
  3cef3a44-215e-4aed-8e3b-b1e3f08063b7,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
23
 
24
  milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
25
 
26
+ I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","broccoli, celery, fresh basil, lettuce, sweet potatoes","broccoli, celery, fresh basil, lettuce, sweet potatoes",True
 
27
  99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3,"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
28
 
29
  In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
30
 
31
+ Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",,"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",False
32
+ 305ac316-eef6-4446-960a-92d80d542f82,Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,,Wojciech,False
33
+ f918266a-b3e0-4914-865d-4faa564f1aef,What is the final numeric output from the attached Python code?,,0,False
34
+ 3f57289b-8c60-48be-bd80-01f8099ca449,How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?,519,519,True
 
35
  1f975693-876d-457b-a649-393859e79bf3,"Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
36
 
37
+ Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",,"132, 133, 134, 197, 245",False
38
+ 840bfca7-4f7b-481a-8794-c560c340185d,"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",,80GSFC21M0002,False
39
+ bda648d7-d618-4883-88f4-3466eabd860e,Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.,Saint Petersburg,Saint Petersburg,True
40
+ cf106601-ab4f-4af9-b045-5295fe67b37d,"What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",,CUB,False
41
+ a0c07678-e491-4bbc-8f0b-07405144218f,"Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",,"Yoshida, Uehara",False
42
+ 7bd855d8-463d-4ed5-93ca-5fe35145f733,The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,89706.00,89706.00,True
43
+ 5a0c1adf-205e-4841-a666-7c3ef95def9d,What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,,Claus,False
 
 
 
 
gaia_results.json CHANGED
@@ -1,145 +1,145 @@
1
  {
2
- "score": 0.0,
3
- "correct": 0,
4
  "total": 20,
5
  "results": [
6
  {
7
  "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
8
  "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
9
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 400 - {'error': {'message': 'tool call validation failed: attempted to call tool \\'wiki_search{\"query\": \"Mercedes Sosa discography\"}\\' which was not in request.tools', 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=wiki_search{\"query\": \"Mercedes Sosa discography\"}></function>'}}",
10
  "ground_truth": "3",
11
  "correct": false
12
  },
13
  {
14
  "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
15
  "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
16
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 29530, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",
17
  "ground_truth": "3",
18
  "correct": false
19
  },
20
  {
21
  "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
22
  "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
23
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 400 - {'error': {'message': \"Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details.\", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=reverse_text>{\"text\": \"The following message was detected as reversed. I have reversed it back for you: If you understand this sentence, write the opposite of the word \"left\" as the answer.\"}</function>'}}",
24
  "ground_truth": "Right",
25
  "correct": false
26
  },
27
  {
28
  "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
29
  "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
30
- "submitted_answer": "ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",
31
  "ground_truth": "Rd5",
32
  "correct": false
33
  },
34
  {
35
  "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
36
  "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
37
- "submitted_answer": "ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",
38
  "ground_truth": "FunkMonk",
39
  "correct": false
40
  },
41
  {
42
  "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
43
  "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
44
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 9367, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",
45
  "ground_truth": "b, e",
46
  "correct": false
47
  },
48
  {
49
  "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
50
  "question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
51
- "submitted_answer": "ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",
52
  "ground_truth": "Extremely",
53
  "correct": false
54
  },
55
  {
56
  "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
57
  "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
58
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 400 - {'error': {'message': \"Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details.\", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=web_search {\"keywords\": \"equine veterinarian surname CK-12 license LibreText Introductory Chemistry materials Marisa Alviar-Agnew Henry Agnew\"} </function>'}}",
59
  "ground_truth": "Louvrier",
60
  "correct": false
61
  },
62
  {
63
  "task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
64
  "question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
65
- "submitted_answer": "ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",
66
  "ground_truth": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
67
- "correct": false
68
  },
69
  {
70
  "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
71
  "question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
72
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 8415, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",
73
  "ground_truth": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",
74
  "correct": false
75
  },
76
  {
77
  "task_id": "305ac316-eef6-4446-960a-92d80d542f82",
78
  "question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
79
- "submitted_answer": "ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",
80
  "ground_truth": "Wojciech",
81
  "correct": false
82
  },
83
  {
84
  "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
85
  "question": "What is the final numeric output from the attached Python code?",
86
- "submitted_answer": "<|python_tag|>web_search{\"keywords\": \"definition of artificial intelligence\"}; browse_url{\"url\": \"https://www.example.com/what-is-ai\"}; browse_url{\"url\": \"https://www.example.com/ai-definition\"}; python_repl{\"code\": \"print('Artificial Intelligence (AI) is a field of computer science that focuses on creating intelligent machines that can perform tasks that typically require human intelligence, such as visual perception, speech recognition, and decision-making.')\"}",
87
  "ground_truth": "0",
88
  "correct": false
89
  },
90
  {
91
  "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
92
  "question": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
93
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 6414, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",
94
  "ground_truth": "519",
95
- "correct": false
96
  },
97
  {
98
  "task_id": "1f975693-876d-457b-a649-393859e79bf3",
99
  "question": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
100
- "submitted_answer": "ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",
101
  "ground_truth": "132, 133, 134, 197, 245",
102
  "correct": false
103
  },
104
  {
105
  "task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
106
  "question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
107
- "submitted_answer": "ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",
108
  "ground_truth": "80GSFC21M0002",
109
  "correct": false
110
  },
111
  {
112
  "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
113
  "question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
114
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 7915, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",
115
  "ground_truth": "Saint Petersburg",
116
- "correct": false
117
  },
118
  {
119
  "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
120
  "question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
121
- "submitted_answer": "ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",
122
  "ground_truth": "CUB",
123
  "correct": false
124
  },
125
  {
126
  "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
127
  "question": "Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
128
- "submitted_answer": "ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",
129
  "ground_truth": "Yoshida, Uehara",
130
  "correct": false
131
  },
132
  {
133
  "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
134
  "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
135
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 14486, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",
136
  "ground_truth": "89706.00",
137
- "correct": false
138
  },
139
  {
140
  "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
141
  "question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
142
- "submitted_answer": "ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 23154, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",
143
  "ground_truth": "Claus",
144
  "correct": false
145
  }
 
1
  {
2
+ "score": 20.0,
3
+ "correct": 4,
4
  "total": 20,
5
  "results": [
6
  {
7
  "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
8
  "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
9
+ "submitted_answer": "",
10
  "ground_truth": "3",
11
  "correct": false
12
  },
13
  {
14
  "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
15
  "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
16
+ "submitted_answer": "",
17
  "ground_truth": "3",
18
  "correct": false
19
  },
20
  {
21
  "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
22
  "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
23
+ "submitted_answer": "left",
24
  "ground_truth": "Right",
25
  "correct": false
26
  },
27
  {
28
  "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
29
  "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
30
+ "submitted_answer": "",
31
  "ground_truth": "Rd5",
32
  "correct": false
33
  },
34
  {
35
  "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
36
  "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
37
+ "submitted_answer": "",
38
  "ground_truth": "FunkMonk",
39
  "correct": false
40
  },
41
  {
42
  "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
43
  "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
44
+ "submitted_answer": "b,e",
45
  "ground_truth": "b, e",
46
  "correct": false
47
  },
48
  {
49
  "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
50
  "question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
51
+ "submitted_answer": "",
52
  "ground_truth": "Extremely",
53
  "correct": false
54
  },
55
  {
56
  "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
57
  "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
58
+ "submitted_answer": "",
59
  "ground_truth": "Louvrier",
60
  "correct": false
61
  },
62
  {
63
  "task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
64
  "question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
65
+ "submitted_answer": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
66
  "ground_truth": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
67
+ "correct": true
68
  },
69
  {
70
  "task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
71
  "question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
72
+ "submitted_answer": "",
73
  "ground_truth": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",
74
  "correct": false
75
  },
76
  {
77
  "task_id": "305ac316-eef6-4446-960a-92d80d542f82",
78
  "question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
79
+ "submitted_answer": "",
80
  "ground_truth": "Wojciech",
81
  "correct": false
82
  },
83
  {
84
  "task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
85
  "question": "What is the final numeric output from the attached Python code?",
86
+ "submitted_answer": "",
87
  "ground_truth": "0",
88
  "correct": false
89
  },
90
  {
91
  "task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
92
  "question": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
93
+ "submitted_answer": "519",
94
  "ground_truth": "519",
95
+ "correct": true
96
  },
97
  {
98
  "task_id": "1f975693-876d-457b-a649-393859e79bf3",
99
  "question": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
100
+ "submitted_answer": "",
101
  "ground_truth": "132, 133, 134, 197, 245",
102
  "correct": false
103
  },
104
  {
105
  "task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
106
  "question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
107
+ "submitted_answer": "",
108
  "ground_truth": "80GSFC21M0002",
109
  "correct": false
110
  },
111
  {
112
  "task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
113
  "question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
114
+ "submitted_answer": "Saint Petersburg",
115
  "ground_truth": "Saint Petersburg",
116
+ "correct": true
117
  },
118
  {
119
  "task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
120
  "question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
121
+ "submitted_answer": "",
122
  "ground_truth": "CUB",
123
  "correct": false
124
  },
125
  {
126
  "task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
127
  "question": "Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
128
+ "submitted_answer": "",
129
  "ground_truth": "Yoshida, Uehara",
130
  "correct": false
131
  },
132
  {
133
  "task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
134
  "question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
135
+ "submitted_answer": "89706.00",
136
  "ground_truth": "89706.00",
137
+ "correct": true
138
  },
139
  {
140
  "task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
141
  "question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
142
+ "submitted_answer": "",
143
  "ground_truth": "Claus",
144
  "correct": false
145
  }
llm/client.py CHANGED
@@ -1,66 +1,50 @@
1
  import os
2
- from typing import List
3
 
4
  from langchain_core.messages import AIMessage
5
  from llm.providers import PROVIDERS
6
 
7
- PROVIDER_ORDER = os.getenv("LLM_PROVIDER_ORDER", "groq, gemini, gemini_gemma").split(",")
8
 
9
- _degraded_providers = {}
10
 
 
 
 
11
 
12
- def _get_next_provider():
13
- """Get next available provider in priority order."""
14
- for name in PROVIDER_ORDER:
15
- if name not in _degraded_providers:
16
- yield name
17
 
 
 
 
18
 
19
- def invoke_llm(messages: List, tools: List, fallback_count: int = 0) -> AIMessage:
20
- """Invoke LLM with provider fallback.
21
-
22
- Args:
23
- messages: Chat messages to send to LLM
24
- tools: List of tools to bind
25
- fallback_count: Current retry attempt
26
-
27
- Returns:
28
- AIMessage response from successful provider
29
- """
30
- provider_name = None
31
- provider = None
32
-
33
- for name in _get_next_provider():
34
- provider_name = name
35
- provider = PROVIDERS.get(name)
36
- if provider:
37
- break
38
-
39
- if not provider:
40
- return AIMessage(content="ERROR: No available LLM providers")
41
-
42
- try:
43
  models = provider.get_models()
44
- model_index = min(fallback_count // 3, len(models) - 1)
45
- model_name = models[model_index]
46
-
47
- print(f"Invoking {provider_name} with model {model_name}")
48
- return provider.invoke(messages, tools, model_name)
49
-
50
- except Exception as e:
51
- error_msg = str(e).lower()
52
-
53
- if "rate limit" in error_msg or "429" in error_msg or "quota" in error_msg:
54
- print(f"{provider_name} rate limit hit. Waiting before retry...")
55
- import time
56
- wait_time = 10 * (fallback_count + 1)
57
- time.sleep(wait_time)
58
- _degraded_providers[provider_name] = True
59
- else:
60
- print(f"{provider_name} error: {e}. Trying next provider.")
61
-
62
- remaining = [n for n in PROVIDER_ORDER if n not in _degraded_providers]
63
- if remaining:
64
- return invoke_llm(messages, tools, fallback_count + 1)
65
-
66
- return AIMessage(content=f"ERROR: All LLM providers failed: {e}")
 
 
 
 
 
 
1
  import os
2
+ import time
3
 
4
  from langchain_core.messages import AIMessage
5
  from llm.providers import PROVIDERS
6
 
7
+ PROVIDER_ORDER = [p.strip() for p in os.getenv("LLM_PROVIDER_ORDER", "opencode_zen, groq").split(",")]
8
 
 
9
 
10
+ def invoke_llm(messages, tools, fallback_count=0, _degraded=None):
11
+ if _degraded is None:
12
+ _degraded = {}
13
 
14
+ for provider_name in PROVIDER_ORDER:
15
+ if provider_name in _degraded:
16
+ continue
 
 
17
 
18
+ provider = PROVIDERS.get(provider_name)
19
+ if not provider:
20
+ continue
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  models = provider.get_models()
23
+ model_attempts = 0
24
+
25
+ while model_attempts < len(models):
26
+ model_name = models[model_attempts]
27
+ print(f"Invoking {provider_name} with {model_name}", flush=True)
28
+
29
+ retries = 0
30
+ while retries < 2:
31
+ try:
32
+ return provider.invoke(messages, tools, model_name)
33
+ except Exception as e:
34
+ err_str = str(e)
35
+ err = err_str.lower()
36
+ if any(x in err for x in ("rate limit", "429", "quota", "resource ex")):
37
+ print(f"{provider_name}/{model_name} rate limited, waiting...", flush=True)
38
+ time.sleep(65)
39
+ retries += 1
40
+ elif any(x in err for x in ("payment required", "402", "tool_use_failed", "model_not_found", "too large", "413")):
41
+ print(f"{provider_name}/{model_name} skip, trying next", flush=True)
42
+ break
43
+ else:
44
+ print(f"{provider_name}/{model_name} error: {type(e).__name__}: {err_str[:150]}", flush=True)
45
+ break
46
+ model_attempts += 1
47
+
48
+ _degraded[provider_name] = True
49
+
50
+ return AIMessage(content="ERROR: All LLM providers failed")
llm/providers/__init__.py CHANGED
@@ -1,9 +1,14 @@
1
- from llm.providers import gemini, gemini_gemma, groq
2
 
3
  PROVIDERS = {
4
  "gemini": gemini,
5
  "gemini_gemma": gemini_gemma,
6
  "groq": groq,
 
 
 
 
 
7
  }
8
 
9
- __all__ = ["PROVIDERS", "gemini", "gemini_gemma", "groq"]
 
1
+ from llm.providers import gemini, gemini_gemma, groq, openrouter, together, zai, hf_inference, opencode_zen
2
 
3
  PROVIDERS = {
4
  "gemini": gemini,
5
  "gemini_gemma": gemini_gemma,
6
  "groq": groq,
7
+ "openrouter": openrouter,
8
+ "together": together,
9
+ "zai": zai,
10
+ "hf_inference": hf_inference,
11
+ "opencode_zen": opencode_zen,
12
  }
13
 
14
+ __all__ = ["PROVIDERS", "gemini", "gemini_gemma", "groq", "openrouter", "together", "zai", "hf_inference", "opencode_zen"]
llm/providers/groq.py CHANGED
@@ -10,4 +10,4 @@ def invoke(messages, tools, model_name: str = "llama-3.3-70b-versatile"):
10
 
11
  def get_models():
12
  """List available Groq models for fallback."""
13
- return ["llama-3.3-70b-versatile", "llama-3.1-8b-instant"]
 
10
 
11
  def get_models():
12
  """List available Groq models for fallback."""
13
+ return ["llama-3.1-8b-instant", "llama-3.3-70b-versatile"]
llm/providers/hf_inference.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from dotenv import load_dotenv
4
+ from huggingface_hub import InferenceClient
5
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
6
+
7
+ load_dotenv()
8
+
9
+ token = os.getenv("HF_TOKEN")
10
+ client = InferenceClient(token=token)
11
+
12
+
13
+ def _convert_message(msg):
14
+ role_map = {
15
+ "HumanMessage": "user",
16
+ "AIMessage": "assistant",
17
+ "SystemMessage": "system",
18
+ "ToolMessage": "tool",
19
+ }
20
+ role = role_map.get(type(msg).__name__, "user")
21
+ d = {"role": role, "content": msg.content if msg.content else ""}
22
+
23
+ if role == "tool":
24
+ d["tool_call_id"] = getattr(msg, "tool_call_id", "")
25
+ d["name"] = getattr(msg, "name", "")
26
+
27
+ if role == "assistant" and hasattr(msg, "tool_calls") and msg.tool_calls:
28
+ d["tool_calls"] = []
29
+ for tc in msg.tool_calls:
30
+ d["tool_calls"].append({
31
+ "id": tc.get("id", ""),
32
+ "type": "function",
33
+ "function": {
34
+ "name": tc["name"],
35
+ "arguments": json.dumps(tc["args"]) if isinstance(tc["args"], dict) else tc["args"],
36
+ },
37
+ })
38
+ return d
39
+
40
+
41
+ def _convert_tools(tools):
42
+ result = []
43
+ for t in tools:
44
+ result.append({
45
+ "type": "function",
46
+ "function": {
47
+ "name": t.name,
48
+ "description": t.description,
49
+ "parameters": t.args_schema.schema() if hasattr(t, "args_schema") and t.args_schema else {},
50
+ },
51
+ })
52
+ return result
53
+
54
+
55
+ def invoke(messages, tools, model_name: str = "deepseek-ai/DeepSeek-V3-0324"):
56
+ hf_messages = [_convert_message(m) for m in messages]
57
+ hf_tools = _convert_tools(tools) if tools else None
58
+
59
+ resp = client.chat_completion(
60
+ model=model_name,
61
+ messages=hf_messages,
62
+ tools=hf_tools,
63
+ tool_choice="auto" if hf_tools else None,
64
+ max_tokens=2048,
65
+ temperature=0,
66
+ )
67
+
68
+ choice = resp.choices[0]
69
+ msg = choice.message
70
+
71
+ response_kwargs = {"content": msg.content or ""}
72
+
73
+ if msg.tool_calls:
74
+ tool_calls = []
75
+ for tc in msg.tool_calls:
76
+ tool_calls.append({
77
+ "id": tc.id,
78
+ "name": tc.function.name,
79
+ "args": json.loads(tc.function.arguments) if tc.function.arguments else {},
80
+ })
81
+ response_kwargs["tool_calls"] = tool_calls
82
+ response_kwargs["additional_kwargs"] = {
83
+ "tool_calls": [
84
+ {
85
+ "id": tc.id,
86
+ "type": "function",
87
+ "function": {"name": tc.function.name, "arguments": tc.function.arguments},
88
+ }
89
+ for tc in msg.tool_calls
90
+ ]
91
+ }
92
+
93
+ return AIMessage(**response_kwargs)
94
+
95
+
96
+ def get_models():
97
+ return ["deepseek-ai/DeepSeek-V3-0324"]
llm/providers/opencode_zen.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import requests
4
+ from dotenv import load_dotenv
5
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
6
+
7
+ load_dotenv()
8
+
9
+ API_KEY = os.getenv("OPENCODE_ZEN_API_KEY", "sk-CEgFM8zjmQxtbByFEGNMBTr0bisvxSQvyjhKJEppQfoDjD7922P2Ljtupey6XQji")
10
+ BASE_URL = "https://opencode.ai/zen/v1"
11
+ HEADERS = {
12
+ "Authorization": f"Bearer {API_KEY}",
13
+ "Content-Type": "application/json",
14
+ }
15
+
16
+
17
+ def _convert_message(msg):
18
+ role_map = {
19
+ "HumanMessage": "user",
20
+ "AIMessage": "assistant",
21
+ "SystemMessage": "system",
22
+ "ToolMessage": "tool",
23
+ }
24
+ role = role_map.get(type(msg).__name__, "user")
25
+ d = {"role": role, "content": msg.content if msg.content else ""}
26
+
27
+ if role == "tool":
28
+ d["tool_call_id"] = getattr(msg, "tool_call_id", "")
29
+ d["name"] = getattr(msg, "name", "")
30
+
31
+ if role == "assistant":
32
+ rc = None
33
+ if hasattr(msg, "additional_kwargs") and msg.additional_kwargs:
34
+ rc = msg.additional_kwargs.get("reasoning_content")
35
+ if rc:
36
+ d["reasoning_content"] = rc
37
+
38
+ if hasattr(msg, "tool_calls") and msg.tool_calls:
39
+ d["tool_calls"] = []
40
+ for tc in msg.tool_calls:
41
+ d["tool_calls"].append({
42
+ "id": tc.get("id", ""),
43
+ "type": "function",
44
+ "function": {
45
+ "name": tc["name"],
46
+ "arguments": json.dumps(tc["args"]) if isinstance(tc["args"], dict) else tc["args"],
47
+ },
48
+ })
49
+ return d
50
+
51
+
52
+ def _convert_tools(tools):
53
+ result = []
54
+ for t in tools:
55
+ result.append({
56
+ "type": "function",
57
+ "function": {
58
+ "name": t.name,
59
+ "description": t.description,
60
+ "parameters": t.args_schema.schema() if hasattr(t, "args_schema") and t.args_schema else {},
61
+ },
62
+ })
63
+ return result
64
+
65
+
66
+ def invoke(messages, tools, model_name: str = None):
67
+ if model_name is None:
68
+ model_name = "deepseek-v4-flash-free"
69
+
70
+ hf_messages = [_convert_message(m) for m in messages]
71
+ hf_tools = _convert_tools(tools) if tools else None
72
+
73
+ data = {
74
+ "model": model_name,
75
+ "messages": hf_messages,
76
+ "max_tokens": 4096,
77
+ "temperature": 0,
78
+ }
79
+ if hf_tools:
80
+ data["tools"] = hf_tools
81
+ data["tool_choice"] = "auto"
82
+
83
+ resp = requests.post(f"{BASE_URL}/chat/completions", headers=HEADERS, json=data, timeout=120)
84
+ if resp.status_code != 200:
85
+ print(f"opencode_zen 400 body: {resp.text[:300]}", flush=True)
86
+ print(f"opencode_zen request model={model_name} tools={bool(hf_tools)} msgs={len(hf_messages)}", flush=True)
87
+ resp.raise_for_status()
88
+ choice = resp.json()["choices"][0]
89
+ msg = choice["message"]
90
+
91
+ response_kwargs = {"content": msg.get("content") or ""}
92
+ additional_kwargs = {}
93
+
94
+ reasoning = msg.get("reasoning_content")
95
+ if reasoning:
96
+ additional_kwargs["reasoning_content"] = reasoning
97
+
98
+ tool_calls_data = msg.get("tool_calls")
99
+ if tool_calls_data:
100
+ tool_calls = []
101
+ for tc in tool_calls_data:
102
+ tool_calls.append({
103
+ "id": tc["id"],
104
+ "name": tc["function"]["name"],
105
+ "args": json.loads(tc["function"]["arguments"]) if tc["function"].get("arguments") else {},
106
+ })
107
+ response_kwargs["tool_calls"] = tool_calls
108
+ additional_kwargs["tool_calls"] = [
109
+ {
110
+ "id": tc["id"],
111
+ "type": "function",
112
+ "function": {"name": tc["function"]["name"], "arguments": tc["function"]["arguments"]},
113
+ }
114
+ for tc in tool_calls_data
115
+ ]
116
+
117
+ if additional_kwargs:
118
+ response_kwargs["additional_kwargs"] = additional_kwargs
119
+
120
+ return AIMessage(**response_kwargs)
121
+
122
+
123
+ def get_models():
124
+ return ["deepseek-v4-flash-free", "nemotron-3-super-free", "big-pickle"]
llm/providers/openrouter.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_openai import ChatOpenAI
4
+
5
+ load_dotenv()
6
+
7
+
8
+ def invoke(messages, tools, model_name: str = "deepseek/deepseek-chat"):
9
+ """Invoke OpenRouter model."""
10
+ model = ChatOpenAI(
11
+ model=model_name,
12
+ temperature=0,
13
+ base_url="https://openrouter.ai/api/v1",
14
+ api_key=os.getenv("OPENROUTER_API_KEY"),
15
+ )
16
+ model_with_tools = model.bind_tools(tools)
17
+ return model_with_tools.invoke(messages)
18
+
19
+
20
+ def get_models():
21
+ """Free models on OpenRouter."""
22
+ return ["deepseek/deepseek-chat", "meta-llama/llama-3.2-3b-instruct"]
llm/providers/together.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_openai import ChatOpenAI
4
+
5
+ load_dotenv()
6
+
7
+
8
+ def invoke(messages, tools, model_name: str = "meta-llama/Llama-4-Scout-17B-16E-Instruct"):
9
+ """Invoke Together AI model."""
10
+ model = ChatOpenAI(
11
+ model=model_name,
12
+ temperature=0,
13
+ base_url="https://api.together.xyz/v1",
14
+ api_key=os.getenv("TOGETHER_API_KEY"),
15
+ )
16
+ model_with_tools = model.bind_tools(tools)
17
+ return model_with_tools.invoke(messages)
18
+
19
+
20
+ def get_models():
21
+ """Free models on Together AI."""
22
+ return [
23
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct",
24
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
25
+ ]
llm/providers/zai.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_openai import ChatOpenAI
4
+
5
+ load_dotenv()
6
+
7
+
8
+ def invoke(messages, tools, model_name: str = "z-ai/glm-5"):
9
+ """Invoke ZAI model."""
10
+ model = ChatOpenAI(
11
+ model=model_name,
12
+ temperature=0,
13
+ base_url="https://api.z.ai/api/paas/v4",
14
+ api_key=os.getenv("ZAI_API_KEY"),
15
+ )
16
+ model_with_tools = model.bind_tools(tools)
17
+ return model_with_tools.invoke(messages)
18
+
19
+
20
+ def get_models():
21
+ """Available models on ZAI."""
22
+ return ["z-ai/glm-5", "z-ai/glm-5.1"]
run_local.py CHANGED
@@ -16,10 +16,26 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
16
 
17
  def extract_answer(content) -> str:
18
  if isinstance(content, str):
19
- match = re.search(r'FINAL ANSWER:\s*(.+?)(?:\n|$)', content, re.IGNORECASE)
 
 
 
 
20
  if match:
21
  return match.group(1).strip()
22
- return content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
23
  return str(content)
24
 
25
  class BasicAgent:
@@ -29,9 +45,16 @@ class BasicAgent:
29
 
30
  def __call__(self, question: str) -> str:
31
  messages = [HumanMessage(content=question)]
32
- result = self.graph.invoke({"messages": messages})
33
- answer = result['messages'][-1].content
34
- return extract_answer(answer)
 
 
 
 
 
 
 
35
 
36
  def file_extract(local_file_path, task_id):
37
  if not local_file_path:
@@ -107,8 +130,9 @@ def main():
107
  })
108
 
109
  status = "OK" if is_correct else "FAIL"
110
- print(f" {status} Submitted: {str(answer)[:40]}")
111
- print(f" Ground: {str(ground_truth)[:40]}")
 
112
 
113
  time.sleep(1.5)
114
 
 
16
 
17
  def extract_answer(content) -> str:
18
  if isinstance(content, str):
19
+ cleaned = content.strip()
20
+ if not cleaned:
21
+ return ""
22
+ # Try FINAL ANSWER: pattern (most specific first)
23
+ match = re.search(r'FINAL ANSWER:\s*(.+?)(?:\n|$)', cleaned, re.IGNORECASE)
24
  if match:
25
  return match.group(1).strip()
26
+ # Try "Answer:" pattern
27
+ match = re.search(r'Answer:\s*(.+?)(?:\n|$)', cleaned, re.IGNORECASE)
28
+ if match:
29
+ return match.group(1).strip()
30
+ # Try "answer is" pattern
31
+ match = re.search(r'(?:the\s+)?answer\s+is\s*:?\s*(.+?)(?:\.|$)', cleaned, re.IGNORECASE)
32
+ if match:
33
+ return match.group(1).strip()
34
+ # Use last non-empty line
35
+ lines = [l.strip() for l in cleaned.split('\n') if l.strip()]
36
+ if lines:
37
+ return lines[-1]
38
+ return cleaned
39
  return str(content)
40
 
41
  class BasicAgent:
 
45
 
46
  def __call__(self, question: str) -> str:
47
  messages = [HumanMessage(content=question)]
48
+ result = self.graph.invoke(
49
+ {"messages": messages, "tool_call_count": 0, "reflection_count": 0, "tool_call_history": []},
50
+ config={"recursion_limit": 50},
51
+ )
52
+ # Find last AIMessage with content (skip ToolMessages and tool-call-only AIMessages)
53
+ for m in reversed(result['messages']):
54
+ cls = type(m).__name__
55
+ if cls == 'AIMessage' and m.content:
56
+ return extract_answer(m.content)
57
+ return ""
58
 
59
  def file_extract(local_file_path, task_id):
60
  if not local_file_path:
 
130
  })
131
 
132
  status = "OK" if is_correct else "FAIL"
133
+ def safe(s): return str(s).encode('utf-8', errors='replace').decode('utf-8', errors='replace')[:40]
134
+ print(f" {status} Submitted: {safe(answer)}")
135
+ print(f" Ground: {safe(ground_truth)}")
136
 
137
  time.sleep(1.5)
138
 
test_env.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ load_dotenv(override=True)
4
+
5
+ print("GROQ_API_KEY exists:", "GROQ_API_KEY" in os.environ)
6
+ print("GOOGLE_API_KEY exists:", "GOOGLE_API_KEY" in os.environ)
7
+ print("HF_TOKEN exists:", "HF_TOKEN" in os.environ)
8
+ print("LLM_PROVIDER_ORDER:", os.getenv("LLM_PROVIDER_ORDER"))
test_gemini.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from langchain_google_genai import ChatGoogleGenerativeAI
4
+ from langchain_core.messages import HumanMessage
5
+
6
+ load_dotenv(override=True)
7
+ GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
8
+
9
+ print("GOOGLE_API_KEY:", GOOGLE_API_KEY[:10] + "..." if GOOGLE_API_KEY else None)
10
+
11
+ try:
12
+ model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0, google_api_key=GOOGLE_API_KEY)
13
+ response = model.invoke([HumanMessage(content="Hello, who are you?")])
14
+ print("Gemini response:", response.content)
15
+ except Exception as e:
16
+ print("Gemini failed:", e)
tools/__init__.py CHANGED
@@ -1,19 +1,21 @@
1
  from tools.web.search import web_search
2
  from tools.web.wiki import wiki_search
 
3
  from tools.web.browse import browse_url
4
  from tools.file.reader import read_file
 
5
  from tools.python import python_repl
6
- from tools.reverse import reverse_text
7
  from tools.youtube import get_youtube_transcript
8
  from tools.audio import transcribe_audio
9
 
10
  __all__ = [
11
  web_search,
12
  wiki_search,
 
13
  browse_url,
14
  read_file,
 
15
  python_repl,
16
- reverse_text,
17
  get_youtube_transcript,
18
  transcribe_audio,
19
  ]
 
1
  from tools.web.search import web_search
2
  from tools.web.wiki import wiki_search
3
+ from tools.web.wiki_page import wiki_page
4
  from tools.web.browse import browse_url
5
  from tools.file.reader import read_file
6
+ from tools.file.spreadsheet import parse_spreadsheet
7
  from tools.python import python_repl
 
8
  from tools.youtube import get_youtube_transcript
9
  from tools.audio import transcribe_audio
10
 
11
  __all__ = [
12
  web_search,
13
  wiki_search,
14
+ wiki_page,
15
  browse_url,
16
  read_file,
17
+ parse_spreadsheet,
18
  python_repl,
 
19
  get_youtube_transcript,
20
  transcribe_audio,
21
  ]
tools/file/reader.py CHANGED
@@ -20,6 +20,15 @@ def read_file(path: str) -> str:
20
  loader = UnstructuredImageLoader(path)
21
  docs = loader.load()
22
  content = "\n\n".join([doc.page_content for doc in docs])
 
 
 
 
 
 
 
 
 
23
  elif ext == ".pdf":
24
  try:
25
  doc = fitz.open(path)
 
20
  loader = UnstructuredImageLoader(path)
21
  docs = loader.load()
22
  content = "\n\n".join([doc.page_content for doc in docs])
23
+ elif ext in (".xlsx", ".xls", ".csv"):
24
+ import pandas as pd
25
+ df = pd.read_excel(path) if ext != ".csv" else pd.read_csv(path)
26
+ buf = [f"Rows: {len(df)}, Columns: {list(df.columns)}"]
27
+ buf.append(" | ".join(str(c) for c in df.columns))
28
+ buf.append("-" * min(200, 10 + 12 * len(df.columns)))
29
+ for i, (_, row) in enumerate(df.iterrows()):
30
+ buf.append(f"{i} | " + " | ".join(str(v) if pd.notna(v) else "" for v in row))
31
+ content = "\n".join(buf)
32
  elif ext == ".pdf":
33
  try:
34
  doc = fitz.open(path)
tools/file/spreadsheet.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.tools import tool
2
+
3
+
4
+ @tool
5
+ def parse_spreadsheet(path: str) -> str:
6
+ """Read an Excel (.xlsx) or CSV file and return its contents as a formatted text table.
7
+ Use this instead of read_file for spreadsheet files to get properly structured data.
8
+ Returns all rows with column headers and row numbers."""
9
+ try:
10
+ import pandas as pd
11
+ import os
12
+
13
+ if not os.path.exists(path):
14
+ return f"FILE_NOT_FOUND: {path}"
15
+
16
+ ext = os.path.splitext(path)[1].lower()
17
+ if ext == ".csv":
18
+ df = pd.read_csv(path)
19
+ elif ext in (".xlsx", ".xls"):
20
+ df = pd.read_excel(path, engine="openpyxl" if ext == ".xlsx" else "xlrd")
21
+ else:
22
+ return f"UNSUPPORTED_FORMAT: {ext}"
23
+
24
+ lines = [f"Sheet: {os.path.basename(path)} | Rows: {len(df)} | Columns: {len(df.columns)}"]
25
+ lines.append(" | " + " | ".join(str(c) for c in df.columns))
26
+ lines.append("-" * min(200, 10 + 12 * len(df.columns)))
27
+ for i, (_, row) in enumerate(df.iterrows()):
28
+ vals = [str(v) if pd.notna(v) else "" for v in row]
29
+ lines.append(f"{i} | " + " | ".join(vals))
30
+
31
+ result = "\n".join(lines)
32
+ if len(result) > 25000:
33
+ result = result[:25000] + "\n... [TRUNCATED]"
34
+
35
+ return result
36
+ except Exception as e:
37
+ return f"SPREADSHEET_ERROR: {e}"
tools/python.py CHANGED
@@ -8,6 +8,7 @@ def python_repl(code: str) -> str:
8
  """Execute python code and return the output. Use this for calculations, data analysis, or processing files.
9
  The code should be a valid python script that prints the final result.
10
  You can use libraries like pandas, numpy, PIL, etc.
 
11
  Example: print(df.head()) or print(2 + 2)"""
12
  try:
13
  old_stdout = sys.stdout
 
8
  """Execute python code and return the output. Use this for calculations, data analysis, or processing files.
9
  The code should be a valid python script that prints the final result.
10
  You can use libraries like pandas, numpy, PIL, etc.
11
+ IMPORTANT: Variables persist between calls to this tool (same Python process). You can define a variable in one call and use it in the next.
12
  Example: print(df.head()) or print(2 + 2)"""
13
  try:
14
  old_stdout = sys.stdout
tools/web/search.py CHANGED
@@ -1,18 +1,20 @@
1
- from langchain_tavily import TavilySearch
2
  from langchain_core.tools import tool
3
 
4
 
5
  @tool
6
  def web_search(keywords: str) -> str:
7
- """Search the web using Tavily. This tool performs a concise, focused search to answer factual questions or gather brief information snippets.
8
- For deeper research or browsing specific URLs, additional tools may be required.
9
- """
10
  try:
11
- tavily = TavilySearch(max_results=5)
12
- results = tavily.invoke(keywords)
13
- formatted_results = []
 
 
14
  for r in results:
15
- formatted_results.append(f"Title: {r['title']}\nURL: {r['url']}\nContent: {r['content'][:300]}")
16
- return "\n".join(formatted_results) or "NO_RESULTS"
 
 
 
17
  except Exception as e:
18
  return f"SEARCH_ERROR: {e}"
 
 
1
  from langchain_core.tools import tool
2
 
3
 
4
  @tool
5
  def web_search(keywords: str) -> str:
6
+ """Search the web. Use this for finding information, facts, URLs, and general web content. Returns title, URL, and snippet for each result."""
 
 
7
  try:
8
+ from ddgs import DDGS
9
+ results = list(DDGS().text(keywords, max_results=5))
10
+ if not results:
11
+ return "NO_RESULTS"
12
+ formatted = []
13
  for r in results:
14
+ title = r.get("title", "")
15
+ url = r.get("href", "")
16
+ body = r.get("body", "")[:300]
17
+ formatted.append(f"Title: {title}\nURL: {url}\nContent: {body}")
18
+ return "\n\n".join(formatted)
19
  except Exception as e:
20
  return f"SEARCH_ERROR: {e}"
tools/web/wiki_page.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.tools import tool
2
+
3
+
4
+ @tool
5
+ def wiki_page(title: str) -> str:
6
+ """Fetch the full text content of a Wikipedia page by its exact title.
7
+ Use this when wiki_search snippets are insufficient and you need the complete article.
8
+ Provide the exact Wikipedia page title (case-sensitive, spaces allowed).
9
+ Returns the first 25000 characters of the article."""
10
+ try:
11
+ import requests
12
+
13
+ params = {
14
+ "action": "query",
15
+ "format": "json",
16
+ "titles": title,
17
+ "prop": "extracts",
18
+ "explaintext": True,
19
+ "exlimit": 1,
20
+ }
21
+ resp = requests.get(
22
+ "https://en.wikipedia.org/w/api.php",
23
+ params=params,
24
+ headers={"User-Agent": "GAIA-Benchmark-Agent/1.0"},
25
+ timeout=15,
26
+ )
27
+ resp.raise_for_status()
28
+ data = resp.json()
29
+ pages = data.get("query", {}).get("pages", {})
30
+ if not pages:
31
+ return "NO_RESULTS"
32
+ page_id = list(pages.keys())[0]
33
+ if page_id == "-1":
34
+ return "PAGE_NOT_FOUND"
35
+ extract = pages[page_id].get("extract", "")
36
+ if not extract:
37
+ return "NO_CONTENT"
38
+ if len(extract) > 25000:
39
+ extract = extract[:25000] + "\n... [TRUNCATED]"
40
+ return extract
41
+ except Exception as e:
42
+ return f"WIKI_PAGE_ERROR: {e}"