Refactor LLM provider invocation and add new providers
Browse files- Updated `invoke_llm` function to improve provider fallback logic and error handling.
- Added new LLM providers: OpenRouter, Together, ZAI, HF Inference, and Opencode Zen.
- Modified provider order to include new providers and adjusted model retrieval logic.
- Enhanced error handling for rate limits and other exceptions during invocation.
- Added spreadsheet parsing functionality to read Excel and CSV files.
- Improved web search tool to use DDGS for better results.
- Introduced a new tool for fetching full Wikipedia page content.
- Updated local run script to handle AI message extraction more robustly.
- Added environment variable checks in `test_env.py` for better debugging.
- __pycache__/agent.cpython-39.pyc +0 -0
- agent.py +61 -52
- analyze_results.py +132 -0
- app.py +4 -1
- gaia_results.csv +20 -29
- gaia_results.json +26 -26
- llm/client.py +39 -55
- llm/providers/__init__.py +7 -2
- llm/providers/groq.py +1 -1
- llm/providers/hf_inference.py +97 -0
- llm/providers/opencode_zen.py +124 -0
- llm/providers/openrouter.py +22 -0
- llm/providers/together.py +25 -0
- llm/providers/zai.py +22 -0
- run_local.py +31 -7
- test_env.py +8 -0
- test_gemini.py +16 -0
- tools/__init__.py +4 -2
- tools/file/reader.py +9 -0
- tools/file/spreadsheet.py +37 -0
- tools/python.py +1 -0
- tools/web/search.py +11 -9
- tools/web/wiki_page.py +42 -0
__pycache__/agent.cpython-39.pyc
CHANGED
|
Binary files a/__pycache__/agent.cpython-39.pyc and b/__pycache__/agent.cpython-39.pyc differ
|
|
|
agent.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
|
|
|
| 3 |
from typing import TypedDict, List, Union
|
| 4 |
|
| 5 |
from dotenv import load_dotenv
|
|
@@ -15,15 +16,15 @@ load_dotenv()
|
|
| 15 |
class AgentState(TypedDict):
|
| 16 |
messages: List[Union[HumanMessage, AIMessage, SystemMessage, ToolMessage]]
|
| 17 |
reflection_count: int
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def _invoke_llm_with_tools(messages, fallback_count=0):
|
| 21 |
-
"""Invoke LLM with provider fallback."""
|
| 22 |
return invoke_llm(messages, tools, fallback_count)
|
| 23 |
|
| 24 |
-
|
| 25 |
def is_reversed_text(question: str) -> bool:
|
| 26 |
-
"""Check if text appears to be reversed."""
|
| 27 |
words = question.split()
|
| 28 |
if len(words) < 3:
|
| 29 |
return False
|
|
@@ -35,75 +36,64 @@ def is_reversed_text(question: str) -> bool:
|
|
| 35 |
orig_valid = len([w for w in orig_words if w in common_words])
|
| 36 |
return rev_valid > orig_valid
|
| 37 |
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def call_model(state: AgentState):
|
| 40 |
-
messages = state["messages"]
|
| 41 |
-
|
| 42 |
-
# Pre-processing: Detect and handle reversed text in the first message
|
| 43 |
if len(messages) == 1 and isinstance(messages[0], HumanMessage):
|
| 44 |
user_msg = messages[0].content
|
| 45 |
if is_reversed_text(user_msg):
|
| 46 |
fixed_msg = user_msg[::-1]
|
| 47 |
messages = [HumanMessage(content=f"The following message was detected as reversed. I have reversed it back for you:\n{fixed_msg}")]
|
| 48 |
-
|
| 49 |
-
# Add System Message if not present
|
| 50 |
if not any(isinstance(m, SystemMessage) for m in messages):
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
Your thought process MUST be methodical:
|
| 54 |
-
1. THINK:
|
| 55 |
-
- Analyze the question deeply. Identify the core goal and ALL constraints (units, date formats, precision, etc.).
|
| 56 |
-
- If the task involves an image or video, describe the visual elements before attempting to solve.
|
| 57 |
-
- Plan your steps. Break the problem into smaller sub-problems.
|
| 58 |
-
2. ACT (Python-First):
|
| 59 |
-
- Use `python_repl` for ANY task involving: math, counting, data analysis, list filtering (e.g., botany), or verifying logic (e.g., commutativity). DO NOT do these manually.
|
| 60 |
-
- Use `web_search` for initial discovery and `browse_url` to verify details from the source.
|
| 61 |
-
3. OBSERVE: Carefully review tool outputs. If a result is ambiguous, search for a second source to triangulate.
|
| 62 |
-
4. REFINE: Question your assumptions. If the answer seems too simple for a complex GAIA task, you likely missed a constraint.
|
| 63 |
-
5. VERIFY: Before finalizing, double-check units and precision.
|
| 64 |
-
6. FINALIZE: Provide the result in the exact format: FINAL ANSWER: <answer>.
|
| 65 |
-
|
| 66 |
-
Guidelines:
|
| 67 |
-
- [Attached Files]: Always use `read_file` for local files.
|
| 68 |
-
- Research: Don't trust a single snippet; browse the full page if the answer is buried.
|
| 69 |
-
- Constraints: If the question says 'alphabetize' or 'comma-separated', use Python to ensure it is perfect.
|
| 70 |
-
- Final Output: Return ONLY the final answer in the requested format.
|
| 71 |
-
"""
|
| 72 |
-
messages = [SystemMessage(content=system_prompt)] + messages
|
| 73 |
|
| 74 |
response = _invoke_llm_with_tools(messages)
|
| 75 |
-
return {"messages": [response]}
|
|
|
|
| 76 |
|
| 77 |
def reflect(state: AgentState):
|
| 78 |
-
"""Node to reflect on the final answer and verify correctness."""
|
| 79 |
messages = state["messages"]
|
| 80 |
last_message = messages[-1]
|
| 81 |
-
|
| 82 |
if "FINAL ANSWER:" not in last_message.content:
|
| 83 |
-
return {"messages": []}
|
| 84 |
|
| 85 |
reflection_prompt = (
|
| 86 |
-
"
|
| 87 |
"1. Did you miss any constraints from the original question?\n"
|
| 88 |
-
"2. Are
|
| 89 |
-
"3.
|
| 90 |
-
"If
|
| 91 |
-
"If
|
| 92 |
)
|
| 93 |
-
|
| 94 |
-
# We add the reflection prompt as a human message to trigger a new response
|
| 95 |
response = _invoke_llm_with_tools(messages + [HumanMessage(content=reflection_prompt)])
|
| 96 |
-
return {"messages": [response], "reflection_count": state.get("reflection_count", 0) + 1}
|
|
|
|
| 97 |
|
| 98 |
def call_tool(state: AgentState):
|
| 99 |
messages = state["messages"]
|
| 100 |
last_message = messages[-1]
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
| 102 |
tool_outputs = []
|
| 103 |
-
|
|
|
|
| 104 |
tool_name = tool_call["name"]
|
| 105 |
tool_args = tool_call["args"]
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
| 107 |
if tool_name not in tools_by_name:
|
| 108 |
tool_outputs.append(ToolMessage(
|
| 109 |
content=f"Error: Tool {tool_name} not found.",
|
|
@@ -111,13 +101,16 @@ def call_tool(state: AgentState):
|
|
| 111 |
name=tool_name
|
| 112 |
))
|
| 113 |
continue
|
| 114 |
-
|
| 115 |
tool = tools_by_name[tool_name]
|
| 116 |
-
print(f"Calling tool: {tool_name} with args: {tool_args}")
|
| 117 |
try:
|
| 118 |
output = tool.invoke(tool_args)
|
|
|
|
|
|
|
|
|
|
| 119 |
tool_outputs.append(ToolMessage(
|
| 120 |
-
content=
|
| 121 |
tool_call_id=tool_call["id"],
|
| 122 |
name=tool_name
|
| 123 |
))
|
|
@@ -127,18 +120,34 @@ def call_tool(state: AgentState):
|
|
| 127 |
tool_call_id=tool_call["id"],
|
| 128 |
name=tool_name
|
| 129 |
))
|
| 130 |
-
return {"messages": tool_outputs}
|
|
|
|
| 131 |
|
| 132 |
def should_continue(state: AgentState):
|
| 133 |
messages = state["messages"]
|
| 134 |
last_message = messages[-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
if hasattr(last_message, "tool_calls") and last_message.tool_calls:
|
| 136 |
return "action"
|
| 137 |
-
|
|
|
|
| 138 |
return "reflect"
|
| 139 |
return END
|
| 140 |
|
| 141 |
-
|
| 142 |
def build_graph():
|
| 143 |
workflow = StateGraph(AgentState)
|
| 144 |
workflow.add_node("agent", call_model)
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
+
from collections import Counter
|
| 4 |
from typing import TypedDict, List, Union
|
| 5 |
|
| 6 |
from dotenv import load_dotenv
|
|
|
|
| 16 |
class AgentState(TypedDict):
|
| 17 |
messages: List[Union[HumanMessage, AIMessage, SystemMessage, ToolMessage]]
|
| 18 |
reflection_count: int
|
| 19 |
+
tool_call_count: int
|
| 20 |
+
tool_call_history: List[str]
|
| 21 |
|
| 22 |
|
| 23 |
def _invoke_llm_with_tools(messages, fallback_count=0):
|
|
|
|
| 24 |
return invoke_llm(messages, tools, fallback_count)
|
| 25 |
|
| 26 |
+
|
| 27 |
def is_reversed_text(question: str) -> bool:
|
|
|
|
| 28 |
words = question.split()
|
| 29 |
if len(words) < 3:
|
| 30 |
return False
|
|
|
|
| 36 |
orig_valid = len([w for w in orig_words if w in common_words])
|
| 37 |
return rev_valid > orig_valid
|
| 38 |
|
| 39 |
+
|
| 40 |
+
SYSTEM_PROMPT = """Answer with FINAL ANSWER: <value>.
|
| 41 |
+
Use tools to research. Read full Wikipedia pages (browse_url) rather than just searching.
|
| 42 |
+
Never repeat the same tool call."""
|
| 43 |
+
|
| 44 |
+
|
| 45 |
def call_model(state: AgentState):
|
| 46 |
+
messages = list(state["messages"])
|
| 47 |
+
|
|
|
|
| 48 |
if len(messages) == 1 and isinstance(messages[0], HumanMessage):
|
| 49 |
user_msg = messages[0].content
|
| 50 |
if is_reversed_text(user_msg):
|
| 51 |
fixed_msg = user_msg[::-1]
|
| 52 |
messages = [HumanMessage(content=f"The following message was detected as reversed. I have reversed it back for you:\n{fixed_msg}")]
|
| 53 |
+
|
|
|
|
| 54 |
if not any(isinstance(m, SystemMessage) for m in messages):
|
| 55 |
+
messages = [SystemMessage(content=SYSTEM_PROMPT)] + messages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
response = _invoke_llm_with_tools(messages)
|
| 58 |
+
return {"messages": state["messages"] + [response]}
|
| 59 |
+
|
| 60 |
|
| 61 |
def reflect(state: AgentState):
|
|
|
|
| 62 |
messages = state["messages"]
|
| 63 |
last_message = messages[-1]
|
| 64 |
+
|
| 65 |
if "FINAL ANSWER:" not in last_message.content:
|
| 66 |
+
return {"messages": []}
|
| 67 |
|
| 68 |
reflection_prompt = (
|
| 69 |
+
"Before finalizing, double-check:\n"
|
| 70 |
"1. Did you miss any constraints from the original question?\n"
|
| 71 |
+
"2. Are units and precision exactly as requested?\n"
|
| 72 |
+
"3. Could any step in reasoning be flawed?\n"
|
| 73 |
+
"If correct, repeat FINAL ANSWER: <answer> exactly.\n"
|
| 74 |
+
"If wrong, explain and provide corrected FINAL ANSWER: <answer>."
|
| 75 |
)
|
| 76 |
+
|
|
|
|
| 77 |
response = _invoke_llm_with_tools(messages + [HumanMessage(content=reflection_prompt)])
|
| 78 |
+
return {"messages": state["messages"] + [response], "reflection_count": state.get("reflection_count", 0) + 1}
|
| 79 |
+
|
| 80 |
|
| 81 |
def call_tool(state: AgentState):
|
| 82 |
messages = state["messages"]
|
| 83 |
last_message = messages[-1]
|
| 84 |
+
|
| 85 |
+
tool_call_history = state.get("tool_call_history", [])
|
| 86 |
+
tool_call_count = state.get("tool_call_count", 0)
|
| 87 |
+
|
| 88 |
tool_outputs = []
|
| 89 |
+
# Limit 5 tool calls per response
|
| 90 |
+
for tool_call in last_message.tool_calls[:5]:
|
| 91 |
tool_name = tool_call["name"]
|
| 92 |
tool_args = tool_call["args"]
|
| 93 |
+
key = f"{tool_name}({tool_args})"
|
| 94 |
+
tool_call_history.append(key)
|
| 95 |
+
tool_call_count += 1
|
| 96 |
+
|
| 97 |
if tool_name not in tools_by_name:
|
| 98 |
tool_outputs.append(ToolMessage(
|
| 99 |
content=f"Error: Tool {tool_name} not found.",
|
|
|
|
| 101 |
name=tool_name
|
| 102 |
))
|
| 103 |
continue
|
| 104 |
+
|
| 105 |
tool = tools_by_name[tool_name]
|
| 106 |
+
print(f"Calling tool: {tool_name} with args: {tool_args}", flush=True)
|
| 107 |
try:
|
| 108 |
output = tool.invoke(tool_args)
|
| 109 |
+
output_str = str(output)
|
| 110 |
+
if len(output_str) > 15000:
|
| 111 |
+
output_str = output_str[:15000] + "\n...[truncated]"
|
| 112 |
tool_outputs.append(ToolMessage(
|
| 113 |
+
content=output_str,
|
| 114 |
tool_call_id=tool_call["id"],
|
| 115 |
name=tool_name
|
| 116 |
))
|
|
|
|
| 120 |
tool_call_id=tool_call["id"],
|
| 121 |
name=tool_name
|
| 122 |
))
|
| 123 |
+
return {"messages": state["messages"] + tool_outputs, "tool_call_count": tool_call_count, "tool_call_history": tool_call_history}
|
| 124 |
+
|
| 125 |
|
| 126 |
def should_continue(state: AgentState):
|
| 127 |
messages = state["messages"]
|
| 128 |
last_message = messages[-1]
|
| 129 |
+
tool_call_count = state.get("tool_call_count", 0)
|
| 130 |
+
tool_call_history = state.get("tool_call_history", [])
|
| 131 |
+
reflection_count = state.get("reflection_count", 0)
|
| 132 |
+
|
| 133 |
+
# Max 8 tool calls (128K context handles it)
|
| 134 |
+
if tool_call_count >= 8:
|
| 135 |
+
return END
|
| 136 |
+
|
| 137 |
+
# Detect loop: same tool name called 4+ times
|
| 138 |
+
if len(tool_call_history) >= 4:
|
| 139 |
+
tool_names = [h.split("(")[0] for h in tool_call_history]
|
| 140 |
+
if any(tool_names.count(n) >= 4 for n in set(tool_names)):
|
| 141 |
+
return END
|
| 142 |
+
|
| 143 |
if hasattr(last_message, "tool_calls") and last_message.tool_calls:
|
| 144 |
return "action"
|
| 145 |
+
content = getattr(last_message, "content", "") or ""
|
| 146 |
+
if "FINAL ANSWER:" in content and reflection_count == 0:
|
| 147 |
return "reflect"
|
| 148 |
return END
|
| 149 |
|
| 150 |
+
|
| 151 |
def build_graph():
|
| 152 |
workflow = StateGraph(AgentState)
|
| 153 |
workflow.add_node("agent", call_model)
|
analyze_results.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Analyze existing gaia_results.json and produce a diagnostic report.
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
import re
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
# Fix Windows console encoding issues
|
| 9 |
+
try:
|
| 10 |
+
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
| 11 |
+
except Exception:
|
| 12 |
+
pass
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
with open("gaia_results.json", "r") as f:
|
| 16 |
+
data = json.load(f)
|
| 17 |
+
|
| 18 |
+
results = data["results"]
|
| 19 |
+
|
| 20 |
+
# Categorize failures
|
| 21 |
+
categories = {
|
| 22 |
+
"groq_rate_limit_tpm": [],
|
| 23 |
+
"recursion_limit": [],
|
| 24 |
+
"tool_call_format_error": [],
|
| 25 |
+
"wrong_output_format": [],
|
| 26 |
+
"other_error": [],
|
| 27 |
+
"correct": [],
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
for r in results:
|
| 31 |
+
ans = r["submitted_answer"]
|
| 32 |
+
q = r["question"][:80]
|
| 33 |
+
tid = r["task_id"][:8]
|
| 34 |
+
gt = r["ground_truth"]
|
| 35 |
+
is_correct = r["correct"]
|
| 36 |
+
|
| 37 |
+
if is_correct:
|
| 38 |
+
categories["correct"].append(r)
|
| 39 |
+
elif "413" in ans and "tokens per minute" in ans.lower():
|
| 40 |
+
categories["groq_rate_limit_tpm"].append(r)
|
| 41 |
+
elif "Recursion limit" in ans:
|
| 42 |
+
categories["recursion_limit"].append(r)
|
| 43 |
+
elif "tool_use_failed" in ans or "tool call validation" in ans:
|
| 44 |
+
categories["tool_call_format_error"].append(r)
|
| 45 |
+
elif "<|python_tag|>" in ans or ("AGENT ERROR" in ans and "tool" in ans.lower()):
|
| 46 |
+
categories["wrong_output_format"].append(r)
|
| 47 |
+
else:
|
| 48 |
+
categories["other_error"].append(r)
|
| 49 |
+
|
| 50 |
+
print("=" * 70)
|
| 51 |
+
print("GAIA BENCHMARK - FAILURE ANALYSIS REPORT")
|
| 52 |
+
print(f"Score: {data['correct']}/{data['total']} = {data['score']:.0f}%")
|
| 53 |
+
print("=" * 70)
|
| 54 |
+
|
| 55 |
+
print("\n## CATEGORY BREAKDOWN")
|
| 56 |
+
for cat, items in categories.items():
|
| 57 |
+
print(f" {cat}: {len(items)} questions")
|
| 58 |
+
|
| 59 |
+
print("\n" + "=" * 70)
|
| 60 |
+
|
| 61 |
+
for cat, label in [
|
| 62 |
+
("groq_rate_limit_tpm", "[RATE_LIMIT] GROQ TPM RATE LIMIT (request too large for fallback model)"),
|
| 63 |
+
("recursion_limit", "[RECURSION] RECURSION LIMIT (agent stuck in tool loop, no answer found)"),
|
| 64 |
+
("tool_call_format_error", "[FORMAT_ERR] TOOL CALL FORMAT ERROR (LLM generated malformed tool invocations)"),
|
| 65 |
+
("wrong_output_format", "[WRONG_OUT] WRONG OUTPUT FORMAT (agent returned tool calls as text, not answer)"),
|
| 66 |
+
("other_error", "[OTHER] OTHER ERROR"),
|
| 67 |
+
]:
|
| 68 |
+
items = categories[cat]
|
| 69 |
+
if not items:
|
| 70 |
+
continue
|
| 71 |
+
print(f"\n### {label} ({len(items)} questions)")
|
| 72 |
+
for r in items:
|
| 73 |
+
print(f" - [{r['task_id'][:8]}] GT={r['ground_truth'][:40]!r}")
|
| 74 |
+
print(f" Q: {r['question'][:100]}")
|
| 75 |
+
# Classify what tool/skill would solve it
|
| 76 |
+
q_lower = r["question"].lower()
|
| 77 |
+
skills = []
|
| 78 |
+
if "youtube.com" in r["question"]:
|
| 79 |
+
skills.append("YouTube Transcript / Video Analysis")
|
| 80 |
+
if "mp3" in q_lower or "audio" in q_lower or "voice memo" in q_lower or "recording" in q_lower:
|
| 81 |
+
skills.append("Audio Transcription (Whisper)")
|
| 82 |
+
if "image" in q_lower or "chess" in q_lower:
|
| 83 |
+
skills.append("Image Analysis (Vision LLM)")
|
| 84 |
+
if "excel" in q_lower or ".xlsx" in q_lower:
|
| 85 |
+
skills.append("Excel/File Reading")
|
| 86 |
+
if "wikipedia" in q_lower or "wiki" in q_lower:
|
| 87 |
+
skills.append("Wikipedia Search")
|
| 88 |
+
if "paper" in q_lower or "article" in q_lower:
|
| 89 |
+
skills.append("Web Browsing/Research")
|
| 90 |
+
if "python code" in q_lower or "code" in q_lower:
|
| 91 |
+
skills.append("Python REPL execution")
|
| 92 |
+
if "table" in q_lower or "commutative" in q_lower:
|
| 93 |
+
skills.append("Python REPL (logic check)")
|
| 94 |
+
if not skills:
|
| 95 |
+
skills.append("Web Search")
|
| 96 |
+
print(f" Needed: {', '.join(skills)}")
|
| 97 |
+
print()
|
| 98 |
+
|
| 99 |
+
print("\n" + "=" * 70)
|
| 100 |
+
print("## PRIORITIZED IMPROVEMENT AREAS")
|
| 101 |
+
print("""
|
| 102 |
+
1. CRITICAL - PROVIDER FALLBACK (affects 8 questions):
|
| 103 |
+
- Groq falls back to llama-3.1-8b-instant (6000 TPM limit)
|
| 104 |
+
- Gemini API quota exhausted (free tier daily limit hit)
|
| 105 |
+
- Fix: Use gemini-1.5-flash or gemini-2.5-flash as PRIMARY provider
|
| 106 |
+
- Fix: Add proper provider rotation that skips quota-exhausted models
|
| 107 |
+
|
| 108 |
+
2. CRITICAL - RECURSION LIMIT (affects 8 questions):
|
| 109 |
+
- Agent loops indefinitely (25 steps) without providing an answer
|
| 110 |
+
- Causes: Tool keeps failing or returning unhelpful results
|
| 111 |
+
- Fix: Add a MAX_TOOL_CALLS guard and force FINAL ANSWER after N iterations
|
| 112 |
+
|
| 113 |
+
3. HIGH - TOOL CALL FORMAT ERRORS (affects 3 questions):
|
| 114 |
+
- LLM generates tool calls not matching the registered tool names/schema
|
| 115 |
+
- wiki_search called with wrong JSON format
|
| 116 |
+
- web_search called with 'keywords' instead of 'query' parameter
|
| 117 |
+
- Fix: Add function signature validation / tool schema alignment
|
| 118 |
+
|
| 119 |
+
4. HIGH - AUDIO/VIDEO QUESTIONS (affects 3 questions):
|
| 120 |
+
- YouTube video analysis requires video frames, not just transcript
|
| 121 |
+
- Audio transcription (mp3) failing due to context overflow
|
| 122 |
+
- Fix: Ensure transcribe_audio + get_youtube_transcript work reliably
|
| 123 |
+
|
| 124 |
+
5. HIGH - IMAGE/CHESS QUESTIONS (affects 1 question):
|
| 125 |
+
- Chess position from image requires multimodal vision model
|
| 126 |
+
- Current setup can't directly process images
|
| 127 |
+
- Fix: Pass image URL to Gemini vision model
|
| 128 |
+
|
| 129 |
+
6. MEDIUM - CONTEXT OVERFLOW on multi-step research questions:
|
| 130 |
+
- Long Wikipedia/web searches fill context window before finding answer
|
| 131 |
+
- Fix: Summarize intermediate tool results before appending to messages
|
| 132 |
+
""")
|
app.py
CHANGED
|
@@ -24,7 +24,10 @@ class BasicAgent:
|
|
| 24 |
def __call__(self, question: str) -> str:
|
| 25 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 26 |
messages = [HumanMessage(content=question)]
|
| 27 |
-
result = self.graph.invoke(
|
|
|
|
|
|
|
|
|
|
| 28 |
answer = result['messages'][-1].content
|
| 29 |
print(f"Agent returning answer: {answer}")
|
| 30 |
return answer
|
|
|
|
| 24 |
def __call__(self, question: str) -> str:
|
| 25 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 26 |
messages = [HumanMessage(content=question)]
|
| 27 |
+
result = self.graph.invoke(
|
| 28 |
+
{"messages": messages, "tool_call_count": 0, "reflection_count": 0, "tool_call_history": []},
|
| 29 |
+
config={"recursion_limit": 50},
|
| 30 |
+
)
|
| 31 |
answer = result['messages'][-1].content
|
| 32 |
print(f"Agent returning answer: {answer}")
|
| 33 |
return answer
|
gaia_results.csv
CHANGED
|
@@ -1,11 +1,9 @@
|
|
| 1 |
task_id,question,submitted_answer,ground_truth,correct
|
| 2 |
-
8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,
|
| 3 |
-
a1e91b78-d3d8-4675-bb8d-62741b4b68a6,"In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 4 |
-
2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
|
| 5 |
-
cca530fc-4052-43b2-b130-b30968d8aa44,Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.,
|
| 6 |
-
|
| 7 |
-
4fc2f1ae-8625-45b5-ab34-ad4433bc21f8,Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,"ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
|
| 8 |
-
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT",FunkMonk,False
|
| 9 |
6f37996b-2ac7-44b0-8e68-6d28256631b4,"Given this table defining * on the set S = {a, b, c, d, e}
|
| 10 |
|
| 11 |
|*|a|b|c|d|e|
|
|
@@ -16,37 +14,30 @@ For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/er
|
|
| 16 |
|d|b|e|b|e|d|
|
| 17 |
|e|d|b|a|d|c|
|
| 18 |
|
| 19 |
-
provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.","
|
| 20 |
9d191bce-651d-4746-be2d-7ef8ecadb9c2,"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
|
| 21 |
|
| 22 |
-
What does Teal'c say in response to the question ""Isn't that hot?""",
|
| 23 |
-
|
| 24 |
-
cabe07ed-9eca-40ea-8ead-410ef5e83f91,What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,"ERROR: LLM invocation failed: Error code: 400 - {'error': {'message': ""Failed to call a function. Please adjust your prompt. See 'failed_generation' for more details."", 'type': 'invalid_request_error', 'code': 'tool_use_failed', 'failed_generation': '<function=web_search {""keywords"": ""equine veterinarian surname CK-12 license LibreText Introductory Chemistry materials Marisa Alviar-Agnew Henry Agnew""} </function>'}}",Louvrier,False
|
| 25 |
3cef3a44-215e-4aed-8e3b-b1e3f08063b7,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
|
| 26 |
|
| 27 |
milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
|
| 28 |
|
| 29 |
-
I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","
|
| 30 |
-
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT","broccoli, celery, fresh basil, lettuce, sweet potatoes",False
|
| 31 |
99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3,"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
|
| 32 |
|
| 33 |
In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
|
| 34 |
|
| 35 |
-
Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
|
| 36 |
-
305ac316-eef6-4446-960a-92d80d542f82,Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
3f57289b-8c60-48be-bd80-01f8099ca449,How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?,"ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 6414, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",519,False
|
| 40 |
1f975693-876d-457b-a649-393859e79bf3,"Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
|
| 41 |
|
| 42 |
-
Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.","
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
a0c07678-e491-4bbc-8f0b-07405144218f,"Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.","ERROR: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
|
| 50 |
-
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT","Yoshida, Uehara",False
|
| 51 |
-
7bd855d8-463d-4ed5-93ca-5fe35145f733,The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,"ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 14486, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",89706.00,False
|
| 52 |
-
5a0c1adf-205e-4841-a666-7c3ef95def9d,What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,"ERROR: LLM invocation failed: Error code: 413 - {'error': {'message': 'Request too large for model `llama-3.1-8b-instant` in organization `org_01kk9kwrt4fcxtaxyjc5ze79eq` service tier `on_demand` on tokens per minute (TPM): Limit 6000, Requested 23154, please reduce your message size and try again. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}",Claus,False
|
|
|
|
| 1 |
task_id,question,submitted_answer,ground_truth,correct
|
| 2 |
+
8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,,3,False
|
| 3 |
+
a1e91b78-d3d8-4675-bb8d-62741b4b68a6,"In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",,3,False
|
| 4 |
+
2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",left,Right,False
|
| 5 |
+
cca530fc-4052-43b2-b130-b30968d8aa44,Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.,,Rd5,False
|
| 6 |
+
4fc2f1ae-8625-45b5-ab34-ad4433bc21f8,Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,,FunkMonk,False
|
|
|
|
|
|
|
| 7 |
6f37996b-2ac7-44b0-8e68-6d28256631b4,"Given this table defining * on the set S = {a, b, c, d, e}
|
| 8 |
|
| 9 |
|*|a|b|c|d|e|
|
|
|
|
| 14 |
|d|b|e|b|e|d|
|
| 15 |
|e|d|b|a|d|c|
|
| 16 |
|
| 17 |
+
provide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.","b,e","b, e",False
|
| 18 |
9d191bce-651d-4746-be2d-7ef8ecadb9c2,"Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.
|
| 19 |
|
| 20 |
+
What does Teal'c say in response to the question ""Isn't that hot?""",,Extremely,False
|
| 21 |
+
cabe07ed-9eca-40ea-8ead-410ef5e83f91,What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,,Louvrier,False
|
|
|
|
| 22 |
3cef3a44-215e-4aed-8e3b-b1e3f08063b7,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
|
| 23 |
|
| 24 |
milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
|
| 25 |
|
| 26 |
+
I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","broccoli, celery, fresh basil, lettuce, sweet potatoes","broccoli, celery, fresh basil, lettuce, sweet potatoes",True
|
|
|
|
| 27 |
99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3,"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.
|
| 28 |
|
| 29 |
In your response, please only list the ingredients, not any measurements. So if the recipe calls for ""a pinch of salt"" or ""two cups of ripe strawberries"" the ingredients on the list would be ""salt"" and ""ripe strawberries"".
|
| 30 |
|
| 31 |
+
Please format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",,"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",False
|
| 32 |
+
305ac316-eef6-4446-960a-92d80d542f82,Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,,Wojciech,False
|
| 33 |
+
f918266a-b3e0-4914-865d-4faa564f1aef,What is the final numeric output from the attached Python code?,,0,False
|
| 34 |
+
3f57289b-8c60-48be-bd80-01f8099ca449,How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?,519,519,True
|
|
|
|
| 35 |
1f975693-876d-457b-a649-393859e79bf3,"Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(
|
| 36 |
|
| 37 |
+
Could you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",,"132, 133, 134, 197, 245",False
|
| 38 |
+
840bfca7-4f7b-481a-8794-c560c340185d,"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",,80GSFC21M0002,False
|
| 39 |
+
bda648d7-d618-4883-88f4-3466eabd860e,Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.,Saint Petersburg,Saint Petersburg,True
|
| 40 |
+
cf106601-ab4f-4af9-b045-5295fe67b37d,"What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",,CUB,False
|
| 41 |
+
a0c07678-e491-4bbc-8f0b-07405144218f,"Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",,"Yoshida, Uehara",False
|
| 42 |
+
7bd855d8-463d-4ed5-93ca-5fe35145f733,The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.,89706.00,89706.00,True
|
| 43 |
+
5a0c1adf-205e-4841-a666-7c3ef95def9d,What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,,Claus,False
|
|
|
|
|
|
|
|
|
|
|
|
gaia_results.json
CHANGED
|
@@ -1,145 +1,145 @@
|
|
| 1 |
{
|
| 2 |
-
"score":
|
| 3 |
-
"correct":
|
| 4 |
"total": 20,
|
| 5 |
"results": [
|
| 6 |
{
|
| 7 |
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
|
| 8 |
"question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
|
| 9 |
-
"submitted_answer": "
|
| 10 |
"ground_truth": "3",
|
| 11 |
"correct": false
|
| 12 |
},
|
| 13 |
{
|
| 14 |
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
|
| 15 |
"question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 16 |
-
"submitted_answer": "
|
| 17 |
"ground_truth": "3",
|
| 18 |
"correct": false
|
| 19 |
},
|
| 20 |
{
|
| 21 |
"task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
|
| 22 |
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
|
| 23 |
-
"submitted_answer": "
|
| 24 |
"ground_truth": "Right",
|
| 25 |
"correct": false
|
| 26 |
},
|
| 27 |
{
|
| 28 |
"task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
|
| 29 |
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
|
| 30 |
-
"submitted_answer": "
|
| 31 |
"ground_truth": "Rd5",
|
| 32 |
"correct": false
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
|
| 36 |
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
|
| 37 |
-
"submitted_answer": "
|
| 38 |
"ground_truth": "FunkMonk",
|
| 39 |
"correct": false
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
|
| 43 |
"question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
|
| 44 |
-
"submitted_answer": "
|
| 45 |
"ground_truth": "b, e",
|
| 46 |
"correct": false
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
|
| 50 |
"question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
|
| 51 |
-
"submitted_answer": "
|
| 52 |
"ground_truth": "Extremely",
|
| 53 |
"correct": false
|
| 54 |
},
|
| 55 |
{
|
| 56 |
"task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
|
| 57 |
"question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
|
| 58 |
-
"submitted_answer": "
|
| 59 |
"ground_truth": "Louvrier",
|
| 60 |
"correct": false
|
| 61 |
},
|
| 62 |
{
|
| 63 |
"task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
|
| 64 |
"question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
|
| 65 |
-
"submitted_answer": "
|
| 66 |
"ground_truth": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
|
| 67 |
-
"correct":
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
|
| 71 |
"question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
|
| 72 |
-
"submitted_answer": "
|
| 73 |
"ground_truth": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",
|
| 74 |
"correct": false
|
| 75 |
},
|
| 76 |
{
|
| 77 |
"task_id": "305ac316-eef6-4446-960a-92d80d542f82",
|
| 78 |
"question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
|
| 79 |
-
"submitted_answer": "
|
| 80 |
"ground_truth": "Wojciech",
|
| 81 |
"correct": false
|
| 82 |
},
|
| 83 |
{
|
| 84 |
"task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
|
| 85 |
"question": "What is the final numeric output from the attached Python code?",
|
| 86 |
-
"submitted_answer": "
|
| 87 |
"ground_truth": "0",
|
| 88 |
"correct": false
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
|
| 92 |
"question": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
|
| 93 |
-
"submitted_answer": "
|
| 94 |
"ground_truth": "519",
|
| 95 |
-
"correct":
|
| 96 |
},
|
| 97 |
{
|
| 98 |
"task_id": "1f975693-876d-457b-a649-393859e79bf3",
|
| 99 |
"question": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
|
| 100 |
-
"submitted_answer": "
|
| 101 |
"ground_truth": "132, 133, 134, 197, 245",
|
| 102 |
"correct": false
|
| 103 |
},
|
| 104 |
{
|
| 105 |
"task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
|
| 106 |
"question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
|
| 107 |
-
"submitted_answer": "
|
| 108 |
"ground_truth": "80GSFC21M0002",
|
| 109 |
"correct": false
|
| 110 |
},
|
| 111 |
{
|
| 112 |
"task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
|
| 113 |
"question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
|
| 114 |
-
"submitted_answer": "
|
| 115 |
"ground_truth": "Saint Petersburg",
|
| 116 |
-
"correct":
|
| 117 |
},
|
| 118 |
{
|
| 119 |
"task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
|
| 120 |
"question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
|
| 121 |
-
"submitted_answer": "
|
| 122 |
"ground_truth": "CUB",
|
| 123 |
"correct": false
|
| 124 |
},
|
| 125 |
{
|
| 126 |
"task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
|
| 127 |
"question": "Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
|
| 128 |
-
"submitted_answer": "
|
| 129 |
"ground_truth": "Yoshida, Uehara",
|
| 130 |
"correct": false
|
| 131 |
},
|
| 132 |
{
|
| 133 |
"task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
|
| 134 |
"question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
|
| 135 |
-
"submitted_answer": "
|
| 136 |
"ground_truth": "89706.00",
|
| 137 |
-
"correct":
|
| 138 |
},
|
| 139 |
{
|
| 140 |
"task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
|
| 141 |
"question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
|
| 142 |
-
"submitted_answer": "
|
| 143 |
"ground_truth": "Claus",
|
| 144 |
"correct": false
|
| 145 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"score": 20.0,
|
| 3 |
+
"correct": 4,
|
| 4 |
"total": 20,
|
| 5 |
"results": [
|
| 6 |
{
|
| 7 |
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
|
| 8 |
"question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
|
| 9 |
+
"submitted_answer": "",
|
| 10 |
"ground_truth": "3",
|
| 11 |
"correct": false
|
| 12 |
},
|
| 13 |
{
|
| 14 |
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
|
| 15 |
"question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 16 |
+
"submitted_answer": "",
|
| 17 |
"ground_truth": "3",
|
| 18 |
"correct": false
|
| 19 |
},
|
| 20 |
{
|
| 21 |
"task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
|
| 22 |
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
|
| 23 |
+
"submitted_answer": "left",
|
| 24 |
"ground_truth": "Right",
|
| 25 |
"correct": false
|
| 26 |
},
|
| 27 |
{
|
| 28 |
"task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
|
| 29 |
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
|
| 30 |
+
"submitted_answer": "",
|
| 31 |
"ground_truth": "Rd5",
|
| 32 |
"correct": false
|
| 33 |
},
|
| 34 |
{
|
| 35 |
"task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
|
| 36 |
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
|
| 37 |
+
"submitted_answer": "",
|
| 38 |
"ground_truth": "FunkMonk",
|
| 39 |
"correct": false
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
|
| 43 |
"question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
|
| 44 |
+
"submitted_answer": "b,e",
|
| 45 |
"ground_truth": "b, e",
|
| 46 |
"correct": false
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
|
| 50 |
"question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
|
| 51 |
+
"submitted_answer": "",
|
| 52 |
"ground_truth": "Extremely",
|
| 53 |
"correct": false
|
| 54 |
},
|
| 55 |
{
|
| 56 |
"task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
|
| 57 |
"question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
|
| 58 |
+
"submitted_answer": "",
|
| 59 |
"ground_truth": "Louvrier",
|
| 60 |
"correct": false
|
| 61 |
},
|
| 62 |
{
|
| 63 |
"task_id": "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
|
| 64 |
"question": "I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:\n\nmilk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts\n\nI need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.",
|
| 65 |
+
"submitted_answer": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
|
| 66 |
"ground_truth": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
|
| 67 |
+
"correct": true
|
| 68 |
},
|
| 69 |
{
|
| 70 |
"task_id": "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
|
| 71 |
"question": "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it as a voice memo and the speaker on my phone is buzzing so I can't quite make out what she's saying. Could you please listen to the recipe and list all of the ingredients that my friend described? I only want the ingredients for the filling, as I have everything I need to make my favorite pie crust. I've attached the recipe as Strawberry pie.mp3.\n\nIn your response, please only list the ingredients, not any measurements. So if the recipe calls for \"a pinch of salt\" or \"two cups of ripe strawberries\" the ingredients on the list would be \"salt\" and \"ripe strawberries\".\n\nPlease format your response as a comma separated list of ingredients. Also, please alphabetize the ingredients.",
|
| 72 |
+
"submitted_answer": "",
|
| 73 |
"ground_truth": "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",
|
| 74 |
"correct": false
|
| 75 |
},
|
| 76 |
{
|
| 77 |
"task_id": "305ac316-eef6-4446-960a-92d80d542f82",
|
| 78 |
"question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.",
|
| 79 |
+
"submitted_answer": "",
|
| 80 |
"ground_truth": "Wojciech",
|
| 81 |
"correct": false
|
| 82 |
},
|
| 83 |
{
|
| 84 |
"task_id": "f918266a-b3e0-4914-865d-4faa564f1aef",
|
| 85 |
"question": "What is the final numeric output from the attached Python code?",
|
| 86 |
+
"submitted_answer": "",
|
| 87 |
"ground_truth": "0",
|
| 88 |
"correct": false
|
| 89 |
},
|
| 90 |
{
|
| 91 |
"task_id": "3f57289b-8c60-48be-bd80-01f8099ca449",
|
| 92 |
"question": "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
|
| 93 |
+
"submitted_answer": "519",
|
| 94 |
"ground_truth": "519",
|
| 95 |
+
"correct": true
|
| 96 |
},
|
| 97 |
{
|
| 98 |
"task_id": "1f975693-876d-457b-a649-393859e79bf3",
|
| 99 |
"question": "Hi, I was out sick from my classes on Friday, so I'm trying to figure out what I need to study for my Calculus mid-term next week. My friend from class sent me an audio recording of Professor Willowbrook giving out the recommended reading for the test, but my headphones are broken :(\n\nCould you please listen to the recording for me and tell me the page numbers I'm supposed to go over? I've attached a file called Homework.mp3 that has the recording. Please provide just the page numbers as a comma-delimited list. And please provide the list in ascending order.",
|
| 100 |
+
"submitted_answer": "",
|
| 101 |
"ground_truth": "132, 133, 134, 197, 245",
|
| 102 |
"correct": false
|
| 103 |
},
|
| 104 |
{
|
| 105 |
"task_id": "840bfca7-4f7b-481a-8794-c560c340185d",
|
| 106 |
"question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?",
|
| 107 |
+
"submitted_answer": "",
|
| 108 |
"ground_truth": "80GSFC21M0002",
|
| 109 |
"correct": false
|
| 110 |
},
|
| 111 |
{
|
| 112 |
"task_id": "bda648d7-d618-4883-88f4-3466eabd860e",
|
| 113 |
"question": "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited? Just give me the city name without abbreviations.",
|
| 114 |
+
"submitted_answer": "Saint Petersburg",
|
| 115 |
"ground_truth": "Saint Petersburg",
|
| 116 |
+
"correct": true
|
| 117 |
},
|
| 118 |
{
|
| 119 |
"task_id": "cf106601-ab4f-4af9-b045-5295fe67b37d",
|
| 120 |
"question": "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",
|
| 121 |
+
"submitted_answer": "",
|
| 122 |
"ground_truth": "CUB",
|
| 123 |
"correct": false
|
| 124 |
},
|
| 125 |
{
|
| 126 |
"task_id": "a0c07678-e491-4bbc-8f0b-07405144218f",
|
| 127 |
"question": "Who are the pitchers with the number before and after Taish\u014d Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.",
|
| 128 |
+
"submitted_answer": "",
|
| 129 |
"ground_truth": "Yoshida, Uehara",
|
| 130 |
"correct": false
|
| 131 |
},
|
| 132 |
{
|
| 133 |
"task_id": "7bd855d8-463d-4ed5-93ca-5fe35145f733",
|
| 134 |
"question": "The attached Excel file contains the sales of menu items for a local fast-food chain. What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.",
|
| 135 |
+
"submitted_answer": "89706.00",
|
| 136 |
"ground_truth": "89706.00",
|
| 137 |
+
"correct": true
|
| 138 |
},
|
| 139 |
{
|
| 140 |
"task_id": "5a0c1adf-205e-4841-a666-7c3ef95def9d",
|
| 141 |
"question": "What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?",
|
| 142 |
+
"submitted_answer": "",
|
| 143 |
"ground_truth": "Claus",
|
| 144 |
"correct": false
|
| 145 |
}
|
llm/client.py
CHANGED
|
@@ -1,66 +1,50 @@
|
|
| 1 |
import os
|
| 2 |
-
|
| 3 |
|
| 4 |
from langchain_core.messages import AIMessage
|
| 5 |
from llm.providers import PROVIDERS
|
| 6 |
|
| 7 |
-
PROVIDER_ORDER = os.getenv("LLM_PROVIDER_ORDER", "
|
| 8 |
|
| 9 |
-
_degraded_providers = {}
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
if name not in _degraded_providers:
|
| 16 |
-
yield name
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
def invoke_llm(messages: List, tools: List, fallback_count: int = 0) -> AIMessage:
|
| 20 |
-
"""Invoke LLM with provider fallback.
|
| 21 |
-
|
| 22 |
-
Args:
|
| 23 |
-
messages: Chat messages to send to LLM
|
| 24 |
-
tools: List of tools to bind
|
| 25 |
-
fallback_count: Current retry attempt
|
| 26 |
-
|
| 27 |
-
Returns:
|
| 28 |
-
AIMessage response from successful provider
|
| 29 |
-
"""
|
| 30 |
-
provider_name = None
|
| 31 |
-
provider = None
|
| 32 |
-
|
| 33 |
-
for name in _get_next_provider():
|
| 34 |
-
provider_name = name
|
| 35 |
-
provider = PROVIDERS.get(name)
|
| 36 |
-
if provider:
|
| 37 |
-
break
|
| 38 |
-
|
| 39 |
-
if not provider:
|
| 40 |
-
return AIMessage(content="ERROR: No available LLM providers")
|
| 41 |
-
|
| 42 |
-
try:
|
| 43 |
models = provider.get_models()
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import time
|
| 3 |
|
| 4 |
from langchain_core.messages import AIMessage
|
| 5 |
from llm.providers import PROVIDERS
|
| 6 |
|
| 7 |
+
PROVIDER_ORDER = [p.strip() for p in os.getenv("LLM_PROVIDER_ORDER", "opencode_zen, groq").split(",")]
|
| 8 |
|
|
|
|
| 9 |
|
| 10 |
+
def invoke_llm(messages, tools, fallback_count=0, _degraded=None):
|
| 11 |
+
if _degraded is None:
|
| 12 |
+
_degraded = {}
|
| 13 |
|
| 14 |
+
for provider_name in PROVIDER_ORDER:
|
| 15 |
+
if provider_name in _degraded:
|
| 16 |
+
continue
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
provider = PROVIDERS.get(provider_name)
|
| 19 |
+
if not provider:
|
| 20 |
+
continue
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
models = provider.get_models()
|
| 23 |
+
model_attempts = 0
|
| 24 |
+
|
| 25 |
+
while model_attempts < len(models):
|
| 26 |
+
model_name = models[model_attempts]
|
| 27 |
+
print(f"Invoking {provider_name} with {model_name}", flush=True)
|
| 28 |
+
|
| 29 |
+
retries = 0
|
| 30 |
+
while retries < 2:
|
| 31 |
+
try:
|
| 32 |
+
return provider.invoke(messages, tools, model_name)
|
| 33 |
+
except Exception as e:
|
| 34 |
+
err_str = str(e)
|
| 35 |
+
err = err_str.lower()
|
| 36 |
+
if any(x in err for x in ("rate limit", "429", "quota", "resource ex")):
|
| 37 |
+
print(f"{provider_name}/{model_name} rate limited, waiting...", flush=True)
|
| 38 |
+
time.sleep(65)
|
| 39 |
+
retries += 1
|
| 40 |
+
elif any(x in err for x in ("payment required", "402", "tool_use_failed", "model_not_found", "too large", "413")):
|
| 41 |
+
print(f"{provider_name}/{model_name} skip, trying next", flush=True)
|
| 42 |
+
break
|
| 43 |
+
else:
|
| 44 |
+
print(f"{provider_name}/{model_name} error: {type(e).__name__}: {err_str[:150]}", flush=True)
|
| 45 |
+
break
|
| 46 |
+
model_attempts += 1
|
| 47 |
+
|
| 48 |
+
_degraded[provider_name] = True
|
| 49 |
+
|
| 50 |
+
return AIMessage(content="ERROR: All LLM providers failed")
|
llm/providers/__init__.py
CHANGED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
from llm.providers import gemini, gemini_gemma, groq
|
| 2 |
|
| 3 |
PROVIDERS = {
|
| 4 |
"gemini": gemini,
|
| 5 |
"gemini_gemma": gemini_gemma,
|
| 6 |
"groq": groq,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
}
|
| 8 |
|
| 9 |
-
__all__ = ["PROVIDERS", "gemini", "gemini_gemma", "groq"]
|
|
|
|
| 1 |
+
from llm.providers import gemini, gemini_gemma, groq, openrouter, together, zai, hf_inference, opencode_zen
|
| 2 |
|
| 3 |
PROVIDERS = {
|
| 4 |
"gemini": gemini,
|
| 5 |
"gemini_gemma": gemini_gemma,
|
| 6 |
"groq": groq,
|
| 7 |
+
"openrouter": openrouter,
|
| 8 |
+
"together": together,
|
| 9 |
+
"zai": zai,
|
| 10 |
+
"hf_inference": hf_inference,
|
| 11 |
+
"opencode_zen": opencode_zen,
|
| 12 |
}
|
| 13 |
|
| 14 |
+
__all__ = ["PROVIDERS", "gemini", "gemini_gemma", "groq", "openrouter", "together", "zai", "hf_inference", "opencode_zen"]
|
llm/providers/groq.py
CHANGED
|
@@ -10,4 +10,4 @@ def invoke(messages, tools, model_name: str = "llama-3.3-70b-versatile"):
|
|
| 10 |
|
| 11 |
def get_models():
|
| 12 |
"""List available Groq models for fallback."""
|
| 13 |
-
return ["llama-3.
|
|
|
|
| 10 |
|
| 11 |
def get_models():
|
| 12 |
"""List available Groq models for fallback."""
|
| 13 |
+
return ["llama-3.1-8b-instant", "llama-3.3-70b-versatile"]
|
llm/providers/hf_inference.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
from huggingface_hub import InferenceClient
|
| 5 |
+
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
token = os.getenv("HF_TOKEN")
|
| 10 |
+
client = InferenceClient(token=token)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _convert_message(msg):
|
| 14 |
+
role_map = {
|
| 15 |
+
"HumanMessage": "user",
|
| 16 |
+
"AIMessage": "assistant",
|
| 17 |
+
"SystemMessage": "system",
|
| 18 |
+
"ToolMessage": "tool",
|
| 19 |
+
}
|
| 20 |
+
role = role_map.get(type(msg).__name__, "user")
|
| 21 |
+
d = {"role": role, "content": msg.content if msg.content else ""}
|
| 22 |
+
|
| 23 |
+
if role == "tool":
|
| 24 |
+
d["tool_call_id"] = getattr(msg, "tool_call_id", "")
|
| 25 |
+
d["name"] = getattr(msg, "name", "")
|
| 26 |
+
|
| 27 |
+
if role == "assistant" and hasattr(msg, "tool_calls") and msg.tool_calls:
|
| 28 |
+
d["tool_calls"] = []
|
| 29 |
+
for tc in msg.tool_calls:
|
| 30 |
+
d["tool_calls"].append({
|
| 31 |
+
"id": tc.get("id", ""),
|
| 32 |
+
"type": "function",
|
| 33 |
+
"function": {
|
| 34 |
+
"name": tc["name"],
|
| 35 |
+
"arguments": json.dumps(tc["args"]) if isinstance(tc["args"], dict) else tc["args"],
|
| 36 |
+
},
|
| 37 |
+
})
|
| 38 |
+
return d
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _convert_tools(tools):
|
| 42 |
+
result = []
|
| 43 |
+
for t in tools:
|
| 44 |
+
result.append({
|
| 45 |
+
"type": "function",
|
| 46 |
+
"function": {
|
| 47 |
+
"name": t.name,
|
| 48 |
+
"description": t.description,
|
| 49 |
+
"parameters": t.args_schema.schema() if hasattr(t, "args_schema") and t.args_schema else {},
|
| 50 |
+
},
|
| 51 |
+
})
|
| 52 |
+
return result
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def invoke(messages, tools, model_name: str = "deepseek-ai/DeepSeek-V3-0324"):
|
| 56 |
+
hf_messages = [_convert_message(m) for m in messages]
|
| 57 |
+
hf_tools = _convert_tools(tools) if tools else None
|
| 58 |
+
|
| 59 |
+
resp = client.chat_completion(
|
| 60 |
+
model=model_name,
|
| 61 |
+
messages=hf_messages,
|
| 62 |
+
tools=hf_tools,
|
| 63 |
+
tool_choice="auto" if hf_tools else None,
|
| 64 |
+
max_tokens=2048,
|
| 65 |
+
temperature=0,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
choice = resp.choices[0]
|
| 69 |
+
msg = choice.message
|
| 70 |
+
|
| 71 |
+
response_kwargs = {"content": msg.content or ""}
|
| 72 |
+
|
| 73 |
+
if msg.tool_calls:
|
| 74 |
+
tool_calls = []
|
| 75 |
+
for tc in msg.tool_calls:
|
| 76 |
+
tool_calls.append({
|
| 77 |
+
"id": tc.id,
|
| 78 |
+
"name": tc.function.name,
|
| 79 |
+
"args": json.loads(tc.function.arguments) if tc.function.arguments else {},
|
| 80 |
+
})
|
| 81 |
+
response_kwargs["tool_calls"] = tool_calls
|
| 82 |
+
response_kwargs["additional_kwargs"] = {
|
| 83 |
+
"tool_calls": [
|
| 84 |
+
{
|
| 85 |
+
"id": tc.id,
|
| 86 |
+
"type": "function",
|
| 87 |
+
"function": {"name": tc.function.name, "arguments": tc.function.arguments},
|
| 88 |
+
}
|
| 89 |
+
for tc in msg.tool_calls
|
| 90 |
+
]
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
return AIMessage(**response_kwargs)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def get_models():
|
| 97 |
+
return ["deepseek-ai/DeepSeek-V3-0324"]
|
llm/providers/opencode_zen.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import requests
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
API_KEY = os.getenv("OPENCODE_ZEN_API_KEY", "sk-CEgFM8zjmQxtbByFEGNMBTr0bisvxSQvyjhKJEppQfoDjD7922P2Ljtupey6XQji")
|
| 10 |
+
BASE_URL = "https://opencode.ai/zen/v1"
|
| 11 |
+
HEADERS = {
|
| 12 |
+
"Authorization": f"Bearer {API_KEY}",
|
| 13 |
+
"Content-Type": "application/json",
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _convert_message(msg):
|
| 18 |
+
role_map = {
|
| 19 |
+
"HumanMessage": "user",
|
| 20 |
+
"AIMessage": "assistant",
|
| 21 |
+
"SystemMessage": "system",
|
| 22 |
+
"ToolMessage": "tool",
|
| 23 |
+
}
|
| 24 |
+
role = role_map.get(type(msg).__name__, "user")
|
| 25 |
+
d = {"role": role, "content": msg.content if msg.content else ""}
|
| 26 |
+
|
| 27 |
+
if role == "tool":
|
| 28 |
+
d["tool_call_id"] = getattr(msg, "tool_call_id", "")
|
| 29 |
+
d["name"] = getattr(msg, "name", "")
|
| 30 |
+
|
| 31 |
+
if role == "assistant":
|
| 32 |
+
rc = None
|
| 33 |
+
if hasattr(msg, "additional_kwargs") and msg.additional_kwargs:
|
| 34 |
+
rc = msg.additional_kwargs.get("reasoning_content")
|
| 35 |
+
if rc:
|
| 36 |
+
d["reasoning_content"] = rc
|
| 37 |
+
|
| 38 |
+
if hasattr(msg, "tool_calls") and msg.tool_calls:
|
| 39 |
+
d["tool_calls"] = []
|
| 40 |
+
for tc in msg.tool_calls:
|
| 41 |
+
d["tool_calls"].append({
|
| 42 |
+
"id": tc.get("id", ""),
|
| 43 |
+
"type": "function",
|
| 44 |
+
"function": {
|
| 45 |
+
"name": tc["name"],
|
| 46 |
+
"arguments": json.dumps(tc["args"]) if isinstance(tc["args"], dict) else tc["args"],
|
| 47 |
+
},
|
| 48 |
+
})
|
| 49 |
+
return d
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _convert_tools(tools):
|
| 53 |
+
result = []
|
| 54 |
+
for t in tools:
|
| 55 |
+
result.append({
|
| 56 |
+
"type": "function",
|
| 57 |
+
"function": {
|
| 58 |
+
"name": t.name,
|
| 59 |
+
"description": t.description,
|
| 60 |
+
"parameters": t.args_schema.schema() if hasattr(t, "args_schema") and t.args_schema else {},
|
| 61 |
+
},
|
| 62 |
+
})
|
| 63 |
+
return result
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def invoke(messages, tools, model_name: str = None):
|
| 67 |
+
if model_name is None:
|
| 68 |
+
model_name = "deepseek-v4-flash-free"
|
| 69 |
+
|
| 70 |
+
hf_messages = [_convert_message(m) for m in messages]
|
| 71 |
+
hf_tools = _convert_tools(tools) if tools else None
|
| 72 |
+
|
| 73 |
+
data = {
|
| 74 |
+
"model": model_name,
|
| 75 |
+
"messages": hf_messages,
|
| 76 |
+
"max_tokens": 4096,
|
| 77 |
+
"temperature": 0,
|
| 78 |
+
}
|
| 79 |
+
if hf_tools:
|
| 80 |
+
data["tools"] = hf_tools
|
| 81 |
+
data["tool_choice"] = "auto"
|
| 82 |
+
|
| 83 |
+
resp = requests.post(f"{BASE_URL}/chat/completions", headers=HEADERS, json=data, timeout=120)
|
| 84 |
+
if resp.status_code != 200:
|
| 85 |
+
print(f"opencode_zen 400 body: {resp.text[:300]}", flush=True)
|
| 86 |
+
print(f"opencode_zen request model={model_name} tools={bool(hf_tools)} msgs={len(hf_messages)}", flush=True)
|
| 87 |
+
resp.raise_for_status()
|
| 88 |
+
choice = resp.json()["choices"][0]
|
| 89 |
+
msg = choice["message"]
|
| 90 |
+
|
| 91 |
+
response_kwargs = {"content": msg.get("content") or ""}
|
| 92 |
+
additional_kwargs = {}
|
| 93 |
+
|
| 94 |
+
reasoning = msg.get("reasoning_content")
|
| 95 |
+
if reasoning:
|
| 96 |
+
additional_kwargs["reasoning_content"] = reasoning
|
| 97 |
+
|
| 98 |
+
tool_calls_data = msg.get("tool_calls")
|
| 99 |
+
if tool_calls_data:
|
| 100 |
+
tool_calls = []
|
| 101 |
+
for tc in tool_calls_data:
|
| 102 |
+
tool_calls.append({
|
| 103 |
+
"id": tc["id"],
|
| 104 |
+
"name": tc["function"]["name"],
|
| 105 |
+
"args": json.loads(tc["function"]["arguments"]) if tc["function"].get("arguments") else {},
|
| 106 |
+
})
|
| 107 |
+
response_kwargs["tool_calls"] = tool_calls
|
| 108 |
+
additional_kwargs["tool_calls"] = [
|
| 109 |
+
{
|
| 110 |
+
"id": tc["id"],
|
| 111 |
+
"type": "function",
|
| 112 |
+
"function": {"name": tc["function"]["name"], "arguments": tc["function"]["arguments"]},
|
| 113 |
+
}
|
| 114 |
+
for tc in tool_calls_data
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
if additional_kwargs:
|
| 118 |
+
response_kwargs["additional_kwargs"] = additional_kwargs
|
| 119 |
+
|
| 120 |
+
return AIMessage(**response_kwargs)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def get_models():
|
| 124 |
+
return ["deepseek-v4-flash-free", "nemotron-3-super-free", "big-pickle"]
|
llm/providers/openrouter.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def invoke(messages, tools, model_name: str = "deepseek/deepseek-chat"):
|
| 9 |
+
"""Invoke OpenRouter model."""
|
| 10 |
+
model = ChatOpenAI(
|
| 11 |
+
model=model_name,
|
| 12 |
+
temperature=0,
|
| 13 |
+
base_url="https://openrouter.ai/api/v1",
|
| 14 |
+
api_key=os.getenv("OPENROUTER_API_KEY"),
|
| 15 |
+
)
|
| 16 |
+
model_with_tools = model.bind_tools(tools)
|
| 17 |
+
return model_with_tools.invoke(messages)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_models():
|
| 21 |
+
"""Free models on OpenRouter."""
|
| 22 |
+
return ["deepseek/deepseek-chat", "meta-llama/llama-3.2-3b-instruct"]
|
llm/providers/together.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def invoke(messages, tools, model_name: str = "meta-llama/Llama-4-Scout-17B-16E-Instruct"):
|
| 9 |
+
"""Invoke Together AI model."""
|
| 10 |
+
model = ChatOpenAI(
|
| 11 |
+
model=model_name,
|
| 12 |
+
temperature=0,
|
| 13 |
+
base_url="https://api.together.xyz/v1",
|
| 14 |
+
api_key=os.getenv("TOGETHER_API_KEY"),
|
| 15 |
+
)
|
| 16 |
+
model_with_tools = model.bind_tools(tools)
|
| 17 |
+
return model_with_tools.invoke(messages)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_models():
|
| 21 |
+
"""Free models on Together AI."""
|
| 22 |
+
return [
|
| 23 |
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
| 24 |
+
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 25 |
+
]
|
llm/providers/zai.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def invoke(messages, tools, model_name: str = "z-ai/glm-5"):
|
| 9 |
+
"""Invoke ZAI model."""
|
| 10 |
+
model = ChatOpenAI(
|
| 11 |
+
model=model_name,
|
| 12 |
+
temperature=0,
|
| 13 |
+
base_url="https://api.z.ai/api/paas/v4",
|
| 14 |
+
api_key=os.getenv("ZAI_API_KEY"),
|
| 15 |
+
)
|
| 16 |
+
model_with_tools = model.bind_tools(tools)
|
| 17 |
+
return model_with_tools.invoke(messages)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_models():
|
| 21 |
+
"""Available models on ZAI."""
|
| 22 |
+
return ["z-ai/glm-5", "z-ai/glm-5.1"]
|
run_local.py
CHANGED
|
@@ -16,10 +16,26 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
| 16 |
|
| 17 |
def extract_answer(content) -> str:
|
| 18 |
if isinstance(content, str):
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
if match:
|
| 21 |
return match.group(1).strip()
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
return str(content)
|
| 24 |
|
| 25 |
class BasicAgent:
|
|
@@ -29,9 +45,16 @@ class BasicAgent:
|
|
| 29 |
|
| 30 |
def __call__(self, question: str) -> str:
|
| 31 |
messages = [HumanMessage(content=question)]
|
| 32 |
-
result = self.graph.invoke(
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
def file_extract(local_file_path, task_id):
|
| 37 |
if not local_file_path:
|
|
@@ -107,8 +130,9 @@ def main():
|
|
| 107 |
})
|
| 108 |
|
| 109 |
status = "OK" if is_correct else "FAIL"
|
| 110 |
-
|
| 111 |
-
print(f"
|
|
|
|
| 112 |
|
| 113 |
time.sleep(1.5)
|
| 114 |
|
|
|
|
| 16 |
|
| 17 |
def extract_answer(content) -> str:
|
| 18 |
if isinstance(content, str):
|
| 19 |
+
cleaned = content.strip()
|
| 20 |
+
if not cleaned:
|
| 21 |
+
return ""
|
| 22 |
+
# Try FINAL ANSWER: pattern (most specific first)
|
| 23 |
+
match = re.search(r'FINAL ANSWER:\s*(.+?)(?:\n|$)', cleaned, re.IGNORECASE)
|
| 24 |
if match:
|
| 25 |
return match.group(1).strip()
|
| 26 |
+
# Try "Answer:" pattern
|
| 27 |
+
match = re.search(r'Answer:\s*(.+?)(?:\n|$)', cleaned, re.IGNORECASE)
|
| 28 |
+
if match:
|
| 29 |
+
return match.group(1).strip()
|
| 30 |
+
# Try "answer is" pattern
|
| 31 |
+
match = re.search(r'(?:the\s+)?answer\s+is\s*:?\s*(.+?)(?:\.|$)', cleaned, re.IGNORECASE)
|
| 32 |
+
if match:
|
| 33 |
+
return match.group(1).strip()
|
| 34 |
+
# Use last non-empty line
|
| 35 |
+
lines = [l.strip() for l in cleaned.split('\n') if l.strip()]
|
| 36 |
+
if lines:
|
| 37 |
+
return lines[-1]
|
| 38 |
+
return cleaned
|
| 39 |
return str(content)
|
| 40 |
|
| 41 |
class BasicAgent:
|
|
|
|
| 45 |
|
| 46 |
def __call__(self, question: str) -> str:
|
| 47 |
messages = [HumanMessage(content=question)]
|
| 48 |
+
result = self.graph.invoke(
|
| 49 |
+
{"messages": messages, "tool_call_count": 0, "reflection_count": 0, "tool_call_history": []},
|
| 50 |
+
config={"recursion_limit": 50},
|
| 51 |
+
)
|
| 52 |
+
# Find last AIMessage with content (skip ToolMessages and tool-call-only AIMessages)
|
| 53 |
+
for m in reversed(result['messages']):
|
| 54 |
+
cls = type(m).__name__
|
| 55 |
+
if cls == 'AIMessage' and m.content:
|
| 56 |
+
return extract_answer(m.content)
|
| 57 |
+
return ""
|
| 58 |
|
| 59 |
def file_extract(local_file_path, task_id):
|
| 60 |
if not local_file_path:
|
|
|
|
| 130 |
})
|
| 131 |
|
| 132 |
status = "OK" if is_correct else "FAIL"
|
| 133 |
+
def safe(s): return str(s).encode('utf-8', errors='replace').decode('utf-8', errors='replace')[:40]
|
| 134 |
+
print(f" {status} Submitted: {safe(answer)}")
|
| 135 |
+
print(f" Ground: {safe(ground_truth)}")
|
| 136 |
|
| 137 |
time.sleep(1.5)
|
| 138 |
|
test_env.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
load_dotenv(override=True)
|
| 4 |
+
|
| 5 |
+
print("GROQ_API_KEY exists:", "GROQ_API_KEY" in os.environ)
|
| 6 |
+
print("GOOGLE_API_KEY exists:", "GOOGLE_API_KEY" in os.environ)
|
| 7 |
+
print("HF_TOKEN exists:", "HF_TOKEN" in os.environ)
|
| 8 |
+
print("LLM_PROVIDER_ORDER:", os.getenv("LLM_PROVIDER_ORDER"))
|
test_gemini.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 4 |
+
from langchain_core.messages import HumanMessage
|
| 5 |
+
|
| 6 |
+
load_dotenv(override=True)
|
| 7 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 8 |
+
|
| 9 |
+
print("GOOGLE_API_KEY:", GOOGLE_API_KEY[:10] + "..." if GOOGLE_API_KEY else None)
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0, google_api_key=GOOGLE_API_KEY)
|
| 13 |
+
response = model.invoke([HumanMessage(content="Hello, who are you?")])
|
| 14 |
+
print("Gemini response:", response.content)
|
| 15 |
+
except Exception as e:
|
| 16 |
+
print("Gemini failed:", e)
|
tools/__init__.py
CHANGED
|
@@ -1,19 +1,21 @@
|
|
| 1 |
from tools.web.search import web_search
|
| 2 |
from tools.web.wiki import wiki_search
|
|
|
|
| 3 |
from tools.web.browse import browse_url
|
| 4 |
from tools.file.reader import read_file
|
|
|
|
| 5 |
from tools.python import python_repl
|
| 6 |
-
from tools.reverse import reverse_text
|
| 7 |
from tools.youtube import get_youtube_transcript
|
| 8 |
from tools.audio import transcribe_audio
|
| 9 |
|
| 10 |
__all__ = [
|
| 11 |
web_search,
|
| 12 |
wiki_search,
|
|
|
|
| 13 |
browse_url,
|
| 14 |
read_file,
|
|
|
|
| 15 |
python_repl,
|
| 16 |
-
reverse_text,
|
| 17 |
get_youtube_transcript,
|
| 18 |
transcribe_audio,
|
| 19 |
]
|
|
|
|
| 1 |
from tools.web.search import web_search
|
| 2 |
from tools.web.wiki import wiki_search
|
| 3 |
+
from tools.web.wiki_page import wiki_page
|
| 4 |
from tools.web.browse import browse_url
|
| 5 |
from tools.file.reader import read_file
|
| 6 |
+
from tools.file.spreadsheet import parse_spreadsheet
|
| 7 |
from tools.python import python_repl
|
|
|
|
| 8 |
from tools.youtube import get_youtube_transcript
|
| 9 |
from tools.audio import transcribe_audio
|
| 10 |
|
| 11 |
__all__ = [
|
| 12 |
web_search,
|
| 13 |
wiki_search,
|
| 14 |
+
wiki_page,
|
| 15 |
browse_url,
|
| 16 |
read_file,
|
| 17 |
+
parse_spreadsheet,
|
| 18 |
python_repl,
|
|
|
|
| 19 |
get_youtube_transcript,
|
| 20 |
transcribe_audio,
|
| 21 |
]
|
tools/file/reader.py
CHANGED
|
@@ -20,6 +20,15 @@ def read_file(path: str) -> str:
|
|
| 20 |
loader = UnstructuredImageLoader(path)
|
| 21 |
docs = loader.load()
|
| 22 |
content = "\n\n".join([doc.page_content for doc in docs])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
elif ext == ".pdf":
|
| 24 |
try:
|
| 25 |
doc = fitz.open(path)
|
|
|
|
| 20 |
loader = UnstructuredImageLoader(path)
|
| 21 |
docs = loader.load()
|
| 22 |
content = "\n\n".join([doc.page_content for doc in docs])
|
| 23 |
+
elif ext in (".xlsx", ".xls", ".csv"):
|
| 24 |
+
import pandas as pd
|
| 25 |
+
df = pd.read_excel(path) if ext != ".csv" else pd.read_csv(path)
|
| 26 |
+
buf = [f"Rows: {len(df)}, Columns: {list(df.columns)}"]
|
| 27 |
+
buf.append(" | ".join(str(c) for c in df.columns))
|
| 28 |
+
buf.append("-" * min(200, 10 + 12 * len(df.columns)))
|
| 29 |
+
for i, (_, row) in enumerate(df.iterrows()):
|
| 30 |
+
buf.append(f"{i} | " + " | ".join(str(v) if pd.notna(v) else "" for v in row))
|
| 31 |
+
content = "\n".join(buf)
|
| 32 |
elif ext == ".pdf":
|
| 33 |
try:
|
| 34 |
doc = fitz.open(path)
|
tools/file/spreadsheet.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.tools import tool
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
@tool
|
| 5 |
+
def parse_spreadsheet(path: str) -> str:
|
| 6 |
+
"""Read an Excel (.xlsx) or CSV file and return its contents as a formatted text table.
|
| 7 |
+
Use this instead of read_file for spreadsheet files to get properly structured data.
|
| 8 |
+
Returns all rows with column headers and row numbers."""
|
| 9 |
+
try:
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
if not os.path.exists(path):
|
| 14 |
+
return f"FILE_NOT_FOUND: {path}"
|
| 15 |
+
|
| 16 |
+
ext = os.path.splitext(path)[1].lower()
|
| 17 |
+
if ext == ".csv":
|
| 18 |
+
df = pd.read_csv(path)
|
| 19 |
+
elif ext in (".xlsx", ".xls"):
|
| 20 |
+
df = pd.read_excel(path, engine="openpyxl" if ext == ".xlsx" else "xlrd")
|
| 21 |
+
else:
|
| 22 |
+
return f"UNSUPPORTED_FORMAT: {ext}"
|
| 23 |
+
|
| 24 |
+
lines = [f"Sheet: {os.path.basename(path)} | Rows: {len(df)} | Columns: {len(df.columns)}"]
|
| 25 |
+
lines.append(" | " + " | ".join(str(c) for c in df.columns))
|
| 26 |
+
lines.append("-" * min(200, 10 + 12 * len(df.columns)))
|
| 27 |
+
for i, (_, row) in enumerate(df.iterrows()):
|
| 28 |
+
vals = [str(v) if pd.notna(v) else "" for v in row]
|
| 29 |
+
lines.append(f"{i} | " + " | ".join(vals))
|
| 30 |
+
|
| 31 |
+
result = "\n".join(lines)
|
| 32 |
+
if len(result) > 25000:
|
| 33 |
+
result = result[:25000] + "\n... [TRUNCATED]"
|
| 34 |
+
|
| 35 |
+
return result
|
| 36 |
+
except Exception as e:
|
| 37 |
+
return f"SPREADSHEET_ERROR: {e}"
|
tools/python.py
CHANGED
|
@@ -8,6 +8,7 @@ def python_repl(code: str) -> str:
|
|
| 8 |
"""Execute python code and return the output. Use this for calculations, data analysis, or processing files.
|
| 9 |
The code should be a valid python script that prints the final result.
|
| 10 |
You can use libraries like pandas, numpy, PIL, etc.
|
|
|
|
| 11 |
Example: print(df.head()) or print(2 + 2)"""
|
| 12 |
try:
|
| 13 |
old_stdout = sys.stdout
|
|
|
|
| 8 |
"""Execute python code and return the output. Use this for calculations, data analysis, or processing files.
|
| 9 |
The code should be a valid python script that prints the final result.
|
| 10 |
You can use libraries like pandas, numpy, PIL, etc.
|
| 11 |
+
IMPORTANT: Variables persist between calls to this tool (same Python process). You can define a variable in one call and use it in the next.
|
| 12 |
Example: print(df.head()) or print(2 + 2)"""
|
| 13 |
try:
|
| 14 |
old_stdout = sys.stdout
|
tools/web/search.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
| 1 |
-
from langchain_tavily import TavilySearch
|
| 2 |
from langchain_core.tools import tool
|
| 3 |
|
| 4 |
|
| 5 |
@tool
|
| 6 |
def web_search(keywords: str) -> str:
|
| 7 |
-
"""Search the web
|
| 8 |
-
For deeper research or browsing specific URLs, additional tools may be required.
|
| 9 |
-
"""
|
| 10 |
try:
|
| 11 |
-
|
| 12 |
-
results =
|
| 13 |
-
|
|
|
|
|
|
|
| 14 |
for r in results:
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
except Exception as e:
|
| 18 |
return f"SEARCH_ERROR: {e}"
|
|
|
|
|
|
|
| 1 |
from langchain_core.tools import tool
|
| 2 |
|
| 3 |
|
| 4 |
@tool
|
| 5 |
def web_search(keywords: str) -> str:
|
| 6 |
+
"""Search the web. Use this for finding information, facts, URLs, and general web content. Returns title, URL, and snippet for each result."""
|
|
|
|
|
|
|
| 7 |
try:
|
| 8 |
+
from ddgs import DDGS
|
| 9 |
+
results = list(DDGS().text(keywords, max_results=5))
|
| 10 |
+
if not results:
|
| 11 |
+
return "NO_RESULTS"
|
| 12 |
+
formatted = []
|
| 13 |
for r in results:
|
| 14 |
+
title = r.get("title", "")
|
| 15 |
+
url = r.get("href", "")
|
| 16 |
+
body = r.get("body", "")[:300]
|
| 17 |
+
formatted.append(f"Title: {title}\nURL: {url}\nContent: {body}")
|
| 18 |
+
return "\n\n".join(formatted)
|
| 19 |
except Exception as e:
|
| 20 |
return f"SEARCH_ERROR: {e}"
|
tools/web/wiki_page.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_core.tools import tool
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
@tool
|
| 5 |
+
def wiki_page(title: str) -> str:
|
| 6 |
+
"""Fetch the full text content of a Wikipedia page by its exact title.
|
| 7 |
+
Use this when wiki_search snippets are insufficient and you need the complete article.
|
| 8 |
+
Provide the exact Wikipedia page title (case-sensitive, spaces allowed).
|
| 9 |
+
Returns the first 25000 characters of the article."""
|
| 10 |
+
try:
|
| 11 |
+
import requests
|
| 12 |
+
|
| 13 |
+
params = {
|
| 14 |
+
"action": "query",
|
| 15 |
+
"format": "json",
|
| 16 |
+
"titles": title,
|
| 17 |
+
"prop": "extracts",
|
| 18 |
+
"explaintext": True,
|
| 19 |
+
"exlimit": 1,
|
| 20 |
+
}
|
| 21 |
+
resp = requests.get(
|
| 22 |
+
"https://en.wikipedia.org/w/api.php",
|
| 23 |
+
params=params,
|
| 24 |
+
headers={"User-Agent": "GAIA-Benchmark-Agent/1.0"},
|
| 25 |
+
timeout=15,
|
| 26 |
+
)
|
| 27 |
+
resp.raise_for_status()
|
| 28 |
+
data = resp.json()
|
| 29 |
+
pages = data.get("query", {}).get("pages", {})
|
| 30 |
+
if not pages:
|
| 31 |
+
return "NO_RESULTS"
|
| 32 |
+
page_id = list(pages.keys())[0]
|
| 33 |
+
if page_id == "-1":
|
| 34 |
+
return "PAGE_NOT_FOUND"
|
| 35 |
+
extract = pages[page_id].get("extract", "")
|
| 36 |
+
if not extract:
|
| 37 |
+
return "NO_CONTENT"
|
| 38 |
+
if len(extract) > 25000:
|
| 39 |
+
extract = extract[:25000] + "\n... [TRUNCATED]"
|
| 40 |
+
return extract
|
| 41 |
+
except Exception as e:
|
| 42 |
+
return f"WIKI_PAGE_ERROR: {e}"
|