Update Groq model to openai/gpt-oss-120b
Browse files
output/gaia_results_20260104_222540.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"generated": "2026-01-04 22:25:40",
|
| 4 |
+
"timestamp": "20260104_222540",
|
| 5 |
+
"total_questions": 8,
|
| 6 |
+
"execution_time_seconds": 57.18,
|
| 7 |
+
"execution_time_formatted": "0m 57s",
|
| 8 |
+
"score_percent": 5.0,
|
| 9 |
+
"correct_count": 1,
|
| 10 |
+
"total_attempted": 8
|
| 11 |
+
},
|
| 12 |
+
"submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 5.0% (1/8 correct)\nMessage: Score calculated successfully: 1/20 total questions answered correctly (8 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
|
| 13 |
+
"results": [
|
| 14 |
+
{
|
| 15 |
+
"task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
|
| 16 |
+
"question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 17 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
|
| 18 |
+
"correct": false
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
|
| 22 |
+
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
|
| 23 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
|
| 24 |
+
"correct": false
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
|
| 28 |
+
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
|
| 29 |
+
"submitted_answer": "right",
|
| 30 |
+
"correct": true
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
|
| 34 |
+
"question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
|
| 35 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
|
| 36 |
+
"correct": false
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
|
| 40 |
+
"question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
|
| 41 |
+
"submitted_answer": "Unable to answer",
|
| 42 |
+
"correct": false
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
|
| 46 |
+
"question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
|
| 47 |
+
"submitted_answer": "ERROR: No evidence collected. Details: Tool parse_file failed: FileNotFoundError: Text file not found: path/to/operation_table.csv",
|
| 48 |
+
"correct": false
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
|
| 52 |
+
"question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
|
| 53 |
+
"submitted_answer": "Unable to answer",
|
| 54 |
+
"correct": false
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
|
| 58 |
+
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
|
| 59 |
+
"submitted_answer": "Unable to answer",
|
| 60 |
+
"correct": false
|
| 61 |
+
}
|
| 62 |
+
]
|
| 63 |
+
}
|
src/agent/llm_client.py
CHANGED
|
@@ -38,7 +38,7 @@ HF_MODEL = "Qwen/Qwen2.5-72B-Instruct" # Excellent for function calling and rea
|
|
| 38 |
# Alternatives: "meta-llama/Llama-3.1-70B-Instruct", "NousResearch/Hermes-3-Llama-3.1-70B"
|
| 39 |
|
| 40 |
# Groq Configuration
|
| 41 |
-
GROQ_MODEL = "
|
| 42 |
# Alternatives: "llama-3.1-8b-instant", "mixtral-8x7b-32768"
|
| 43 |
|
| 44 |
# Shared Configuration
|
|
@@ -184,7 +184,9 @@ def _call_with_fallback(function_name: str, *args, **kwargs) -> Any:
|
|
| 184 |
logger.info(f"[{function_name}] Using primary provider: {primary_provider}")
|
| 185 |
return retry_with_backoff(lambda: primary_func(*args, **kwargs))
|
| 186 |
except Exception as primary_error:
|
| 187 |
-
logger.warning(
|
|
|
|
|
|
|
| 188 |
|
| 189 |
# If fallback disabled, raise immediately
|
| 190 |
if not enable_fallback:
|
|
@@ -199,11 +201,15 @@ def _call_with_fallback(function_name: str, *args, **kwargs) -> Any:
|
|
| 199 |
for fallback_provider in fallback_providers:
|
| 200 |
try:
|
| 201 |
fallback_func = _get_provider_function(function_name, fallback_provider)
|
| 202 |
-
logger.info(
|
|
|
|
|
|
|
| 203 |
return retry_with_backoff(lambda: fallback_func(*args, **kwargs))
|
| 204 |
except Exception as fallback_error:
|
| 205 |
errors[fallback_provider] = fallback_error
|
| 206 |
-
logger.warning(
|
|
|
|
|
|
|
| 207 |
continue
|
| 208 |
|
| 209 |
# All providers failed
|
|
|
|
| 38 |
# Alternatives: "meta-llama/Llama-3.1-70B-Instruct", "NousResearch/Hermes-3-Llama-3.1-70B"
|
| 39 |
|
| 40 |
# Groq Configuration
|
| 41 |
+
GROQ_MODEL = "openai/gpt-oss-120b"
|
| 42 |
# Alternatives: "llama-3.1-8b-instant", "mixtral-8x7b-32768"
|
| 43 |
|
| 44 |
# Shared Configuration
|
|
|
|
| 184 |
logger.info(f"[{function_name}] Using primary provider: {primary_provider}")
|
| 185 |
return retry_with_backoff(lambda: primary_func(*args, **kwargs))
|
| 186 |
except Exception as primary_error:
|
| 187 |
+
logger.warning(
|
| 188 |
+
f"[{function_name}] Primary provider {primary_provider} failed: {primary_error}"
|
| 189 |
+
)
|
| 190 |
|
| 191 |
# If fallback disabled, raise immediately
|
| 192 |
if not enable_fallback:
|
|
|
|
| 201 |
for fallback_provider in fallback_providers:
|
| 202 |
try:
|
| 203 |
fallback_func = _get_provider_function(function_name, fallback_provider)
|
| 204 |
+
logger.info(
|
| 205 |
+
f"[{function_name}] Trying fallback provider: {fallback_provider}"
|
| 206 |
+
)
|
| 207 |
return retry_with_backoff(lambda: fallback_func(*args, **kwargs))
|
| 208 |
except Exception as fallback_error:
|
| 209 |
errors[fallback_provider] = fallback_error
|
| 210 |
+
logger.warning(
|
| 211 |
+
f"[{function_name}] Fallback provider {fallback_provider} failed: {fallback_error}"
|
| 212 |
+
)
|
| 213 |
continue
|
| 214 |
|
| 215 |
# All providers failed
|