mangubee commited on
Commit
44d1862
·
1 Parent(s): dc583a7

Update Groq model to openai/gpt-oss-120b

Browse files
output/gaia_results_20260104_222540.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "generated": "2026-01-04 22:25:40",
4
+ "timestamp": "20260104_222540",
5
+ "total_questions": 8,
6
+ "execution_time_seconds": 57.18,
7
+ "execution_time_formatted": "0m 57s",
8
+ "score_percent": 5.0,
9
+ "correct_count": 1,
10
+ "total_attempted": 8
11
+ },
12
+ "submission_status": "Submission Successful!\nUser: mangoobee\nOverall Score: 5.0% (1/8 correct)\nMessage: Score calculated successfully: 1/20 total questions answered correctly (8 valid tasks attempted). Score did not improve previous record, leaderboard not updated.",
13
+ "results": [
14
+ {
15
+ "task_id": "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
16
+ "question": "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
17
+ "submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
18
+ "correct": false
19
+ },
20
+ {
21
+ "task_id": "cca530fc-4052-43b2-b130-b30968d8aa44",
22
+ "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
23
+ "submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
24
+ "correct": false
25
+ },
26
+ {
27
+ "task_id": "2d83110e-a098-4ebb-9987-066c06fa42d0",
28
+ "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
29
+ "submitted_answer": "right",
30
+ "correct": true
31
+ },
32
+ {
33
+ "task_id": "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
34
+ "question": "Examine the video at https://www.youtube.com/watch?v=1htKBjuUWec.\n\nWhat does Teal'c say in response to the question \"Isn't that hot?\"",
35
+ "submitted_answer": "ERROR: No evidence collected. Details: Tool vision failed: Exception: Vision analysis failed - Gemini and Claude both failed",
36
+ "correct": false
37
+ },
38
+ {
39
+ "task_id": "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
40
+ "question": "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?",
41
+ "submitted_answer": "Unable to answer",
42
+ "correct": false
43
+ },
44
+ {
45
+ "task_id": "6f37996b-2ac7-44b0-8e68-6d28256631b4",
46
+ "question": "Given this table defining * on the set S = {a, b, c, d, e}\n\n|*|a|b|c|d|e|\n|---|---|---|---|---|---|\n|a|a|b|c|b|d|\n|b|b|c|a|e|c|\n|c|c|a|b|b|a|\n|d|b|e|b|e|d|\n|e|d|b|a|d|c|\n\nprovide the subset of S involved in any possible counter-examples that prove * is not commutative. Provide your answer as a comma separated list of the elements in the set in alphabetical order.",
47
+ "submitted_answer": "ERROR: No evidence collected. Details: Tool parse_file failed: FileNotFoundError: Text file not found: path/to/operation_table.csv",
48
+ "correct": false
49
+ },
50
+ {
51
+ "task_id": "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
52
+ "question": "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.",
53
+ "submitted_answer": "Unable to answer",
54
+ "correct": false
55
+ },
56
+ {
57
+ "task_id": "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
58
+ "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
59
+ "submitted_answer": "Unable to answer",
60
+ "correct": false
61
+ }
62
+ ]
63
+ }
src/agent/llm_client.py CHANGED
@@ -38,7 +38,7 @@ HF_MODEL = "Qwen/Qwen2.5-72B-Instruct" # Excellent for function calling and rea
38
  # Alternatives: "meta-llama/Llama-3.1-70B-Instruct", "NousResearch/Hermes-3-Llama-3.1-70B"
39
 
40
  # Groq Configuration
41
- GROQ_MODEL = "qwen/qwen3-32b" # Free tier: 60 req/min, fast inference
42
  # Alternatives: "llama-3.1-8b-instant", "mixtral-8x7b-32768"
43
 
44
  # Shared Configuration
@@ -184,7 +184,9 @@ def _call_with_fallback(function_name: str, *args, **kwargs) -> Any:
184
  logger.info(f"[{function_name}] Using primary provider: {primary_provider}")
185
  return retry_with_backoff(lambda: primary_func(*args, **kwargs))
186
  except Exception as primary_error:
187
- logger.warning(f"[{function_name}] Primary provider {primary_provider} failed: {primary_error}")
 
 
188
 
189
  # If fallback disabled, raise immediately
190
  if not enable_fallback:
@@ -199,11 +201,15 @@ def _call_with_fallback(function_name: str, *args, **kwargs) -> Any:
199
  for fallback_provider in fallback_providers:
200
  try:
201
  fallback_func = _get_provider_function(function_name, fallback_provider)
202
- logger.info(f"[{function_name}] Trying fallback provider: {fallback_provider}")
 
 
203
  return retry_with_backoff(lambda: fallback_func(*args, **kwargs))
204
  except Exception as fallback_error:
205
  errors[fallback_provider] = fallback_error
206
- logger.warning(f"[{function_name}] Fallback provider {fallback_provider} failed: {fallback_error}")
 
 
207
  continue
208
 
209
  # All providers failed
 
38
  # Alternatives: "meta-llama/Llama-3.1-70B-Instruct", "NousResearch/Hermes-3-Llama-3.1-70B"
39
 
40
  # Groq Configuration
41
+ GROQ_MODEL = "openai/gpt-oss-120b"
42
  # Alternatives: "llama-3.1-8b-instant", "mixtral-8x7b-32768"
43
 
44
  # Shared Configuration
 
184
  logger.info(f"[{function_name}] Using primary provider: {primary_provider}")
185
  return retry_with_backoff(lambda: primary_func(*args, **kwargs))
186
  except Exception as primary_error:
187
+ logger.warning(
188
+ f"[{function_name}] Primary provider {primary_provider} failed: {primary_error}"
189
+ )
190
 
191
  # If fallback disabled, raise immediately
192
  if not enable_fallback:
 
201
  for fallback_provider in fallback_providers:
202
  try:
203
  fallback_func = _get_provider_function(function_name, fallback_provider)
204
+ logger.info(
205
+ f"[{function_name}] Trying fallback provider: {fallback_provider}"
206
+ )
207
  return retry_with_backoff(lambda: fallback_func(*args, **kwargs))
208
  except Exception as fallback_error:
209
  errors[fallback_provider] = fallback_error
210
+ logger.warning(
211
+ f"[{function_name}] Fallback provider {fallback_provider} failed: {fallback_error}"
212
+ )
213
  continue
214
 
215
  # All providers failed