akseljoonas HF Staff commited on
Commit
9fe493b
·
1 Parent(s): 9de209d

gpt 5 nano judge

Browse files
Files changed (3) hide show
  1. agent/main.py +11 -9
  2. eval/rubric_eval.py +1 -1
  3. eval/task.py +4 -2
agent/main.py CHANGED
@@ -16,6 +16,17 @@ from agent.core.agent_loop import submission_loop
16
  from agent.core.session import OpType
17
  from agent.core.tools import ToolRouter
18
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  @dataclass
21
  class Operation:
@@ -98,15 +109,6 @@ async def main():
98
  print("=" * 60)
99
  print("Type your messages below. Type 'exit', 'quit', or '/quit' to end.\n")
100
 
101
- lmnr_api_key = os.environ.get("LMNR_API_KEY")
102
- if lmnr_api_key:
103
- try:
104
- Laminar.initialize(project_api_key=lmnr_api_key)
105
- litellm.callbacks = [LaminarLiteLLMCallback()]
106
- print("✅ Laminar initialized")
107
- except Exception as e:
108
- print(f"⚠️ Failed to initialize Laminar: {e}")
109
-
110
  # Create queues for communication
111
  submission_queue = asyncio.Queue()
112
  event_queue = asyncio.Queue()
 
16
  from agent.core.session import OpType
17
  from agent.core.tools import ToolRouter
18
 
19
+ litellm.drop_params = True
20
+
21
+ lmnr_api_key = os.environ.get("LMNR_API_KEY")
22
+ if lmnr_api_key:
23
+ try:
24
+ Laminar.initialize(project_api_key=lmnr_api_key)
25
+ litellm.callbacks = [LaminarLiteLLMCallback()]
26
+ print("✅ Laminar initialized")
27
+ except Exception as e:
28
+ print(f"⚠️ Failed to initialize Laminar: {e}")
29
+
30
 
31
  @dataclass
32
  class Operation:
 
109
  print("=" * 60)
110
  print("Type your messages below. Type 'exit', 'quit', or '/quit' to end.\n")
111
 
 
 
 
 
 
 
 
 
 
112
  # Create queues for communication
113
  submission_queue = asyncio.Queue()
114
  event_queue = asyncio.Queue()
eval/rubric_eval.py CHANGED
@@ -97,7 +97,7 @@ def evaluate_with_rubrics(
97
  question: str,
98
  response: str,
99
  rubrics: List[RubricData],
100
- model: str = "gpt-4o-mini",
101
  ) -> RubricEvaluation:
102
  """
103
  Evaluate response using RaR-Explicit method (weighted sum).
 
97
  question: str,
98
  response: str,
99
  rubrics: List[RubricData],
100
+ model: str = "gpt-5-nano",
101
  ) -> RubricEvaluation:
102
  """
103
  Evaluate response using RaR-Explicit method (weighted sum).
eval/task.py CHANGED
@@ -14,6 +14,7 @@ from inspect_ai import Task, task
14
  from inspect_ai.dataset import Sample, hf_dataset
15
  from inspect_ai.scorer import Score, Target, mean, scorer
16
  from inspect_ai.solver._task_state import TaskState
 
17
 
18
  PROJECT_ROOT = Path(__file__).resolve().parents[1]
19
  if str(PROJECT_ROOT) not in sys.path:
@@ -56,7 +57,7 @@ def _metadata_to_rubrics(metadata: dict[str, Any]) -> list[RubricData]:
56
 
57
 
58
  @scorer(metrics=[mean()], name="rubric_scorer")
59
- def rubric_scorer(judge_model: str = "gpt-4o-mini"):
60
  async def score(state: TaskState, target: Target) -> Score:
61
  response_text = state.output.completion or state.output.message.text
62
  question = state.metadata.get("question", state.input_text)
@@ -99,8 +100,9 @@ def hf_benchmark_with_rubrics(
99
  },
100
  dataset_name: str = "akseljoonas/hf-agent-rubrics@train",
101
  limit: int | None = None,
102
- judge_model: str = "gpt-4o-mini",
103
  ) -> Task:
 
104
  if "@" not in dataset_name:
105
  raise ValueError("Dataset name must be in the format 'author/dataset@split'")
106
  dataset_name, dataset_split = dataset_name.split("@")
 
14
  from inspect_ai.dataset import Sample, hf_dataset
15
  from inspect_ai.scorer import Score, Target, mean, scorer
16
  from inspect_ai.solver._task_state import TaskState
17
+ import litellm
18
 
19
  PROJECT_ROOT = Path(__file__).resolve().parents[1]
20
  if str(PROJECT_ROOT) not in sys.path:
 
57
 
58
 
59
  @scorer(metrics=[mean()], name="rubric_scorer")
60
+ def rubric_scorer(judge_model: str = "gpt-5-mini"):
61
  async def score(state: TaskState, target: Target) -> Score:
62
  response_text = state.output.completion or state.output.message.text
63
  question = state.metadata.get("question", state.input_text)
 
100
  },
101
  dataset_name: str = "akseljoonas/hf-agent-rubrics@train",
102
  limit: int | None = None,
103
+ judge_model: str = "gpt-5-mini",
104
  ) -> Task:
105
+ litellm.drop_params = True
106
  if "@" not in dataset_name:
107
  raise ValueError("Dataset name must be in the format 'author/dataset@split'")
108
  dataset_name, dataset_split = dataset_name.split("@")