Spaces:
Running
Running
Commit
·
9fe493b
1
Parent(s):
9de209d
gpt 5 nano judge
Browse files- agent/main.py +11 -9
- eval/rubric_eval.py +1 -1
- eval/task.py +4 -2
agent/main.py
CHANGED
|
@@ -16,6 +16,17 @@ from agent.core.agent_loop import submission_loop
|
|
| 16 |
from agent.core.session import OpType
|
| 17 |
from agent.core.tools import ToolRouter
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
@dataclass
|
| 21 |
class Operation:
|
|
@@ -98,15 +109,6 @@ async def main():
|
|
| 98 |
print("=" * 60)
|
| 99 |
print("Type your messages below. Type 'exit', 'quit', or '/quit' to end.\n")
|
| 100 |
|
| 101 |
-
lmnr_api_key = os.environ.get("LMNR_API_KEY")
|
| 102 |
-
if lmnr_api_key:
|
| 103 |
-
try:
|
| 104 |
-
Laminar.initialize(project_api_key=lmnr_api_key)
|
| 105 |
-
litellm.callbacks = [LaminarLiteLLMCallback()]
|
| 106 |
-
print("✅ Laminar initialized")
|
| 107 |
-
except Exception as e:
|
| 108 |
-
print(f"⚠️ Failed to initialize Laminar: {e}")
|
| 109 |
-
|
| 110 |
# Create queues for communication
|
| 111 |
submission_queue = asyncio.Queue()
|
| 112 |
event_queue = asyncio.Queue()
|
|
|
|
| 16 |
from agent.core.session import OpType
|
| 17 |
from agent.core.tools import ToolRouter
|
| 18 |
|
| 19 |
+
litellm.drop_params = True
|
| 20 |
+
|
| 21 |
+
lmnr_api_key = os.environ.get("LMNR_API_KEY")
|
| 22 |
+
if lmnr_api_key:
|
| 23 |
+
try:
|
| 24 |
+
Laminar.initialize(project_api_key=lmnr_api_key)
|
| 25 |
+
litellm.callbacks = [LaminarLiteLLMCallback()]
|
| 26 |
+
print("✅ Laminar initialized")
|
| 27 |
+
except Exception as e:
|
| 28 |
+
print(f"⚠️ Failed to initialize Laminar: {e}")
|
| 29 |
+
|
| 30 |
|
| 31 |
@dataclass
|
| 32 |
class Operation:
|
|
|
|
| 109 |
print("=" * 60)
|
| 110 |
print("Type your messages below. Type 'exit', 'quit', or '/quit' to end.\n")
|
| 111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
# Create queues for communication
|
| 113 |
submission_queue = asyncio.Queue()
|
| 114 |
event_queue = asyncio.Queue()
|
eval/rubric_eval.py
CHANGED
|
@@ -97,7 +97,7 @@ def evaluate_with_rubrics(
|
|
| 97 |
question: str,
|
| 98 |
response: str,
|
| 99 |
rubrics: List[RubricData],
|
| 100 |
-
model: str = "gpt-
|
| 101 |
) -> RubricEvaluation:
|
| 102 |
"""
|
| 103 |
Evaluate response using RaR-Explicit method (weighted sum).
|
|
|
|
| 97 |
question: str,
|
| 98 |
response: str,
|
| 99 |
rubrics: List[RubricData],
|
| 100 |
+
model: str = "gpt-5-nano",
|
| 101 |
) -> RubricEvaluation:
|
| 102 |
"""
|
| 103 |
Evaluate response using RaR-Explicit method (weighted sum).
|
eval/task.py
CHANGED
|
@@ -14,6 +14,7 @@ from inspect_ai import Task, task
|
|
| 14 |
from inspect_ai.dataset import Sample, hf_dataset
|
| 15 |
from inspect_ai.scorer import Score, Target, mean, scorer
|
| 16 |
from inspect_ai.solver._task_state import TaskState
|
|
|
|
| 17 |
|
| 18 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 19 |
if str(PROJECT_ROOT) not in sys.path:
|
|
@@ -56,7 +57,7 @@ def _metadata_to_rubrics(metadata: dict[str, Any]) -> list[RubricData]:
|
|
| 56 |
|
| 57 |
|
| 58 |
@scorer(metrics=[mean()], name="rubric_scorer")
|
| 59 |
-
def rubric_scorer(judge_model: str = "gpt-
|
| 60 |
async def score(state: TaskState, target: Target) -> Score:
|
| 61 |
response_text = state.output.completion or state.output.message.text
|
| 62 |
question = state.metadata.get("question", state.input_text)
|
|
@@ -99,8 +100,9 @@ def hf_benchmark_with_rubrics(
|
|
| 99 |
},
|
| 100 |
dataset_name: str = "akseljoonas/hf-agent-rubrics@train",
|
| 101 |
limit: int | None = None,
|
| 102 |
-
judge_model: str = "gpt-
|
| 103 |
) -> Task:
|
|
|
|
| 104 |
if "@" not in dataset_name:
|
| 105 |
raise ValueError("Dataset name must be in the format 'author/dataset@split'")
|
| 106 |
dataset_name, dataset_split = dataset_name.split("@")
|
|
|
|
| 14 |
from inspect_ai.dataset import Sample, hf_dataset
|
| 15 |
from inspect_ai.scorer import Score, Target, mean, scorer
|
| 16 |
from inspect_ai.solver._task_state import TaskState
|
| 17 |
+
import litellm
|
| 18 |
|
| 19 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 20 |
if str(PROJECT_ROOT) not in sys.path:
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
@scorer(metrics=[mean()], name="rubric_scorer")
|
| 60 |
+
def rubric_scorer(judge_model: str = "gpt-5-mini"):
|
| 61 |
async def score(state: TaskState, target: Target) -> Score:
|
| 62 |
response_text = state.output.completion or state.output.message.text
|
| 63 |
question = state.metadata.get("question", state.input_text)
|
|
|
|
| 100 |
},
|
| 101 |
dataset_name: str = "akseljoonas/hf-agent-rubrics@train",
|
| 102 |
limit: int | None = None,
|
| 103 |
+
judge_model: str = "gpt-5-mini",
|
| 104 |
) -> Task:
|
| 105 |
+
litellm.drop_params = True
|
| 106 |
if "@" not in dataset_name:
|
| 107 |
raise ValueError("Dataset name must be in the format 'author/dataset@split'")
|
| 108 |
dataset_name, dataset_split = dataset_name.split("@")
|