Mahakii commited on
Commit
857fbed
·
1 Parent(s): 4070093

Implement full robustness hardening for grader pipeline

Browse files
openenv.yaml CHANGED
@@ -13,66 +13,17 @@ description: >
13
 
14
  tasks:
15
  - id: task1
16
- task_id: task1
17
- task: task1
18
- task_name: task1
19
  name: Baseline triage
20
  enabled: true
21
  grader: graders/task1_grader.py
22
- grader_cmd: python graders/task1_grader.py
23
- grader_command: python graders/task1_grader.py
24
- grader_path: graders/task1_grader.py
25
- grader_file: graders/task1_grader.py
26
- grader_legacy_path: graders/task1.py
27
- graders:
28
- - type: python
29
- name: default
30
- path: graders/task1_grader.py
31
- command: python graders/task1_grader.py
32
  - id: task2
33
- task_id: task2
34
- task: task2
35
- task_name: task2
36
  name: Moderate pressure triage
37
  enabled: true
38
  grader: graders/task2_grader.py
39
- grader_cmd: python graders/task2_grader.py
40
- grader_command: python graders/task2_grader.py
41
- grader_path: graders/task2_grader.py
42
- grader_file: graders/task2_grader.py
43
- grader_legacy_path: graders/task2.py
44
- graders:
45
- - type: python
46
- name: default
47
- path: graders/task2_grader.py
48
- command: python graders/task2_grader.py
49
  - id: task3
50
- task_id: task3
51
- task: task3
52
- task_name: task3
53
  name: High pressure triage
54
  enabled: true
55
  grader: graders/task3_grader.py
56
- grader_cmd: python graders/task3_grader.py
57
- grader_command: python graders/task3_grader.py
58
- grader_path: graders/task3_grader.py
59
- grader_file: graders/task3_grader.py
60
- grader_legacy_path: graders/task3.py
61
- graders:
62
- - type: python
63
- name: default
64
- path: graders/task3_grader.py
65
- command: python graders/task3_grader.py
66
-
67
- graders:
68
- task1: graders/task1_grader.py
69
- task2: graders/task2_grader.py
70
- task3: graders/task3_grader.py
71
-
72
- task_graders:
73
- task1: python graders/task1_grader.py
74
- task2: python graders/task2_grader.py
75
- task3: python graders/task3_grader.py
76
 
77
  tags:
78
  - openenv
 
13
 
14
  tasks:
15
  - id: task1
 
 
 
16
  name: Baseline triage
17
  enabled: true
18
  grader: graders/task1_grader.py
 
 
 
 
 
 
 
 
 
 
19
  - id: task2
 
 
 
20
  name: Moderate pressure triage
21
  enabled: true
22
  grader: graders/task2_grader.py
 
 
 
 
 
 
 
 
 
 
23
  - id: task3
 
 
 
24
  name: High pressure triage
25
  enabled: true
26
  grader: graders/task3_grader.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  tags:
29
  - openenv
triage_env/evaluation/run_llm_agent.py CHANGED
@@ -1,7 +1,20 @@
 
 
 
1
  from triage_env.server.triage_env_environment import TriageEnvironment
2
  from triage_env.agents.llm_agent import LLMAgent
3
 
4
 
 
 
 
 
 
 
 
 
 
 
5
  def mock_llm(system_prompt: str, user_prompt: str) -> str:
6
  # temporary placeholder until real API integration
7
  _ = system_prompt, user_prompt
@@ -13,17 +26,15 @@ def main():
13
  agent = LLMAgent(llm_callable=mock_llm)
14
 
15
  obs = env.reset()
16
- print("Initial Observation:")
17
- print(obs.model_dump())
18
 
19
  while not obs.done:
20
  action = agent.act(obs)
21
- print("\nAction:", action.model_dump())
22
  obs = env.step(action)
23
- print("Observation:", obs.model_dump())
24
 
25
- print("\nFinal State:")
26
- print(env.state.model_dump())
27
 
28
 
29
  if __name__ == "__main__":
 
1
+ import logging
2
+ import sys
3
+
4
  from triage_env.server.triage_env_environment import TriageEnvironment
5
  from triage_env.agents.llm_agent import LLMAgent
6
 
7
 
8
+ if not logging.getLogger().handlers:
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ stream=sys.stderr,
12
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
13
+ )
14
+
15
+ LOGGER = logging.getLogger(__name__)
16
+
17
+
18
  def mock_llm(system_prompt: str, user_prompt: str) -> str:
19
  # temporary placeholder until real API integration
20
  _ = system_prompt, user_prompt
 
26
  agent = LLMAgent(llm_callable=mock_llm)
27
 
28
  obs = env.reset()
29
+ LOGGER.info("Initial Observation: %s", obs.model_dump())
 
30
 
31
  while not obs.done:
32
  action = agent.act(obs)
33
+ LOGGER.info("Action: %s", action.model_dump())
34
  obs = env.step(action)
35
+ LOGGER.info("Observation: %s", obs.model_dump())
36
 
37
+ LOGGER.info("Final State: %s", env.state.model_dump())
 
38
 
39
 
40
  if __name__ == "__main__":
triage_env/graders/common.py CHANGED
@@ -2,12 +2,16 @@ from __future__ import annotations
2
 
3
  import json
4
  import math
 
5
  import os
 
6
  import sys
7
  from dataclasses import replace
8
  from pathlib import Path
9
  from typing import Any
10
 
 
 
11
  # Ensure triage_env package can be imported when graders are executed via file path.
12
  REPO_ROOT = Path(__file__).resolve().parents[2]
13
  if str(REPO_ROOT) not in sys.path:
@@ -15,6 +19,7 @@ if str(REPO_ROOT) not in sys.path:
15
 
16
  from triage_env.agents.random_agent import RandomAgent
17
  from triage_env.agents.rl_agents import RLAgent
 
18
  from triage_env.agents.rule_based_agent import RuleBasedAgent
19
  from triage_env.agents.trained_q_agent import TrainedQAgent
20
  from triage_env.evaluation.evaluator import evaluate_agent
@@ -23,6 +28,68 @@ from triage_env.tasks import TASK_CONFIGS, TASK_TARGETS
23
 
24
  GRADER_VERSION = "v2.2" # Updated version
25
  SCORE_EPSILON = 0.001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def _resolve_existing_path(candidates: list[Path]) -> Path | None:
@@ -32,6 +99,106 @@ def _resolve_existing_path(candidates: list[Path]) -> Path | None:
32
  return None
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def _build_evaluated_agent(task_name: str):
36
  package_root = Path(__file__).resolve().parents[1]
37
 
@@ -270,86 +437,38 @@ def _compute_final_score(components: dict[str, float]) -> float:
270
 
271
 
272
  def grade_task(task_name: str, episodes: int = 1) -> dict[str, Any]:
273
- try:
274
- if task_name not in TASK_CONFIGS:
275
- raise ValueError(f"Unsupported task: {task_name}")
276
-
277
- task_config = TASK_CONFIGS[task_name]
278
- agent, agent_meta = _build_evaluated_agent(task_name)
279
- summary, _ = evaluate_agent(
280
- env_class=TriageEnvironment,
281
- agent=agent,
282
- task=task_name,
283
- num_episodes=episodes,
284
- max_steps=task_config.max_steps,
285
- )
286
 
287
- components = _compute_components(task_name, summary)
288
- final_score = _compute_final_score(components)
289
-
290
- return {
291
- "grader_version": GRADER_VERSION,
292
- "task": task_name,
293
- "task_id": task_name,
294
- "episodes": episodes,
295
- "score": final_score,
296
- "reward": final_score,
297
- "score_range": [0.0, 1.0],
298
- "components": {
299
- "rollout_achievement": components["rollout_achievement"],
300
- "safety_errors": components["safety_errors"],
301
- "efficiency": components["efficiency"],
302
- "task_specific": components["task_specific"],
303
- },
304
- "signals": {
305
- "selected_agent": agent_meta.get("selected_agent"),
306
- "selected_checkpoint": agent_meta.get("checkpoint"),
307
- "selection_reason": agent_meta.get("selection_reason"),
308
- "survival_rate": components["survival_rate"],
309
- "critical_survival_rate": components["critical_survival_rate"],
310
- "success_rate": components["success_rate"],
311
- "reward_norm": components["reward_norm"],
312
- "invalid_rate": components["invalid_rate"],
313
- "stabilization_threshold": components["stabilization_threshold"],
314
- },
315
- "summary": summary,
316
- }
317
-
318
- except Exception as e:
319
- print(f"❌ grade_task EXCEPTION: {type(e).__name__}: {e}", file=sys.stderr)
320
- import traceback
321
- traceback.print_exc(file=sys.stderr)
322
-
323
- safe_score = 0.5
324
-
325
- # ✅ FIX 3: Restore full component/summary shape (audit: 'weakened schema')
326
- return {
327
- "grader_version": GRADER_VERSION,
328
- "task": task_name,
329
- "task_id": task_name,
330
- "episodes": episodes,
331
- "score": safe_score,
332
- "reward": safe_score,
333
- "score_range": [0.0, 1.0],
334
- "components": {
335
- "rollout_achievement": 0.5,
336
- "safety_errors": 0.5,
337
- "efficiency": 0.5,
338
- "task_specific": 0.5,
339
- },
340
- "signals": {
341
- "error_type": type(e).__name__,
342
- "error_message": str(e),
343
- "status": "fallback_safe_mode",
344
- "survival_rate": 0.5,
345
- "critical_survival_rate": 0.5,
346
- "success_rate": 0.5,
347
- "reward_norm": 0.5,
348
- "invalid_rate": 0.0,
349
- "stabilization_threshold": 0.5,
350
- },
351
- "summary": {"status": "fallback", "error": str(e)}
352
- }
353
 
354
 
355
  def print_grader_result(result: dict[str, Any]) -> None:
 
2
 
3
  import json
4
  import math
5
+ import logging
6
  import os
7
+ import multiprocessing as mp
8
  import sys
9
  from dataclasses import replace
10
  from pathlib import Path
11
  from typing import Any
12
 
13
+ from jsonschema import ValidationError, validate
14
+
15
  # Ensure triage_env package can be imported when graders are executed via file path.
16
  REPO_ROOT = Path(__file__).resolve().parents[2]
17
  if str(REPO_ROOT) not in sys.path:
 
19
 
20
  from triage_env.agents.random_agent import RandomAgent
21
  from triage_env.agents.rl_agents import RLAgent
22
+ from triage_env.agents.base_agent import BaseAgent
23
  from triage_env.agents.rule_based_agent import RuleBasedAgent
24
  from triage_env.agents.trained_q_agent import TrainedQAgent
25
  from triage_env.evaluation.evaluator import evaluate_agent
 
28
 
29
  GRADER_VERSION = "v2.2" # Updated version
30
  SCORE_EPSILON = 0.001
31
+ GRADE_TIMEOUT_SECONDS = float(os.getenv("TRIAGE_GRADE_TIMEOUT_SECONDS", "300"))
32
+
33
+ if not logging.getLogger().handlers:
34
+ logging.basicConfig(
35
+ level=os.getenv("TRIAGE_LOG_LEVEL", "INFO").upper(),
36
+ stream=sys.stderr,
37
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
38
+ )
39
+
40
+ LOGGER = logging.getLogger(__name__)
41
+
42
+ _RESULT_SCHEMA = {
43
+ "type": "object",
44
+ "required": ["grader_version", "task", "task_id", "episodes", "score", "reward", "score_range", "components", "signals", "summary"],
45
+ "properties": {
46
+ "grader_version": {"type": "string"},
47
+ "task": {"type": "string"},
48
+ "task_id": {"type": "string"},
49
+ "episodes": {"type": "integer", "minimum": 1},
50
+ "score": {"type": "number", "minimum": SCORE_EPSILON, "maximum": 1.0 - SCORE_EPSILON},
51
+ "reward": {"type": "number"},
52
+ "score_range": {
53
+ "type": "array",
54
+ "items": {"type": "number"},
55
+ "minItems": 2,
56
+ "maxItems": 2,
57
+ },
58
+ "components": {"type": "object"},
59
+ "signals": {"type": "object"},
60
+ "summary": {"type": "object"},
61
+ },
62
+ }
63
+
64
+
65
+ class SafeAgent(BaseAgent):
66
+ def __init__(self, wrapped_agent: BaseAgent):
67
+ self._wrapped_agent = wrapped_agent
68
+
69
+ @property
70
+ def name(self) -> str:
71
+ return getattr(self._wrapped_agent, "name", self._wrapped_agent.__class__.__name__)
72
+
73
+ def reset(self):
74
+ try:
75
+ reset = getattr(self._wrapped_agent, "reset", None)
76
+ if callable(reset):
77
+ reset()
78
+ except Exception:
79
+ LOGGER.exception("SafeAgent reset failed; continuing")
80
+
81
+ def act(self, observation):
82
+ try:
83
+ return self._wrapped_agent.act(observation)
84
+ except Exception:
85
+ LOGGER.exception("Wrapped agent crashed; returning safe wait action")
86
+ return _safe_wait_action()
87
+
88
+
89
+ def _safe_wait_action():
90
+ from triage_env.models import TriageAction
91
+
92
+ return TriageAction(action_type="wait", patient_id=-1)
93
 
94
 
95
  def _resolve_existing_path(candidates: list[Path]) -> Path | None:
 
99
  return None
100
 
101
 
102
+ def _fallback_grade(task_name: str, episodes: int, reason: str) -> dict[str, Any]:
103
+ safe_score = _clip_open_01(0.5)
104
+ return {
105
+ "grader_version": GRADER_VERSION,
106
+ "status": "error",
107
+ "task": task_name,
108
+ "task_id": task_name,
109
+ "episodes": episodes,
110
+ "score": safe_score,
111
+ "reward": safe_score,
112
+ "score_range": [0.0, 1.0],
113
+ "components": {
114
+ "rollout_achievement": safe_score,
115
+ "safety_errors": safe_score,
116
+ "efficiency": safe_score,
117
+ "task_specific": safe_score,
118
+ },
119
+ "signals": {
120
+ "fallback": 1.0,
121
+ "error": reason,
122
+ },
123
+ "summary": {
124
+ "task": task_name,
125
+ "fallback_reason": reason,
126
+ "success_rate": safe_score,
127
+ "survival_rate": safe_score,
128
+ "critical_survival_rate": safe_score,
129
+ "avg_total_reward": safe_score,
130
+ },
131
+ }
132
+
133
+
134
+ def _validate_result_schema(result: dict[str, Any]) -> dict[str, Any]:
135
+ try:
136
+ validate(instance=result, schema=_RESULT_SCHEMA)
137
+ return result
138
+ except ValidationError as exc:
139
+ LOGGER.error("Grader result failed schema validation: %s", exc)
140
+ return _fallback_grade(result.get("task", "unknown"), int(result.get("episodes", 1)), f"schema:{exc}")
141
+
142
+
143
+ def _grade_task_impl(task_name: str, episodes: int) -> dict[str, Any]:
144
+ if task_name not in TASK_CONFIGS:
145
+ raise ValueError(f"Unsupported task: {task_name}")
146
+
147
+ task_config = TASK_CONFIGS[task_name]
148
+ agent, agent_meta = _build_evaluated_agent(task_name)
149
+ agent = SafeAgent(agent)
150
+ summary, _ = evaluate_agent(
151
+ env_class=TriageEnvironment,
152
+ agent=agent,
153
+ task=task_name,
154
+ num_episodes=episodes,
155
+ max_steps=task_config.max_steps,
156
+ )
157
+
158
+ components = _compute_components(task_name, summary)
159
+ final_score = _compute_final_score(components)
160
+
161
+ return {
162
+ "grader_version": GRADER_VERSION,
163
+ "task": task_name,
164
+ "task_id": task_name,
165
+ "episodes": episodes,
166
+ "score": final_score,
167
+ "reward": final_score,
168
+ "score_range": [0.0, 1.0],
169
+ "components": {
170
+ "rollout_achievement": components["rollout_achievement"],
171
+ "safety_errors": components["safety_errors"],
172
+ "efficiency": components["efficiency"],
173
+ "task_specific": components["task_specific"],
174
+ },
175
+ "signals": {
176
+ "selected_agent": agent_meta.get("selected_agent"),
177
+ "selected_checkpoint": agent_meta.get("checkpoint"),
178
+ "selection_reason": agent_meta.get("selection_reason"),
179
+ "survival_rate": components["survival_rate"],
180
+ "critical_survival_rate": components["critical_survival_rate"],
181
+ "success_rate": components["success_rate"],
182
+ "reward_norm": components["reward_norm"],
183
+ "invalid_rate": components["invalid_rate"],
184
+ "stabilization_threshold": components["stabilization_threshold"],
185
+ },
186
+ "summary": summary,
187
+ }
188
+
189
+
190
+ def _grade_task_worker(task_name: str, episodes: int, result_queue):
191
+ try:
192
+ result_queue.put(_validate_result_schema(_grade_task_impl(task_name, episodes)))
193
+ except Exception as exc: # pragma: no cover
194
+ err = {
195
+ "type": type(exc).__name__,
196
+ "message": str(exc),
197
+ "traceback": "",
198
+ }
199
+ result_queue.put(_fallback_grade(task_name, episodes, json.dumps(err, ensure_ascii=True)))
200
+
201
+
202
  def _build_evaluated_agent(task_name: str):
203
  package_root = Path(__file__).resolve().parents[1]
204
 
 
437
 
438
 
439
  def grade_task(task_name: str, episodes: int = 1) -> dict[str, Any]:
440
+ ctx = mp.get_context("spawn")
441
+ result_queue = ctx.Queue(maxsize=1)
442
+ process = ctx.Process(target=_grade_task_worker, args=(task_name, episodes, result_queue), daemon=True)
 
 
 
 
 
 
 
 
 
 
443
 
444
+ try:
445
+ process.start()
446
+ process.join(timeout=GRADE_TIMEOUT_SECONDS)
447
+
448
+ if process.is_alive():
449
+ LOGGER.error("Grader timed out after %.1f seconds for task %s", GRADE_TIMEOUT_SECONDS, task_name)
450
+ process.terminate()
451
+ process.join(timeout=5)
452
+ return _fallback_grade(task_name, episodes, f"timeout:{GRADE_TIMEOUT_SECONDS}")
453
+
454
+ if result_queue.empty():
455
+ return _fallback_grade(task_name, episodes, "no-result-from-worker")
456
+
457
+ result = result_queue.get_nowait()
458
+ return _validate_result_schema(result)
459
+
460
+ except Exception as exc: # pragma: no cover
461
+ LOGGER.exception("Unexpected grader failure for task %s", task_name)
462
+ return _fallback_grade(task_name, episodes, f"{type(exc).__name__}:{exc}")
463
+ finally:
464
+ try:
465
+ result_queue.close()
466
+ except Exception:
467
+ pass
468
+ try:
469
+ result_queue.join_thread()
470
+ except Exception:
471
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
 
474
  def print_grader_result(result: dict[str, Any]) -> None:
triage_env/openenv.yaml DELETED
@@ -1,82 +0,0 @@
1
- spec_version: 1
2
- name: criticalops-triage-env
3
-
4
- type: space
5
- runtime: fastapi
6
- app: server.app:app
7
- port: 8000
8
-
9
- description: >
10
- A real-world OpenEnv environment simulating combined medical and military triage scenarios.
11
- Agents must prioritize patients, allocate limited resources, and make high-stakes decisions
12
- to maximize survival and health outcomes.
13
-
14
- tasks:
15
- - id: task1
16
- task_id: task1
17
- task: task1
18
- task_name: task1
19
- name: Baseline triage
20
- enabled: true
21
- grader: graders/task1_grader.py
22
- grader_cmd: python graders/task1_grader.py
23
- grader_command: python graders/task1_grader.py
24
- grader_path: graders/task1_grader.py
25
- grader_file: graders/task1_grader.py
26
- grader_legacy_path: graders/task1.py
27
- graders:
28
- - type: python
29
- name: default
30
- path: graders/task1_grader.py
31
- command: python graders/task1_grader.py
32
- - id: task2
33
- task_id: task2
34
- task: task2
35
- task_name: task2
36
- name: Moderate pressure triage
37
- enabled: true
38
- grader: graders/task2_grader.py
39
- grader_cmd: python graders/task2_grader.py
40
- grader_command: python graders/task2_grader.py
41
- grader_path: graders/task2_grader.py
42
- grader_file: graders/task2_grader.py
43
- grader_legacy_path: graders/task2.py
44
- graders:
45
- - type: python
46
- name: default
47
- path: graders/task2_grader.py
48
- command: python graders/task2_grader.py
49
- - id: task3
50
- task_id: task3
51
- task: task3
52
- task_name: task3
53
- name: High pressure triage
54
- enabled: true
55
- grader: graders/task3_grader.py
56
- grader_cmd: python graders/task3_grader.py
57
- grader_command: python graders/task3_grader.py
58
- grader_path: graders/task3_grader.py
59
- grader_file: graders/task3_grader.py
60
- grader_legacy_path: graders/task3.py
61
- graders:
62
- - type: python
63
- name: default
64
- path: graders/task3_grader.py
65
- command: python graders/task3_grader.py
66
-
67
- graders:
68
- task1: graders/task1_grader.py
69
- task2: graders/task2_grader.py
70
- task3: graders/task3_grader.py
71
-
72
- task_graders:
73
- task1: python graders/task1_grader.py
74
- task2: python graders/task2_grader.py
75
- task3: python graders/task3_grader.py
76
-
77
- tags:
78
- - openenv
79
- - triage
80
- - healthcare
81
- - military
82
- - decision-making
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
triage_env/scripts/run_llm_agent.py CHANGED
@@ -1,6 +1,6 @@
1
  import argparse
2
  import logging
3
- import os
4
 
5
  from triage_env.agents.llm_agent import LLMAgent
6
  from triage_env.config import get_llm_config
@@ -9,7 +9,14 @@ from triage_env.evaluation.evaluator import run_single_episode
9
  from triage_env.server.triage_env_environment import TriageEnvironment
10
 
11
 
12
- logging.basicConfig(level=logging.INFO)
 
 
 
 
 
 
 
13
 
14
 
15
  def main() -> None:
@@ -20,14 +27,13 @@ def main() -> None:
20
  args = parser.parse_args()
21
 
22
  if not llm_config.api_key:
23
- print("API_KEY is not set. LLMAgent may run in fallback mode.")
24
  if not llm_config.base_url:
25
- print("API_BASE_URL is not set. Requests may not route through the validator proxy.")
26
 
27
  env = TriageEnvironment(task=args.task)
28
  metrics = run_single_episode(env, LLMAgent())
29
- print("LLM agent episode metrics:")
30
- print(metrics)
31
 
32
 
33
  if __name__ == "__main__":
 
1
  import argparse
2
  import logging
3
+ import sys
4
 
5
  from triage_env.agents.llm_agent import LLMAgent
6
  from triage_env.config import get_llm_config
 
9
  from triage_env.server.triage_env_environment import TriageEnvironment
10
 
11
 
12
+ if not logging.getLogger().handlers:
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ stream=sys.stderr,
16
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
17
+ )
18
+
19
+ LOGGER = logging.getLogger(__name__)
20
 
21
 
22
  def main() -> None:
 
27
  args = parser.parse_args()
28
 
29
  if not llm_config.api_key:
30
+ LOGGER.warning("API_KEY is not set. LLMAgent may run in fallback mode.")
31
  if not llm_config.base_url:
32
+ LOGGER.warning("API_BASE_URL is not set. Requests may not route through the validator proxy.")
33
 
34
  env = TriageEnvironment(task=args.task)
35
  metrics = run_single_episode(env, LLMAgent())
36
+ LOGGER.info("LLM agent episode metrics: %s", metrics)
 
37
 
38
 
39
  if __name__ == "__main__":
validate-submission.sh CHANGED
@@ -4,8 +4,7 @@ set -euo pipefail
4
  # Wrapper entrypoint that forwards to the Python validator.
5
  # Usage:
6
  # ./validate-submission.sh <ping_url> [repo_dir]
7
- # If repo_dir is omitted and ./triage_env/openenv.yaml exists,
8
- # it defaults to ./triage_env for convenience.
9
 
10
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11
  PYTHON_BIN="${PYTHON_BIN:-}"
@@ -23,8 +22,4 @@ if [ -z "$PYTHON_BIN" ]; then
23
  fi
24
  fi
25
 
26
- if [ "$#" -eq 1 ] && [ -f "$SCRIPT_DIR/triage_env/openenv.yaml" ]; then
27
- exec "$PYTHON_BIN" "$SCRIPT_DIR/validation.py" "$1" "$SCRIPT_DIR/triage_env"
28
- fi
29
-
30
  exec "$PYTHON_BIN" "$SCRIPT_DIR/validation.py" "$@"
 
4
  # Wrapper entrypoint that forwards to the Python validator.
5
  # Usage:
6
  # ./validate-submission.sh <ping_url> [repo_dir]
7
+ # If repo_dir is omitted, validation.py defaults to the repository root.
 
8
 
9
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
10
  PYTHON_BIN="${PYTHON_BIN:-}"
 
22
  fi
23
  fi
24
 
 
 
 
 
25
  exec "$PYTHON_BIN" "$SCRIPT_DIR/validation.py" "$@"
validation.py CHANGED
@@ -165,18 +165,10 @@ def find_docker_context(repo_dir: Path) -> tuple[Path, Path] | None:
165
 
166
 
167
  def find_openenv_dir(repo_dir: Path) -> Path | None:
168
- """Find the directory containing openenv.yaml by checking common locations."""
169
- # Check root first
170
  if (repo_dir / "openenv.yaml").exists():
171
  return repo_dir
172
-
173
- # Check common subdirectories
174
- for subdir in ["triage_env", "env", "environment", "server"]:
175
- candidate = repo_dir / subdir
176
- if (candidate / "openenv.yaml").exists():
177
- return candidate
178
-
179
- # If not found, return None
180
  return None
181
 
182
 
@@ -225,8 +217,8 @@ def check_step3_openenv_validate(repo_dir: Path) -> None:
225
  # Find the actual OpenEnv environment directory
226
  env_dir = find_openenv_dir(repo_dir)
227
  if env_dir is None:
228
- fail_msg("openenv.yaml not found in repo or common subdirectories (triage_env, env, environment, server)")
229
- hint(f"Make sure openenv.yaml is in {repo_dir} or a subdirectory like {repo_dir}/triage_env/")
230
  stop_at("Step 3")
231
 
232
  log(f" Found openenv.yaml in: {env_dir}")
 
165
 
166
 
167
  def find_openenv_dir(repo_dir: Path) -> Path | None:
168
+ """Find the directory containing the single source-of-truth openenv.yaml."""
 
169
  if (repo_dir / "openenv.yaml").exists():
170
  return repo_dir
171
+
 
 
 
 
 
 
 
172
  return None
173
 
174
 
 
217
  # Find the actual OpenEnv environment directory
218
  env_dir = find_openenv_dir(repo_dir)
219
  if env_dir is None:
220
+ fail_msg("openenv.yaml not found at repository root")
221
+ hint(f"Make sure openenv.yaml is in {repo_dir}")
222
  stop_at("Step 3")
223
 
224
  log(f" Found openenv.yaml in: {env_dir}")