Addy897 commited on
Commit
a2abcaa
·
1 Parent(s): dae9663

inference update

Browse files
Files changed (2) hide show
  1. inference.py +49 -9
  2. support_ops_env/graders/common.py +12 -2
inference.py CHANGED
@@ -20,6 +20,8 @@ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.1"))
20
  MAX_TOKENS = int(os.getenv("MAX_TOKENS", "220"))
21
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.8"))
22
 
 
 
23
 
24
  SYSTEM_PROMPT = textwrap.dedent(
25
  """
@@ -99,17 +101,31 @@ def get_model_action(client: OpenAI, observation: Observation, step: int, reward
99
  return fallback, str(exc).replace("\n", " ")
100
 
101
 
102
- def ensure_known_task(task_name: str) -> str:
103
- if task_name in list_task_ids():
104
- return task_name
105
- return list_task_ids()[0]
106
 
107
 
108
- def main() -> None:
109
- task_name = ensure_known_task(TASK_NAME)
110
- client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
111
- env = SupportOpsEnv(task_id=task_name)
 
 
 
 
112
 
 
 
 
 
 
 
 
 
 
 
113
  rewards: List[float] = []
114
  steps_taken = 0
115
  score = 0.0
@@ -141,11 +157,35 @@ def main() -> None:
141
  if done:
142
  break
143
 
144
- score = min(max(score, 0.0), 1.0)
 
145
  success = score >= SUCCESS_SCORE_THRESHOLD
146
  finally:
147
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  if __name__ == "__main__":
151
  main()
 
20
  MAX_TOKENS = int(os.getenv("MAX_TOKENS", "220"))
21
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.8"))
22
 
23
+ # Minimum number of tasks required by the grader
24
+ MIN_TASKS = 3
25
 
26
  SYSTEM_PROMPT = textwrap.dedent(
27
  """
 
101
  return fallback, str(exc).replace("\n", " ")
102
 
103
 
104
+ def clamp_score(score: float) -> float:
105
+ """Clamp score to strictly open interval (0, 1) as required by the grader."""
106
+ _EPSILON = 1e-6
107
+ return min(max(float(score), _EPSILON), 1.0 - _EPSILON)
108
 
109
 
110
+ def select_tasks(requested: str) -> List[str]:
111
+ """
112
+ Return at least MIN_TASKS task IDs.
113
+ Always includes the requested task; pads with other available tasks if needed.
114
+ """
115
+ available = list_task_ids()
116
+ if not available:
117
+ raise RuntimeError("No tasks available in the environment.")
118
 
119
+ # Start with the requested task (validated), then fill up to MIN_TASKS
120
+ primary = requested if requested in available else available[0]
121
+ others = [t for t in available if t != primary]
122
+ task_list = [primary] + others
123
+ return task_list[:max(MIN_TASKS, 1)]
124
+
125
+
126
+ def run_task(client: OpenAI, task_name: str) -> dict:
127
+ """Run a single task and return a result dict."""
128
+ env = SupportOpsEnv(task_id=task_name)
129
  rewards: List[float] = []
130
  steps_taken = 0
131
  score = 0.0
 
157
  if done:
158
  break
159
 
160
+ # Fix 1: clamp to strictly open (0, 1) — grader rejects 0.0 and 1.0
161
+ score = clamp_score(score)
162
  success = score >= SUCCESS_SCORE_THRESHOLD
163
  finally:
164
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
165
 
166
+ return {"task": task_name, "success": success, "steps": steps_taken, "score": score}
167
+
168
+
169
+ def main() -> None:
170
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
171
+
172
+ # Fix 2: run at least MIN_TASKS tasks so the grader has enough scored entries
173
+ tasks = select_tasks(TASK_NAME)
174
+
175
+ all_results = []
176
+ for task_name in tasks:
177
+ result = run_task(client, task_name)
178
+ all_results.append(result)
179
+
180
+ # Summary across all tasks
181
+ total = len(all_results)
182
+ passed = sum(1 for r in all_results if r["success"])
183
+ avg_score = sum(r["score"] for r in all_results) / total if total else 0.0
184
+ print(
185
+ f"[SUMMARY] tasks={total} passed={passed} avg_score={avg_score:.3f}",
186
+ flush=True,
187
+ )
188
+
189
 
190
  if __name__ == "__main__":
191
  main()
support_ops_env/graders/common.py CHANGED
@@ -4,6 +4,16 @@ from typing import Dict, List
4
 
5
  from ..models import StateModel, TaskGrade, TaskSpec, TicketSpec
6
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def _ticket_component(
9
  ticket: TicketSpec,
@@ -36,7 +46,7 @@ def grade_single_ticket(
36
  ) -> TaskGrade:
37
  ticket = task.tickets[0]
38
  weighted = _ticket_component(ticket, state, weights)
39
- score = round(sum(weighted.values()), 4)
40
  notes = _notes_for_ticket(ticket, state)
41
  return TaskGrade(
42
  task_id=task.task_id,
@@ -79,7 +89,7 @@ def grade_queue_task(
79
  ranking_score = round((matches / len(task.gold_queue_order)) * weights.get("ranking", 0.0), 4)
80
 
81
  averaged["ranking"] = ranking_score
82
- score = round(sum(averaged.values()), 4)
83
  return TaskGrade(
84
  task_id=task.task_id,
85
  score=score,
 
4
 
5
  from ..models import StateModel, TaskGrade, TaskSpec, TicketSpec
6
 
7
+ # Scores must be strictly within (0, 1) — the submission grader rejects
8
+ # exact 0.0 and 1.0. Because all non-context components are binary,
9
+ # a perfect or zero run would otherwise produce exactly 0.0 or 1.0.
10
+ _SCORE_MIN = 1e-6
11
+ _SCORE_MAX = 1.0 - 1e-6
12
+
13
+
14
+ def _clamp(score: float) -> float:
15
+ return min(max(score, _SCORE_MIN), _SCORE_MAX)
16
+
17
 
18
  def _ticket_component(
19
  ticket: TicketSpec,
 
46
  ) -> TaskGrade:
47
  ticket = task.tickets[0]
48
  weighted = _ticket_component(ticket, state, weights)
49
+ score = _clamp(round(sum(weighted.values()), 4))
50
  notes = _notes_for_ticket(ticket, state)
51
  return TaskGrade(
52
  task_id=task.task_id,
 
89
  ranking_score = round((matches / len(task.gold_queue_order)) * weights.get("ranking", 0.0), 4)
90
 
91
  averaged["ranking"] = ranking_score
92
+ score = _clamp(round(sum(averaged.values()), 4))
93
  return TaskGrade(
94
  task_id=task.task_id,
95
  score=score,