Spaces:
Sleeping
Sleeping
inference update
Browse files- inference.py +49 -9
- support_ops_env/graders/common.py +12 -2
inference.py
CHANGED
|
@@ -20,6 +20,8 @@ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.1"))
|
|
| 20 |
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "220"))
|
| 21 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.8"))
|
| 22 |
|
|
|
|
|
|
|
| 23 |
|
| 24 |
SYSTEM_PROMPT = textwrap.dedent(
|
| 25 |
"""
|
|
@@ -99,17 +101,31 @@ def get_model_action(client: OpenAI, observation: Observation, step: int, reward
|
|
| 99 |
return fallback, str(exc).replace("\n", " ")
|
| 100 |
|
| 101 |
|
| 102 |
-
def
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
return
|
| 106 |
|
| 107 |
|
| 108 |
-
def
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
rewards: List[float] = []
|
| 114 |
steps_taken = 0
|
| 115 |
score = 0.0
|
|
@@ -141,11 +157,35 @@ def main() -> None:
|
|
| 141 |
if done:
|
| 142 |
break
|
| 143 |
|
| 144 |
-
|
|
|
|
| 145 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 146 |
finally:
|
| 147 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
if __name__ == "__main__":
|
| 151 |
main()
|
|
|
|
| 20 |
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "220"))
|
| 21 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.8"))
|
| 22 |
|
| 23 |
+
# Minimum number of tasks required by the grader
|
| 24 |
+
MIN_TASKS = 3
|
| 25 |
|
| 26 |
SYSTEM_PROMPT = textwrap.dedent(
|
| 27 |
"""
|
|
|
|
| 101 |
return fallback, str(exc).replace("\n", " ")
|
| 102 |
|
| 103 |
|
| 104 |
+
def clamp_score(score: float) -> float:
|
| 105 |
+
"""Clamp score to strictly open interval (0, 1) as required by the grader."""
|
| 106 |
+
_EPSILON = 1e-6
|
| 107 |
+
return min(max(float(score), _EPSILON), 1.0 - _EPSILON)
|
| 108 |
|
| 109 |
|
| 110 |
+
def select_tasks(requested: str) -> List[str]:
|
| 111 |
+
"""
|
| 112 |
+
Return at least MIN_TASKS task IDs.
|
| 113 |
+
Always includes the requested task; pads with other available tasks if needed.
|
| 114 |
+
"""
|
| 115 |
+
available = list_task_ids()
|
| 116 |
+
if not available:
|
| 117 |
+
raise RuntimeError("No tasks available in the environment.")
|
| 118 |
|
| 119 |
+
# Start with the requested task (validated), then fill up to MIN_TASKS
|
| 120 |
+
primary = requested if requested in available else available[0]
|
| 121 |
+
others = [t for t in available if t != primary]
|
| 122 |
+
task_list = [primary] + others
|
| 123 |
+
return task_list[:max(MIN_TASKS, 1)]
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def run_task(client: OpenAI, task_name: str) -> dict:
|
| 127 |
+
"""Run a single task and return a result dict."""
|
| 128 |
+
env = SupportOpsEnv(task_id=task_name)
|
| 129 |
rewards: List[float] = []
|
| 130 |
steps_taken = 0
|
| 131 |
score = 0.0
|
|
|
|
| 157 |
if done:
|
| 158 |
break
|
| 159 |
|
| 160 |
+
# Fix 1: clamp to strictly open (0, 1) — grader rejects 0.0 and 1.0
|
| 161 |
+
score = clamp_score(score)
|
| 162 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 163 |
finally:
|
| 164 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 165 |
|
| 166 |
+
return {"task": task_name, "success": success, "steps": steps_taken, "score": score}
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def main() -> None:
|
| 170 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 171 |
+
|
| 172 |
+
# Fix 2: run at least MIN_TASKS tasks so the grader has enough scored entries
|
| 173 |
+
tasks = select_tasks(TASK_NAME)
|
| 174 |
+
|
| 175 |
+
all_results = []
|
| 176 |
+
for task_name in tasks:
|
| 177 |
+
result = run_task(client, task_name)
|
| 178 |
+
all_results.append(result)
|
| 179 |
+
|
| 180 |
+
# Summary across all tasks
|
| 181 |
+
total = len(all_results)
|
| 182 |
+
passed = sum(1 for r in all_results if r["success"])
|
| 183 |
+
avg_score = sum(r["score"] for r in all_results) / total if total else 0.0
|
| 184 |
+
print(
|
| 185 |
+
f"[SUMMARY] tasks={total} passed={passed} avg_score={avg_score:.3f}",
|
| 186 |
+
flush=True,
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
|
| 190 |
if __name__ == "__main__":
|
| 191 |
main()
|
support_ops_env/graders/common.py
CHANGED
|
@@ -4,6 +4,16 @@ from typing import Dict, List
|
|
| 4 |
|
| 5 |
from ..models import StateModel, TaskGrade, TaskSpec, TicketSpec
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
def _ticket_component(
|
| 9 |
ticket: TicketSpec,
|
|
@@ -36,7 +46,7 @@ def grade_single_ticket(
|
|
| 36 |
) -> TaskGrade:
|
| 37 |
ticket = task.tickets[0]
|
| 38 |
weighted = _ticket_component(ticket, state, weights)
|
| 39 |
-
score = round(sum(weighted.values()), 4)
|
| 40 |
notes = _notes_for_ticket(ticket, state)
|
| 41 |
return TaskGrade(
|
| 42 |
task_id=task.task_id,
|
|
@@ -79,7 +89,7 @@ def grade_queue_task(
|
|
| 79 |
ranking_score = round((matches / len(task.gold_queue_order)) * weights.get("ranking", 0.0), 4)
|
| 80 |
|
| 81 |
averaged["ranking"] = ranking_score
|
| 82 |
-
score = round(sum(averaged.values()), 4)
|
| 83 |
return TaskGrade(
|
| 84 |
task_id=task.task_id,
|
| 85 |
score=score,
|
|
|
|
| 4 |
|
| 5 |
from ..models import StateModel, TaskGrade, TaskSpec, TicketSpec
|
| 6 |
|
| 7 |
+
# Scores must be strictly within (0, 1) — the submission grader rejects
|
| 8 |
+
# exact 0.0 and 1.0. Because all non-context components are binary,
|
| 9 |
+
# a perfect or zero run would otherwise produce exactly 0.0 or 1.0.
|
| 10 |
+
_SCORE_MIN = 1e-6
|
| 11 |
+
_SCORE_MAX = 1.0 - 1e-6
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def _clamp(score: float) -> float:
|
| 15 |
+
return min(max(score, _SCORE_MIN), _SCORE_MAX)
|
| 16 |
+
|
| 17 |
|
| 18 |
def _ticket_component(
|
| 19 |
ticket: TicketSpec,
|
|
|
|
| 46 |
) -> TaskGrade:
|
| 47 |
ticket = task.tickets[0]
|
| 48 |
weighted = _ticket_component(ticket, state, weights)
|
| 49 |
+
score = _clamp(round(sum(weighted.values()), 4))
|
| 50 |
notes = _notes_for_ticket(ticket, state)
|
| 51 |
return TaskGrade(
|
| 52 |
task_id=task.task_id,
|
|
|
|
| 89 |
ranking_score = round((matches / len(task.gold_queue_order)) * weights.get("ranking", 0.0), 4)
|
| 90 |
|
| 91 |
averaged["ranking"] = ranking_score
|
| 92 |
+
score = _clamp(round(sum(averaged.values()), 4))
|
| 93 |
return TaskGrade(
|
| 94 |
task_id=task.task_id,
|
| 95 |
score=score,
|