Spaces:
Sleeping
Sleeping
Grader Fix (#2)
Browse files- inference.py +48 -46
inference.py
CHANGED
|
@@ -1,19 +1,10 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3 |
-
# inference.py β OpenEnv Evaluation Script (Hackathon Submission)
|
| 4 |
-
#
|
| 5 |
-
# STDOUT FORMAT (strict regex compliance):
|
| 6 |
-
# [START] task=<task_name> env=<benchmark> model=<model_name>
|
| 7 |
-
# [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 8 |
-
# [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
|
| 9 |
-
#
|
| 10 |
-
# ALL debug output goes to stderr. NO JSON on stdout. NO extra whitespace.
|
| 11 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
|
| 13 |
import os
|
| 14 |
import sys
|
| 15 |
import json
|
| 16 |
import re
|
|
|
|
| 17 |
import traceback
|
| 18 |
import time
|
| 19 |
from typing import List
|
|
@@ -36,10 +27,6 @@ except ImportError:
|
|
| 36 |
|
| 37 |
from models import AuditorAction
|
| 38 |
|
| 39 |
-
# ββ Debug logger: ONLY to stderr βββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
-
def _dbg(msg: str) -> None:
|
| 41 |
-
print(msg, file=sys.stderr, flush=True)
|
| 42 |
-
|
| 43 |
class LLMResponse(BaseModel):
|
| 44 |
reasoning: str
|
| 45 |
decisions: List[int]
|
|
@@ -52,9 +39,7 @@ if not HF_TOKEN:
|
|
| 52 |
raise ValueError("CRITICAL: HF_TOKEN environment variable is missing.")
|
| 53 |
|
| 54 |
TASK_ID: str = os.getenv("TASK_ID", "anomaly_detection_hard")
|
| 55 |
-
ENV_NAME: str = "fin_auditor"
|
| 56 |
|
| 57 |
-
# FIX: Sync the inference max_steps default with the active task
|
| 58 |
if "easy" in TASK_ID.lower():
|
| 59 |
_DEFAULT_MAX = 5
|
| 60 |
elif "medium" in TASK_ID.lower():
|
|
@@ -86,6 +71,9 @@ Example:
|
|
| 86 |
{"reasoning": "Trade 1 has high risk. Trade 2 is safe.", "decisions": [1, 0, 1]}
|
| 87 |
"""
|
| 88 |
|
|
|
|
|
|
|
|
|
|
| 89 |
def _build_user_prompt(step: int, features: list[list[float]]) -> str:
|
| 90 |
lines = [
|
| 91 |
f"Step {step}: You have {len(features)} flagged trades to audit.",
|
|
@@ -165,10 +153,8 @@ def _call_llm(step: int, features: list[list[float]]) -> list[int]:
|
|
| 165 |
content = response.choices[0].message.content or ""
|
| 166 |
return _parse_llm_decisions(content, len(features))
|
| 167 |
except Exception as e:
|
| 168 |
-
_dbg(f"[LLM RETRY {attempt+1}/{max_retries}] {e}")
|
| 169 |
time.sleep(1)
|
| 170 |
|
| 171 |
-
_dbg("[LLM] All retries exhausted, using risk_score fallback")
|
| 172 |
fallback_decisions = []
|
| 173 |
for row in features:
|
| 174 |
if len(row) >= 4:
|
|
@@ -179,59 +165,75 @@ def _call_llm(step: int, features: list[list[float]]) -> list[int]:
|
|
| 179 |
return fallback_decisions
|
| 180 |
|
| 181 |
def run_inference() -> None:
|
|
|
|
|
|
|
| 182 |
steps_completed: int = 0
|
| 183 |
-
|
| 184 |
-
success: bool = False
|
| 185 |
-
error_msg: str | None = None
|
| 186 |
-
|
| 187 |
-
# ββ [START] β always emitted ββββββββββββββββββββββββββββββββββββββββββ
|
| 188 |
-
print(f"[START] task={TASK_ID} env={ENV_NAME} model={MODEL_NAME}", flush=True)
|
| 189 |
|
| 190 |
try:
|
| 191 |
env = FinAuditorEnvironment()
|
| 192 |
obs = env.reset()
|
|
|
|
| 193 |
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
for step_num in range(1, MAX_STEPS + 1):
|
| 197 |
-
step_reward = 0.0
|
| 198 |
features = obs.features
|
| 199 |
|
| 200 |
if not features:
|
| 201 |
action = AuditorAction(decisions=[])
|
| 202 |
-
|
| 203 |
else:
|
| 204 |
decisions = _call_llm(step_num, features)
|
| 205 |
action = AuditorAction(decisions=decisions)
|
| 206 |
|
| 207 |
obs = env.step(action)
|
| 208 |
-
|
| 209 |
-
|
|
|
|
| 210 |
steps_completed = step_num
|
| 211 |
|
| 212 |
-
#
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
if obs.done:
|
| 220 |
break
|
| 221 |
|
| 222 |
-
success = True
|
| 223 |
-
|
| 224 |
except KeyboardInterrupt:
|
| 225 |
-
|
| 226 |
-
_dbg("[DBG] Interrupted by user")
|
| 227 |
except Exception as exc:
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
if __name__ == "__main__":
|
| 237 |
run_inference()
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import sys
|
| 5 |
import json
|
| 6 |
import re
|
| 7 |
+
import datetime
|
| 8 |
import traceback
|
| 9 |
import time
|
| 10 |
from typing import List
|
|
|
|
| 27 |
|
| 28 |
from models import AuditorAction
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
class LLMResponse(BaseModel):
|
| 31 |
reasoning: str
|
| 32 |
decisions: List[int]
|
|
|
|
| 39 |
raise ValueError("CRITICAL: HF_TOKEN environment variable is missing.")
|
| 40 |
|
| 41 |
TASK_ID: str = os.getenv("TASK_ID", "anomaly_detection_hard")
|
|
|
|
| 42 |
|
|
|
|
| 43 |
if "easy" in TASK_ID.lower():
|
| 44 |
_DEFAULT_MAX = 5
|
| 45 |
elif "medium" in TASK_ID.lower():
|
|
|
|
| 71 |
{"reasoning": "Trade 1 has high risk. Trade 2 is safe.", "decisions": [1, 0, 1]}
|
| 72 |
"""
|
| 73 |
|
| 74 |
+
def _ts() -> str:
|
| 75 |
+
return datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
|
| 76 |
+
|
| 77 |
def _build_user_prompt(step: int, features: list[list[float]]) -> str:
|
| 78 |
lines = [
|
| 79 |
f"Step {step}: You have {len(features)} flagged trades to audit.",
|
|
|
|
| 153 |
content = response.choices[0].message.content or ""
|
| 154 |
return _parse_llm_decisions(content, len(features))
|
| 155 |
except Exception as e:
|
|
|
|
| 156 |
time.sleep(1)
|
| 157 |
|
|
|
|
| 158 |
fallback_decisions = []
|
| 159 |
for row in features:
|
| 160 |
if len(row) >= 4:
|
|
|
|
| 165 |
return fallback_decisions
|
| 166 |
|
| 167 |
def run_inference() -> None:
|
| 168 |
+
episode_id: str = "unknown"
|
| 169 |
+
total_reward: float = 0.0
|
| 170 |
steps_completed: int = 0
|
| 171 |
+
status: str = "SUCCESS"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
try:
|
| 174 |
env = FinAuditorEnvironment()
|
| 175 |
obs = env.reset()
|
| 176 |
+
episode_id = getattr(env.state, 'episode_id', "test_run")
|
| 177 |
|
| 178 |
+
start_payload = {
|
| 179 |
+
"episode_id": episode_id,
|
| 180 |
+
"model": MODEL_NAME,
|
| 181 |
+
"difficulty": TASK_ID,
|
| 182 |
+
"max_steps": MAX_STEPS
|
| 183 |
+
}
|
| 184 |
+
print(f"[START] {json.dumps(start_payload)}", flush=True)
|
| 185 |
|
| 186 |
for step_num in range(1, MAX_STEPS + 1):
|
| 187 |
+
step_reward = 0.0
|
| 188 |
features = obs.features
|
| 189 |
|
| 190 |
if not features:
|
| 191 |
action = AuditorAction(decisions=[])
|
| 192 |
+
_last_reasoning = "Empty matrix."
|
| 193 |
else:
|
| 194 |
decisions = _call_llm(step_num, features)
|
| 195 |
action = AuditorAction(decisions=decisions)
|
| 196 |
|
| 197 |
obs = env.step(action)
|
| 198 |
+
# Apply safe floor fallback in inference just in case
|
| 199 |
+
step_reward = obs.reward if obs.reward is not None else 0.001
|
| 200 |
+
total_reward += step_reward
|
| 201 |
steps_completed = step_num
|
| 202 |
|
| 203 |
+
# FIX: Used round(..., 4) to prevent collapsing small fractions into 0.00
|
| 204 |
+
step_payload = {
|
| 205 |
+
"step": step_num,
|
| 206 |
+
"anomalies": len(features),
|
| 207 |
+
"reward": round(float(step_reward), 4),
|
| 208 |
+
"cumulative_reward": round(float(total_reward), 4),
|
| 209 |
+
"done": bool(obs.done),
|
| 210 |
+
"error": None,
|
| 211 |
+
"reasoning": _last_reasoning[:120].replace('\n', ' ') + "...",
|
| 212 |
+
"tp": getattr(env.state, 'last_tp', 0),
|
| 213 |
+
"tn": getattr(env.state, 'last_tn', 0),
|
| 214 |
+
"fp": getattr(env.state, 'last_fp', 0),
|
| 215 |
+
"fn": getattr(env.state, 'last_fn', 0)
|
| 216 |
+
}
|
| 217 |
+
print(f"[STEP] {json.dumps(step_payload)}", flush=True)
|
| 218 |
|
| 219 |
if obs.done:
|
| 220 |
break
|
| 221 |
|
|
|
|
|
|
|
| 222 |
except KeyboardInterrupt:
|
| 223 |
+
status = "INTERRUPTED"
|
|
|
|
| 224 |
except Exception as exc:
|
| 225 |
+
status = "ERROR"
|
| 226 |
+
traceback.print_exc(file=sys.stderr)
|
| 227 |
+
|
| 228 |
+
avg_reward = total_reward / max(steps_completed, 1)
|
| 229 |
+
|
| 230 |
+
# FIX: Used round(..., 4) for terminal payload outputs as well
|
| 231 |
+
end_payload = {
|
| 232 |
+
"total_reward": round(float(total_reward), 4),
|
| 233 |
+
"avg_reward": round(float(avg_reward), 4),
|
| 234 |
+
"status": status
|
| 235 |
+
}
|
| 236 |
+
print(f"[END] {json.dumps(end_payload)}", flush=True)
|
| 237 |
|
| 238 |
if __name__ == "__main__":
|
| 239 |
run_inference()
|