Commit ·
8242dc3
1
Parent(s): b93cee3
fix: reward system
Browse files- inference.py +24 -5
inference.py
CHANGED
|
@@ -64,6 +64,10 @@ EPISODE_STEPS = 96
|
|
| 64 |
LAST_STEP_INDEX = EPISODE_STEPS - 1
|
| 65 |
SCORE_EPSILON = 1e-6
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
SYSPROMPT = """You are GridMind, an expert industrial energy management controller.
|
| 68 |
You control a building's HVAC, thermal storage, batch job scheduling, and load shedding.
|
| 69 |
Your goal is to minimize electricity costs while maintaining comfort and meeting grid demand-response signals.
|
|
@@ -113,6 +117,12 @@ def clamp_open_score(score: float) -> float:
|
|
| 113 |
return score
|
| 114 |
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
# ── Environment client ───────────────────────────────────────────────────────
|
| 117 |
|
| 118 |
|
|
@@ -372,9 +382,10 @@ def run_episode(
|
|
| 372 |
llm_reuse_remaining -= 1
|
| 373 |
|
| 374 |
obs = step_resp["observation"]
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
|
|
|
| 378 |
total_steps += 1
|
| 379 |
done = bool(step_resp.get("done", False))
|
| 380 |
|
|
@@ -418,7 +429,15 @@ def run_episode(
|
|
| 418 |
elapsed = time.time() - start_time
|
| 419 |
grade = env_client.grade()
|
| 420 |
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
print(
|
| 423 |
f"[END] success={'true' if success else 'false'} steps={total_steps} rewards={rewards_str}",
|
| 424 |
flush=True
|
|
@@ -430,7 +449,7 @@ def run_episode(
|
|
| 430 |
"total_reward": total_reward,
|
| 431 |
"total_steps": total_steps,
|
| 432 |
"elapsed_sec": elapsed,
|
| 433 |
-
"score":
|
| 434 |
"sub_scores": grade.get("sub_scores", {}),
|
| 435 |
"exploit_detected": grade.get("exploit_detected", False),
|
| 436 |
}
|
|
|
|
| 64 |
LAST_STEP_INDEX = EPISODE_STEPS - 1
|
| 65 |
SCORE_EPSILON = 1e-6
|
| 66 |
|
| 67 |
+
REW_MIN = -8.0
|
| 68 |
+
REW_MAX = 6.0
|
| 69 |
+
REW_RANGE = REW_MAX - REW_MIN
|
| 70 |
+
|
| 71 |
SYSPROMPT = """You are GridMind, an expert industrial energy management controller.
|
| 72 |
You control a building's HVAC, thermal storage, batch job scheduling, and load shedding.
|
| 73 |
Your goal is to minimize electricity costs while maintaining comfort and meeting grid demand-response signals.
|
|
|
|
| 117 |
return score
|
| 118 |
|
| 119 |
|
| 120 |
+
def normalize_reward(raw_reward: float) -> float:
|
| 121 |
+
"""Normalize raw reward to (0, 1) using fixed theoretical range."""
|
| 122 |
+
normalized = (raw_reward - REW_MIN) / REW_RANGE
|
| 123 |
+
return clamp_open_score(normalized)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
# ── Environment client ───────────────────────────────────────────────────────
|
| 127 |
|
| 128 |
|
|
|
|
| 382 |
llm_reuse_remaining -= 1
|
| 383 |
|
| 384 |
obs = step_resp["observation"]
|
| 385 |
+
raw_reward = float(step_resp["reward"])
|
| 386 |
+
reward = normalize_reward(raw_reward)
|
| 387 |
+
total_reward += raw_reward
|
| 388 |
+
step_rewards.append(raw_reward)
|
| 389 |
total_steps += 1
|
| 390 |
done = bool(step_resp.get("done", False))
|
| 391 |
|
|
|
|
| 429 |
elapsed = time.time() - start_time
|
| 430 |
grade = env_client.grade()
|
| 431 |
|
| 432 |
+
if step_rewards:
|
| 433 |
+
normalized_rewards = [normalize_reward(r) for r in step_rewards]
|
| 434 |
+
else:
|
| 435 |
+
normalized_rewards = []
|
| 436 |
+
|
| 437 |
+
episode_score = sum(normalized_rewards) / len(normalized_rewards) if normalized_rewards else SCORE_EPSILON
|
| 438 |
+
episode_score = clamp_open_score(episode_score)
|
| 439 |
+
|
| 440 |
+
rewards_str = ",".join(f"{r:.2f}" for r in normalized_rewards)
|
| 441 |
print(
|
| 442 |
f"[END] success={'true' if success else 'false'} steps={total_steps} rewards={rewards_str}",
|
| 443 |
flush=True
|
|
|
|
| 449 |
"total_reward": total_reward,
|
| 450 |
"total_steps": total_steps,
|
| 451 |
"elapsed_sec": elapsed,
|
| 452 |
+
"score": episode_score,
|
| 453 |
"sub_scores": grade.get("sub_scores", {}),
|
| 454 |
"exploit_detected": grade.get("exploit_detected", False),
|
| 455 |
}
|