ShreeshantXD commited on
Commit
8242dc3
·
1 Parent(s): b93cee3

fix: reward system

Browse files
Files changed (1) hide show
  1. inference.py +24 -5
inference.py CHANGED
@@ -64,6 +64,10 @@ EPISODE_STEPS = 96
64
  LAST_STEP_INDEX = EPISODE_STEPS - 1
65
  SCORE_EPSILON = 1e-6
66
 
 
 
 
 
67
  SYSPROMPT = """You are GridMind, an expert industrial energy management controller.
68
  You control a building's HVAC, thermal storage, batch job scheduling, and load shedding.
69
  Your goal is to minimize electricity costs while maintaining comfort and meeting grid demand-response signals.
@@ -113,6 +117,12 @@ def clamp_open_score(score: float) -> float:
113
  return score
114
 
115
 
 
 
 
 
 
 
116
  # ── Environment client ───────────────────────────────────────────────────────
117
 
118
 
@@ -372,9 +382,10 @@ def run_episode(
372
  llm_reuse_remaining -= 1
373
 
374
  obs = step_resp["observation"]
375
- reward = float(step_resp["reward"])
376
- total_reward += reward
377
- step_rewards.append(reward)
 
378
  total_steps += 1
379
  done = bool(step_resp.get("done", False))
380
 
@@ -418,7 +429,15 @@ def run_episode(
418
  elapsed = time.time() - start_time
419
  grade = env_client.grade()
420
 
421
- rewards_str = ",".join(f"{r:.2f}" for r in step_rewards)
 
 
 
 
 
 
 
 
422
  print(
423
  f"[END] success={'true' if success else 'false'} steps={total_steps} rewards={rewards_str}",
424
  flush=True
@@ -430,7 +449,7 @@ def run_episode(
430
  "total_reward": total_reward,
431
  "total_steps": total_steps,
432
  "elapsed_sec": elapsed,
433
- "score": clamp_open_score(float(grade.get("score", SCORE_EPSILON))),
434
  "sub_scores": grade.get("sub_scores", {}),
435
  "exploit_detected": grade.get("exploit_detected", False),
436
  }
 
64
  LAST_STEP_INDEX = EPISODE_STEPS - 1
65
  SCORE_EPSILON = 1e-6
66
 
67
+ REW_MIN = -8.0
68
+ REW_MAX = 6.0
69
+ REW_RANGE = REW_MAX - REW_MIN
70
+
71
  SYSPROMPT = """You are GridMind, an expert industrial energy management controller.
72
  You control a building's HVAC, thermal storage, batch job scheduling, and load shedding.
73
  Your goal is to minimize electricity costs while maintaining comfort and meeting grid demand-response signals.
 
117
  return score
118
 
119
 
120
+ def normalize_reward(raw_reward: float) -> float:
121
+ """Normalize raw reward to (0, 1) using fixed theoretical range."""
122
+ normalized = (raw_reward - REW_MIN) / REW_RANGE
123
+ return clamp_open_score(normalized)
124
+
125
+
126
  # ── Environment client ───────────────────────────────────────────────────────
127
 
128
 
 
382
  llm_reuse_remaining -= 1
383
 
384
  obs = step_resp["observation"]
385
+ raw_reward = float(step_resp["reward"])
386
+ reward = normalize_reward(raw_reward)
387
+ total_reward += raw_reward
388
+ step_rewards.append(raw_reward)
389
  total_steps += 1
390
  done = bool(step_resp.get("done", False))
391
 
 
429
  elapsed = time.time() - start_time
430
  grade = env_client.grade()
431
 
432
+ if step_rewards:
433
+ normalized_rewards = [normalize_reward(r) for r in step_rewards]
434
+ else:
435
+ normalized_rewards = []
436
+
437
+ episode_score = sum(normalized_rewards) / len(normalized_rewards) if normalized_rewards else SCORE_EPSILON
438
+ episode_score = clamp_open_score(episode_score)
439
+
440
+ rewards_str = ",".join(f"{r:.2f}" for r in normalized_rewards)
441
  print(
442
  f"[END] success={'true' if success else 'false'} steps={total_steps} rewards={rewards_str}",
443
  flush=True
 
449
  "total_reward": total_reward,
450
  "total_steps": total_steps,
451
  "elapsed_sec": elapsed,
452
+ "score": episode_score,
453
  "sub_scores": grade.get("sub_scores", {}),
454
  "exploit_detected": grade.get("exploit_detected", False),
455
  }