feat: implement core environment simulation logic and update baseline scores
Browse files- baseline_scores.json +19 -19
- env/environment.go +18 -6
- env/rewards.go +28 -36
baseline_scores.json
CHANGED
|
@@ -7,50 +7,50 @@
|
|
| 7 |
"llm_every": 4,
|
| 8 |
"max_steps": null,
|
| 9 |
"task_averages": {
|
| 10 |
-
"1": 0.
|
| 11 |
-
"2": 0.
|
| 12 |
-
"3": 0.
|
| 13 |
},
|
| 14 |
-
"overall_average": 0.
|
| 15 |
"all_results": [
|
| 16 |
{
|
| 17 |
"task_id": 1,
|
| 18 |
"seed": 1100,
|
| 19 |
-
"total_reward":
|
| 20 |
"total_steps": 96,
|
| 21 |
-
"elapsed_sec":
|
| 22 |
-
"score": 0.
|
| 23 |
"sub_scores": {
|
| 24 |
-
"cost": 0.
|
| 25 |
},
|
| 26 |
"exploit_detected": false
|
| 27 |
},
|
| 28 |
{
|
| 29 |
"task_id": 2,
|
| 30 |
"seed": 1200,
|
| 31 |
-
"total_reward":
|
| 32 |
"total_steps": 96,
|
| 33 |
-
"elapsed_sec":
|
| 34 |
-
"score": 0.
|
| 35 |
"sub_scores": {
|
| 36 |
-
"cost": 0.
|
| 37 |
-
"temperature": 0.
|
| 38 |
},
|
| 39 |
"exploit_detected": false
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"task_id": 3,
|
| 43 |
"seed": 1300,
|
| 44 |
-
"total_reward":
|
| 45 |
"total_steps": 96,
|
| 46 |
-
"elapsed_sec":
|
| 47 |
-
"score": 0.
|
| 48 |
"sub_scores": {
|
| 49 |
"batch_deadline": 1,
|
| 50 |
-
"carbon": 0.
|
| 51 |
-
"cost": 0.
|
| 52 |
"grid_response": 0.21428571428571427,
|
| 53 |
-
"temperature": 0.
|
| 54 |
},
|
| 55 |
"exploit_detected": false
|
| 56 |
}
|
|
|
|
| 7 |
"llm_every": 4,
|
| 8 |
"max_steps": null,
|
| 9 |
"task_averages": {
|
| 10 |
+
"1": 0.7063,
|
| 11 |
+
"2": 0.6333,
|
| 12 |
+
"3": 0.5966
|
| 13 |
},
|
| 14 |
+
"overall_average": 0.6454,
|
| 15 |
"all_results": [
|
| 16 |
{
|
| 17 |
"task_id": 1,
|
| 18 |
"seed": 1100,
|
| 19 |
+
"total_reward": 251.40178983938813,
|
| 20 |
"total_steps": 96,
|
| 21 |
+
"elapsed_sec": 0.14183712005615234,
|
| 22 |
+
"score": 0.7063,
|
| 23 |
"sub_scores": {
|
| 24 |
+
"cost": 0.7063441549865395
|
| 25 |
},
|
| 26 |
"exploit_detected": false
|
| 27 |
},
|
| 28 |
{
|
| 29 |
"task_id": 2,
|
| 30 |
"seed": 1200,
|
| 31 |
+
"total_reward": 246.40262234598185,
|
| 32 |
"total_steps": 96,
|
| 33 |
+
"elapsed_sec": 0.11959218978881836,
|
| 34 |
+
"score": 0.6333,
|
| 35 |
"sub_scores": {
|
| 36 |
+
"cost": 0.7014155357169216,
|
| 37 |
+
"temperature": 0.53125
|
| 38 |
},
|
| 39 |
"exploit_detected": false
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"task_id": 3,
|
| 43 |
"seed": 1300,
|
| 44 |
+
"total_reward": 255.60231973463087,
|
| 45 |
"total_steps": 96,
|
| 46 |
+
"elapsed_sec": 0.12531447410583496,
|
| 47 |
+
"score": 0.5966,
|
| 48 |
"sub_scores": {
|
| 49 |
"batch_deadline": 1,
|
| 50 |
+
"carbon": 0.6574530318382599,
|
| 51 |
+
"cost": 0.670084941969173,
|
| 52 |
"grid_response": 0.21428571428571427,
|
| 53 |
+
"temperature": 0.5729166666666666
|
| 54 |
},
|
| 55 |
"exploit_detected": false
|
| 56 |
}
|
env/environment.go
CHANGED
|
@@ -418,18 +418,30 @@ func (e *Environment) stepBuilding(b *BuildingState, act ActionModel, idx int) S
|
|
| 418 |
batchCompleted, batchMissed := e.updateBatchJobs(b, act.BatchJobSlot, s)
|
| 419 |
|
| 420 |
// ----- Thermal dynamics -----
|
| 421 |
-
//
|
| 422 |
-
//
|
| 423 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
infiltration := (b.OutdoorTemperature - b.IndoorTemperature) * 0.03
|
|
|
|
|
|
|
| 425 |
storageEffect := 0.0
|
| 426 |
-
if act.ThermalChargeRate < 0 {
|
| 427 |
-
storageEffect = math.Abs(act.ThermalChargeRate) * 0.
|
| 428 |
}
|
|
|
|
|
|
|
| 429 |
processHeat := b.ProcessDemand * 0.002 // kW→°C rough factor
|
| 430 |
-
|
|
|
|
| 431 |
b.IndoorTemperature += deltaT
|
| 432 |
|
|
|
|
|
|
|
|
|
|
| 433 |
// ----- Energy & cost accounting -----
|
| 434 |
batchPowerDraw := e.batchRunningPower(b)
|
| 435 |
totalKW := hvacPower + math.Max(0, chargeKW) + batchPowerDraw - shedKW
|
|
|
|
| 418 |
batchCompleted, batchMissed := e.updateBatchJobs(b, act.BatchJobSlot, s)
|
| 419 |
|
| 420 |
// ----- Thermal dynamics -----
|
| 421 |
+
// First-order setpoint-driven model:
|
| 422 |
+
// HVAC drives temperature toward setpoint; higher power = stronger effect.
|
| 423 |
+
// At HVACPowerLevel=1.0, HVAC strongly pushes toward setpoint.
|
| 424 |
+
// At HVACPowerLevel=0.0, HVAC is off — temp drifts with environment.
|
| 425 |
+
hvacEffect := (b.SetpointTemperature - b.IndoorTemperature) * act.HVACPowerLevel * 0.15
|
| 426 |
+
|
| 427 |
+
// Outdoor infiltration: building slowly equilibrates with outside
|
| 428 |
infiltration := (b.OutdoorTemperature - b.IndoorTemperature) * 0.03
|
| 429 |
+
|
| 430 |
+
// Thermal storage discharge provides supplemental conditioning toward setpoint
|
| 431 |
storageEffect := 0.0
|
| 432 |
+
if act.ThermalChargeRate < 0 {
|
| 433 |
+
storageEffect = (b.SetpointTemperature - b.IndoorTemperature) * math.Abs(act.ThermalChargeRate) * 0.05
|
| 434 |
}
|
| 435 |
+
|
| 436 |
+
// Process equipment waste heat (always warms the building)
|
| 437 |
processHeat := b.ProcessDemand * 0.002 // kW→°C rough factor
|
| 438 |
+
|
| 439 |
+
deltaT := hvacEffect + infiltration + storageEffect + processHeat
|
| 440 |
b.IndoorTemperature += deltaT
|
| 441 |
|
| 442 |
+
// Clamp to physically reasonable indoor range
|
| 443 |
+
b.IndoorTemperature = math.Max(10.0, math.Min(40.0, b.IndoorTemperature))
|
| 444 |
+
|
| 445 |
// ----- Energy & cost accounting -----
|
| 446 |
batchPowerDraw := e.batchRunningPower(b)
|
| 447 |
totalKW := hvacPower + math.Max(0, chargeKW) + batchPowerDraw - shedKW
|
env/rewards.go
CHANGED
|
@@ -25,8 +25,8 @@ type ComputeRewardInput struct {
|
|
| 25 |
}
|
| 26 |
|
| 27 |
// ComputeReward returns a dense RewardComponents struct from the current step inputs.
|
| 28 |
-
//
|
| 29 |
-
//
|
| 30 |
func ComputeReward(inp ComputeRewardInput) RewardComponents {
|
| 31 |
rc := RewardComponents{}
|
| 32 |
|
|
@@ -36,38 +36,32 @@ func ComputeReward(inp ComputeRewardInput) RewardComponents {
|
|
| 36 |
rc.CostSavings = 1.5 - (inp.StepCost/typicalCost)*2.0
|
| 37 |
|
| 38 |
// ── 2. Temperature Constraint ────────────────────────────────────────────
|
| 39 |
-
//
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
rc.TempConstraint = computeTempReward(temp, inp.B.SetpointTemperature, inp.TMin, inp.TMax)
|
| 43 |
-
}
|
| 44 |
|
| 45 |
// ── 3. Grid Stress Response ──────────────────────────────────────────────
|
| 46 |
-
//
|
| 47 |
-
|
| 48 |
-
rc.GridResponse = computeGridResponse(inp.GridStress, inp.ShedFraction)
|
| 49 |
-
}
|
| 50 |
|
| 51 |
// ── 4. Deadline Penalty / Bonus ──────────────────────────────────────────
|
| 52 |
-
//
|
| 53 |
-
if inp.
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
onTrackJobs++
|
| 62 |
-
}
|
| 63 |
-
if job.Completed && !job.MissedDeadline {
|
| 64 |
-
onTrackJobs++ // completed on time is even better
|
| 65 |
-
}
|
| 66 |
}
|
| 67 |
-
if
|
| 68 |
-
|
| 69 |
}
|
| 70 |
}
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
// ── 5. Efficiency Bonus (thermal storage utilization) ─────────────────────
|
| 73 |
// Rewards smart storage use: arbitrage + maintaining useful storage levels.
|
|
@@ -100,15 +94,13 @@ func ComputeReward(inp ComputeRewardInput) RewardComponents {
|
|
| 100 |
}
|
| 101 |
|
| 102 |
// ── 7. Carbon Reward ─────────────────────────────────────────────────────
|
| 103 |
-
//
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
rc.CarbonReward += 0.15
|
| 111 |
-
}
|
| 112 |
}
|
| 113 |
|
| 114 |
// ── Aggregate ────────────────────────────────────────────────────────────
|
|
|
|
| 25 |
}
|
| 26 |
|
| 27 |
// ComputeReward returns a dense RewardComponents struct from the current step inputs.
|
| 28 |
+
// All 7 reward components are always computed for rich per-step signal.
|
| 29 |
+
// Task-specific weighting is handled by the GRADING system (tasks.go), not here.
|
| 30 |
func ComputeReward(inp ComputeRewardInput) RewardComponents {
|
| 31 |
rc := RewardComponents{}
|
| 32 |
|
|
|
|
| 36 |
rc.CostSavings = 1.5 - (inp.StepCost/typicalCost)*2.0
|
| 37 |
|
| 38 |
// ── 2. Temperature Constraint ────────────────────────────────────────────
|
| 39 |
+
// Gaussian bonus for being near setpoint; penalty outside comfort bounds.
|
| 40 |
+
temp := inp.B.IndoorTemperature
|
| 41 |
+
rc.TempConstraint = computeTempReward(temp, inp.B.SetpointTemperature, inp.TMin, inp.TMax)
|
|
|
|
|
|
|
| 42 |
|
| 43 |
// ── 3. Grid Stress Response ──────────────────────────────────────────────
|
| 44 |
+
// Rewards proactive grid awareness and demand-response compliance.
|
| 45 |
+
rc.GridResponse = computeGridResponse(inp.GridStress, inp.ShedFraction)
|
|
|
|
|
|
|
| 46 |
|
| 47 |
// ── 4. Deadline Penalty / Bonus ──────────────────────────────────────────
|
| 48 |
+
// Penalise missed batch jobs, reward on-track pending jobs.
|
| 49 |
+
if inp.BatchMissed > 0 {
|
| 50 |
+
rc.DeadlinePenalty = -float64(inp.BatchMissed) * 1.5
|
| 51 |
+
}
|
| 52 |
+
// Positive signal: reward for jobs still on track (not missed yet)
|
| 53 |
+
onTrackJobs := 0
|
| 54 |
+
for _, job := range inp.B.Jobs {
|
| 55 |
+
if !job.Completed && !job.MissedDeadline {
|
| 56 |
+
onTrackJobs++
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
}
|
| 58 |
+
if job.Completed && !job.MissedDeadline {
|
| 59 |
+
onTrackJobs++ // completed on time is even better
|
| 60 |
}
|
| 61 |
}
|
| 62 |
+
if onTrackJobs > 0 && inp.BatchMissed == 0 {
|
| 63 |
+
rc.DeadlinePenalty += float64(onTrackJobs) * 0.08
|
| 64 |
+
}
|
| 65 |
|
| 66 |
// ── 5. Efficiency Bonus (thermal storage utilization) ─────────────────────
|
| 67 |
// Rewards smart storage use: arbitrage + maintaining useful storage levels.
|
|
|
|
| 94 |
}
|
| 95 |
|
| 96 |
// ── 7. Carbon Reward ─────────────────────────────────────────────────────
|
| 97 |
+
// Rewards low-carbon operation based on grid carbon intensity.
|
| 98 |
+
carbonNorm := math.Max(0, (inp.B.CarbonIntensity-100.0)/600.0)
|
| 99 |
+
// Baseline bonus, reduced by carbon-heavy consumption
|
| 100 |
+
rc.CarbonReward = 0.6 - (inp.EnergyKWh * carbonNorm * 0.25)
|
| 101 |
+
// Extra bonus for operating during genuinely clean grid periods
|
| 102 |
+
if carbonNorm < 0.3 {
|
| 103 |
+
rc.CarbonReward += 0.15
|
|
|
|
|
|
|
| 104 |
}
|
| 105 |
|
| 106 |
// ── Aggregate ────────────────────────────────────────────────────────────
|