adityss commited on
Commit
5569b4d
·
1 Parent(s): 0361922

feat: implement core environment simulation logic and update baseline scores

Browse files
Files changed (3) hide show
  1. baseline_scores.json +19 -19
  2. env/environment.go +18 -6
  3. env/rewards.go +28 -36
baseline_scores.json CHANGED
@@ -7,50 +7,50 @@
7
  "llm_every": 4,
8
  "max_steps": null,
9
  "task_averages": {
10
- "1": 0.2776,
11
- "2": 0.2182,
12
- "3": 0.3115
13
  },
14
- "overall_average": 0.2691,
15
  "all_results": [
16
  {
17
  "task_id": 1,
18
  "seed": 1100,
19
- "total_reward": 114.28893759320243,
20
  "total_steps": 96,
21
- "elapsed_sec": 1.3370721340179443,
22
- "score": 0.2776,
23
  "sub_scores": {
24
- "cost": 0.277555958007489
25
  },
26
  "exploit_detected": false
27
  },
28
  {
29
  "task_id": 2,
30
  "seed": 1200,
31
- "total_reward": -625.6665397814021,
32
  "total_steps": 96,
33
- "elapsed_sec": 1.229602336883545,
34
- "score": 0.2182,
35
  "sub_scores": {
36
- "cost": 0.2595566056450961,
37
- "temperature": 0.15625
38
  },
39
  "exploit_detected": false
40
  },
41
  {
42
  "task_id": 3,
43
  "seed": 1300,
44
- "total_reward": -639.8462871515986,
45
  "total_steps": 96,
46
- "elapsed_sec": 1.1910581588745117,
47
- "score": 0.3115,
48
  "sub_scores": {
49
  "batch_deadline": 1,
50
- "carbon": 0.24377839161166936,
51
- "cost": 0.25263438913936676,
52
  "grid_response": 0.21428571428571427,
53
- "temperature": 0.14583333333333334
54
  },
55
  "exploit_detected": false
56
  }
 
7
  "llm_every": 4,
8
  "max_steps": null,
9
  "task_averages": {
10
+ "1": 0.7063,
11
+ "2": 0.6333,
12
+ "3": 0.5966
13
  },
14
+ "overall_average": 0.6454,
15
  "all_results": [
16
  {
17
  "task_id": 1,
18
  "seed": 1100,
19
+ "total_reward": 251.40178983938813,
20
  "total_steps": 96,
21
+ "elapsed_sec": 0.14183712005615234,
22
+ "score": 0.7063,
23
  "sub_scores": {
24
+ "cost": 0.7063441549865395
25
  },
26
  "exploit_detected": false
27
  },
28
  {
29
  "task_id": 2,
30
  "seed": 1200,
31
+ "total_reward": 246.40262234598185,
32
  "total_steps": 96,
33
+ "elapsed_sec": 0.11959218978881836,
34
+ "score": 0.6333,
35
  "sub_scores": {
36
+ "cost": 0.7014155357169216,
37
+ "temperature": 0.53125
38
  },
39
  "exploit_detected": false
40
  },
41
  {
42
  "task_id": 3,
43
  "seed": 1300,
44
+ "total_reward": 255.60231973463087,
45
  "total_steps": 96,
46
+ "elapsed_sec": 0.12531447410583496,
47
+ "score": 0.5966,
48
  "sub_scores": {
49
  "batch_deadline": 1,
50
+ "carbon": 0.6574530318382599,
51
+ "cost": 0.670084941969173,
52
  "grid_response": 0.21428571428571427,
53
+ "temperature": 0.5729166666666666
54
  },
55
  "exploit_detected": false
56
  }
env/environment.go CHANGED
@@ -418,18 +418,30 @@ func (e *Environment) stepBuilding(b *BuildingState, act ActionModel, idx int) S
418
  batchCompleted, batchMissed := e.updateBatchJobs(b, act.BatchJobSlot, s)
419
 
420
  // ----- Thermal dynamics -----
421
- // Simple first-order thermal model:
422
- // ΔT per step = (HVAC effect + outdoor infiltration + storage discharge effect - process demand)
423
- hvacEffect := (act.HVACPowerLevel - 0.5) * 2.0 * 1.5 // ±3°C max swing per step
 
 
 
 
424
  infiltration := (b.OutdoorTemperature - b.IndoorTemperature) * 0.03
 
 
425
  storageEffect := 0.0
426
- if act.ThermalChargeRate < 0 { // discharging storage = provides cooling/heating
427
- storageEffect = math.Abs(act.ThermalChargeRate) * 0.5
428
  }
 
 
429
  processHeat := b.ProcessDemand * 0.002 // kW→°C rough factor
430
- deltaT := hvacEffect + infiltration + storageEffect - processHeat
 
431
  b.IndoorTemperature += deltaT
432
 
 
 
 
433
  // ----- Energy & cost accounting -----
434
  batchPowerDraw := e.batchRunningPower(b)
435
  totalKW := hvacPower + math.Max(0, chargeKW) + batchPowerDraw - shedKW
 
418
  batchCompleted, batchMissed := e.updateBatchJobs(b, act.BatchJobSlot, s)
419
 
420
  // ----- Thermal dynamics -----
421
+ // First-order setpoint-driven model:
422
+ // HVAC drives temperature toward setpoint; higher power = stronger effect.
423
+ // At HVACPowerLevel=1.0, HVAC strongly pushes toward setpoint.
424
+ // At HVACPowerLevel=0.0, HVAC is off — temp drifts with environment.
425
+ hvacEffect := (b.SetpointTemperature - b.IndoorTemperature) * act.HVACPowerLevel * 0.15
426
+
427
+ // Outdoor infiltration: building slowly equilibrates with outside
428
  infiltration := (b.OutdoorTemperature - b.IndoorTemperature) * 0.03
429
+
430
+ // Thermal storage discharge provides supplemental conditioning toward setpoint
431
  storageEffect := 0.0
432
+ if act.ThermalChargeRate < 0 {
433
+ storageEffect = (b.SetpointTemperature - b.IndoorTemperature) * math.Abs(act.ThermalChargeRate) * 0.05
434
  }
435
+
436
+ // Process equipment waste heat (always warms the building)
437
  processHeat := b.ProcessDemand * 0.002 // kW→°C rough factor
438
+
439
+ deltaT := hvacEffect + infiltration + storageEffect + processHeat
440
  b.IndoorTemperature += deltaT
441
 
442
+ // Clamp to physically reasonable indoor range
443
+ b.IndoorTemperature = math.Max(10.0, math.Min(40.0, b.IndoorTemperature))
444
+
445
  // ----- Energy & cost accounting -----
446
  batchPowerDraw := e.batchRunningPower(b)
447
  totalKW := hvacPower + math.Max(0, chargeKW) + batchPowerDraw - shedKW
env/rewards.go CHANGED
@@ -25,8 +25,8 @@ type ComputeRewardInput struct {
25
  }
26
 
27
  // ComputeReward returns a dense RewardComponents struct from the current step inputs.
28
- // The reward is task-aware: task 1 only cares about cost, task 2 adds temperature,
29
- // task 3 adds grid response, batch deadlines, and carbon.
30
  func ComputeReward(inp ComputeRewardInput) RewardComponents {
31
  rc := RewardComponents{}
32
 
@@ -36,38 +36,32 @@ func ComputeReward(inp ComputeRewardInput) RewardComponents {
36
  rc.CostSavings = 1.5 - (inp.StepCost/typicalCost)*2.0
37
 
38
  // ── 2. Temperature Constraint ────────────────────────────────────────────
39
- // Active for task 2 and 3. Gaussian bonus for being near setpoint.
40
- if inp.TaskID >= 2 {
41
- temp := inp.B.IndoorTemperature
42
- rc.TempConstraint = computeTempReward(temp, inp.B.SetpointTemperature, inp.TMin, inp.TMax)
43
- }
44
 
45
  // ── 3. Grid Stress Response ──────────────────────────────────────────────
46
- // Active for task 3. Rewards proactive grid awareness, not just reactive shedding.
47
- if inp.TaskID >= 3 {
48
- rc.GridResponse = computeGridResponse(inp.GridStress, inp.ShedFraction)
49
- }
50
 
51
  // ── 4. Deadline Penalty / Bonus ──────────────────────────────────────────
52
- // Task 2+: penalise missed jobs, reward on-track pending jobs.
53
- if inp.TaskID >= 2 {
54
- if inp.BatchMissed > 0 {
55
- rc.DeadlinePenalty = -float64(inp.BatchMissed) * 1.5
56
- }
57
- // Positive signal: reward for jobs still on track (not missed yet)
58
- onTrackJobs := 0
59
- for _, job := range inp.B.Jobs {
60
- if !job.Completed && !job.MissedDeadline {
61
- onTrackJobs++
62
- }
63
- if job.Completed && !job.MissedDeadline {
64
- onTrackJobs++ // completed on time is even better
65
- }
66
  }
67
- if onTrackJobs > 0 && inp.BatchMissed == 0 {
68
- rc.DeadlinePenalty += float64(onTrackJobs) * 0.08
69
  }
70
  }
 
 
 
71
 
72
  // ── 5. Efficiency Bonus (thermal storage utilization) ─────────────────────
73
  // Rewards smart storage use: arbitrage + maintaining useful storage levels.
@@ -100,15 +94,13 @@ func ComputeReward(inp ComputeRewardInput) RewardComponents {
100
  }
101
 
102
  // ── 7. Carbon Reward ─────────────────────────────────────────────────────
103
- // Active for task 3. Rewards low-carbon operation.
104
- if inp.TaskID >= 3 {
105
- carbonNorm := math.Max(0, (inp.B.CarbonIntensity-100.0)/600.0)
106
- // Baseline bonus, reduced by carbon-heavy consumption
107
- rc.CarbonReward = 0.6 - (inp.EnergyKWh * carbonNorm * 0.25)
108
- // Extra bonus for operating during genuinely clean grid periods
109
- if carbonNorm < 0.3 {
110
- rc.CarbonReward += 0.15
111
- }
112
  }
113
 
114
  // ── Aggregate ────────────────────────────────────────────────────────────
 
25
  }
26
 
27
  // ComputeReward returns a dense RewardComponents struct from the current step inputs.
28
+ // All 7 reward components are always computed for rich per-step signal.
29
+ // Task-specific weighting is handled by the GRADING system (tasks.go), not here.
30
  func ComputeReward(inp ComputeRewardInput) RewardComponents {
31
  rc := RewardComponents{}
32
 
 
36
  rc.CostSavings = 1.5 - (inp.StepCost/typicalCost)*2.0
37
 
38
  // ── 2. Temperature Constraint ────────────────────────────────────────────
39
+ // Gaussian bonus for being near setpoint; penalty outside comfort bounds.
40
+ temp := inp.B.IndoorTemperature
41
+ rc.TempConstraint = computeTempReward(temp, inp.B.SetpointTemperature, inp.TMin, inp.TMax)
 
 
42
 
43
  // ── 3. Grid Stress Response ──────────────────────────────────────────────
44
+ // Rewards proactive grid awareness and demand-response compliance.
45
+ rc.GridResponse = computeGridResponse(inp.GridStress, inp.ShedFraction)
 
 
46
 
47
  // ── 4. Deadline Penalty / Bonus ──────────────────────────────────────────
48
+ // Penalise missed batch jobs, reward on-track pending jobs.
49
+ if inp.BatchMissed > 0 {
50
+ rc.DeadlinePenalty = -float64(inp.BatchMissed) * 1.5
51
+ }
52
+ // Positive signal: reward for jobs still on track (not missed yet)
53
+ onTrackJobs := 0
54
+ for _, job := range inp.B.Jobs {
55
+ if !job.Completed && !job.MissedDeadline {
56
+ onTrackJobs++
 
 
 
 
 
57
  }
58
+ if job.Completed && !job.MissedDeadline {
59
+ onTrackJobs++ // completed on time is even better
60
  }
61
  }
62
+ if onTrackJobs > 0 && inp.BatchMissed == 0 {
63
+ rc.DeadlinePenalty += float64(onTrackJobs) * 0.08
64
+ }
65
 
66
  // ── 5. Efficiency Bonus (thermal storage utilization) ─────────────────────
67
  // Rewards smart storage use: arbitrage + maintaining useful storage levels.
 
94
  }
95
 
96
  // ── 7. Carbon Reward ─────────────────────────────────────────────────────
97
+ // Rewards low-carbon operation based on grid carbon intensity.
98
+ carbonNorm := math.Max(0, (inp.B.CarbonIntensity-100.0)/600.0)
99
+ // Baseline bonus, reduced by carbon-heavy consumption
100
+ rc.CarbonReward = 0.6 - (inp.EnergyKWh * carbonNorm * 0.25)
101
+ // Extra bonus for operating during genuinely clean grid periods
102
+ if carbonNorm < 0.3 {
103
+ rc.CarbonReward += 0.15
 
 
104
  }
105
 
106
  // ── Aggregate ────────────────────────────────────────────────────────────