Spaces:
Running
Running
feat: implement reward system logic in new rewards module
Browse files- env/rewards.go +8 -8
env/rewards.go
CHANGED
|
@@ -31,10 +31,10 @@ func ComputeReward(inp ComputeRewardInput) RewardComponents {
|
|
| 31 |
rc := RewardComponents{}
|
| 32 |
|
| 33 |
// ── 1. Cost Savings ─────────────────────────────────────────────────────
|
| 34 |
-
//
|
| 35 |
-
//
|
| 36 |
typicalCost := 4.0
|
| 37 |
-
rc.CostSavings = -(inp.StepCost / typicalCost) * 2.0
|
| 38 |
|
| 39 |
// ── 2. Temperature Constraint ────────────────────────────────────────────
|
| 40 |
// Only active for task 2 and 3.
|
|
@@ -75,12 +75,12 @@ func ComputeReward(inp ComputeRewardInput) RewardComponents {
|
|
| 75 |
}
|
| 76 |
|
| 77 |
// ── 7. Carbon Reward ─────────────────────────────────────────────────────
|
| 78 |
-
// Low-carbon bonus: active for task 3
|
| 79 |
if inp.TaskID >= 3 {
|
| 80 |
// Normalise carbon: iso-ne range roughly 100–700 gCO2/kWh
|
| 81 |
carbonNorm := (inp.B.CarbonIntensity - 100.0) / 600.0
|
| 82 |
-
//
|
| 83 |
-
rc.CarbonReward = -inp.EnergyKWh * carbonNorm * 0.3
|
| 84 |
}
|
| 85 |
|
| 86 |
// ── Aggregate ────────────────────────────────────────────────────────────
|
|
@@ -97,11 +97,11 @@ func computeTempReward(temp, setpoint, tMin, tMax float64) float64 {
|
|
| 97 |
// Gaussian-shaped bonus: maximum at setpoint, degrades toward bounds
|
| 98 |
deviation := math.Abs(temp - setpoint)
|
| 99 |
sigma := (tMax - tMin) / 4.0
|
| 100 |
-
return math.Exp(-0.5*(deviation/sigma)*(deviation/sigma)) *
|
| 101 |
}
|
| 102 |
// Outside bounds: proportional penalty
|
| 103 |
excess := math.Max(temp-tMax, tMin-temp)
|
| 104 |
-
return -excess * 0.
|
| 105 |
}
|
| 106 |
|
| 107 |
// computeGridResponse returns a bonus for shedding load during high grid stress,
|
|
|
|
| 31 |
rc := RewardComponents{}
|
| 32 |
|
| 33 |
// ── 1. Cost Savings ─────────────────────────────────────────────────────
|
| 34 |
+
// Shift from pure penalty to a positive baseline: standardizing operations gives positive reward.
|
| 35 |
+
// Baseline reward of 1.5, minus the relative cost.
|
| 36 |
typicalCost := 4.0
|
| 37 |
+
rc.CostSavings = 1.5 - (inp.StepCost / typicalCost) * 2.0
|
| 38 |
|
| 39 |
// ── 2. Temperature Constraint ────────────────────────────────────────────
|
| 40 |
// Only active for task 2 and 3.
|
|
|
|
| 75 |
}
|
| 76 |
|
| 77 |
// ── 7. Carbon Reward ─────────────────────────────────────────────────────
|
| 78 |
+
// Low-carbon bonus: active for task 3.
|
| 79 |
if inp.TaskID >= 3 {
|
| 80 |
// Normalise carbon: iso-ne range roughly 100–700 gCO2/kWh
|
| 81 |
carbonNorm := (inp.B.CarbonIntensity - 100.0) / 600.0
|
| 82 |
+
// Provide a baseline positive score, reduced by carbon footprint
|
| 83 |
+
rc.CarbonReward = 0.5 - (inp.EnergyKWh * carbonNorm * 0.3)
|
| 84 |
}
|
| 85 |
|
| 86 |
// ── Aggregate ────────────────────────────────────────────────────────────
|
|
|
|
| 97 |
// Gaussian-shaped bonus: maximum at setpoint, degrades toward bounds
|
| 98 |
deviation := math.Abs(temp - setpoint)
|
| 99 |
sigma := (tMax - tMin) / 4.0
|
| 100 |
+
return math.Exp(-0.5*(deviation/sigma)*(deviation/sigma)) * 1.5 // Increased positive reward
|
| 101 |
}
|
| 102 |
// Outside bounds: proportional penalty
|
| 103 |
excess := math.Max(temp-tMax, tMin-temp)
|
| 104 |
+
return -excess * 0.6
|
| 105 |
}
|
| 106 |
|
| 107 |
// computeGridResponse returns a bonus for shedding load during high grid stress,
|