File size: 17,555 Bytes
0af208b
1875b13
 
0af208b
 
 
 
 
1875b13
ef0556b
 
 
 
 
 
 
 
 
 
 
 
 
1875b13
 
ef0556b
 
 
 
1875b13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d74982
1875b13
0af208b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1875b13
 
 
 
 
ef0556b
 
 
 
 
 
1875b13
c009bc5
1875b13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c009bc5
 
1875b13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e58b5ec
ef0556b
e58b5ec
 
 
 
 
1875b13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e58b5ec
 
 
1875b13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e58b5ec
 
 
 
 
1875b13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d74982
 
 
 
 
 
 
 
 
 
 
 
e58b5ec
 
 
 
 
 
1875b13
e58b5ec
 
1875b13
 
 
 
 
6d74982
 
1875b13
 
c009bc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
// Package env defines the four GridMind-RL tasks and their deterministic graders.
package env

import (
	"fmt"
	"math"
	"math/rand"
)

// clampOpenInterval clamps a score to the open interval (0, 1), strictly excluding 0.0 and 1.0.
// This ensures all scores satisfy the requirement: 0 < score < 1
func clampOpenInterval(score float64) float64 {
	const epsilon = 1e-6
	if score <= 0.0 {
		return epsilon
	}
	if score >= 1.0 {
		return 1.0 - epsilon
	}
	return score
}

// TaskConfig describes a single task.
type TaskConfig struct {
	ID          int                `json:"id"`
	Name        string             `json:"name"`
	Description string             `json:"description"`
	Difficulty  string             `json:"difficulty"`
	Weights     map[string]float64 `json:"weights"`
}

// AllTasks returns the ordered list of task configurations.
func AllTasks() []TaskConfig {
	return []TaskConfig{
		{
			ID:          1,
			Name:        "Cost Minimization",
			Description: "Minimize total energy cost over a 24-hour episode with no process constraints. Beat the always-on flat policy baseline.",
			Difficulty:  "easy",
			Weights:     map[string]float64{"cost": 1.0},
		},
		{
			ID:          2,
			Name:        "Constrained Temperature Management",
			Description: "Minimize cost while keeping indoor temperature within ±2°C of setpoint at all times.",
			Difficulty:  "medium",
			Weights:     map[string]float64{"cost": 0.6, "temperature": 0.4},
		},
		{
			ID:          3,
			Name:        "Full Demand-Response with Batch Scheduling",
			Description: "Minimize cost, maintain temperature, respond to grid stress events, schedule all batch jobs before their deadlines, and minimize carbon emissions.",
			Difficulty:  "hard",
			Weights:     map[string]float64{"cost": 0.28, "temperature": 0.20, "grid_response": 0.20, "batch_deadline": 0.12, "carbon": 0.20},
		},
		{
			ID:          4,
			Name:        "Instruction-Following Operator",
			Description: "Complete a randomly sampled natural-language objective card. The agent must parse the instruction, plan accordingly, and satisfy all stated KPI targets.",
			Difficulty:  "hard",
			Weights:     map[string]float64{"task_completion": 0.50, "cost": 0.30, "temperature": 0.20},
		},
	}
}

// instructionTemplate is a parameterised instruction card template.
type instructionTemplate struct {
	makeText    func(params map[string]float64) string
	targets     map[string]float64
	weights     map[string]float64
}

// GenerateInstructionCard samples a random instruction card for Task 4.
// The card contains a human-readable text objective plus machine-readable targets.
func GenerateInstructionCard(rng *rand.Rand) *InstructionCard {
	// Pool of parameterised templates
	templates := []instructionTemplate{
		{
			// Template 1: hard energy cap
			makeText: func(p map[string]float64) string {
				return fmt.Sprintf("Keep total energy cost under $%.2f for this 24-hour episode while maintaining comfort.", p["cost_cap"])
			},
			targets:  map[string]float64{"max_cost": 0.0}, // filled in below
			weights:  map[string]float64{"task_completion": 0.5, "cost": 0.3, "temperature": 0.2},
		},
		{
			// Template 2: aggressive temperature constraint
			makeText: func(p map[string]float64) string {
				return fmt.Sprintf("Never allow indoor temperature to exceed %.0f°C or drop below %.0f°C at any point during the episode.", p["t_max"], p["t_min"])
			},
			targets:  map[string]float64{"t_min": 0.0, "t_max": 0.0},
			weights:  map[string]float64{"task_completion": 0.5, "temperature": 0.4, "cost": 0.1},
		},
		{
			// Template 3: grid response SLA
			makeText: func(p map[string]float64) string {
				return fmt.Sprintf("Respond to all grid stress events (signal > 0.7) by shedding at least %.0f%% of non-critical load.", p["min_shed_pct"]*100)
			},
			targets:  map[string]float64{"min_shed_fraction": 0.0},
			weights:  map[string]float64{"task_completion": 0.5, "cost": 0.2, "temperature": 0.3},
		},
		{
			// Template 4: carbon reduction
			makeText: func(p map[string]float64) string {
				return fmt.Sprintf("Reduce carbon emissions to at least %.0f%% below the always-on baseline policy.", p["carbon_reduction_pct"]*100)
			},
			targets:  map[string]float64{"carbon_reduction": 0.0},
			weights:  map[string]float64{"task_completion": 0.5, "cost": 0.2, "temperature": 0.2, "carbon": 0.1},
		},
		{
			// Template 5: combined cost + temperature + grid
			makeText: func(p map[string]float64) string {
				return fmt.Sprintf("Keep energy cost under $%.2f, temperature between %.0f–%.0f°C, and respond to all grid stress events.", p["cost_cap"], p["t_min"], p["t_max"])
			},
			targets:  map[string]float64{"max_cost": 0.0, "t_min": 0.0, "t_max": 0.0, "min_shed_fraction": 0.25},
			weights:  map[string]float64{"task_completion": 0.5, "cost": 0.2, "temperature": 0.2, "grid_response": 0.1},
		},
	}

	// Pick a random template
	tmpl := templates[rng.Intn(len(templates))]

	// Randomise numeric parameters
	params := map[string]float64{
		"cost_cap":             1.5 + rng.Float64()*2.0,   // $1.50 – $3.50
		"t_min":               18.0 + rng.Float64()*2.0,  // 18–20 °C
		"t_max":               23.0 + rng.Float64()*2.0,  // 23–25 °C
		"min_shed_pct":        0.2 + rng.Float64()*0.2,   // 20–40 %
		"carbon_reduction_pct": 0.15 + rng.Float64()*0.2, // 15–35 %
	}

	// Fill targets from params
	targets := make(map[string]float64)
	for k := range tmpl.targets {
		switch k {
		case "max_cost":
			targets[k] = params["cost_cap"]
		case "t_min":
			targets[k] = params["t_min"]
		case "t_max":
			targets[k] = params["t_max"]
		case "min_shed_fraction":
			targets[k] = params["min_shed_pct"]
		case "carbon_reduction":
			targets[k] = params["carbon_reduction_pct"]
		}
	}

	weights := make(map[string]float64)
	for k, v := range tmpl.weights {
		weights[k] = v
	}

	return &InstructionCard{
		Text:    tmpl.makeText(params),
		Targets: targets,
		Weights: weights,
	}
}

// GradeEpisodeInput collects all data needed to score a completed episode.
type GradeEpisodeInput struct {
	TaskID           int
	Buildings        []*BuildingState
	Replay           []ReplayEntry
	TempHistory      [][]float64 // per building, per step
	TMin             float64
	TMax             float64
	ExploitPenalties []float64
	InstructionCard  *InstructionCard // set for Task 4 episodes
}

// GradeEpisode computes a deterministic 0.0–1.0 score for a completed episode.
// Given a fixed random seed, this function is fully deterministic.
func GradeEpisode(inp GradeEpisodeInput) EpisodeGrade {
	grade := EpisodeGrade{
		TaskID:    inp.TaskID,
		SubScores: map[string]float64{},
		Details:   map[string]interface{}{},
	}

	switch inp.TaskID {
	case 1:
		grade = gradeTask1(inp, grade)
	case 2:
		grade = gradeTask2(inp, grade)
	case 3:
		grade = gradeTask3(inp, grade)
	case 4:
		grade = gradeTask4(inp, grade)
	default:
		grade = gradeTask1(inp, grade)
	}

	// Exploit detection: reduce score by penalty
	totalPenalty := 0.0
	for i, b := range inp.Buildings {
		_ = b
		if i < len(inp.ExploitPenalties) {
			totalPenalty += inp.ExploitPenalties[i]
		}
	}
	if totalPenalty > 0 {
		grade.ExploitDetected = true
		grade.PenaltyApplied = math.Min(totalPenalty, 0.3) // max 30% penalty
		grade.Score = math.Max(0, grade.Score-grade.PenaltyApplied)
	}

	// Clamp AFTER rounding to ensure boundary values are handled
	grade.Score = clampOpenInterval(math.Round(grade.Score*10000) / 10000) // 4 decimal places

	// Also ensure all sub-scores are properly clamped after rounding
	for key, val := range grade.SubScores {
		grade.SubScores[key] = clampOpenInterval(math.Round(val*10000) / 10000)
	}
	return grade
}

// ── Task 1: Cost Minimization ───────────────────────────────────────────────

func gradeTask1(inp GradeEpisodeInput, grade EpisodeGrade) EpisodeGrade {
	agentCost := 0.0
	baselineCost := 0.0
	for _, b := range inp.Buildings {
		agentCost += b.CumulativeCost
		baselineCost += b.BaselineCost
	}

	var costScore float64
	if baselineCost > 0 {
		// score = max(0, 1 - agent_cost / baseline_cost)
		// 0.0 if agent costs same or more, 1.0 if agent costs nothing
		ratio := agentCost / baselineCost
		costScore = math.Max(0, 1.0-ratio)
	}

	// Clamp after min operation
	clamped := clampOpenInterval(math.Min(1.0, costScore))
	grade.SubScores["cost"] = clampOpenInterval(math.Round(clamped*10000) / 10000)
	grade.Score = grade.SubScores["cost"]
	grade.Details["agent_cost"] = agentCost
	grade.Details["baseline_cost"] = baselineCost
	grade.Details["cost_ratio"] = agentCost / math.Max(baselineCost, 0.01)
	return grade
}

// ── Task 2: Constrained Temperature Management ──────────────────────────────

func gradeTask2(inp GradeEpisodeInput, grade EpisodeGrade) EpisodeGrade {
	// Cost sub-score (same as task 1)
	grade = gradeTask1(inp, grade)
	costScore := grade.SubScores["cost"]

	// Temperature constraint sub-score
	totalSteps := 0
	withinBounds := 0
	for i, history := range inp.TempHistory {
		_ = i
		for _, temp := range history {
			totalSteps++
			if temp >= inp.TMin && temp <= inp.TMax {
				withinBounds++
			}
		}
	}
	constraintScore := 0.0
	if totalSteps > 0 {
		constraintScore = float64(withinBounds) / float64(totalSteps)
	}

	// Clamp sub-scores and final score after rounding
	grade.SubScores["cost"] = clampOpenInterval(math.Round(costScore*10000) / 10000)
	grade.SubScores["temperature"] = clampOpenInterval(math.Round(constraintScore*10000) / 10000)
	finalScore := costScore*0.6 + constraintScore*0.4
	grade.Score = clampOpenInterval(math.Round(finalScore*10000) / 10000)
	grade.Details["within_bounds_steps"] = withinBounds
	grade.Details["total_steps"] = totalSteps
	return grade
}

// ── Task 3: Full Demand-Response with Batch Scheduling ──────────────────────

func gradeTask3(inp GradeEpisodeInput, grade EpisodeGrade) EpisodeGrade {
	// Reuse task 2 for cost + temperature scores
	grade = gradeTask2(inp, grade)
	costScore := grade.SubScores["cost"]
	tempScore := grade.SubScores["temperature"]

	// Grid response sub-score
	// Count steps where stress > 0.7 and shed_fraction > 0.15
	gridStressSteps := 0
	gridResponseSteps := 0
	for _, entry := range inp.Replay {
		if entry.Observation.GridStressSignal > 0.7 {
			gridStressSteps++
			if entry.Action.LoadShedFraction > 0.15 {
				gridResponseSteps++
			}
		}
	}
	gridScore := 0.5 // default neutral if no stress events
	if gridStressSteps > 0 {
		gridScore = float64(gridResponseSteps) / float64(gridStressSteps)
	}

	// Batch deadline sub-score
	totalJobs := 0
	completedOnTime := 0
	for _, b := range inp.Buildings {
		for _, job := range b.Jobs {
			totalJobs++
			if job.Completed && !job.MissedDeadline {
				completedOnTime++
			}
		}
	}
	batchScore := 0.0
	if totalJobs > 0 {
		batchScore = float64(completedOnTime) / float64(totalJobs)
	}

	// Carbon sub-score vs baseline always-on policy (same spirit as cost)
	agentCarbon := 0.0
	baselineCarbon := 0.0
	for _, b := range inp.Buildings {
		agentCarbon += b.CumulativeCarbon
		baselineCarbon += b.BaselineCarbon
	}
	carbonScore := 0.0
	if baselineCarbon > 0 {
		carbonScore = math.Max(0, 1.0-agentCarbon/baselineCarbon)
	}

	// Clamp all sub-scores after rounding
	grade.SubScores["cost"] = clampOpenInterval(math.Round(costScore*10000) / 10000)
	grade.SubScores["temperature"] = clampOpenInterval(math.Round(tempScore*10000) / 10000)
	grade.SubScores["grid_response"] = clampOpenInterval(math.Round(gridScore*10000) / 10000)
	grade.SubScores["batch_deadline"] = clampOpenInterval(math.Round(batchScore*10000) / 10000)
	grade.SubScores["carbon"] = clampOpenInterval(math.Round(math.Min(1.0, carbonScore)*10000) / 10000)

	finalScore := costScore*0.28 + tempScore*0.20 + gridScore*0.20 + batchScore*0.12 + carbonScore*0.20
	grade.Score = clampOpenInterval(math.Round(finalScore*10000) / 10000)

	grade.Details["grid_stress_steps"] = gridStressSteps
	grade.Details["grid_response_steps"] = gridResponseSteps
	grade.Details["total_jobs"] = totalJobs
	grade.Details["completed_on_time"] = completedOnTime
	grade.Details["agent_carbon"] = agentCarbon
	grade.Details["baseline_carbon"] = baselineCarbon
	return grade
}

// ── Task 4: Instruction-Following Operator ───────────────────────────────────

// gradeTask4 evaluates how well the agent satisfied the natural-language
// instruction card issued at reset. It reads the InstructionCard from Building 0,
// checks each target that appears in the card, and computes a weighted score.
// Falls back to Task 3 grading when no instruction card is available.
func gradeTask4(inp GradeEpisodeInput, grade EpisodeGrade) EpisodeGrade {
	// Require an instruction card — passed from the environment at grade time
	if inp.InstructionCard == nil {
		// Fallback: grade as Task 3 (no card to evaluate)
		return gradeTask3(inp, grade)
	}

	card := inp.InstructionCard
	weights := card.Weights
	targets := card.Targets

	// Always compute base sub-scores — reuse existing graders
	base := gradeTask3(inp, EpisodeGrade{
		TaskID:    inp.TaskID,
		SubScores: map[string]float64{},
		Details:   map[string]interface{}{},
	})
	costScore := base.SubScores["cost"]
	tempScore := base.SubScores["temperature"]
	gridScore := base.SubScores["grid_response"]
	carbonScore := base.SubScores["carbon"]

	// ── Card-specific KPI checks ─────────────────────────────────────────────

	// KPI 1: Cost cap — did the agent stay under max_cost?
	taskCompletionScore := 0.5 // default partial credit
	if maxCost, ok := targets["max_cost"]; ok && maxCost > 0 {
		agentCost := 0.0
		for _, b := range inp.Buildings {
			agentCost += b.CumulativeCost
		}
		if agentCost <= maxCost {
			taskCompletionScore = 1.0
		} else {
			// Partial credit: how close were they?
			taskCompletionScore = math.Max(0, 1.0-(agentCost-maxCost)/maxCost)
		}
		grade.Details["target_max_cost"] = maxCost
		grade.Details["actual_cost"] = agentCost
	}

	// KPI 2: Temperature bounds — never violated t_min / t_max
	if tMin, hasTMin := targets["t_min"]; hasTMin {
		tMax, hasTMax := targets["t_max"]
		if hasTMax {
			totalSteps := 0
			withinBounds := 0
			for _, history := range inp.TempHistory {
				for _, temp := range history {
					totalSteps++
					if temp >= tMin && temp <= tMax {
						withinBounds++
					}
				}
			}
			if totalSteps > 0 {
				adherence := float64(withinBounds) / float64(totalSteps)
				// Strict: full credit only if ALWAYS within bounds
				taskCompletionScore = adherence
			}
			grade.Details["target_t_min"] = tMin
			grade.Details["target_t_max"] = tMax
		}
	}

	// KPI 3: Grid response SLA — shed >= min_shed_fraction when stress > 0.7
	if minShed, ok := targets["min_shed_fraction"]; ok {
		stressSteps := 0
		compliantSteps := 0
		for _, entry := range inp.Replay {
			if entry.Observation.GridStressSignal > 0.7 {
				stressSteps++
				if entry.Action.LoadShedFraction >= minShed {
					compliantSteps++
				}
			}
		}
		if stressSteps > 0 {
			taskCompletionScore = float64(compliantSteps) / float64(stressSteps)
		}
		grade.Details["target_min_shed"] = minShed
		grade.Details["stress_steps"] = stressSteps
		grade.Details["compliant_steps"] = compliantSteps
	}

	// KPI 4: Carbon reduction — did agent beat baseline by carbon_reduction target?
	if carbonTarget, ok := targets["carbon_reduction"]; ok {
		agentCarbon := 0.0
		baselineCarbon := 0.0
		for _, b := range inp.Buildings {
			agentCarbon += b.CumulativeCarbon
			baselineCarbon += b.BaselineCarbon
		}
		if baselineCarbon > 0 {
			actualReduction := 1.0 - agentCarbon/baselineCarbon
			if actualReduction >= carbonTarget {
				taskCompletionScore = 1.0
			} else {
				taskCompletionScore = math.Max(0, actualReduction/carbonTarget)
			}
		}
		grade.Details["target_carbon_reduction"] = carbonTarget
	}

	// ── Weighted final score ─────────────────────────────────────────────────
	// Use weights from the card; fall back to Task 4 defaults if missing
	wTask := getWeight(weights, "task_completion", 0.50)
	wCost := getWeight(weights, "cost", 0.20)
	wTemp := getWeight(weights, "temperature", 0.20)
	wGrid := getWeight(weights, "grid_response", 0.05)
	wCarbon := getWeight(weights, "carbon", 0.05)

	finalScore := taskCompletionScore*wTask +
		costScore*wCost +
		tempScore*wTemp +
		gridScore*wGrid +
		carbonScore*wCarbon

	grade.SubScores["task_completion"] = clampOpenInterval(math.Round(taskCompletionScore*10000) / 10000)
	grade.SubScores["cost"] = clampOpenInterval(math.Round(costScore*10000) / 10000)
	grade.SubScores["temperature"] = clampOpenInterval(math.Round(tempScore*10000) / 10000)
	grade.SubScores["grid_response"] = clampOpenInterval(math.Round(gridScore*10000) / 10000)
	grade.SubScores["carbon"] = clampOpenInterval(math.Round(carbonScore*10000) / 10000)
	grade.Score = clampOpenInterval(math.Round(finalScore*10000) / 10000)

	grade.Details["instruction_card_text"] = card.Text
	return grade
}

// getWeight safely retrieves a weight from a map, returning defaultVal if missing.
func getWeight(weights map[string]float64, key string, defaultVal float64) float64 {
	if v, ok := weights[key]; ok {
		return v
	}
	return defaultVal
}