// Package env defines the four GridMind-RL tasks and their deterministic graders. package env import ( "fmt" "math" "math/rand" ) // clampOpenInterval clamps a score to the open interval (0, 1), strictly excluding 0.0 and 1.0. // This ensures all scores satisfy the requirement: 0 < score < 1 func clampOpenInterval(score float64) float64 { const epsilon = 1e-6 if score <= 0.0 { return epsilon } if score >= 1.0 { return 1.0 - epsilon } return score } // TaskConfig describes a single task. type TaskConfig struct { ID int `json:"id"` Name string `json:"name"` Description string `json:"description"` Difficulty string `json:"difficulty"` Weights map[string]float64 `json:"weights"` } // AllTasks returns the ordered list of task configurations. func AllTasks() []TaskConfig { return []TaskConfig{ { ID: 1, Name: "Cost Minimization", Description: "Minimize total energy cost over a 24-hour episode with no process constraints. Beat the always-on flat policy baseline.", Difficulty: "easy", Weights: map[string]float64{"cost": 1.0}, }, { ID: 2, Name: "Constrained Temperature Management", Description: "Minimize cost while keeping indoor temperature within ±2°C of setpoint at all times.", Difficulty: "medium", Weights: map[string]float64{"cost": 0.6, "temperature": 0.4}, }, { ID: 3, Name: "Full Demand-Response with Batch Scheduling", Description: "Minimize cost, maintain temperature, respond to grid stress events, schedule all batch jobs before their deadlines, and minimize carbon emissions.", Difficulty: "hard", Weights: map[string]float64{"cost": 0.28, "temperature": 0.20, "grid_response": 0.20, "batch_deadline": 0.12, "carbon": 0.20}, }, { ID: 4, Name: "Instruction-Following Operator", Description: "Complete a randomly sampled natural-language objective card. The agent must parse the instruction, plan accordingly, and satisfy all stated KPI targets.", Difficulty: "hard", Weights: map[string]float64{"task_completion": 0.50, "cost": 0.30, "temperature": 0.20}, }, } } // instructionTemplate is a parameterised instruction card template. type instructionTemplate struct { makeText func(params map[string]float64) string targets map[string]float64 weights map[string]float64 } // GenerateInstructionCard samples a random instruction card for Task 4. // The card contains a human-readable text objective plus machine-readable targets. func GenerateInstructionCard(rng *rand.Rand) *InstructionCard { // Pool of parameterised templates templates := []instructionTemplate{ { // Template 1: hard energy cap makeText: func(p map[string]float64) string { return fmt.Sprintf("Keep total energy cost under $%.2f for this 24-hour episode while maintaining comfort.", p["cost_cap"]) }, targets: map[string]float64{"max_cost": 0.0}, // filled in below weights: map[string]float64{"task_completion": 0.5, "cost": 0.3, "temperature": 0.2}, }, { // Template 2: aggressive temperature constraint makeText: func(p map[string]float64) string { return fmt.Sprintf("Never allow indoor temperature to exceed %.0f°C or drop below %.0f°C at any point during the episode.", p["t_max"], p["t_min"]) }, targets: map[string]float64{"t_min": 0.0, "t_max": 0.0}, weights: map[string]float64{"task_completion": 0.5, "temperature": 0.4, "cost": 0.1}, }, { // Template 3: grid response SLA makeText: func(p map[string]float64) string { return fmt.Sprintf("Respond to all grid stress events (signal > 0.7) by shedding at least %.0f%% of non-critical load.", p["min_shed_pct"]*100) }, targets: map[string]float64{"min_shed_fraction": 0.0}, weights: map[string]float64{"task_completion": 0.5, "cost": 0.2, "temperature": 0.3}, }, { // Template 4: carbon reduction makeText: func(p map[string]float64) string { return fmt.Sprintf("Reduce carbon emissions to at least %.0f%% below the always-on baseline policy.", p["carbon_reduction_pct"]*100) }, targets: map[string]float64{"carbon_reduction": 0.0}, weights: map[string]float64{"task_completion": 0.5, "cost": 0.2, "temperature": 0.2, "carbon": 0.1}, }, { // Template 5: combined cost + temperature + grid makeText: func(p map[string]float64) string { return fmt.Sprintf("Keep energy cost under $%.2f, temperature between %.0f–%.0f°C, and respond to all grid stress events.", p["cost_cap"], p["t_min"], p["t_max"]) }, targets: map[string]float64{"max_cost": 0.0, "t_min": 0.0, "t_max": 0.0, "min_shed_fraction": 0.25}, weights: map[string]float64{"task_completion": 0.5, "cost": 0.2, "temperature": 0.2, "grid_response": 0.1}, }, } // Pick a random template tmpl := templates[rng.Intn(len(templates))] // Randomise numeric parameters params := map[string]float64{ "cost_cap": 1.5 + rng.Float64()*2.0, // $1.50 – $3.50 "t_min": 18.0 + rng.Float64()*2.0, // 18–20 °C "t_max": 23.0 + rng.Float64()*2.0, // 23–25 °C "min_shed_pct": 0.2 + rng.Float64()*0.2, // 20–40 % "carbon_reduction_pct": 0.15 + rng.Float64()*0.2, // 15–35 % } // Fill targets from params targets := make(map[string]float64) for k := range tmpl.targets { switch k { case "max_cost": targets[k] = params["cost_cap"] case "t_min": targets[k] = params["t_min"] case "t_max": targets[k] = params["t_max"] case "min_shed_fraction": targets[k] = params["min_shed_pct"] case "carbon_reduction": targets[k] = params["carbon_reduction_pct"] } } weights := make(map[string]float64) for k, v := range tmpl.weights { weights[k] = v } return &InstructionCard{ Text: tmpl.makeText(params), Targets: targets, Weights: weights, } } // GradeEpisodeInput collects all data needed to score a completed episode. type GradeEpisodeInput struct { TaskID int Buildings []*BuildingState Replay []ReplayEntry TempHistory [][]float64 // per building, per step TMin float64 TMax float64 ExploitPenalties []float64 InstructionCard *InstructionCard // set for Task 4 episodes } // GradeEpisode computes a deterministic 0.0–1.0 score for a completed episode. // Given a fixed random seed, this function is fully deterministic. func GradeEpisode(inp GradeEpisodeInput) EpisodeGrade { grade := EpisodeGrade{ TaskID: inp.TaskID, SubScores: map[string]float64{}, Details: map[string]interface{}{}, } switch inp.TaskID { case 1: grade = gradeTask1(inp, grade) case 2: grade = gradeTask2(inp, grade) case 3: grade = gradeTask3(inp, grade) case 4: grade = gradeTask4(inp, grade) default: grade = gradeTask1(inp, grade) } // Exploit detection: reduce score by penalty totalPenalty := 0.0 for i, b := range inp.Buildings { _ = b if i < len(inp.ExploitPenalties) { totalPenalty += inp.ExploitPenalties[i] } } if totalPenalty > 0 { grade.ExploitDetected = true grade.PenaltyApplied = math.Min(totalPenalty, 0.3) // max 30% penalty grade.Score = math.Max(0, grade.Score-grade.PenaltyApplied) } // Clamp AFTER rounding to ensure boundary values are handled grade.Score = clampOpenInterval(math.Round(grade.Score*10000) / 10000) // 4 decimal places // Also ensure all sub-scores are properly clamped after rounding for key, val := range grade.SubScores { grade.SubScores[key] = clampOpenInterval(math.Round(val*10000) / 10000) } return grade } // ── Task 1: Cost Minimization ─────────────────────────────────────────────── func gradeTask1(inp GradeEpisodeInput, grade EpisodeGrade) EpisodeGrade { agentCost := 0.0 baselineCost := 0.0 for _, b := range inp.Buildings { agentCost += b.CumulativeCost baselineCost += b.BaselineCost } var costScore float64 if baselineCost > 0 { // score = max(0, 1 - agent_cost / baseline_cost) // 0.0 if agent costs same or more, 1.0 if agent costs nothing ratio := agentCost / baselineCost costScore = math.Max(0, 1.0-ratio) } // Clamp after min operation clamped := clampOpenInterval(math.Min(1.0, costScore)) grade.SubScores["cost"] = clampOpenInterval(math.Round(clamped*10000) / 10000) grade.Score = grade.SubScores["cost"] grade.Details["agent_cost"] = agentCost grade.Details["baseline_cost"] = baselineCost grade.Details["cost_ratio"] = agentCost / math.Max(baselineCost, 0.01) return grade } // ── Task 2: Constrained Temperature Management ────────────────────────────── func gradeTask2(inp GradeEpisodeInput, grade EpisodeGrade) EpisodeGrade { // Cost sub-score (same as task 1) grade = gradeTask1(inp, grade) costScore := grade.SubScores["cost"] // Temperature constraint sub-score totalSteps := 0 withinBounds := 0 for i, history := range inp.TempHistory { _ = i for _, temp := range history { totalSteps++ if temp >= inp.TMin && temp <= inp.TMax { withinBounds++ } } } constraintScore := 0.0 if totalSteps > 0 { constraintScore = float64(withinBounds) / float64(totalSteps) } // Clamp sub-scores and final score after rounding grade.SubScores["cost"] = clampOpenInterval(math.Round(costScore*10000) / 10000) grade.SubScores["temperature"] = clampOpenInterval(math.Round(constraintScore*10000) / 10000) finalScore := costScore*0.6 + constraintScore*0.4 grade.Score = clampOpenInterval(math.Round(finalScore*10000) / 10000) grade.Details["within_bounds_steps"] = withinBounds grade.Details["total_steps"] = totalSteps return grade } // ── Task 3: Full Demand-Response with Batch Scheduling ────────────────────── func gradeTask3(inp GradeEpisodeInput, grade EpisodeGrade) EpisodeGrade { // Reuse task 2 for cost + temperature scores grade = gradeTask2(inp, grade) costScore := grade.SubScores["cost"] tempScore := grade.SubScores["temperature"] // Grid response sub-score // Count steps where stress > 0.7 and shed_fraction > 0.15 gridStressSteps := 0 gridResponseSteps := 0 for _, entry := range inp.Replay { if entry.Observation.GridStressSignal > 0.7 { gridStressSteps++ if entry.Action.LoadShedFraction > 0.15 { gridResponseSteps++ } } } gridScore := 0.5 // default neutral if no stress events if gridStressSteps > 0 { gridScore = float64(gridResponseSteps) / float64(gridStressSteps) } // Batch deadline sub-score totalJobs := 0 completedOnTime := 0 for _, b := range inp.Buildings { for _, job := range b.Jobs { totalJobs++ if job.Completed && !job.MissedDeadline { completedOnTime++ } } } batchScore := 0.0 if totalJobs > 0 { batchScore = float64(completedOnTime) / float64(totalJobs) } // Carbon sub-score vs baseline always-on policy (same spirit as cost) agentCarbon := 0.0 baselineCarbon := 0.0 for _, b := range inp.Buildings { agentCarbon += b.CumulativeCarbon baselineCarbon += b.BaselineCarbon } carbonScore := 0.0 if baselineCarbon > 0 { carbonScore = math.Max(0, 1.0-agentCarbon/baselineCarbon) } // Clamp all sub-scores after rounding grade.SubScores["cost"] = clampOpenInterval(math.Round(costScore*10000) / 10000) grade.SubScores["temperature"] = clampOpenInterval(math.Round(tempScore*10000) / 10000) grade.SubScores["grid_response"] = clampOpenInterval(math.Round(gridScore*10000) / 10000) grade.SubScores["batch_deadline"] = clampOpenInterval(math.Round(batchScore*10000) / 10000) grade.SubScores["carbon"] = clampOpenInterval(math.Round(math.Min(1.0, carbonScore)*10000) / 10000) finalScore := costScore*0.28 + tempScore*0.20 + gridScore*0.20 + batchScore*0.12 + carbonScore*0.20 grade.Score = clampOpenInterval(math.Round(finalScore*10000) / 10000) grade.Details["grid_stress_steps"] = gridStressSteps grade.Details["grid_response_steps"] = gridResponseSteps grade.Details["total_jobs"] = totalJobs grade.Details["completed_on_time"] = completedOnTime grade.Details["agent_carbon"] = agentCarbon grade.Details["baseline_carbon"] = baselineCarbon return grade } // ── Task 4: Instruction-Following Operator ─────────────────────────────────── // gradeTask4 evaluates how well the agent satisfied the natural-language // instruction card issued at reset. It reads the InstructionCard from Building 0, // checks each target that appears in the card, and computes a weighted score. // Falls back to Task 3 grading when no instruction card is available. func gradeTask4(inp GradeEpisodeInput, grade EpisodeGrade) EpisodeGrade { // Require an instruction card — passed from the environment at grade time if inp.InstructionCard == nil { // Fallback: grade as Task 3 (no card to evaluate) return gradeTask3(inp, grade) } card := inp.InstructionCard weights := card.Weights targets := card.Targets // Always compute base sub-scores — reuse existing graders base := gradeTask3(inp, EpisodeGrade{ TaskID: inp.TaskID, SubScores: map[string]float64{}, Details: map[string]interface{}{}, }) costScore := base.SubScores["cost"] tempScore := base.SubScores["temperature"] gridScore := base.SubScores["grid_response"] carbonScore := base.SubScores["carbon"] // ── Card-specific KPI checks ───────────────────────────────────────────── // KPI 1: Cost cap — did the agent stay under max_cost? taskCompletionScore := 0.5 // default partial credit if maxCost, ok := targets["max_cost"]; ok && maxCost > 0 { agentCost := 0.0 for _, b := range inp.Buildings { agentCost += b.CumulativeCost } if agentCost <= maxCost { taskCompletionScore = 1.0 } else { // Partial credit: how close were they? taskCompletionScore = math.Max(0, 1.0-(agentCost-maxCost)/maxCost) } grade.Details["target_max_cost"] = maxCost grade.Details["actual_cost"] = agentCost } // KPI 2: Temperature bounds — never violated t_min / t_max if tMin, hasTMin := targets["t_min"]; hasTMin { tMax, hasTMax := targets["t_max"] if hasTMax { totalSteps := 0 withinBounds := 0 for _, history := range inp.TempHistory { for _, temp := range history { totalSteps++ if temp >= tMin && temp <= tMax { withinBounds++ } } } if totalSteps > 0 { adherence := float64(withinBounds) / float64(totalSteps) // Strict: full credit only if ALWAYS within bounds taskCompletionScore = adherence } grade.Details["target_t_min"] = tMin grade.Details["target_t_max"] = tMax } } // KPI 3: Grid response SLA — shed >= min_shed_fraction when stress > 0.7 if minShed, ok := targets["min_shed_fraction"]; ok { stressSteps := 0 compliantSteps := 0 for _, entry := range inp.Replay { if entry.Observation.GridStressSignal > 0.7 { stressSteps++ if entry.Action.LoadShedFraction >= minShed { compliantSteps++ } } } if stressSteps > 0 { taskCompletionScore = float64(compliantSteps) / float64(stressSteps) } grade.Details["target_min_shed"] = minShed grade.Details["stress_steps"] = stressSteps grade.Details["compliant_steps"] = compliantSteps } // KPI 4: Carbon reduction — did agent beat baseline by carbon_reduction target? if carbonTarget, ok := targets["carbon_reduction"]; ok { agentCarbon := 0.0 baselineCarbon := 0.0 for _, b := range inp.Buildings { agentCarbon += b.CumulativeCarbon baselineCarbon += b.BaselineCarbon } if baselineCarbon > 0 { actualReduction := 1.0 - agentCarbon/baselineCarbon if actualReduction >= carbonTarget { taskCompletionScore = 1.0 } else { taskCompletionScore = math.Max(0, actualReduction/carbonTarget) } } grade.Details["target_carbon_reduction"] = carbonTarget } // ── Weighted final score ───────────────────────────────────────────────── // Use weights from the card; fall back to Task 4 defaults if missing wTask := getWeight(weights, "task_completion", 0.50) wCost := getWeight(weights, "cost", 0.20) wTemp := getWeight(weights, "temperature", 0.20) wGrid := getWeight(weights, "grid_response", 0.05) wCarbon := getWeight(weights, "carbon", 0.05) finalScore := taskCompletionScore*wTask + costScore*wCost + tempScore*wTemp + gridScore*wGrid + carbonScore*wCarbon grade.SubScores["task_completion"] = clampOpenInterval(math.Round(taskCompletionScore*10000) / 10000) grade.SubScores["cost"] = clampOpenInterval(math.Round(costScore*10000) / 10000) grade.SubScores["temperature"] = clampOpenInterval(math.Round(tempScore*10000) / 10000) grade.SubScores["grid_response"] = clampOpenInterval(math.Round(gridScore*10000) / 10000) grade.SubScores["carbon"] = clampOpenInterval(math.Round(carbonScore*10000) / 10000) grade.Score = clampOpenInterval(math.Round(finalScore*10000) / 10000) grade.Details["instruction_card_text"] = card.Text return grade } // getWeight safely retrieves a weight from a map, returning defaultVal if missing. func getWeight(weights map[string]float64, key string, defaultVal float64) float64 { if v, ok := weights[key]; ok { return v } return defaultVal }