Commit ·
b054ef7
1
Parent(s): 926806a
feat: add example environment variables, update README, and enhance inference script for better error handling
Browse files- .env.example +21 -0
- README.md +367 -149
- baseline_scores.json +20 -20
- inference.py +32 -0
- python/inference.py +84 -40
.env.example
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Example environment variables for GridMind-RL
|
| 2 |
+
# Copy this to .env and fill in real keys for local testing.
|
| 3 |
+
|
| 4 |
+
# Mandatory hackathon secret (set this in HF Space secrets too)
|
| 5 |
+
HF_TOKEN=your_provider_api_key_here
|
| 6 |
+
|
| 7 |
+
# OpenAI-compatible endpoint (default: OpenRouter free-tier)
|
| 8 |
+
API_BASE_URL=https://openrouter.ai/api/v1
|
| 9 |
+
|
| 10 |
+
# Model to use (change to smaller model if you need lower latency/cost)
|
| 11 |
+
MODEL_NAME=your_chosen_model_name_here
|
| 12 |
+
|
| 13 |
+
# Optional: provider-specific API key fallback for development
|
| 14 |
+
OPENAI_API_KEY=your_api_key_here
|
| 15 |
+
|
| 16 |
+
# Environment server URL (local Docker)
|
| 17 |
+
ENV_URL=http://localhost:7860
|
| 18 |
+
|
| 19 |
+
# Inference script flags
|
| 20 |
+
# --fast-mode : run heuristic (no LLM calls) for deterministic, instant runs
|
| 21 |
+
# --episodes N : number of episodes per task
|
README.md
CHANGED
|
@@ -1,165 +1,241 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
**A real-world RL environment for building energy
|
| 4 |
|
| 5 |
-
Built on the [OpenEnv](https://github.com/meta-pytorch/OpenEnv) specification. Containerized. Ready for Hugging Face Spaces.
|
| 6 |
|
| 7 |
---
|
| 8 |
|
| 9 |
-
##
|
| 10 |
|
| 11 |
-
|
| 12 |
|
| 13 |
-
|
| 14 |
-
- **Comfort** — keep indoor temperature within comfortable bounds
|
| 15 |
-
- **Grid compliance** — shed load when the grid signals demand-response events
|
| 16 |
-
- **Scheduling** — complete batch processing jobs before their deadlines
|
| 17 |
-
- **Carbon** — minimize carbon emissions by timing consumption to clean-grid periods
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
---
|
| 22 |
|
| 23 |
-
##
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
| Field | Type | Range | Description |
|
| 28 |
-
|-------|------|-------|-------------|
|
| 29 |
-
| `indoor_temperature` | float | 10–40 °C | Current building temperature |
|
| 30 |
-
| `thermal_storage_level` | float | 0.0–1.0 | Thermal tank
|
| 31 |
-
| `process_demand` | float | ≥ 0 kW | Current industrial power
|
| 32 |
-
| `current_price` | float | > 0 $/kWh | Real-time electricity price |
|
| 33 |
-
| `grid_stress_signal` | float | 0.0–1.0 | Utility demand-response urgency (
|
| 34 |
-
| `carbon_intensity` | float | ≥ 0 gCO₂/kWh |
|
| 35 |
-
| `hour_of_day` | int | 0–23 |
|
| 36 |
-
| `batch_queue` | int
|
| 37 |
-
| `cumulative_cost` | float | ≥ 0 $ |
|
| 38 |
-
| `step` | int | 0–95 | Current timestep (96
|
| 39 |
-
| `building_id` | int | 0+ | Building
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
| 58 |
-
|
|
| 59 |
-
|
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
|
| 67 |
---
|
| 68 |
|
| 69 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
| 78 |
|
| 79 |
-
**
|
|
|
|
|
|
|
| 80 |
|
| 81 |
---
|
| 82 |
|
| 83 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
### Prerequisites
|
| 86 |
|
| 87 |
-
- **Docker** — [
|
| 88 |
- **Python 3.9+** — [Download Python](https://www.python.org/downloads/)
|
| 89 |
- **Git** — [Download Git](https://git-scm.com/downloads)
|
| 90 |
|
| 91 |
-
###
|
|
|
|
|
|
|
| 92 |
|
| 93 |
```bash
|
| 94 |
-
git clone https://github.com/LO-Kyu/gridmind.git
|
| 95 |
-
cd gridmind
|
| 96 |
```
|
| 97 |
|
| 98 |
-
###
|
| 99 |
|
| 100 |
```bash
|
| 101 |
docker build -t gridmind-rl .
|
| 102 |
docker run --rm -d -p 7860:7860 -p 7861:7861 --name gridmind gridmind-rl
|
| 103 |
```
|
| 104 |
|
| 105 |
-
|
| 106 |
|
| 107 |
```bash
|
| 108 |
-
#
|
| 109 |
curl http://localhost:7860/health
|
| 110 |
-
|
| 111 |
-
# Windows (PowerShell)
|
| 112 |
-
Invoke-RestMethod -Uri http://localhost:7860/health
|
| 113 |
```
|
| 114 |
|
| 115 |
-
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
Open a **new terminal** (keep Docker running) and install:
|
| 120 |
|
| 121 |
```bash
|
| 122 |
pip install -r python/requirements.txt
|
| 123 |
```
|
| 124 |
|
| 125 |
-
###
|
| 126 |
-
|
| 127 |
-
The inference script uses an LLM to make decisions. You need a **free** API key:
|
| 128 |
-
|
| 129 |
-
1. Go to [openrouter.ai/keys](https://openrouter.ai/keys)
|
| 130 |
-
2. Sign in with Google or GitHub (free)
|
| 131 |
-
3. Click **"Create Key"** and copy it
|
| 132 |
-
|
| 133 |
-
### Step 5: Configure Your API Key
|
| 134 |
|
| 135 |
-
|
| 136 |
|
| 137 |
-
```
|
| 138 |
-
|
| 139 |
-
MODEL_NAME=meta-llama/llama-3.1-8b-instruct:free
|
| 140 |
-
OPENAI_API_KEY=sk-or-v1-paste-your-actual-key-here
|
| 141 |
-
ENV_URL=http://localhost:7860
|
| 142 |
```
|
| 143 |
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
-
|
| 147 |
|
| 148 |
-
|
| 149 |
-
# Run LLM agent on all 3 tasks
|
| 150 |
-
python inference.py --episodes 1
|
| 151 |
|
| 152 |
-
|
| 153 |
-
python inference.py --fast-mode --episodes 1
|
| 154 |
-
```
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
-
###
|
| 163 |
|
| 164 |
```bash
|
| 165 |
docker stop gridmind
|
|
@@ -167,56 +243,149 @@ docker stop gridmind
|
|
| 167 |
|
| 168 |
---
|
| 169 |
|
| 170 |
-
##
|
| 171 |
|
| 172 |
-
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|------|:----------:|:-----:|---------|
|
| 176 |
-
| 1 — Cost Minimization | 🟢 Easy | **0.7063** | cost: 0.706 |
|
| 177 |
-
| 2 — Temperature Management | 🟡 Medium | **0.6333** | cost: 0.701, temperature: 0.531 |
|
| 178 |
-
| 3 — Full Demand Response | 🔴 Hard | **0.5966** | cost: 0.670, temp: 0.573, grid: 0.214, batch: 1.000, carbon: 0.657 |
|
| 179 |
-
| **Overall Average** | | **0.6454** | |
|
| 180 |
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
---
|
| 184 |
|
| 185 |
-
##
|
| 186 |
|
| 187 |
-
Base URL: `http://localhost:7860`
|
| 188 |
|
| 189 |
-
|
|
| 190 |
-
|--------|------|---------|
|
| 191 |
-
| `
|
| 192 |
-
| `
|
| 193 |
-
| `
|
| 194 |
-
| `
|
| 195 |
-
| `
|
| 196 |
-
| `
|
| 197 |
-
| `
|
| 198 |
-
| `
|
| 199 |
-
| `
|
| 200 |
|
| 201 |
-
|
| 202 |
|
| 203 |
```bash
|
| 204 |
-
# Reset to Task 1
|
| 205 |
curl -X POST http://localhost:7860/reset \
|
| 206 |
-H "Content-Type: application/json" \
|
| 207 |
-d '{"task_id": 1, "seed": 42}'
|
| 208 |
|
| 209 |
-
#
|
|
|
|
|
|
|
|
|
|
| 210 |
curl -X POST http://localhost:7860/step \
|
| 211 |
-H "Content-Type: application/json" \
|
| 212 |
-
-d '{
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
curl http://localhost:7860/grade
|
| 216 |
```
|
| 217 |
|
| 218 |
---
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
## 🏗️ Architecture
|
| 221 |
|
| 222 |
```
|
|
@@ -385,21 +554,70 @@ Each episode emits structured markers for automated evaluation:
|
|
| 385 |
|
| 386 |
---
|
| 387 |
|
| 388 |
-
##
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
|
| 393 |
-
|
|
| 394 |
-
|
|
| 395 |
-
|
|
| 396 |
-
|
|
| 397 |
-
|
|
| 398 |
-
|
|
| 399 |
-
|
|
| 400 |
-
|
|
| 401 |
-
|
|
| 402 |
-
| Exploit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
---
|
| 405 |
|
|
|
|
| 1 |
+
# 🏢 GridMind-RL — Energy Management Reinforcement Learning Environment
|
| 2 |
|
| 3 |
+
**A real-world RL environment for intelligent building energy optimization.** Control HVAC systems, thermal storage, batch job scheduling, and demand-response under stochastic electricity prices and grid stress events.
|
| 4 |
|
| 5 |
+
Built on the [OpenEnv](https://github.com/meta-pytorch/OpenEnv) specification. Containerized. Ready for Hugging Face Spaces deployment.
|
| 6 |
|
| 7 |
---
|
| 8 |
|
| 9 |
+
## 📖 Overview & Motivation
|
| 10 |
|
| 11 |
+
Building energy management is a **real-world optimization problem** facing utilities, facility operators, and industrial sites globally. Traditional rule-based controls waste billions in energy costs and miss opportunities for grid participation.
|
| 12 |
|
| 13 |
+
**GridMind-RL** simulates decisions that facility operators must make daily:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
- **Cost Optimization** — Buy electricity when prices are low, avoid peak surcharges
|
| 16 |
+
- **Comfort & Safety** — Maintain indoor temperature within acceptable ranges while managing thermal inertia
|
| 17 |
+
- **Grid Participation** — Respond to demand-response signals and grid stress events
|
| 18 |
+
- **Batch Scheduling** — Coordinate industrial process timings to meet deadlines and minimize energy cost
|
| 19 |
+
- **Carbon Minimization** — Shift consumption to periods when grid carbon intensity is low
|
| 20 |
+
|
| 21 |
+
**Why this matters:** An RL agent trained in this environment can learn strategies that would be difficult or impossible for humans to hand-craft. The combination of continuous control (HVAC power, thermal storage), discrete decisions (batch scheduling), and multiple simultaneous objectives (cost, comfort, grid, deadlines, carbon) creates a realistic, challenging benchmark.
|
| 22 |
+
|
| 23 |
+
**Episode Length:** 96 steps = 24 hours at 15-minute resolution. A complete episode requires strategic decision-making across a full day-night cycle.
|
| 24 |
|
| 25 |
---
|
| 26 |
|
| 27 |
+
## � Observation Space
|
| 28 |
+
|
| 29 |
+
At each timestep, the environment provides the following observations. **Episode length: 96 steps** (15-minute intervals = 24 hours).
|
| 30 |
+
|
| 31 |
+
| Field | Data Type | Range / Values | Description |
|
| 32 |
+
|-------|-----------|-----------------|-------------|
|
| 33 |
+
| `indoor_temperature` | float | 10–40 °C | Current building interior temperature |
|
| 34 |
+
| `thermal_storage_level` | float | 0.0–1.0 | Thermal tank charge state (0 = empty, 1 = full) |
|
| 35 |
+
| `process_demand` | float | ≥ 0 kW | Current industrial batch process power draw |
|
| 36 |
+
| `current_price` | float | > 0 $/kWh | Real-time spot electricity price |
|
| 37 |
+
| `grid_stress_signal` | float | 0.0–1.0 | Utility demand-response urgency (0.7+ = critical) |
|
| 38 |
+
| `carbon_intensity` | float | ≥ 0 gCO₂/kWh | Current grid carbon intensity |
|
| 39 |
+
| `hour_of_day` | int | 0–23 | Time-of-day context |
|
| 40 |
+
| `batch_queue` | int array | — | Pending batch jobs with deadline slots |
|
| 41 |
+
| `cumulative_cost` | float | ≥ 0 $ | Energy cost accumulated in current episode so far |
|
| 42 |
+
| `step` | int | 0–95 | Current timestep (96 total = 24 hours) |
|
| 43 |
+
| `building_id` | int | 0+ | Building identifier (for multi-building scenarios) |
|
| 44 |
+
|
| 45 |
+
**Observation Properties:**
|
| 46 |
+
- Observations are **deterministic** given the seed — same seed produces identical sequences
|
| 47 |
+
- All fields are **normalized or bounded** for stable learning
|
| 48 |
+
- Prices follow realistic time-of-use patterns; carbon intensity varies with grid mix
|
| 49 |
+
- Batch queue starts empty; jobs appear stochastically based on the task/seed
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## 🎮 Action Space
|
| 54 |
+
|
| 55 |
+
At each step, the agent sends an action controlling four independent subsystems:
|
| 56 |
+
|
| 57 |
+
| Field | Data Type | Range | Description |
|
| 58 |
+
|-------|-----------|-------|-------------|
|
| 59 |
+
| `hvac_power_level` | float | 0.0–1.0 | HVAC system power (0 = off, 1 = full) |
|
| 60 |
+
| `thermal_charge_rate` | float | -1.0–1.0 | Thermal storage control (+charge, -discharge) |
|
| 61 |
+
| `batch_job_slot` | int | 0–4 | Schedule next batch job: 0=immediate, 1–4=defer |
|
| 62 |
+
| `load_shed_fraction` | float | 0.0–0.5 | Non-critical load reduction (0–50%) for demand-response |
|
| 63 |
+
| `building_id` | int | 0+ | Building identifier (routing) |
|
| 64 |
+
|
| 65 |
+
**Action Space Properties:**
|
| 66 |
+
- **Continuous** (HVAC, thermal charging, load shedding) + **discrete** (batch scheduling) → hybrid control
|
| 67 |
+
- Actions are applied every 15-minute step
|
| 68 |
+
- Load shedding is capped at 50% to ensure safety/habitability
|
| 69 |
+
- Batch scheduling decisions affect energy cost and deadline compliance
|
| 70 |
|
| 71 |
---
|
| 72 |
|
| 73 |
+
## 💡 Reward Function
|
| 74 |
+
|
| 75 |
+
The environment provides **dense rewards every step** (not sparse, not binary). Each step returns:
|
| 76 |
+
- A scalar reward (sum of components)
|
| 77 |
+
- A dictionary of 7 weighted sub-components for transparency
|
| 78 |
|
| 79 |
+
| Component | Purpose | Possible Values |
|
| 80 |
+
|-----------|---------|-----------------|
|
| 81 |
+
| **cost_savings** | Minimize energy bill | Negative (cost increases) to positive (savings vs baseline) |
|
| 82 |
+
| **temp_constraint** | Maintain comfort | Gaussian bonus near 21°C, penalty outside 19–23°C bounds |
|
| 83 |
+
| **grid_response** | Shift load during stress | Bonus proportional to shed fraction when grid signal > 0.7 |
|
| 84 |
+
| **efficiency_bonus** | Exploit thermal storage | Reward charge/discharge timing and thermal arbitrage |
|
| 85 |
+
| **stability_penalty** | Smooth control | Small penalty for rapid oscillations in HVAC/storage |
|
| 86 |
+
| **deadline_penalty** | Meet job deadlines | Large penalty if batch job finishes after deadline |
|
| 87 |
+
| **carbon_reward** | Low-carbon consumption | Bonus for consuming during low-carbon grid periods |
|
| 88 |
|
| 89 |
+
**Example Reward Calculation:**
|
| 90 |
+
If an agent takes a well-timed action during high-price, high-stress period:
|
| 91 |
+
- Large positive `cost_savings` (avoided expensive hour)
|
| 92 |
+
- Positive `grid_response` (shed load successfully)
|
| 93 |
+
- Possible positive `carbon_reward` (if grid is clean)
|
| 94 |
+
- **Total step reward** = weighted sum of all components
|
| 95 |
|
| 96 |
+
This multi-objective reward structure encourages **learning tradeoffs** between cost, comfort, grid support, and carbon efficiency.
|
| 97 |
+
|
| 98 |
+
---
|
| 99 |
|
| 100 |
---
|
| 101 |
|
| 102 |
+
## 📋 Tasks & Difficulty Levels
|
| 103 |
+
|
| 104 |
+
Three independent tasks with **deterministic programmatic graders**. Scores range **0.0–1.0**; higher is better.
|
| 105 |
+
|
| 106 |
+
### Task 1 — Cost Minimization (🟢 Easy)
|
| 107 |
+
|
| 108 |
+
**Objective:** Minimize total energy cost in 24 hours with no other constraints.
|
| 109 |
+
|
| 110 |
+
**Difficulty Rationale:** Only one objective (cost) to optimize; temperature and grid constraints are relaxed.
|
| 111 |
+
|
| 112 |
+
**Grader Metrics:**
|
| 113 |
+
- **Cost score (100%)** — Compares total episode energy cost to a deterministic baseline. Higher savings → higher score.
|
| 114 |
+
|
| 115 |
+
**Baseline Score:** **0.7063**
|
| 116 |
+
|
| 117 |
+
---
|
| 118 |
+
|
| 119 |
+
### Task 2 — Constrained Temperature Control (🟡 Medium)
|
| 120 |
+
|
| 121 |
+
**Objective:** Minimize cost while maintaining indoor temperature between **19–23°C** throughout the episode.
|
| 122 |
+
|
| 123 |
+
**Difficulty Rationale:** Introduces a hard constraint (temperature bounds). Agent must use thermal storage strategically to meet both cost and comfort goals.
|
| 124 |
+
|
| 125 |
+
**Grader Metrics:**
|
| 126 |
+
- **Cost score (60%)** — Total energy cost vs baseline
|
| 127 |
+
- **Temperature score (40%)** — Fraction of steps within bounds (hard penalty for violations)
|
| 128 |
+
|
| 129 |
+
**Notes:** A naive agent might achieve low cost by disabling HVAC, but then temperatures drift out of bounds (0 score). Trade-off learning is required.
|
| 130 |
+
|
| 131 |
+
**Baseline Score:** **0.6333**
|
| 132 |
+
|
| 133 |
+
---
|
| 134 |
+
|
| 135 |
+
### Task 3 — Full Demand Response (🔴 Hard)
|
| 136 |
+
|
| 137 |
+
**Objective:** Minimize cost, maintain temperature, respond to grid events, complete batch jobs on time, and minimize carbon emissions. This is a **multi-objective constraint satisfaction** problem.
|
| 138 |
+
|
| 139 |
+
**Difficulty Rationale:** Most realistic. Agent must balance five competing objectives simultaneously; any single failure is costly.
|
| 140 |
+
|
| 141 |
+
**Grader Metrics:**
|
| 142 |
+
- **Cost score (28%)** — Energy cost
|
| 143 |
+
- **Temperature score (20%)** — Time within comfort bounds
|
| 144 |
+
- **Grid response score (20%)** — Load shed during demand-response events (signal > 0.7)
|
| 145 |
+
- **Batch deadline score (12%)** — Fraction of jobs completed before deadline
|
| 146 |
+
- **Carbon reward score (20%)** — Shift load to low-carbon periods
|
| 147 |
+
|
| 148 |
+
**Baseline Breakdown:**
|
| 149 |
+
- Cost: 0.670, Temperature: 0.573, Grid: 0.214, Batch: 1.000, Carbon: 0.657
|
| 150 |
+
- **Overall: 0.5966**
|
| 151 |
+
|
| 152 |
+
**Challenge:** Grid response score (~0.21) shows that the baseline heuristic rarely sheds load opportunistically. Learning agents should discover that quick load shedding during high-price, high-stress periods yields significant cost savings.
|
| 153 |
+
|
| 154 |
+
**Grader Determinism:** Same seed always produces identical evaluations. Episodes are seeded internally; reproducible batches of evaluations can be generated for benchmark comparisons.
|
| 155 |
+
|
| 156 |
+
---
|
| 157 |
+
|
| 158 |
+
## 🚀 Setup & Usage
|
| 159 |
|
| 160 |
### Prerequisites
|
| 161 |
|
| 162 |
+
- **Docker** — [Download Docker Desktop](https://www.docker.com/products/docker-desktop/)
|
| 163 |
- **Python 3.9+** — [Download Python](https://www.python.org/downloads/)
|
| 164 |
- **Git** — [Download Git](https://git-scm.com/downloads)
|
| 165 |
|
| 166 |
+
### Quick Start (5 minutes)
|
| 167 |
+
|
| 168 |
+
#### 1. Clone the Repository
|
| 169 |
|
| 170 |
```bash
|
| 171 |
+
git clone https://github.com/LO-Kyu/gridmind-rl.git
|
| 172 |
+
cd gridmind-rl
|
| 173 |
```
|
| 174 |
|
| 175 |
+
#### 2. Build and Start the Environment Server
|
| 176 |
|
| 177 |
```bash
|
| 178 |
docker build -t gridmind-rl .
|
| 179 |
docker run --rm -d -p 7860:7860 -p 7861:7861 --name gridmind gridmind-rl
|
| 180 |
```
|
| 181 |
|
| 182 |
+
Verify the server is running:
|
| 183 |
|
| 184 |
```bash
|
| 185 |
+
# Check health endpoint
|
| 186 |
curl http://localhost:7860/health
|
| 187 |
+
# Expected: {"status":"ok","version":"1.0.0"}
|
|
|
|
|
|
|
| 188 |
```
|
| 189 |
|
| 190 |
+
#### 3. Install Python Dependencies
|
| 191 |
|
| 192 |
+
Open a **new terminal** and install:
|
|
|
|
|
|
|
| 193 |
|
| 194 |
```bash
|
| 195 |
pip install -r python/requirements.txt
|
| 196 |
```
|
| 197 |
|
| 198 |
+
#### 4. Run Inference (No LLM — Fast)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
+
Run a fast, deterministic baseline using heuristic policy:
|
| 201 |
|
| 202 |
+
```bash
|
| 203 |
+
python inference.py --fast-mode --episodes 1
|
|
|
|
|
|
|
|
|
|
| 204 |
```
|
| 205 |
|
| 206 |
+
Expected output (sample):
|
| 207 |
+
```
|
| 208 |
+
[START] task=Cost_Minimization env=gridmind model=heuristic
|
| 209 |
+
[STEP1] step=1 action={...} reward=10.5 done=false
|
| 210 |
+
[STEP2] step=2 action={...} reward=12.3 done=false
|
| 211 |
+
...
|
| 212 |
+
[STEP96] step=96 action={...} reward=8.9 done=true
|
| 213 |
+
[END] success=true steps=96 rewards=[10.5, 12.3, ..., 8.9]
|
| 214 |
+
```
|
| 215 |
|
| 216 |
+
Results saved to: `baseline_scores.json`
|
| 217 |
|
| 218 |
+
#### 5. (Optional) Run with LLM
|
|
|
|
|
|
|
| 219 |
|
| 220 |
+
To use an LLM agent for decision-making:
|
|
|
|
|
|
|
| 221 |
|
| 222 |
+
1. Get a **free API key** from [openrouter.ai/keys](https://openrouter.ai/keys) (no credit card needed)
|
| 223 |
+
2. Create `.env` file (copy from `.env.example`):
|
| 224 |
+
```bash
|
| 225 |
+
cp .env.example .env
|
| 226 |
+
```
|
| 227 |
+
3. Edit `.env` and add your API key:
|
| 228 |
+
```env
|
| 229 |
+
HF_TOKEN=sk-or-v1-your-key-here
|
| 230 |
+
# or
|
| 231 |
+
OPENAI_API_KEY=sk-or-v1-your-key-here
|
| 232 |
+
```
|
| 233 |
+
4. Run with LLM:
|
| 234 |
+
```bash
|
| 235 |
+
python inference.py --episodes 1
|
| 236 |
+
```
|
| 237 |
|
| 238 |
+
#### 6. Stop the Server (When Done)
|
| 239 |
|
| 240 |
```bash
|
| 241 |
docker stop gridmind
|
|
|
|
| 243 |
|
| 244 |
---
|
| 245 |
|
| 246 |
+
### Inference Script Reference
|
| 247 |
|
| 248 |
+
The `inference.py` script (project root) is the **hackathon submission entrypoint**.
|
| 249 |
|
| 250 |
+
**Environment Variables:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
+
| Variable | Default | Description |
|
| 253 |
+
|----------|---------|-------------|
|
| 254 |
+
| `HF_TOKEN` | (required for submission) | API key for LLM provider or HF Spaces |
|
| 255 |
+
| `OPENAI_API_KEY` | (optional fallback) | Alternative OpenAI-compatible key |
|
| 256 |
+
| `API_BASE_URL` | `https://openrouter.ai/api/v1` | LLM endpoint URL |
|
| 257 |
+
| `MODEL_NAME` | `meta-llama/llama-3.3-70b-instruct:free` | Model identifier |
|
| 258 |
+
| `ENV_URL` | `http://localhost:7860` | Environment server address |
|
| 259 |
+
|
| 260 |
+
**Command-Line Flags:**
|
| 261 |
+
|
| 262 |
+
| Flag | Default | Description |
|
| 263 |
+
|------|---------|-------------|
|
| 264 |
+
| `--episodes N` | 1 | Episodes per task (runs tasks 1, 2, 3 in sequence) |
|
| 265 |
+
| `--fast-mode` | off | Don't call LLM; use heuristic policy only (reproducible, no API calls) |
|
| 266 |
+
| `--llm-every N` | 4 | Reuse each LLM decision for N steps (reduces API calls) |
|
| 267 |
+
| `--max-steps N` | 96 | Stop episode early after N steps |
|
| 268 |
+
| `--env-url URL` | from env var | Override environment server URL |
|
| 269 |
+
| `--output FILE` | `baseline_scores.json` | Output results filename |
|
| 270 |
+
| `--verbose` | off | Print detailed logs for each step |
|
| 271 |
+
|
| 272 |
+
**Examples:**
|
| 273 |
+
|
| 274 |
+
```bash
|
| 275 |
+
# Run all 3 tasks with LLM (1 episode each)
|
| 276 |
+
python inference.py --episodes 1
|
| 277 |
+
|
| 278 |
+
# Reproduce baseline fast (no LLM)
|
| 279 |
+
python inference.py --fast-mode --episodes 1
|
| 280 |
+
|
| 281 |
+
# Only Task 2, heuristic, verbose output
|
| 282 |
+
python inference.py --fast-mode --episodes 1 --verbose
|
| 283 |
+
|
| 284 |
+
# Run 5 episodes per task with custom environment
|
| 285 |
+
python inference.py --episodes 5 --env-url http://my-server:7860
|
| 286 |
+
```
|
| 287 |
|
| 288 |
---
|
| 289 |
|
| 290 |
+
### HTTP API Reference
|
| 291 |
|
| 292 |
+
**Base URL:** `http://localhost:7860`
|
| 293 |
|
| 294 |
+
| Endpoint | Method | Purpose | Example Body |
|
| 295 |
+
|----------|--------|---------|---------------|
|
| 296 |
+
| `/health` | GET | Liveness check | — |
|
| 297 |
+
| `/ping` | GET | Lightweight ping | — |
|
| 298 |
+
| `/reset` | POST | Reset episode for a task | `{"task_id": 1, "seed": 42}` |
|
| 299 |
+
| `/step` | POST | Apply action, get next observation | `{"hvac_power_level": 0.5, "thermal_charge_rate": 0.1, ...}` |
|
| 300 |
+
| `/state` | GET | Current full state snapshot | — |
|
| 301 |
+
| `/grade` | GET | Episode score (0.0–1.0) with sub-scores | — |
|
| 302 |
+
| `/replay` | GET | Full step-by-step trajectory | — |
|
| 303 |
+
| `/tasks` | GET | Task definitions and grader weights | — |
|
| 304 |
+
| `/metrics` | GET | Prometheus-format metrics | — |
|
| 305 |
|
| 306 |
+
**Example Workflow:**
|
| 307 |
|
| 308 |
```bash
|
| 309 |
+
# 1. Reset to Task 1 with seed 42
|
| 310 |
curl -X POST http://localhost:7860/reset \
|
| 311 |
-H "Content-Type: application/json" \
|
| 312 |
-d '{"task_id": 1, "seed": 42}'
|
| 313 |
|
| 314 |
+
# 2. Get initial observation
|
| 315 |
+
curl http://localhost:7860/state
|
| 316 |
+
|
| 317 |
+
# 3. Take an action
|
| 318 |
curl -X POST http://localhost:7860/step \
|
| 319 |
-H "Content-Type: application/json" \
|
| 320 |
+
-d '{
|
| 321 |
+
"hvac_power_level": 0.5,
|
| 322 |
+
"thermal_charge_rate": 0.1,
|
| 323 |
+
"batch_job_slot": 1,
|
| 324 |
+
"load_shed_fraction": 0.0
|
| 325 |
+
}'
|
| 326 |
+
|
| 327 |
+
# 4. Check final score after episode completes
|
| 328 |
curl http://localhost:7860/grade
|
| 329 |
```
|
| 330 |
|
| 331 |
---
|
| 332 |
|
| 333 |
+
## 📊 Baseline Performance Scores
|
| 334 |
+
|
| 335 |
+
The baseline is a **heuristic policy** (rule-based, no LLM) representing a reasonable but non-optimized control strategy. Your RL agent should aim to exceed these scores.
|
| 336 |
+
|
| 337 |
+
**Baseline Run:** `python inference.py --fast-mode --episodes 1`
|
| 338 |
+
|
| 339 |
+
### Summary Scores
|
| 340 |
+
|
| 341 |
+
| Task | Difficulty | Score | Model |
|
| 342 |
+
|------|:----------:|:-----:|-------|
|
| 343 |
+
| Task 1 — Cost Minimization | 🟢 Easy | **0.7063** | Heuristic |
|
| 344 |
+
| Task 2 — Temperature Control | 🟡 Medium | **0.6333** | Heuristic |
|
| 345 |
+
| Task 3 — Full Demand Response | 🔴 Hard | **0.5966** | Heuristic |
|
| 346 |
+
| **Overall Average** | — | **0.6454** | Heuristic |
|
| 347 |
+
|
| 348 |
+
### Detailed Breakdown
|
| 349 |
+
|
| 350 |
+
#### Task 1 Results
|
| 351 |
+
- **Task:** Cost minimization (96 hours × 15 min = 24 hours)
|
| 352 |
+
- **Score:** 0.7063
|
| 353 |
+
- **Sub-score:** Cost = 0.706
|
| 354 |
+
- **Interpretation:** Heuristic achieves ~70% of optimal cost reduction vs baseline
|
| 355 |
+
|
| 356 |
+
#### Task 2 Results
|
| 357 |
+
- **Task:** Minimize cost while maintaining temperature 19–23°C
|
| 358 |
+
- **Score:** 0.6333
|
| 359 |
+
- **Sub-scores:**
|
| 360 |
+
- Cost: 0.701
|
| 361 |
+
- Temperature constraint: 0.531 (agent violated comfort bounds ~47% of the time)
|
| 362 |
+
- **Interpretation:** Temperature management is challenging for the heuristic. Tighter thermal control could improve this score significantly.
|
| 363 |
+
|
| 364 |
+
#### Task 3 Results (Most Interesting)
|
| 365 |
+
- **Task:** Multi-objective: cost, temperature, grid response, batch deadlines, carbon
|
| 366 |
+
- **Score:** 0.5966
|
| 367 |
+
- **Sub-scores:**
|
| 368 |
+
- Cost: 0.670
|
| 369 |
+
- Temperature: 0.573 (similar temperature control challenge as Task 2)
|
| 370 |
+
- **Grid response: 0.214** ← Heuristic rarely participates in demand-response
|
| 371 |
+
- Batch deadline: 1.000 (heuristic always completes jobs on time)
|
| 372 |
+
- Carbon: 0.657
|
| 373 |
+
|
| 374 |
+
**Key Insight:** The heuristic's low grid response score (0.21) suggests that learned agents have significant room for improvement by:
|
| 375 |
+
1. Recognizing high-price + high-stress periods
|
| 376 |
+
2. Proactively shedding load to reduce cost
|
| 377 |
+
3. Using thermal storage to recover comfort afterward
|
| 378 |
+
|
| 379 |
+
This multi-objective setting is where RL agents typically exceed heuristic baselines.
|
| 380 |
+
|
| 381 |
+
### Reproducibility & Evaluation
|
| 382 |
+
|
| 383 |
+
- **Deterministic:** Baseline scores are **deterministic** — same seed always produces identical actions and rewards
|
| 384 |
+
- **Seeding:** Each task uses a fixed base seed (1100, 1200, 1300) for reproducible evaluation
|
| 385 |
+
- **Your Submissions:** Your agent will be evaluated on the same seed distribution; compare your scores directly to baseline
|
| 386 |
+
|
| 387 |
+
---
|
| 388 |
+
|
| 389 |
## 🏗️ Architecture
|
| 390 |
|
| 391 |
```
|
|
|
|
| 554 |
|
| 555 |
---
|
| 556 |
|
| 557 |
+
## ✅ OpenEnv Specification Compliance
|
| 558 |
+
|
| 559 |
+
GridMind-RL fully implements the OpenEnv specification for standardized RL environments. All components are present and tested:
|
| 560 |
+
|
| 561 |
+
| Requirement | Status | Notes |
|
| 562 |
+
|-------------|:------:|-------|
|
| 563 |
+
| Manifest (`openenv.yaml`) | ✅ | All metadata, schema definitions, and version info |
|
| 564 |
+
| Observation Schema | ✅ | 11-field object: temperature, storage, price, grid signal, carbon, hour, batch queue, cost, step, building_id |
|
| 565 |
+
| Action Schema | ✅ | 5-field object: HVAC, thermal rate, batch slot, load shed, building_id |
|
| 566 |
+
| HTTP Endpoints | ✅ | `/reset`, `/step`, `/state`, `/grade`, `/replay`, `/tasks`, `/health`, `/metrics` |
|
| 567 |
+
| Determinism | ✅ | Seeded episode generation; identical seeds produce identical trajectories |
|
| 568 |
+
| Typed Models | ✅ | Pydantic models (Python) mirror Go structs exactly |
|
| 569 |
+
| Dense Rewards | ✅ | 7-component reward breakdown every step |
|
| 570 |
+
| Graders | ✅ | 3 tasks with programmatic, deterministic graders (0.0–1.0 range) |
|
| 571 |
+
| Exploit Detection | ✅ | Built into grading pipeline to flag unrealistic scores |
|
| 572 |
+
|
| 573 |
+
---
|
| 574 |
+
|
| 575 |
+
## ❓ FAQ
|
| 576 |
+
|
| 577 |
+
**Q: Can I use a different model?**
|
| 578 |
+
A: Yes. Set `MODEL_NAME` environment variable to any OpenAI-compatible model. The default (`meta-llama/llama-3.3-70b-instruct:free`) is free on OpenRouter with no credit card.
|
| 579 |
+
|
| 580 |
+
**Q: How do I avoid rate limiting?**
|
| 581 |
+
A: (1) Use `--fast-mode` for local testing (no API calls), (2) Set `--llm-every 4` to reuse decisions, (3) Use a paid API tier for submission, or (4) Train & submit an offline policy.
|
| 582 |
+
|
| 583 |
+
**Q: Will my API key be exposed in submissions?**
|
| 584 |
+
A: No. Store your API key in `.env` (git-ignored). On HF Spaces, set secrets via the Space settings UI; keys are never committed to the repo.
|
| 585 |
+
|
| 586 |
+
**Q: What's the difference between `HF_TOKEN` and `OPENAI_API_KEY`?**
|
| 587 |
+
A: `HF_TOKEN` is used in HF Space deployments and external evaluations. `OPENAI_API_KEY` is a fallback for local development. The code tries `HF_TOKEN` first, then `OPENAI_API_KEY`. At least one must be set.
|
| 588 |
+
|
| 589 |
+
**Q: Can I submit an offline/trained policy?**
|
| 590 |
+
A: Yes. Modify `python/inference.py` to use your trained agent instead of LLM calls. Ensure you still output the required `[START]`, `[STEP]`, `[END]` format.
|
| 591 |
+
|
| 592 |
+
**Q: What if my submission times out?**
|
| 593 |
+
A: Each episode is 96 steps. The environment runs 3 episodes (one per task). Optimize for latency: reduce LLM calls (use `--llm-every`), use a faster model, or submit a heuristic/trained offline policy.
|
| 594 |
+
|
| 595 |
+
---
|
| 596 |
+
|
| 597 |
+
## 🎯 Submission Checklist
|
| 598 |
+
|
| 599 |
+
Before submitting, verify:
|
| 600 |
+
|
| 601 |
+
- [ ] Clone repo, build Docker, run `docker run -p 7860:7860 -p 7861:7861 gridmind-rl`
|
| 602 |
+
- [ ] Run `python inference.py --fast-mode --episodes 1` locally — should produce `baseline_scores.json`
|
| 603 |
+
- [ ] Check `[START]`, `[STEP]`, `[END]` markers in stdout
|
| 604 |
+
- [ ] Set `HF_TOKEN` or `OPENAI_API_KEY` in `.env` for LLM runs
|
| 605 |
+
- [ ] Test with LLM: `python inference.py --episodes 1`
|
| 606 |
+
- [ ] Verify Dockerfile builds without errors: `docker build -t gridmind-rl .`
|
| 607 |
+
- [ ] Create HF Space (Docker SDK, CPU Basic)
|
| 608 |
+
- [ ] Push repo to HF Space: `git push hf main`
|
| 609 |
+
- [ ] Set secrets in HF Space UI: `HF_TOKEN`, `API_BASE_URL` (optional), `MODEL_NAME` (optional)
|
| 610 |
+
- [ ] Verify Space is running: `curl https://YOUR_USERNAME-gridmind-rl.hf.space/health`
|
| 611 |
+
- [ ] Submit Space URL to hackathon organizers
|
| 612 |
+
|
| 613 |
+
---
|
| 614 |
+
|
| 615 |
+
## 📚 Additional Resources
|
| 616 |
+
|
| 617 |
+
- **OpenEnv Spec:** https://github.com/meta-pytorch/OpenEnv
|
| 618 |
+
- **OpenRouter Free Models:** https://openrouter.ai/keys
|
| 619 |
+
- **HF Spaces Docs:** https://huggingface.co/docs/hub/spaces
|
| 620 |
+
- **GridMind Repository:** https://github.com/LO-Kyu/gridmind-rl
|
| 621 |
|
| 622 |
---
|
| 623 |
|
baseline_scores.json
CHANGED
|
@@ -3,54 +3,54 @@
|
|
| 3 |
"api_base": "https://openrouter.ai/api/v1",
|
| 4 |
"episodes_per_task": 1,
|
| 5 |
"seed_base": 1000,
|
| 6 |
-
"fast_mode":
|
| 7 |
"llm_every": 4,
|
| 8 |
"max_steps": null,
|
| 9 |
"task_averages": {
|
| 10 |
-
"1": 0.
|
| 11 |
-
"2": 0.
|
| 12 |
-
"3": 0.
|
| 13 |
},
|
| 14 |
-
"overall_average": 0.
|
| 15 |
"all_results": [
|
| 16 |
{
|
| 17 |
"task_id": 1,
|
| 18 |
"seed": 1100,
|
| 19 |
-
"total_reward":
|
| 20 |
"total_steps": 96,
|
| 21 |
-
"elapsed_sec":
|
| 22 |
-
"score": 0.
|
| 23 |
"sub_scores": {
|
| 24 |
-
"cost": 0.
|
| 25 |
},
|
| 26 |
"exploit_detected": false
|
| 27 |
},
|
| 28 |
{
|
| 29 |
"task_id": 2,
|
| 30 |
"seed": 1200,
|
| 31 |
-
"total_reward":
|
| 32 |
"total_steps": 96,
|
| 33 |
-
"elapsed_sec": 1.
|
| 34 |
-
"score": 0.
|
| 35 |
"sub_scores": {
|
| 36 |
-
"cost": 0.
|
| 37 |
-
"temperature": 0.
|
| 38 |
},
|
| 39 |
"exploit_detected": false
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"task_id": 3,
|
| 43 |
"seed": 1300,
|
| 44 |
-
"total_reward":
|
| 45 |
"total_steps": 96,
|
| 46 |
-
"elapsed_sec":
|
| 47 |
-
"score": 0.
|
| 48 |
"sub_scores": {
|
| 49 |
"batch_deadline": 1,
|
| 50 |
-
"carbon": 0.
|
| 51 |
-
"cost": 0.
|
| 52 |
"grid_response": 0.21428571428571427,
|
| 53 |
-
"temperature": 0.
|
| 54 |
},
|
| 55 |
"exploit_detected": false
|
| 56 |
}
|
|
|
|
| 3 |
"api_base": "https://openrouter.ai/api/v1",
|
| 4 |
"episodes_per_task": 1,
|
| 5 |
"seed_base": 1000,
|
| 6 |
+
"fast_mode": true,
|
| 7 |
"llm_every": 4,
|
| 8 |
"max_steps": null,
|
| 9 |
"task_averages": {
|
| 10 |
+
"1": 0.7063,
|
| 11 |
+
"2": 0.6333,
|
| 12 |
+
"3": 0.5966
|
| 13 |
},
|
| 14 |
+
"overall_average": 0.6454,
|
| 15 |
"all_results": [
|
| 16 |
{
|
| 17 |
"task_id": 1,
|
| 18 |
"seed": 1100,
|
| 19 |
+
"total_reward": 251.40178983938813,
|
| 20 |
"total_steps": 96,
|
| 21 |
+
"elapsed_sec": 1.8465147018432617,
|
| 22 |
+
"score": 0.7063,
|
| 23 |
"sub_scores": {
|
| 24 |
+
"cost": 0.7063441549865395
|
| 25 |
},
|
| 26 |
"exploit_detected": false
|
| 27 |
},
|
| 28 |
{
|
| 29 |
"task_id": 2,
|
| 30 |
"seed": 1200,
|
| 31 |
+
"total_reward": 246.40262234598185,
|
| 32 |
"total_steps": 96,
|
| 33 |
+
"elapsed_sec": 1.826324224472046,
|
| 34 |
+
"score": 0.6333,
|
| 35 |
"sub_scores": {
|
| 36 |
+
"cost": 0.7014155357169216,
|
| 37 |
+
"temperature": 0.53125
|
| 38 |
},
|
| 39 |
"exploit_detected": false
|
| 40 |
},
|
| 41 |
{
|
| 42 |
"task_id": 3,
|
| 43 |
"seed": 1300,
|
| 44 |
+
"total_reward": 255.60231973463087,
|
| 45 |
"total_steps": 96,
|
| 46 |
+
"elapsed_sec": 1.8300776481628418,
|
| 47 |
+
"score": 0.5966,
|
| 48 |
"sub_scores": {
|
| 49 |
"batch_deadline": 1,
|
| 50 |
+
"carbon": 0.6574530318382599,
|
| 51 |
+
"cost": 0.670084941969173,
|
| 52 |
"grid_response": 0.21428571428571427,
|
| 53 |
+
"temperature": 0.5729166666666666
|
| 54 |
},
|
| 55 |
"exploit_detected": false
|
| 56 |
}
|
inference.py
CHANGED
|
@@ -1,11 +1,43 @@
|
|
| 1 |
"""
|
| 2 |
Hackathon entrypoint: run from repo root with:
|
| 3 |
python inference.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
Delegates to python/inference.py (single source of truth).
|
| 5 |
"""
|
|
|
|
|
|
|
| 6 |
import runpy
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
impl = Path(__file__).resolve().parent / "python" / "inference.py"
|
| 11 |
runpy.run_path(str(impl), run_name="__main__")
|
|
|
|
| 1 |
"""
|
| 2 |
Hackathon entrypoint: run from repo root with:
|
| 3 |
python inference.py
|
| 4 |
+
|
| 5 |
+
Reads environment variables:
|
| 6 |
+
- API_BASE_URL (default: https://openrouter.ai/api/v1)
|
| 7 |
+
- MODEL_NAME (default: meta-llama/llama-3.3-70b-instruct:free)
|
| 8 |
+
- HF_TOKEN (mandatory, no default)
|
| 9 |
+
|
| 10 |
+
Emits hackathon-compliant stdout format:
|
| 11 |
+
[START] task=<name> env=gridmind model=<model>
|
| 12 |
+
[STEP] step=<n> action=<json> reward=<0.00> done=<true|false> error=<msg|null>
|
| 13 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...>
|
| 14 |
+
|
| 15 |
Delegates to python/inference.py (single source of truth).
|
| 16 |
"""
|
| 17 |
+
import os
|
| 18 |
+
import sys
|
| 19 |
import runpy
|
| 20 |
from pathlib import Path
|
| 21 |
|
| 22 |
if __name__ == "__main__":
|
| 23 |
+
# Load .env file FIRST (if present)
|
| 24 |
+
try:
|
| 25 |
+
from dotenv import load_dotenv
|
| 26 |
+
load_dotenv() # reads .env from current directory or project root
|
| 27 |
+
except ImportError:
|
| 28 |
+
pass # python-dotenv not installed — env vars must be set manually
|
| 29 |
+
|
| 30 |
+
# Now validate HF_TOKEN after .env is loaded
|
| 31 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 32 |
+
if not hf_token:
|
| 33 |
+
# Allow OPENAI_API_KEY as fallback for development
|
| 34 |
+
if not os.getenv("OPENAI_API_KEY"):
|
| 35 |
+
print(
|
| 36 |
+
"[ERROR] HF_TOKEN environment variable is required "
|
| 37 |
+
"(or OPENAI_API_KEY for development)",
|
| 38 |
+
file=sys.stderr
|
| 39 |
+
)
|
| 40 |
+
sys.exit(1)
|
| 41 |
+
|
| 42 |
impl = Path(__file__).resolve().parent / "python" / "inference.py"
|
| 43 |
runpy.run_path(str(impl), run_name="__main__")
|
python/inference.py
CHANGED
|
@@ -47,8 +47,9 @@ except ImportError:
|
|
| 47 |
ENV_URL = os.getenv("ENV_URL", "http://localhost:7860")
|
| 48 |
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/llama-3.3-70b-instruct:free")
|
| 49 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://openrouter.ai/api/v1")
|
| 50 |
-
# Accept OPENAI_API_KEY
|
| 51 |
-
|
|
|
|
| 52 |
DEFAULT_EPISODES = 1
|
| 53 |
DEFAULT_SEED_BASE = 1000
|
| 54 |
MAX_RETRIES = 3
|
|
@@ -277,60 +278,103 @@ def run_episode(
|
|
| 277 |
max_steps: int | None,
|
| 278 |
verbose: bool = False,
|
| 279 |
) -> dict[str, Any]:
|
| 280 |
-
"""Run a single episode and
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
reset_resp = env_client.reset(task_id=task_id, seed=seed)
|
| 282 |
obs = reset_resp["observations"][0]
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
| 286 |
total_reward = 0.0
|
| 287 |
total_steps = 0
|
| 288 |
start_time = time.time()
|
| 289 |
step_resp: dict[str, Any] = {}
|
| 290 |
step_limit = EPISODE_STEPS if max_steps is None else min(max_steps, EPISODE_STEPS)
|
| 291 |
-
|
| 292 |
llm_reuse_remaining = 0
|
| 293 |
cached_action = agent._default_action()
|
| 294 |
-
|
|
|
|
|
|
|
|
|
|
| 295 |
while not step_resp.get("done", False):
|
| 296 |
if total_steps >= step_limit:
|
| 297 |
break
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
print(
|
| 322 |
-
f"
|
| 323 |
-
f"
|
| 324 |
-
|
| 325 |
-
f"cost=${obs['cumulative_cost']:.2f} "
|
| 326 |
-
f"reward={step_resp['reward']:.3f}",
|
| 327 |
-
flush=True,
|
| 328 |
)
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
elapsed = time.time() - start_time
|
| 331 |
grade = env_client.grade()
|
| 332 |
-
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
return {
|
| 335 |
"task_id": task_id,
|
| 336 |
"seed": seed,
|
|
|
|
| 47 |
ENV_URL = os.getenv("ENV_URL", "http://localhost:7860")
|
| 48 |
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/llama-3.3-70b-instruct:free")
|
| 49 |
API_BASE_URL = os.getenv("API_BASE_URL", "https://openrouter.ai/api/v1")
|
| 50 |
+
# Hackathon spec: HF_TOKEN is mandatory. Accept OPENAI_API_KEY as secondary fallback for dev.
|
| 51 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 52 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") or HF_TOKEN or ""
|
| 53 |
DEFAULT_EPISODES = 1
|
| 54 |
DEFAULT_SEED_BASE = 1000
|
| 55 |
MAX_RETRIES = 3
|
|
|
|
| 278 |
max_steps: int | None,
|
| 279 |
verbose: bool = False,
|
| 280 |
) -> dict[str, Any]:
|
| 281 |
+
"""Run a single episode and emit hackathon-compliant stdout format.
|
| 282 |
+
|
| 283 |
+
Emits:
|
| 284 |
+
[START] task=<name> env=gridmind model=<model>
|
| 285 |
+
[STEP] step=<n> action=<json> reward=<0.00> done=<true|false> error=<msg|null>
|
| 286 |
+
...
|
| 287 |
+
[END] success=<true|false> steps=<n> rewards=<r1,r2,...>
|
| 288 |
+
"""
|
| 289 |
reset_resp = env_client.reset(task_id=task_id, seed=seed)
|
| 290 |
obs = reset_resp["observations"][0]
|
| 291 |
+
|
| 292 |
+
task_name = f"gridmind-task-{task_id}"
|
| 293 |
+
|
| 294 |
+
# Emit [START] with required fields
|
| 295 |
+
print(f"[START] task={task_name} env=gridmind model={MODEL_NAME}", flush=True)
|
| 296 |
+
|
| 297 |
total_reward = 0.0
|
| 298 |
total_steps = 0
|
| 299 |
start_time = time.time()
|
| 300 |
step_resp: dict[str, Any] = {}
|
| 301 |
step_limit = EPISODE_STEPS if max_steps is None else min(max_steps, EPISODE_STEPS)
|
| 302 |
+
|
| 303 |
llm_reuse_remaining = 0
|
| 304 |
cached_action = agent._default_action()
|
| 305 |
+
|
| 306 |
+
step_rewards: list[float] = []
|
| 307 |
+
last_error: str | None = None
|
| 308 |
+
|
| 309 |
while not step_resp.get("done", False):
|
| 310 |
if total_steps >= step_limit:
|
| 311 |
break
|
| 312 |
+
|
| 313 |
+
try:
|
| 314 |
+
if fast_mode:
|
| 315 |
+
action = agent._heuristic_action(obs)
|
| 316 |
+
else:
|
| 317 |
+
if llm_reuse_remaining <= 0:
|
| 318 |
+
cached_action = agent.choose_action(obs, task_id)
|
| 319 |
+
llm_reuse_remaining = max(1, llm_every)
|
| 320 |
+
action = cached_action
|
| 321 |
+
|
| 322 |
+
step_resp = env_client.step(action)
|
| 323 |
+
if step_resp is None or "observation" not in step_resp:
|
| 324 |
+
last_error = "invalid step response"
|
| 325 |
+
break
|
| 326 |
+
|
| 327 |
+
if not fast_mode:
|
| 328 |
+
llm_reuse_remaining -= 1
|
| 329 |
+
|
| 330 |
+
obs = step_resp["observation"]
|
| 331 |
+
reward = float(step_resp["reward"])
|
| 332 |
+
total_reward += reward
|
| 333 |
+
step_rewards.append(reward)
|
| 334 |
+
total_steps += 1
|
| 335 |
+
done = bool(step_resp.get("done", False))
|
| 336 |
+
|
| 337 |
+
# Emit [STEP] with required fields (action as compact JSON, reward to 2 decimals)
|
| 338 |
+
action_json = json.dumps(action, separators=(',', ':'))
|
| 339 |
+
error_field = "null" if last_error is None else f'"{last_error}"'
|
| 340 |
print(
|
| 341 |
+
f"[STEP] step={total_steps} action={action_json} "
|
| 342 |
+
f"reward={reward:.2f} done={'true' if done else 'false'} error={error_field}",
|
| 343 |
+
flush=True
|
|
|
|
|
|
|
|
|
|
| 344 |
)
|
| 345 |
+
|
| 346 |
+
last_error = None # Clear error after successful step
|
| 347 |
+
|
| 348 |
+
if verbose and total_steps % 16 == 0:
|
| 349 |
+
print(
|
| 350 |
+
f" step={total_steps:02d} price=${obs['current_price']:.3f} "
|
| 351 |
+
f"temp={obs['indoor_temperature']:.1f}°C "
|
| 352 |
+
f"stress={obs['grid_stress_signal']:.2f} "
|
| 353 |
+
f"cost=${obs['cumulative_cost']:.2f}",
|
| 354 |
+
flush=True,
|
| 355 |
+
)
|
| 356 |
+
|
| 357 |
+
except Exception as e:
|
| 358 |
+
last_error = str(e)
|
| 359 |
+
print(
|
| 360 |
+
f"[STEP] step={total_steps + 1} action=null "
|
| 361 |
+
f"reward=0.00 done=true error=\"{last_error}\"",
|
| 362 |
+
flush=True
|
| 363 |
+
)
|
| 364 |
+
break
|
| 365 |
+
|
| 366 |
elapsed = time.time() - start_time
|
| 367 |
grade = env_client.grade()
|
| 368 |
+
|
| 369 |
+
success = (total_steps > 0 and total_steps >= step_limit) or last_error is None
|
| 370 |
+
rewards_str = ",".join(f"{r:.2f}" for r in step_rewards)
|
| 371 |
+
|
| 372 |
+
# Emit [END] with required fields
|
| 373 |
+
print(
|
| 374 |
+
f"[END] success={'true' if success else 'false'} steps={total_steps} rewards={rewards_str}",
|
| 375 |
+
flush=True
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
return {
|
| 379 |
"task_id": task_id,
|
| 380 |
"seed": seed,
|