after transfer

#1
Files changed (3) hide show
  1. .gitattributes +10 -0
  2. .gitignore +0 -0
  3. scripts/train_unsloth.py +68 -37
.gitattributes ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ *.png filter=xet merge=xet
2
+ *.jpg filter=xet merge=xet
3
+ *.jpeg filter=xet merge=xet
4
+ *.gif filter=xet merge=xet
5
+ *.pdf filter=xet merge=xet
6
+ *.zip filter=xet merge=xet
7
+ *.pth filter=xet merge=xet
8
+ *.pt filter=xet merge=xet
9
+ *.ckpt filter=xet merge=xet
10
+ *.safetensors filter=xet merge=xet
.gitignore CHANGED
Binary files a/.gitignore and b/.gitignore differ
 
scripts/train_unsloth.py CHANGED
@@ -1,9 +1,11 @@
1
  #!/usr/bin/env python3
2
  """
3
  GridMind-RL Unsloth GRPO Training Script
4
- ----------------------------------------
5
- Fine-tunes Qwen2.5-0.5B-Instruct using Unsloth's 4-bit LoRA and TRL's GRPOTrainer.
6
  The environment rewards are gathered by hitting the OpenEnv HTTP server directly.
 
 
7
  """
8
 
9
  import argparse
@@ -13,16 +15,16 @@ import re
13
  import sys
14
  import requests
15
  import pandas as pd
 
 
16
  from datasets import Dataset
17
  from trl import GRPOTrainer, GRPOConfig
18
  from unsloth import FastLanguageModel
19
  from transformers import TrainerCallback
20
 
21
- # Ensure results directory exists
22
  os.makedirs("results", exist_ok=True)
23
 
24
- SYSTEM_PROMPT = """\
25
- You are an expert industrial building energy controller.
26
  Each turn you receive the current building state and must respond with
27
  ONLY a valid JSON action object.
28
 
@@ -31,24 +33,24 @@ Action format:
31
  "batch_job_slot": <0-4>, "load_shed_fraction": <0.0-0.5>, "building_id": 0}
32
 
33
  Strategy:
34
- - Charge storage when price < $0.08/kWh (positive thermal_charge_rate)
35
- - Discharge storage when price > $0.15/kWh (negative thermal_charge_rate)
36
- - Shed load 0.3-0.5 when grid_stress_signal > 0.7
37
- - Reduce HVAC during peak hours (8-12, 17-21)
38
- - Keep temperature between 19-23°C"""
39
 
40
- def make_prompt(i):
 
 
 
 
41
  return [{
42
- "role": "system", "content": SYSTEM_PROMPT
43
  }, {
44
  "role": "user",
45
- "content": f"Episode {i+1}: The building simulation is starting. "
46
- "You will receive the state each step. "
47
- "Output your first action as JSON now."
48
  }]
49
 
50
  def reward_valid_json(completions, **kwargs):
51
- """Reward 0.3 for any valid JSON output."""
52
  rewards = []
53
  for completion in completions:
54
  text = completion[0]["content"] if isinstance(completion, list) else completion
@@ -56,7 +58,7 @@ def reward_valid_json(completions, **kwargs):
56
  match = re.search(r'\{.*?\}', text, re.DOTALL)
57
  if match:
58
  json.loads(match.group())
59
- rewards.append(0.3)
60
  else:
61
  rewards.append(0.0)
62
  except Exception:
@@ -64,7 +66,7 @@ def reward_valid_json(completions, **kwargs):
64
  return rewards
65
 
66
  def reward_has_required_keys(completions, **kwargs):
67
- """Reward 0.3 if JSON has all 4 required action keys."""
68
  required = {"hvac_power_level", "thermal_charge_rate", "batch_job_slot", "load_shed_fraction"}
69
  rewards = []
70
  for completion in completions:
@@ -74,7 +76,7 @@ def reward_has_required_keys(completions, **kwargs):
74
  if match:
75
  action = json.loads(match.group())
76
  if required.issubset(action.keys()):
77
- rewards.append(0.3)
78
  else:
79
  rewards.append(0.1)
80
  else:
@@ -84,14 +86,17 @@ def reward_has_required_keys(completions, **kwargs):
84
  return rewards
85
 
86
  def get_reward_env_interaction(env_url):
87
- """Episode-level reward from /grade endpoint with seed variation.
88
-
89
- Uses 8-step rollouts with varied seeds to prevent mode collapse.
90
- The /grade endpoint returns the true episode score (0.0-1.0 clamped),
91
- which we use directly as the primary learning signal.
92
  """
 
 
93
  def reward_env_interaction(completions, **kwargs):
 
94
  rewards = []
 
95
  for i, completion in enumerate(completions):
96
  text = completion[0]["content"] if isinstance(completion, list) else completion
97
  try:
@@ -105,9 +110,9 @@ def get_reward_env_interaction(env_url):
105
  "building_id": 0
106
  }
107
 
108
- # Vary seed to prevent mode collapse — each rollout sees a different episode
109
- seed = 1000 + i
110
- task_id = (i % 3) + 1 # Cycle through tasks 1, 2, 3
111
 
112
  reset_resp = requests.post(
113
  f"{env_url}/reset",
@@ -118,7 +123,11 @@ def get_reward_env_interaction(env_url):
118
  rewards.append(0.0)
119
  continue
120
 
121
- for _ in range(8):
 
 
 
 
122
  step_resp = requests.post(
123
  f"{env_url}/step",
124
  json=[step_action],
@@ -130,10 +139,7 @@ def get_reward_env_interaction(env_url):
130
  grade_resp = requests.get(f"{env_url}/grade", timeout=30)
131
  if grade_resp.status_code == 200:
132
  episode_score = float(grade_resp.json().get("score", 0.5))
133
- # Normalize: heuristic baseline ≈ 0.5, zero-shot ≈ 0.65, trained ≈ 0.72
134
- # Map to 0.0-1.0 where 0.5 is the floor (heuristic), 0.72 is the ceiling (trained target)
135
- normalized = (episode_score - 0.4) / 0.32 # maps 0.4→0.0, 0.72→1.0
136
- rewards.append(max(0.0, min(1.0, normalized)))
137
  else:
138
  rewards.append(0.0)
139
 
@@ -143,6 +149,30 @@ def get_reward_env_interaction(env_url):
143
  return rewards
144
  return reward_env_interaction
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  class CSVLogCallback(TrainerCallback):
147
  """Custom callback to continuously log training metrics to a CSV file."""
148
  def __init__(self, output_path):
@@ -159,7 +189,7 @@ class CSVLogCallback(TrainerCallback):
159
  def main():
160
  parser = argparse.ArgumentParser(description="Train GridMind-RL agent with Unsloth GRPO")
161
  parser.add_argument("--env-url", type=str, default="http://localhost:7860", help="OpenEnv server URL")
162
- parser.add_argument("--model-name", type=str, default="unsloth/Qwen2.5-0.5B-Instruct", help="Base model")
163
  parser.add_argument("--prompts", type=int, default=300, help="Number of training prompts")
164
  parser.add_argument("--epochs", type=int, default=1, help="Training epochs")
165
  parser.add_argument("--max-steps", type=int, default=-1, help="Max steps (overrides epochs if > 0)")
@@ -169,7 +199,7 @@ def main():
169
 
170
  print(f"🚀 Loading model: {args.model_name}")
171
  max_seq_length = 512
172
- lora_rank = 8
173
 
174
  model, tokenizer = FastLanguageModel.from_pretrained(
175
  model_name=args.model_name,
@@ -199,16 +229,16 @@ def main():
199
  max_steps=args.max_steps,
200
  per_device_train_batch_size=1,
201
  gradient_accumulation_steps=4,
202
- num_generations=4, # GRPO group size
203
  max_prompt_length=256,
204
  max_completion_length=128,
205
- learning_rate=5e-6,
206
  lr_scheduler_type="cosine",
207
  warmup_ratio=0.1,
208
  logging_steps=5,
209
  save_steps=100,
210
  fp16=True,
211
- report_to="none", # We use our CSV callback instead
212
  seed=42,
213
  )
214
 
@@ -221,6 +251,7 @@ def main():
221
  reward_valid_json,
222
  reward_has_required_keys,
223
  get_reward_env_interaction(args.env_url),
 
224
  ],
225
  callbacks=[CSVLogCallback(args.output_csv)]
226
  )
 
1
  #!/usr/bin/env python3
2
  """
3
  GridMind-RL Unsloth GRPO Training Script
4
+ ----------------------------------------------
5
+ Fine-tunes Qwen2.5-1.5B-Instruct using Unsloth's 4-bit LoRA and TRL's GRPOTrainer.
6
  The environment rewards are gathered by hitting the OpenEnv HTTP server directly.
7
+
8
+ FIXED: Removed reward hacking, added entropy bonus, diverse seeds, proper normalization.
9
  """
10
 
11
  import argparse
 
15
  import sys
16
  import requests
17
  import pandas as pd
18
+ import random
19
+ from collections import Counter
20
  from datasets import Dataset
21
  from trl import GRPOTrainer, GRPOConfig
22
  from unsloth import FastLanguageModel
23
  from transformers import TrainerCallback
24
 
 
25
  os.makedirs("results", exist_ok=True)
26
 
27
+ SYSTEM_PROMPT = """You are an expert industrial building energy controller.
 
28
  Each turn you receive the current building state and must respond with
29
  ONLY a valid JSON action object.
30
 
 
33
  "batch_job_slot": <0-4>, "load_shed_fraction": <0.0-0.5>, "building_id": 0}
34
 
35
  Strategy:
36
+ - Always respond with valid JSON containing all required keys
37
+ - Vary your actions - don't repeat the same pattern
38
+ - Optimize for low cost + comfort maintenance + grid response"""
 
 
39
 
40
+ def make_prompt(i, obs=None, task_desc=""):
41
+ system_content = SYSTEM_PROMPT
42
+ if obs and task_desc:
43
+ system_content += f"\n\nCurrent observation:\n- Temperature: {obs.get('indoor_temperature', 21):.1f}°C\n- Price: ${obs.get('current_price', 0.10):.3f}/kWh\n- Grid stress: {obs.get('grid_stress_signal', 0):.2f}\n- Hour: {obs.get('hour_of_day', 12)}\n- Storage: {obs.get('thermal_storage_level', 0.5):.1%}"
44
+
45
  return [{
46
+ "role": "system", "content": system_content
47
  }, {
48
  "role": "user",
49
+ "content": f"Episode {i+1}: {task_desc}\nOutput action as JSON."
 
 
50
  }]
51
 
52
  def reward_valid_json(completions, **kwargs):
53
+ """Reward 0.25 for any valid JSON output."""
54
  rewards = []
55
  for completion in completions:
56
  text = completion[0]["content"] if isinstance(completion, list) else completion
 
58
  match = re.search(r'\{.*?\}', text, re.DOTALL)
59
  if match:
60
  json.loads(match.group())
61
+ rewards.append(0.25)
62
  else:
63
  rewards.append(0.0)
64
  except Exception:
 
66
  return rewards
67
 
68
  def reward_has_required_keys(completions, **kwargs):
69
+ """Reward 0.25 if JSON has all 4 required action keys."""
70
  required = {"hvac_power_level", "thermal_charge_rate", "batch_job_slot", "load_shed_fraction"}
71
  rewards = []
72
  for completion in completions:
 
76
  if match:
77
  action = json.loads(match.group())
78
  if required.issubset(action.keys()):
79
+ rewards.append(0.25)
80
  else:
81
  rewards.append(0.1)
82
  else:
 
86
  return rewards
87
 
88
  def get_reward_env_interaction(env_url):
89
+ """Episode-level reward from /grade endpoint with diverse seeds.
90
+
91
+ FIXED: Uses raw /grade score directly (0.0-1.0), no normalization that causes reward hacking.
92
+ Each sample gets a different seed/task to prevent mode collapse.
 
93
  """
94
+ last_observations = []
95
+
96
  def reward_env_interaction(completions, **kwargs):
97
+ nonlocal last_observations
98
  rewards = []
99
+
100
  for i, completion in enumerate(completions):
101
  text = completion[0]["content"] if isinstance(completion, list) else completion
102
  try:
 
110
  "building_id": 0
111
  }
112
 
113
+ # Diverse seeds to prevent mode collapse
114
+ seed = 2000 + (i * 17) % 500
115
+ task_id = (i % 3) + 1
116
 
117
  reset_resp = requests.post(
118
  f"{env_url}/reset",
 
123
  rewards.append(0.0)
124
  continue
125
 
126
+ obs = reset_resp.json().get("observations", [{}])[0] if reset_resp.json().get("observations") else {}
127
+ last_observations.append(obs)
128
+
129
+ # 4-step mini-rollout for faster training
130
+ for _ in range(4):
131
  step_resp = requests.post(
132
  f"{env_url}/step",
133
  json=[step_action],
 
139
  grade_resp = requests.get(f"{env_url}/grade", timeout=30)
140
  if grade_resp.status_code == 200:
141
  episode_score = float(grade_resp.json().get("score", 0.5))
142
+ rewards.append(episode_score)
 
 
 
143
  else:
144
  rewards.append(0.0)
145
 
 
149
  return rewards
150
  return reward_env_interaction
151
 
152
+ def reward_entropy_bonus(completions, **kwargs):
153
+ """Reward action diversity to prevent mode collapse - bonus for varied actions."""
154
+ rewards = []
155
+ actions_seen = []
156
+
157
+ for completion in completions:
158
+ text = completion[0]["content"] if isinstance(completion, list) else completion
159
+ try:
160
+ match = re.search(r'\{.*?\}', text, re.DOTALL)
161
+ if match:
162
+ action = json.loads(match.group())
163
+ actions_seen.append(json.dumps(action, sort_keys=True))
164
+ except:
165
+ pass
166
+
167
+ if len(actions_seen) > 1:
168
+ unique_actions = len(set(actions_seen))
169
+ diversity_ratio = unique_actions / len(actions_seen)
170
+ rewards = [0.1 * diversity_ratio] * len(actions_seen)
171
+ else:
172
+ rewards = [0.05] * len(completions)
173
+
174
+ return rewards
175
+
176
  class CSVLogCallback(TrainerCallback):
177
  """Custom callback to continuously log training metrics to a CSV file."""
178
  def __init__(self, output_path):
 
189
  def main():
190
  parser = argparse.ArgumentParser(description="Train GridMind-RL agent with Unsloth GRPO")
191
  parser.add_argument("--env-url", type=str, default="http://localhost:7860", help="OpenEnv server URL")
192
+ parser.add_argument("--model-name", type=str, default="unsloth/Qwen2.5-1.5B-Instruct", help="Base model")
193
  parser.add_argument("--prompts", type=int, default=300, help="Number of training prompts")
194
  parser.add_argument("--epochs", type=int, default=1, help="Training epochs")
195
  parser.add_argument("--max-steps", type=int, default=-1, help="Max steps (overrides epochs if > 0)")
 
199
 
200
  print(f"🚀 Loading model: {args.model_name}")
201
  max_seq_length = 512
202
+ lora_rank = 16 # Increased for better learning capacity
203
 
204
  model, tokenizer = FastLanguageModel.from_pretrained(
205
  model_name=args.model_name,
 
229
  max_steps=args.max_steps,
230
  per_device_train_batch_size=1,
231
  gradient_accumulation_steps=4,
232
+ num_generations=4,
233
  max_prompt_length=256,
234
  max_completion_length=128,
235
+ learning_rate=3e-6, # Lower LR for stability
236
  lr_scheduler_type="cosine",
237
  warmup_ratio=0.1,
238
  logging_steps=5,
239
  save_steps=100,
240
  fp16=True,
241
+ report_to="none",
242
  seed=42,
243
  )
244
 
 
251
  reward_valid_json,
252
  reward_has_required_keys,
253
  get_reward_env_interaction(args.env_url),
254
+ reward_entropy_bonus,
255
  ],
256
  callbacks=[CSVLogCallback(args.output_csv)]
257
  )