oozan commited on
Commit
a36c04c
·
verified ·
1 Parent(s): ee44412

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. requirements.txt +1 -0
  2. train_evchargeenv_pg.py +119 -0
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  gymnasium
2
  numpy
 
 
1
  gymnasium
2
  numpy
3
+ torch
train_evchargeenv_pg.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.optim as optim
5
+
6
+ from env.ev_charge_env import EVChargeEnv
7
+
8
+
9
+ class ActorCritic(nn.Module):
10
+ def __init__(self, obs_dim: int, act_dim: int):
11
+ super().__init__()
12
+ self.shared = nn.Sequential(
13
+ nn.Linear(obs_dim, 64),
14
+ nn.ReLU(),
15
+ nn.Linear(64, 64),
16
+ nn.ReLU(),
17
+ )
18
+ self.policy_head = nn.Linear(64, act_dim)
19
+ self.value_head = nn.Linear(64, 1)
20
+
21
+ def forward(self, x):
22
+ h = self.shared(x)
23
+ logits = self.policy_head(h)
24
+ value = self.value_head(h).squeeze(-1)
25
+ return logits, value
26
+
27
+
28
+ def make_env():
29
+ # You can change scenario here: "easy", "medium", "hard"
30
+ return EVChargeEnv(scenario="medium")
31
+
32
+
33
+ def run_episode(env, model, device, gamma=0.99):
34
+ obs, _ = env.reset()
35
+ obs = torch.tensor(obs, dtype=torch.float32, device=device)
36
+
37
+ log_probs = []
38
+ values = []
39
+ rewards = []
40
+
41
+ done = False
42
+ while not done:
43
+ logits, value = model(obs.unsqueeze(0)) # [1, obs_dim]
44
+ # Gaussian policy for continuous action in [0, 1]
45
+ mean = torch.sigmoid(logits.squeeze(0)) # [act_dim]
46
+ std = torch.ones_like(mean) * 0.2 # fixed std
47
+
48
+ dist = torch.distributions.Normal(mean, std)
49
+ action = dist.sample()
50
+ action_clipped = torch.clamp(action, 0.0, 1.0)
51
+
52
+ log_prob = dist.log_prob(action).sum()
53
+
54
+ np_action = action_clipped.detach().cpu().numpy()
55
+ next_obs, reward, terminated, truncated, _ = env.step(np_action)
56
+
57
+ log_probs.append(log_prob)
58
+ values.append(value)
59
+ rewards.append(torch.tensor(reward, dtype=torch.float32, device=device))
60
+
61
+ done = terminated or truncated
62
+ obs = torch.tensor(next_obs, dtype=torch.float32, device=device)
63
+
64
+ # Compute returns
65
+ returns = []
66
+ G = torch.tensor(0.0, device=device)
67
+ for r in reversed(rewards):
68
+ G = r + gamma * G
69
+ returns.insert(0, G)
70
+
71
+ returns = torch.stack(returns)
72
+ values = torch.stack(values).squeeze(-1)
73
+ log_probs = torch.stack(log_probs)
74
+
75
+ advantages = returns - values.detach()
76
+
77
+ policy_loss = -(log_probs * advantages).mean()
78
+ value_loss = (returns - values).pow(2).mean()
79
+
80
+ total_reward = float(sum(r.item() for r in rewards))
81
+
82
+ return policy_loss, value_loss, total_reward, len(rewards)
83
+
84
+
85
+ def train(num_episodes=200):
86
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
87
+ env = make_env()
88
+ obs_dim = env.observation_space.shape[0]
89
+ act_dim = env.action_space.shape[0]
90
+
91
+ model = ActorCritic(obs_dim, act_dim).to(device)
92
+ optimizer = optim.Adam(model.parameters(), lr=3e-4)
93
+
94
+ reward_history = []
95
+
96
+ for episode in range(1, num_episodes + 1):
97
+ policy_loss, value_loss, total_reward, steps = run_episode(env, model, device)
98
+
99
+ loss = policy_loss + 0.5 * value_loss
100
+ optimizer.zero_grad()
101
+ loss.backward()
102
+ optimizer.step()
103
+
104
+ reward_history.append(total_reward)
105
+
106
+ if episode % 10 == 0:
107
+ avg_last = np.mean(reward_history[-10:])
108
+ print(
109
+ f"Episode {episode:4d} | "
110
+ f"ep_reward={total_reward:.2f} | "
111
+ f"avg_last10={avg_last:.2f} | steps={steps}"
112
+ )
113
+
114
+ print("Training finished.")
115
+ print(f"Average reward over last 20 episodes: {np.mean(reward_history[-20:]):.2f}")
116
+
117
+
118
+ if __name__ == "__main__":
119
+ train(num_episodes=200)