File size: 3,483 Bytes
a36c04c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from env.ev_charge_env import EVChargeEnv


class ActorCritic(nn.Module):
    def __init__(self, obs_dim: int, act_dim: int):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(obs_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
        )
        self.policy_head = nn.Linear(64, act_dim)
        self.value_head = nn.Linear(64, 1)

    def forward(self, x):
        h = self.shared(x)
        logits = self.policy_head(h)
        value = self.value_head(h).squeeze(-1)
        return logits, value


def make_env():
    # You can change scenario here: "easy", "medium", "hard"
    return EVChargeEnv(scenario="medium")


def run_episode(env, model, device, gamma=0.99):
    obs, _ = env.reset()
    obs = torch.tensor(obs, dtype=torch.float32, device=device)

    log_probs = []
    values = []
    rewards = []

    done = False
    while not done:
        logits, value = model(obs.unsqueeze(0))  # [1, obs_dim]
        # Gaussian policy for continuous action in [0, 1]
        mean = torch.sigmoid(logits.squeeze(0))  # [act_dim]
        std = torch.ones_like(mean) * 0.2  # fixed std

        dist = torch.distributions.Normal(mean, std)
        action = dist.sample()
        action_clipped = torch.clamp(action, 0.0, 1.0)

        log_prob = dist.log_prob(action).sum()

        np_action = action_clipped.detach().cpu().numpy()
        next_obs, reward, terminated, truncated, _ = env.step(np_action)

        log_probs.append(log_prob)
        values.append(value)
        rewards.append(torch.tensor(reward, dtype=torch.float32, device=device))

        done = terminated or truncated
        obs = torch.tensor(next_obs, dtype=torch.float32, device=device)

    # Compute returns
    returns = []
    G = torch.tensor(0.0, device=device)
    for r in reversed(rewards):
        G = r + gamma * G
        returns.insert(0, G)

    returns = torch.stack(returns)
    values = torch.stack(values).squeeze(-1)
    log_probs = torch.stack(log_probs)

    advantages = returns - values.detach()

    policy_loss = -(log_probs * advantages).mean()
    value_loss = (returns - values).pow(2).mean()

    total_reward = float(sum(r.item() for r in rewards))

    return policy_loss, value_loss, total_reward, len(rewards)


def train(num_episodes=200):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = make_env()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    model = ActorCritic(obs_dim, act_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=3e-4)

    reward_history = []

    for episode in range(1, num_episodes + 1):
        policy_loss, value_loss, total_reward, steps = run_episode(env, model, device)

        loss = policy_loss + 0.5 * value_loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        reward_history.append(total_reward)

        if episode % 10 == 0:
            avg_last = np.mean(reward_history[-10:])
            print(
                f"Episode {episode:4d} | "
                f"ep_reward={total_reward:.2f} | "
                f"avg_last10={avg_last:.2f} | steps={steps}"
            )

    print("Training finished.")
    print(f"Average reward over last 20 episodes: {np.mean(reward_history[-20:]):.2f}")


if __name__ == "__main__":
    train(num_episodes=200)