File size: 4,100 Bytes
a063d15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gymnasium as gym
import numpy as np
import tensorflow as tf
import config
from reward import shape_reward_vectorized

class PPOTrainer:
    def __init__(self, agent):
        self.agent = agent
        # Create 15 asynchronous environments running in parallel processes
        self.envs = gym.vector.AsyncVectorEnv([
            lambda: gym.make(config.ENV_NAME) for _ in range(config.NUM_ENVS)
        ])
        self.states, _ = self.envs.reset()

    def collect_rollouts(self):
        """Collects trajectories across all parallel environments."""
        mb_states = np.zeros((config.ROLLOUT_STEPS, config.NUM_ENVS, 2), dtype=np.float32)
        mb_actions = np.zeros((config.ROLLOUT_STEPS, config.NUM_ENVS, 1), dtype=np.float32)
        mb_log_probs = np.zeros((config.ROLLOUT_STEPS, config.NUM_ENVS, 1), dtype=np.float32)
        mb_rewards = np.zeros((config.ROLLOUT_STEPS, config.NUM_ENVS), dtype=np.float32)
        mb_values = np.zeros((config.ROLLOUT_STEPS, config.NUM_ENVS), dtype=np.float32)
        mb_masks = np.zeros((config.ROLLOUT_STEPS, config.NUM_ENVS), dtype=np.float32)
        
        ep_raw_rewards = np.zeros(config.NUM_ENVS)
        finished_ep_scores = []

        for step in range(config.ROLLOUT_STEPS):
            mb_states[step] = self.states
            
            # Request continuous actions for all environments at once
            actions, log_probs, values = self.agent.get_vector_actions(self.states)
            
            mb_actions[step] = actions
            mb_log_probs[step] = log_probs
            mb_values[step] = values
            
            next_states, rewards_raw, terminated, truncated, _ = self.envs.step(actions)
            dones = terminated | truncated
            
            mb_rewards[step] = shape_reward_vectorized(next_states, rewards_raw)
            mb_masks[step] = 1.0 - dones.astype(np.float32)
            
            ep_raw_rewards += rewards_raw
            for idx, done in enumerate(dones):
                if done:
                    finished_ep_scores.append(ep_raw_rewards[idx])
                    ep_raw_rewards[idx] = 0.0 # Reset tracked internal count
                    
            self.states = next_states

        # Standard General Advantage Tracking for parallel timelines
        mb_returns = np.zeros_like(mb_rewards)
        mb_advantages = np.zeros_like(mb_rewards)
        running_return = np.zeros(config.NUM_ENVS)
        previous_value = np.zeros(config.NUM_ENVS)

        for t in reversed(range(config.ROLLOUT_STEPS)):
            running_return = mb_rewards[t] + config.GAMMA * running_return * mb_masks[t]
            mb_returns[t] = running_return
            
            td_error = mb_rewards[t] + config.GAMMA * previous_value * mb_masks[t] - mb_values[t]
            mb_advantages[t] = td_error
            previous_value = mb_values[t]

        # Flatten environment data dimensions for matrix training compatibility
        return (
            mb_states.reshape(-1, 2),
            mb_actions.reshape(-1, 1),
            mb_log_probs.reshape(-1, 1),
            mb_returns.flatten(),
            mb_advantages.flatten(),
            finished_ep_scores
        )

    def train_epoch(self, states, actions, log_probs, returns, advantages):

        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
        
        dataset = tf.data.Dataset.from_tensor_slices((states, actions, log_probs, returns, advantages))
        dataset = dataset.shuffle(buffer_size=len(states)).batch(config.BATCH_SIZE)
        
        total_al, total_cl = 0, 0
        steps = 0
        for _ in range(config.TRAIN_EPOCHS):
            for batch in dataset:
                b_states, b_actions, b_log_probs, b_returns, b_advantages = batch
                al, cl = self.agent.train_step(b_states, b_actions, b_log_probs, b_returns, b_advantages)
                total_al += al.numpy()
                total_cl += cl.numpy()
                steps += 1
                
        return total_al / steps, total_cl / steps

    def close(self):
        self.envs.close()