File size: 18,701 Bytes
9523beb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
# ============================================================
# Unit 4: Policy Gradient (REINFORCE) for Pixelcopter-PLE-v0
# Deep Reinforcement Learning Course - Hugging Face
# 支持继续训练版本
# ============================================================

import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import gymnasium as gym
from gymnasium import spaces
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
import os
from datetime import datetime

# ===== 修正的PLE环境导入和Wrapper =====
from ple.games.pixelcopter import Pixelcopter
from ple import PLE

class PLEWrapper(gym.Env):
    def __init__(self):
        super().__init__()
        self.game = Pixelcopter()
        # 只使用fps参数,移除display参数
        self.env = PLE(self.game, fps=30)
        self.env.init()
        
        # 定义观察和动作空间
        state_dim = len(self.env.getGameState())
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(state_dim,), dtype=np.float32)
        self.action_space = spaces.Discrete(len(self.env.getActionSet()))
        self.actions = self.env.getActionSet()
        
    def reset(self, seed=None):
        self.env.reset_game()
        state = np.array(list(self.env.getGameState().values()), dtype=np.float32)
        return state, {}
    
    def step(self, action):
        reward = self.env.act(self.actions[action])
        state = np.array(list(self.env.getGameState().values()), dtype=np.float32)
        terminated = self.env.game_over()
        return state, reward, terminated, False, {}

# ============================================================
# 设备配置
# ============================================================
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ============================================================
# 环境配置
# ============================================================
env = PLEWrapper()
eval_env = PLEWrapper()
s_size = env.observation_space.shape[0]
a_size = env.action_space.n
print(f"Environment: Pixelcopter-PLE")
print(f"Observation Space: {s_size}, Action Space: {a_size}")

# ============================================================
# 策略网络定义
# ============================================================
class Policy(nn.Module):
    """
    策略网络:输入状态,输出动作概率分布
    """
    def __init__(self, s_size, a_size, h_size=128):
        """
        初始化策略网络
        Args:
            s_size: 状态空间维度
            a_size: 动作空间维度  
            h_size: 隐藏层大小
        """
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size * 2)
        self.fc3 = nn.Linear(h_size * 2, a_size)
        
        # 添加dropout提高泛化能力
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        """
        前向传播
        Args:
            x: 输入状态
        Returns:
            动作概率分布
        """
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return F.softmax(x, dim=1)

    def act(self, state):
        """
        根据当前策略选择动作
        Args:
            state: 当前状态
        Returns:
            action: 选择的动作
            log_prob: 该动作的对数概率(用于梯度计算)
        """
        # 转换状态为tensor并移到正确设备
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        
        # 获取动作概率分布(保持在同一设备上)
        probs = self.forward(state)
        
        # 创建分类分布
        m = torch.distributions.Categorical(probs)
        
        # 采样动作(基于概率分布,不是贪心选择)
        action = m.sample()
        
        # 返回动作值和对数概率
        return action.item(), m.log_prob(action)

# ============================================================
# 学习率调度器
# ============================================================
class LearningRateScheduler:
    def __init__(self, optimizer, initial_lr, decay_rate=0.95, decay_episodes=5000):
        self.optimizer = optimizer
        self.initial_lr = initial_lr
        self.decay_rate = decay_rate
        self.decay_episodes = decay_episodes
        
    def step(self, episode):
        if episode > 0 and episode % self.decay_episodes == 0:
            new_lr = self.initial_lr * (self.decay_rate ** (episode // self.decay_episodes))
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = new_lr
            print(f"📉 Learning rate decayed to: {new_lr:.2e}")

# ============================================================
# 改进的REINFORCE算法实现
# ============================================================
def reinforce_continued(policy, optimizer, n_training_episodes, max_t, gamma, print_every, 
                       previous_scores=[], model_path=None, lr_scheduler=None):
    """
    支持继续训练的REINFORCE算法
    Args:
        policy: 策略网络
        optimizer: 优化器
        n_training_episodes: 新增训练轮数
        max_t: 每轮最大步数
        gamma: 折扣因子
        print_every: 打印间隔
        previous_scores: 之前的训练分数
        model_path: 模型保存路径
        lr_scheduler: 学习率调度器
    Returns:
        scores: 所有得分列表(包括之前的)
    """
    scores_deque = deque(maxlen=100)  # 保存最近100轮得分
    scores = previous_scores.copy()   # 保留之前的训练历史
    
    # 如果有之前的分数,用最近的分数初始化deque
    if previous_scores:
        recent_scores = previous_scores[-100:] if len(previous_scores) >= 100 else previous_scores
        scores_deque.extend(recent_scores)
        print(f"📈 Resuming with recent average score: {np.mean(scores_deque):.2f}")
        print(f"📊 Previous best score: {max(previous_scores):.2f}")
    
    start_episode = len(previous_scores) + 1
    best_avg_score = max([np.mean(previous_scores[max(0, i-99):i+1]) for i in range(len(previous_scores))]) if previous_scores else -float('inf')
    
    print(f"🚀 Starting continued training from episode {start_episode}")
    print(f"🎯 Target: Beat previous best average score of {best_avg_score:.2f}")
    print()
    
    for i_episode in range(start_episode, start_episode + n_training_episodes):
        saved_log_probs = []  # 保存每步的log概率
        rewards = []          # 保存每步的奖励
        state, _ = env.reset()
        
        # --- 1. 收集一条完整轨迹 ---
        for t in range(max_t):
            # 根据当前策略选择动作
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            
            # 执行动作,获取下一状态和奖励
            state, reward, terminated, truncated, _ = env.step(action)
            rewards.append(reward)
            
            # 检查是否结束
            if terminated or truncated:
                break
        
        # 记录本轮总得分
        episode_score = sum(rewards)
        scores_deque.append(episode_score)
        scores.append(episode_score)
        
        # --- 2. 计算折扣回报 (Discounted Returns) ---
        returns = deque(maxlen=max_t)
        n_steps = len(rewards)
        
        # 从后往前计算累计折扣回报:G_t = r_t + γ*r_{t+1} + γ²*r_{t+2} + ...
        G = 0
        for r in reversed(rewards):
            G = r + gamma * G
            returns.appendleft(G)
        
        # 标准化回报(重要的工程技巧,提高训练稳定性)
        returns = torch.tensor(returns).to(device)
        if len(returns) > 1:  # 避免标准差为0的情况
            returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        
        # --- 3. 计算策略梯度损失 ---
        # 策略梯度定理:∇J(θ) = E[∇log π(a|s) * G_t]
        # 损失函数:L = -∑(log_prob * return) (负号因为要最大化回报)
        policy_loss = []
        for log_prob, return_val in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * return_val)
        
        # 合并所有损失
        policy_loss = torch.cat(policy_loss).sum()
        
        # --- 4. 反向传播更新参数 ---
        optimizer.zero_grad()
        policy_loss.backward()
        
        # 添加梯度裁剪以提高训练稳定性
        torch.nn.utils.clip_grad_norm_(policy.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        # 学习率调度
        if lr_scheduler:
            lr_scheduler.step(i_episode)
        
        # 打印训练进度
        if i_episode % print_every == 0:
            current_avg = np.mean(scores_deque)
            current_lr = optimizer.param_groups[0]['lr']
            print(f'Episode {i_episode:6d} | Avg Score: {current_avg:7.2f} | Last Score: {episode_score:7.2f} | Steps: {len(rewards):4d} | LR: {current_lr:.2e}')
            
            # 检查是否创造新纪录
            if current_avg > best_avg_score:
                best_avg_score = current_avg
                print(f"🎉 New best average score: {best_avg_score:.2f}")
                
                # 保存最佳模型
                if model_path:
                    best_model_path = model_path.replace('.pth', '_best.pth')
                    torch.save({
                        'policy_state_dict': policy.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        's_size': s_size,
                        'a_size': a_size,
                        'hidden_size': policy.fc1.out_features,
                        'scores': scores,
                        'episode': i_episode,
                        'best_avg_score': best_avg_score,
                        'timestamp': datetime.now().isoformat()
                    }, best_model_path)
                    print(f"💾 Best model saved: {best_model_path}")
            
            # 定期保存检查点
            if model_path and i_episode % (print_every * 2) == 0:
                checkpoint_path = model_path.replace('.pth', f'_checkpoint_{i_episode}.pth')
                torch.save({
                    'policy_state_dict': policy.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    's_size': s_size,
                    'a_size': a_size,
                    'hidden_size': policy.fc1.out_features,
                    'scores': scores,
                    'episode': i_episode,
                    'timestamp': datetime.now().isoformat()
                }, checkpoint_path)
                print(f"💾 Checkpoint saved: {checkpoint_path}")
            
            print()
            
    return scores

# ============================================================
# 评估函数
# ============================================================
def evaluate_policy(policy, eval_env, n_eval_episodes=10):
    """
    评估策略性能
    Args:
        policy: 训练好的策略网络
        eval_env: 评估环境
        n_eval_episodes: 评估轮数
    Returns:
        episode_rewards: 每轮奖励列表
        mean_reward: 平均奖励
        std_reward: 奖励标准差
    """
    episode_rewards = []
    
    # 设置为评估模式
    policy.eval()
    
    for i in range(n_eval_episodes):
        state, _ = eval_env.reset()
        episode_reward = 0
        done = False
        steps = 0
        
        while not done and steps < 10000:  # 添加最大步数限制
            # 评估时使用确定性策略(不采样,选择概率最大的动作)
            with torch.no_grad():
                state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)
                probs = policy.forward(state_tensor)
                action = torch.argmax(probs, dim=1).item()
            
            state, reward, terminated, truncated, _ = eval_env.step(action)
            episode_reward += reward
            done = terminated or truncated
            steps += 1
            
        episode_rewards.append(episode_reward)
        print(f"Eval Episode {i+1:2d}: Reward = {episode_reward:7.2f} | Steps = {steps:4d}")
    
    # 恢复训练模式
    policy.train()
    
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)
    
    print(f"\n{'='*50}")
    print(f"Evaluation Results:")
    print(f"Mean Reward: {mean_reward:.2f}")
    print(f"Std Reward: {std_reward:.2f}")
    print(f"Score (mean - std): {mean_reward - std_reward:.2f}")
    print(f"Required for Pixelcopter-PLE-v0: 5.0")
    print(f"{'='*50}")
    
    return episode_rewards, mean_reward, std_reward

# ============================================================
# 模型加载函数
# ============================================================
def load_model(model_path, policy, optimizer):
    """
    加载已保存的模型
    """
    if os.path.exists(model_path):
        print(f"🔄 Loading existing model from {model_path}")
        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
        
        # 加载模型参数
        policy.load_state_dict(checkpoint['policy_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        
        # 加载训练历史
        previous_scores = checkpoint.get('scores', [])
        episode = checkpoint.get('episode', 0)
        best_avg_score = checkpoint.get('best_avg_score', -float('inf'))
        
        print(f"✅ Model loaded successfully!")
        print(f"📊 Loaded {len(previous_scores)} previous training episodes")
        if previous_scores:
            print(f"🎯 Previous best score: {max(previous_scores):.2f}")
            print(f"🏆 Previous best average score: {best_avg_score:.2f}")
        
        return previous_scores, episode, best_avg_score
    else:
        print("🆕 No existing model found, starting fresh training")
        return [], 0, -float('inf')

# ============================================================
# 主训练流程
# ============================================================
if __name__ == "__main__":
    # 超参数设置 - 针对继续训练优化
    HIDDEN_SIZE = 256
    INITIAL_LEARNING_RATE = 2e-5  # 继续训练时使用较小的学习率
    N_TRAINING_EPISODES = 20000   # 继续训练的轮数
    MAX_T = 10000
    GAMMA = 0.995                 # 稍微提高折扣因子
    PRINT_EVERY = 1000
    
    # 模型路径
    MODEL_PATH = "/home/eason/Workspace/Result_DRL/reinforce_pixelcopter.pth"
    
    print("="*60)
    print("REINFORCE Continued Training for Pixelcopter-PLE-v0")
    print("="*60)
    print(f"📅 Training started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print()
    
    # 初始化策略网络和优化器
    policy = Policy(s_size, a_size, HIDDEN_SIZE).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=INITIAL_LEARNING_RATE)
    
    # 初始化学习率调度器
    lr_scheduler = LearningRateScheduler(optimizer, INITIAL_LEARNING_RATE, decay_rate=0.95, decay_episodes=5000)
    
    print(f"🧠 Policy Network: {policy}")
    print(f"⚙️  Optimizer: Adam (initial_lr={INITIAL_LEARNING_RATE:.2e})")
    print(f"📈 Training Episodes: {N_TRAINING_EPISODES}")
    print(f"⏱️  Max Steps per Episode: {MAX_T}")
    print(f"💰 Discount Factor: {GAMMA}")
    print()
    
    # 尝试加载已有模型
    previous_scores, last_episode, best_avg_score = load_model(MODEL_PATH, policy, optimizer)
    
    # 更新优化器学习率(确保使用当前设定的学习率)
    for param_group in optimizer.param_groups:
        param_group['lr'] = INITIAL_LEARNING_RATE
    
    print(f"📚 Current learning rate: {INITIAL_LEARNING_RATE:.2e}")
    print()
    
    # 开始训练
    print("🚀 Starting training...")
    print("-" * 80)
    
    scores = reinforce_continued(
        policy=policy,
        optimizer=optimizer,
        n_training_episodes=N_TRAINING_EPISODES,
        max_t=MAX_T,
        gamma=GAMMA,
        print_every=PRINT_EVERY,
        previous_scores=previous_scores,
        model_path=MODEL_PATH,
        lr_scheduler=lr_scheduler
    )
    
    print("\n" + "="*60)
    print("Training Completed!")
    print("="*60)
    
    # 保存最终模型
    final_model_data = {
        'policy_state_dict': policy.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        's_size': s_size,
        'a_size': a_size,
        'hidden_size': HIDDEN_SIZE,
        'scores': scores,
        'total_episodes': len(scores),
        'final_avg_score': np.mean(scores[-100:]) if len(scores) >= 100 else np.mean(scores),
        'training_completed_at': datetime.now().isoformat(),
        'hyperparameters': {
            'learning_rate': INITIAL_LEARNING_RATE,
            'gamma': GAMMA,
            'hidden_size': HIDDEN_SIZE,
            'max_t': MAX_T
        }
    }
    
    torch.save(final_model_data, MODEL_PATH)
    print(f"✅ Final model saved to {MODEL_PATH}")
    
    # 评估训练好的模型
    print("\n" + "="*60)
    print("Evaluating Final Policy")
    print("="*60)
    
    episode_rewards, mean_reward, std_reward = evaluate_policy(policy, eval_env, n_eval_episodes=10)
    
    # 训练结果总结
    print(f"\n🎉 Final Training Results:")
    print(f"   Total Episodes Trained: {len(scores)}")
    print(f"   Final Average Score (last 100): {np.mean(scores[-100:]) if len(scores) >= 100 else np.mean(scores):.2f}")
    print(f"   Best Single Episode Score: {max(scores):.2f}")
    print(f"   Evaluation Mean Reward: {mean_reward:.2f}")
    print(f"   Evaluation Std Reward: {std_reward:.2f}")
    print(f"   Final Score (mean - std): {mean_reward - std_reward:.2f}")
    print(f"   Required for Pixelcopter-PLE-v0: 5.0")
    
    if mean_reward - std_reward >= 5.0:
        print(f"   Status: ✅ PASSED! Congratulations!")
    else:
        needed_improvement = 5.0 - (mean_reward - std_reward)
        print(f"   Status: ❌ Need {needed_improvement:.2f} more points")
        print(f"   Suggestion: Continue training with lower learning rate or adjust network architecture")
    
    print(f"\n📅 Training completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")