File size: 4,740 Bytes
0642513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
简单训练脚本 - 直接训练并保存模型
"""
import os
import sys
import time
import torch
import numpy as np

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from game import Game2048
from model import Game2048Transformer
from trainer import PPOTrainer, RolloutBuffer

def train_simple(
    num_games: int = 1000,
    save_path: str = "checkpoints/model.pt",
    print_interval: int = 10
):
    """简单训练"""
    print("=" * 50)
    print("2048 AI Simple Training")
    print("=" * 50)
    
    device = "cpu"
    model = Game2048Transformer().to(device)
    trainer = PPOTrainer(model, lr=3e-4, device=device)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"Training for {num_games} games...")
    print("-" * 50)
    
    # 统计
    scores = []
    max_tiles = []
    best_score = 0
    
    start_time = time.time()
    
    for game_idx in range(num_games):
        # 运行一局游戏
        game = Game2048()
        game.reset()
        
        buffer = RolloutBuffer(capacity=10000)
        
        while not game.game_over:
            state = game.get_state()
            scores_feat = np.array([
                min(game.accumulated_score / 50000, 1.0),
                min(game.situational_score / 200, 1.0)
            ], dtype=np.float32)
            valid = game.get_valid_actions()
            
            # 转换张量
            state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
            scores_t = torch.FloatTensor(scores_feat).unsqueeze(0).to(device)
            valid_t = torch.BoolTensor(valid).unsqueeze(0).to(device)
            
            # 选择动作
            with torch.no_grad():
                action, log_prob, value = model.get_action(state_t, scores_t, valid_t, deterministic=False)
            
            # 执行动作
            old_state = state.copy()
            old_scores = scores_feat.copy()
            
            next_state, reward, moved, done = game.move(action)
            
            # 存储转移
            from trainer import Transition
            transition = Transition(
                state=old_state,
                scores=old_scores,
                action=action,
                reward=reward,
                next_state=next_state.copy(),
                next_scores=np.array([
                    min(game.accumulated_score / 50000, 1.0),
                    min(game.situational_score / 200, 1.0)
                ], dtype=np.float32),
                done=done,
                log_prob=log_prob,
                value=value,
                valid_actions=valid
            )
            buffer.push(transition)
            
            # 每步更新
            if len(buffer) >= 64:
                trainer.update(buffer)
                buffer.clear()
        
        # 记录结果
        scores.append(game.accumulated_score)
        max_tiles.append(game.get_max_tile())
        
        if game.accumulated_score > best_score:
            best_score = game.accumulated_score
        
        # 打印进度
        if (game_idx + 1) % print_interval == 0:
            elapsed = time.time() - start_time
            avg_score = np.mean(scores[-print_interval:])
            avg_max_tile = np.mean(max_tiles[-print_interval:])
            speed = (game_idx + 1) / elapsed
            
            print(
                f"Game {game_idx + 1}/{num_games} | "
                f"Avg Score: {avg_score:.0f} | "
                f"Best: {best_score} | "
                f"Max Tile: {avg_max_tile:.0f} | "
                f"Speed: {speed:.2f} games/s"
            )
            
            # 保存模型
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            torch.save({
                'model_state_dict': model.state_dict(),
                'game_idx': game_idx,
                'best_score': best_score,
                'avg_score': avg_score
            }, save_path)
    
    # 最终保存
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    torch.save({
        'model_state_dict': model.state_dict(),
        'game_idx': num_games,
        'best_score': best_score,
        'avg_score': np.mean(scores[-100:])
    }, save_path)
    
    elapsed = time.time() - start_time
    print("\n" + "=" * 50)
    print("Training Complete!")
    print("=" * 50)
    print(f"Total games: {num_games}")
    print(f"Total time: {elapsed:.1f}s")
    print(f"Average score (last 100): {np.mean(scores[-100:]):.0f}")
    print(f"Best score: {best_score}")
    print(f"Best max tile: {max(max_tiles)}")
    print(f"Model saved to: {save_path}")
    
    return model

if __name__ == "__main__":
    train_simple(num_games=500)