File size: 4,740 Bytes
"""
简单训练脚本 - 直接训练并保存模型
"""
import os
import sys
import time
import torch
import numpy as np

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from game import Game2048
from model import Game2048Transformer
from trainer import PPOTrainer, RolloutBuffer

def train_simple(
    num_games: int = 1000,
    save_path: str = "checkpoints/model.pt",
    print_interval: int = 10
):
    """简单训练"""
    print("=" * 50)
    print("2048 AI Simple Training")
    print("=" * 50)
    
    device = "cpu"
    model = Game2048Transformer().to(device)
    trainer = PPOTrainer(model, lr=3e-4, device=device)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"Training for {num_games} games...")
    print("-" * 50)
    
    # 统计
    scores = []
    max_tiles = []
    best_score = 0
    
    start_time = time.time()
    
    for game_idx in range(num_games):
        # 运行一局游戏
        game = Game2048()
        game.reset()
        
        buffer = RolloutBuffer(capacity=10000)
        
        while not game.game_over:
            state = game.get_state()
            scores_feat = np.array([
                min(game.accumulated_score / 50000, 1.0),
                min(game.situational_score / 200, 1.0)
            ], dtype=np.float32)
            valid = game.get_valid_actions()
            
            # 转换张量
            state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
            scores_t = torch.FloatTensor(scores_feat).unsqueeze(0).to(device)
            valid_t = torch.BoolTensor(valid).unsqueeze(0).to(device)
            
            # 选择动作
            with torch.no_grad():
                action, log_prob, value = model.get_action(state_t, scores_t, valid_t, deterministic=False)
            
            # 执行动作
            old_state = state.copy()
            old_scores = scores_feat.copy()
            
            next_state, reward, moved, done = game.move(action)
            
            # 存储转移
            from trainer import Transition
            transition = Transition(
                state=old_state,
                scores=old_scores,
                action=action,
                reward=reward,
                next_state=next_state.copy(),
                next_scores=np.array([
                    min(game.accumulated_score / 50000, 1.0),
                    min(game.situational_score / 200, 1.0)
                ], dtype=np.float32),
                done=done,
                log_prob=log_prob,
                value=value,
                valid_actions=valid
            )
            buffer.push(transition)
            
            # 每步更新
            if len(buffer) >= 64:
                trainer.update(buffer)
                buffer.clear()
        
        # 记录结果
        scores.append(game.accumulated_score)
        max_tiles.append(game.get_max_tile())
        
        if game.accumulated_score > best_score:
            best_score = game.accumulated_score
        
        # 打印进度
        if (game_idx + 1) % print_interval == 0:
            elapsed = time.time() - start_time
            avg_score = np.mean(scores[-print_interval:])
            avg_max_tile = np.mean(max_tiles[-print_interval:])
            speed = (game_idx + 1) / elapsed
            
            print(
                f"Game {game_idx + 1}/{num_games} | "
                f"Avg Score: {avg_score:.0f} | "
                f"Best: {best_score} | "
                f"Max Tile: {avg_max_tile:.0f} | "
                f"Speed: {speed:.2f} games/s"
            )
            
            # 保存模型
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            torch.save({
                'model_state_dict': model.state_dict(),
                'game_idx': game_idx,
                'best_score': best_score,
                'avg_score': avg_score
            }, save_path)
    
    # 最终保存
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    torch.save({
        'model_state_dict': model.state_dict(),
        'game_idx': num_games,
        'best_score': best_score,
        'avg_score': np.mean(scores[-100:])
    }, save_path)
    
    elapsed = time.time() - start_time
    print("\n" + "=" * 50)
    print("Training Complete!")
    print("=" * 50)
    print(f"Total games: {num_games}")
    print(f"Total time: {elapsed:.1f}s")
    print(f"Average score (last 100): {np.mean(scores[-100:]):.0f}")
    print(f"Best score: {best_score}")
    print(f"Best max tile: {max(max_tiles)}")
    print(f"Model saved to: {save_path}")
    
    return model

if __name__ == "__main__":
    train_simple(num_games=500)