vAIbe_2048 / train_simple.py
forthezero's picture
Upload 13 files
0642513 verified
"""
简单训练脚本 - 直接训练并保存模型
"""
import os
import sys
import time
import torch
import numpy as np
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from game import Game2048
from model import Game2048Transformer
from trainer import PPOTrainer, RolloutBuffer
def train_simple(
num_games: int = 1000,
save_path: str = "checkpoints/model.pt",
print_interval: int = 10
):
"""简单训练"""
print("=" * 50)
print("2048 AI Simple Training")
print("=" * 50)
device = "cpu"
model = Game2048Transformer().to(device)
trainer = PPOTrainer(model, lr=3e-4, device=device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Training for {num_games} games...")
print("-" * 50)
# 统计
scores = []
max_tiles = []
best_score = 0
start_time = time.time()
for game_idx in range(num_games):
# 运行一局游戏
game = Game2048()
game.reset()
buffer = RolloutBuffer(capacity=10000)
while not game.game_over:
state = game.get_state()
scores_feat = np.array([
min(game.accumulated_score / 50000, 1.0),
min(game.situational_score / 200, 1.0)
], dtype=np.float32)
valid = game.get_valid_actions()
# 转换张量
state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
scores_t = torch.FloatTensor(scores_feat).unsqueeze(0).to(device)
valid_t = torch.BoolTensor(valid).unsqueeze(0).to(device)
# 选择动作
with torch.no_grad():
action, log_prob, value = model.get_action(state_t, scores_t, valid_t, deterministic=False)
# 执行动作
old_state = state.copy()
old_scores = scores_feat.copy()
next_state, reward, moved, done = game.move(action)
# 存储转移
from trainer import Transition
transition = Transition(
state=old_state,
scores=old_scores,
action=action,
reward=reward,
next_state=next_state.copy(),
next_scores=np.array([
min(game.accumulated_score / 50000, 1.0),
min(game.situational_score / 200, 1.0)
], dtype=np.float32),
done=done,
log_prob=log_prob,
value=value,
valid_actions=valid
)
buffer.push(transition)
# 每步更新
if len(buffer) >= 64:
trainer.update(buffer)
buffer.clear()
# 记录结果
scores.append(game.accumulated_score)
max_tiles.append(game.get_max_tile())
if game.accumulated_score > best_score:
best_score = game.accumulated_score
# 打印进度
if (game_idx + 1) % print_interval == 0:
elapsed = time.time() - start_time
avg_score = np.mean(scores[-print_interval:])
avg_max_tile = np.mean(max_tiles[-print_interval:])
speed = (game_idx + 1) / elapsed
print(
f"Game {game_idx + 1}/{num_games} | "
f"Avg Score: {avg_score:.0f} | "
f"Best: {best_score} | "
f"Max Tile: {avg_max_tile:.0f} | "
f"Speed: {speed:.2f} games/s"
)
# 保存模型
os.makedirs(os.path.dirname(save_path), exist_ok=True)
torch.save({
'model_state_dict': model.state_dict(),
'game_idx': game_idx,
'best_score': best_score,
'avg_score': avg_score
}, save_path)
# 最终保存
os.makedirs(os.path.dirname(save_path), exist_ok=True)
torch.save({
'model_state_dict': model.state_dict(),
'game_idx': num_games,
'best_score': best_score,
'avg_score': np.mean(scores[-100:])
}, save_path)
elapsed = time.time() - start_time
print("\n" + "=" * 50)
print("Training Complete!")
print("=" * 50)
print(f"Total games: {num_games}")
print(f"Total time: {elapsed:.1f}s")
print(f"Average score (last 100): {np.mean(scores[-100:]):.0f}")
print(f"Best score: {best_score}")
print(f"Best max tile: {max(max_tiles)}")
print(f"Model saved to: {save_path}")
return model
if __name__ == "__main__":
train_simple(num_games=500)