forthezero
/

vAIbe_2048

Model card Files Files and versions

vAIbe_2048 / train_simple.py

forthezero's picture

Upload 13 files

0642513 verified 2 months ago

history blame contribute delete

4.74 kB

	"""
	简单训练脚本 - 直接训练并保存模型
	"""
	import os
	import sys
	import time
	import torch
	import numpy as np

	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

	from game import Game2048
	from model import Game2048Transformer
	from trainer import PPOTrainer, RolloutBuffer

	def train_simple(
	num_games: int = 1000,
	save_path: str = "checkpoints/model.pt",
	print_interval: int = 10
	):
	"""简单训练"""
	print("=" * 50)
	print("2048 AI Simple Training")
	print("=" * 50)

	device = "cpu"
	model = Game2048Transformer().to(device)
	trainer = PPOTrainer(model, lr=3e-4, device=device)

	print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
	print(f"Training for {num_games} games...")
	print("-" * 50)

	# 统计
	scores = []
	max_tiles = []
	best_score = 0

	start_time = time.time()

	for game_idx in range(num_games):
	# 运行一局游戏
	game = Game2048()
	game.reset()

	buffer = RolloutBuffer(capacity=10000)

	while not game.game_over:
	state = game.get_state()
	scores_feat = np.array([
	min(game.accumulated_score / 50000, 1.0),
	min(game.situational_score / 200, 1.0)
	], dtype=np.float32)
	valid = game.get_valid_actions()

	# 转换张量
	state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
	scores_t = torch.FloatTensor(scores_feat).unsqueeze(0).to(device)
	valid_t = torch.BoolTensor(valid).unsqueeze(0).to(device)

	# 选择动作
	with torch.no_grad():
	action, log_prob, value = model.get_action(state_t, scores_t, valid_t, deterministic=False)

	# 执行动作
	old_state = state.copy()
	old_scores = scores_feat.copy()

	next_state, reward, moved, done = game.move(action)

	# 存储转移
	from trainer import Transition
	transition = Transition(
	state=old_state,
	scores=old_scores,
	action=action,
	reward=reward,
	next_state=next_state.copy(),
	next_scores=np.array([
	min(game.accumulated_score / 50000, 1.0),
	min(game.situational_score / 200, 1.0)
	], dtype=np.float32),
	done=done,
	log_prob=log_prob,
	value=value,
	valid_actions=valid
	)
	buffer.push(transition)

	# 每步更新
	if len(buffer) >= 64:
	trainer.update(buffer)
	buffer.clear()

	# 记录结果
	scores.append(game.accumulated_score)
	max_tiles.append(game.get_max_tile())

	if game.accumulated_score > best_score:
	best_score = game.accumulated_score

	# 打印进度
	if (game_idx + 1) % print_interval == 0:
	elapsed = time.time() - start_time
	avg_score = np.mean(scores[-print_interval:])
	avg_max_tile = np.mean(max_tiles[-print_interval:])
	speed = (game_idx + 1) / elapsed

	print(
	f"Game {game_idx + 1}/{num_games} \| "
	f"Avg Score: {avg_score:.0f} \| "
	f"Best: {best_score} \| "
	f"Max Tile: {avg_max_tile:.0f} \| "
	f"Speed: {speed:.2f} games/s"
	)

	# 保存模型
	os.makedirs(os.path.dirname(save_path), exist_ok=True)
	torch.save({
	'model_state_dict': model.state_dict(),
	'game_idx': game_idx,
	'best_score': best_score,
	'avg_score': avg_score
	}, save_path)

	# 最终保存
	os.makedirs(os.path.dirname(save_path), exist_ok=True)
	torch.save({
	'model_state_dict': model.state_dict(),
	'game_idx': num_games,
	'best_score': best_score,
	'avg_score': np.mean(scores[-100:])
	}, save_path)

	elapsed = time.time() - start_time
	print("\n" + "=" * 50)
	print("Training Complete!")
	print("=" * 50)
	print(f"Total games: {num_games}")
	print(f"Total time: {elapsed:.1f}s")
	print(f"Average score (last 100): {np.mean(scores[-100:]):.0f}")
	print(f"Best score: {best_score}")
	print(f"Best max tile: {max(max_tiles)}")
	print(f"Model saved to: {save_path}")

	return model

	if __name__ == "__main__":
	train_simple(num_games=500)