Spaces:

trioskosmos
/

LovecaSim

Sleeping

App Files Files Community

LovecaSim / ai /agents /fast_mcts.py

trioskosmos

Upload ai/agents/fast_mcts.py with huggingface_hub

25d7b1e verified 3 months ago

raw

history blame contribute delete

6.15 kB

	import math
	from dataclasses import dataclass
	from typing import Dict, List, Tuple

	import numpy as np

	# Assuming GameState interface from existing code
	# We import the actual GameState to be safe
	from engine.game.game_state import GameState


	@dataclass
	class HeuristicMCTSConfig:
	num_simulations: int = 100
	c_puct: float = 1.4
	depth_limit: int = 50


	class HeuristicNode:
	def __init__(self, parent=None, prior=1.0):
	self.parent = parent
	self.children: Dict[int, "HeuristicNode"] = {}
	self.visit_count = 0
	self.value_sum = 0.0
	self.prior = prior
	self.untried_actions: List[int] = []
	self.player_just_moved = -1

	@property
	def value(self):
	if self.visit_count == 0:
	return 0
	return self.value_sum / self.visit_count

	def ucb_score(self, c_puct):
	# Standard UCB1
	if self.visit_count == 0:
	return float("inf")

	# UCB = Q + c * sqrt(ln(N_parent) / N_child)
	# Note: AlphaZero uses a slightly different variant with Priors.
	# Since we don't have a policy network, we assume uniform priors or just use standard UCB.
	# Let's use standard UCB for "MCTS without training"

	parent_visits = self.parent.visit_count if self.parent else 1
	exploitation = self.value
	exploration = c_puct * math.sqrt(math.log(parent_visits) / self.visit_count)
	return exploitation + exploration


	class HeuristicMCTS:
	"""
	MCTS that uses random rollouts and heuristics instead of a Neural Network.
	This works 'without training' because it relies on the game rules (simulation)
	and hard-coded domain knowledge (rollout policy / terminal evaluation).
	"""

	def __init__(self, config: HeuristicMCTSConfig):
	self.config = config
	self.root = None

	def search(self, state: GameState) -> int:
	self.root = HeuristicNode(prior=1.0)
	# We need to copy state for the root? Actually search loop copies it.
	# But we need to know legal actions.
	legal = state.get_legal_actions()
	self.root.untried_actions = [i for i, x in enumerate(legal) if x]
	self.root.player_just_moved = 1 - state.current_player # Parent moved previously

	for _ in range(self.config.num_simulations):
	node = self.root
	sim_state = state.copy()

	# 1. Selection
	path = [node]
	while node.children and not node.untried_actions:
	action, node = self._select_best_step(node)
	sim_state = sim_state.step(action)
	path.append(node)

	# 2. Expansion
	if node.untried_actions:
	action = node.untried_actions.pop()
	sim_state = sim_state.step(action)
	child = HeuristicNode(parent=node, prior=1.0)
	child.player_just_moved = 1 - sim_state.current_player # The player who took 'action'
	node.children[action] = child
	node = child
	path.append(node)

	# 3. Simulation (Rollout)
	# Run until terminal or depth limit
	depth = 0
	while not sim_state.is_terminal() and depth < self.config.depth_limit:
	legal = sim_state.get_legal_actions()
	legal_indices = [i for i, x in enumerate(legal) if x]
	if not legal_indices:
	break
	# Random Policy (No training required)
	action = np.random.choice(legal_indices)
	sim_state = sim_state.step(action)
	depth += 1

	# 4. Backpropagation
	# If terminal, get reward. If cutoff, use heuristic.
	if sim_state.is_terminal():
	# reward is relative to current_player
	# We need reward from perspective of root player?
	# Usually standard MCTS backprops values flipping each layer
	reward = sim_state.get_reward(state.current_player) # 1.0 if root wins
	else:
	reward = self._heuristic_eval(sim_state, state.current_player)

	for i, n in enumerate(reversed(path)):
	n.visit_count += 1
	# If n.player_just_moved == root_player, this node represents a state AFTER root moved.
	# So its value should be positive if root won.
	# Standard: if player_just_moved won, +1.

	# Simpler view: All values tracked relative to Root Player.
	n.value_sum += reward

	# Select best move (robust child)
	if not self.root.children:
	return 0 # Fallback

	best_action = max(self.root.children.items(), key=lambda item: item[1].visit_count)[0]
	return best_action

	def _select_best_step(self, node: HeuristicNode) -> Tuple[int, HeuristicNode]:
	# Standard UCB
	best_score = -float("inf")
	best_item = None

	for action, child in node.children.items():
	score = child.ucb_score(self.config.c_puct)
	if score > best_score:
	best_score = score
	best_item = (action, child)

	return best_item

	def _heuristic_eval(self, state: GameState, root_player: int) -> float:
	"""
	Evaluate state without a neural network.
	Logic: More blades/hearts/lives = Better.
	"""
	p = state.players[root_player]
	opp = state.players[1 - root_player]

	# Score = (My Lives - Opp Lives) + 0.1 * (My Power - Opp Power)
	score = 0.0
	score += (len(p.success_lives) - len(opp.success_lives)) * 0.5

	my_power = p.get_total_blades(state.member_db)
	opp_power = opp.get_total_blades(state.member_db)
	score += (my_power - opp_power) * 0.05

	# Clamp to [-1, 1]
	return max(-1.0, min(1.0, score))


	if __name__ == "__main__":
	pass