File size: 10,703 Bytes

1e2624a

"""
Improved FruitBox environment that addresses several issues in the baseline:
- Optional backward board generation for solvable boards (high coverage).
- Illegal actions advance time and can carry a penalty; episodes end when no legal actions.
- Incremental action-mask updates so we do not rescan every rectangle on illegal steps.
- Reward can include zero-valued cells to encourage 0 활용 전략.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Dict, Optional, Tuple, List

import gymnasium as gym
import numpy as np
from gymnasium import spaces

from envs.backward_generator import BackwardBoardGenerator


@dataclass
class FruitBoxImprovedConfig:
    rows: int = 10
    cols: int = 17
    reward_per_cell: float = 1.0
    reward_per_zero_cell: float = 0.0  # zero-valued cells (cleared apples) give no extra reward
    illegal_action_reward: float = -1.0
    max_steps: int = 500  # safety cap; original game uses time, not steps

    # Board generation
    use_backward_generator: bool = True
    target_coverage: float = 0.95  # only used when use_backward_generator is True
    enforce_total_sum_mod_10: bool = True  # fallback random generation

    # Rendering
    render_mode: Optional[str] = None  # "ansi" or None


class FruitBoxEnvImproved(gym.Env):
    metadata = {"render_modes": ["ansi"], "render_fps": 30}

    def __init__(self, config: Optional[FruitBoxImprovedConfig] = None, **kwargs):
        super().__init__()
        if config is None:
            cfg = FruitBoxImprovedConfig(**kwargs) if kwargs else FruitBoxImprovedConfig()
        else:
            cfg = config
            for k, v in kwargs.items():
                setattr(cfg, k, v)
        self.cfg: FruitBoxImprovedConfig = cfg

        R, C = self.cfg.rows, self.cfg.cols
        assert R > 0 and C > 0, "rows and cols must be positive"

        # Observation: integers 0..9 (0 means empty)
        self.observation_space = spaces.Box(low=0, high=9, shape=(R, C), dtype=np.int8)

        # Actions: choose any axis-aligned rectangle (r1,c1,r2,c2) with r1<=r2, c1<=c2
        rects = []
        for r1 in range(R):
            for r2 in range(r1, R):
                for c1 in range(C):
                    for c2 in range(c1, C):
                        rects.append((r1, c1, r2, c2))
        self.rects: np.ndarray = np.array(rects, dtype=np.int32)  # (N, 4)
        self.n_actions: int = self.rects.shape[0]
        self.action_space = spaces.Discrete(self.n_actions)

        # Precompute indices for vectorized prefix-sum rectangle queries
        self._idx_r1 = self.rects[:, 0]
        self._idx_c1 = self.rects[:, 1]
        self._idx_r2p = self.rects[:, 2] + 1  # r2+1
        self._idx_c2p = self.rects[:, 3] + 1  # c2+1

        # Cell -> list of rectangles that include the cell (for incremental updates)
        self._cell_to_rects: List[np.ndarray] = self._build_cell_to_rects()

        self.board: np.ndarray = np.zeros((R, C), dtype=np.int16)
        self.steps: int = 0
        self.np_random = np.random.default_rng()

        # Cached per-rect sums and mask
        self._rect_sums: np.ndarray = np.zeros(self.n_actions, dtype=np.int32)
        self._action_mask: np.ndarray = np.zeros(self.n_actions, dtype=bool)

    # ---------- utilities ----------
    def _build_cell_to_rects(self) -> List[np.ndarray]:
        R, C = self.cfg.rows, self.cfg.cols
        mapping: List[List[int]] = [[] for _ in range(R * C)]
        for idx, (r1, c1, r2, c2) in enumerate(self.rects):
            for r in range(r1, r2 + 1):
                base = r * C
                for c in range(c1, c2 + 1):
                    mapping[base + c].append(idx)
        return [np.array(indices, dtype=np.int32) for indices in mapping]

    @staticmethod
    def _padded_prefix_sums(arr: np.ndarray) -> np.ndarray:
        """Return (R+1, C+1) padded summed-area table."""
        R, C = arr.shape
        ps = np.zeros((R + 1, C + 1), dtype=np.int32)
        ps[1:, 1:] = arr.cumsum(axis=0).cumsum(axis=1)
        return ps

    def _rect_sums_vectorized(self, ps: np.ndarray) -> np.ndarray:
        """Compute sums for all rectangles using padded prefix sums (vectorized)."""
        return (
            ps[self._idx_r2p, self._idx_c2p]
            - ps[self._idx_r1, self._idx_c2p]
            - ps[self._idx_r2p, self._idx_c1]
            + ps[self._idx_r1, self._idx_c1]
        )

    def _gen_board(self) -> np.ndarray:
        """Generate a board; prefers solvable boards via backward generator."""
        R, C = self.cfg.rows, self.cfg.cols
        if self.cfg.use_backward_generator:
            gen_seed = int(self.np_random.integers(0, 1_000_000_000))
            generator = BackwardBoardGenerator(rows=R, cols=C, seed=gen_seed)
            board, solution = generator.generate(target_coverage=self.cfg.target_coverage)
            self._last_solution = solution
            return board.astype(np.int16, copy=False)

        # Fallback: random board with sum%10 adjusted
        low, high = 1, 9
        board = self.np_random.integers(low, high + 1, size=(R, C), dtype=np.int16)
        if self.cfg.enforce_total_sum_mod_10:
            delta = int((10 - (board.sum() % 10)) % 10)
            tries = 0
            while delta > 0 and tries < 100:
                r = int(self.np_random.integers(0, R))
                c = int(self.np_random.integers(0, C))
                inc = min(9 - int(board[r, c]), delta)
                if inc > 0:
                    board[r, c] += inc
                    delta -= inc
                tries += 1
        return board

    def _compute_full_mask(self, board: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """Compute sums and mask for all rectangles."""
        ps_val = self._padded_prefix_sums(board)
        sums = self._rect_sums_vectorized(ps_val)
        mask = (sums == 10)
        return sums.astype(np.int32, copy=False), mask

    def _update_after_clear(self, r1: int, c1: int, r2: int, c2: int, cleared_vals: np.ndarray):
        """
        Incrementally update rectangle sums/mask after setting a region to zero.
        cleared_vals is the pre-zeroing values of shape (r2-r1+1, c2-c1+1).
        """
        R, C = self.cfg.rows, self.cfg.cols
        deltas: Dict[int, int] = {}
        for dr, row in enumerate(range(r1, r2 + 1)):
            base = row * C
            for dc, col in enumerate(range(c1, c2 + 1)):
                val = int(cleared_vals[dr, dc])
                if val == 0:
                    continue
                cell_rects = self._cell_to_rects[base + col]
                for rect_idx in cell_rects:
                    deltas[rect_idx] = deltas.get(rect_idx, 0) + val

        for rect_idx, delta in deltas.items():
            self._rect_sums[rect_idx] -= delta
            self._action_mask[rect_idx] = (self._rect_sums[rect_idx] == 10)

    # ---------- Gymnasium API ----------
    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None) -> Tuple[np.ndarray, dict]:
        if seed is not None:
            self.np_random = np.random.default_rng(seed)
        self.steps = 0
        self.board = self._gen_board().astype(np.int16, copy=False)
        self._rect_sums, self._action_mask = self._compute_full_mask(self.board)
        info = {"action_mask": self._action_mask}
        obs = self.board.clip(0, 9).astype(np.int8, copy=False)
        return obs, info

    def step(self, action: int):
        assert isinstance(action, (int, np.integer)), "action must be an integer index"
        terminated = False
        truncated = False
        reward = 0.0

        # Illegal action: advance time, optional penalty, end if no legal actions remain.
        if action < 0 or action >= self.n_actions or not self._action_mask[action]:
            self.steps += 1
            reward = float(self.cfg.illegal_action_reward)
            if not self._action_mask.any():
                terminated = True
            if self.steps >= self.cfg.max_steps:
                truncated = True
            obs = self.board.clip(0, 9).astype(np.int8, copy=False)
            info = {"action_mask": self._action_mask, "illegal_action": True}
            return obs, reward, terminated, truncated, info

        r1, c1, r2, c2 = self.rects[action]
        region = self.board[r1 : r2 + 1, c1 : c2 + 1]
        cleared_vals = region.copy()
        cells_total = region.size
        cells_nonzero = int(np.sum(region > 0))
        cells_zero = cells_total - cells_nonzero

        # Apply action
        self.board[r1 : r2 + 1, c1 : c2 + 1] = 0
        self.steps += 1

        reward = (
            self.cfg.reward_per_cell * float(cells_nonzero)
            + self.cfg.reward_per_zero_cell * float(cells_zero)
        )

        # Incremental mask update
        self._update_after_clear(r1, c1, r2, c2, cleared_vals)

        if not self._action_mask.any():
            terminated = True
        if self.steps >= self.cfg.max_steps:
            truncated = True

        obs = self.board.clip(0, 9).astype(np.int8, copy=False)
        info = {"action_mask": self._action_mask, "illegal_action": False}
        return obs, float(reward), terminated, truncated, info

    # ---------- helpers ----------
    def legal_actions(self) -> np.ndarray:
        return np.nonzero(self._action_mask)[0]

    def sample_valid_action(self) -> Optional[int]:
        legal = self.legal_actions()
        if legal.size == 0:
            return None
        return int(self.np_random.choice(legal))

    # ---------- rendering ----------
    def render(self):
        if self.cfg.render_mode != "ansi":
            return
        lines = []
        lines.append(f"Steps={self.steps}")
        lines.append("+" + "---" * self.cfg.cols + "+")
        for r in range(self.cfg.rows):
            row_vals = " ".join(f"{int(v):1d}" for v in self.board[r])
            lines.append(f"| {row_vals} |")
        lines.append("+" + "---" * self.cfg.cols + "+")
        return "\n".join(lines)

    def close(self):
        pass


# ---- quick smoke test ----
if __name__ == "__main__":
    env = FruitBoxEnvImproved(FruitBoxImprovedConfig(render_mode="ansi"))
    obs, info = env.reset(seed=0)
    print("Initial legal actions:", len(np.nonzero(info["action_mask"])[0]))
    total = 0.0
    while True:
        mask = info["action_mask"]
        if not mask.any():
            break
        a = int(np.flatnonzero(mask)[0])
        obs, r, terminated, truncated, info = env.step(a)
        total += r
        if env.cfg.render_mode == "ansi":
            print(env.render())
        if terminated or truncated:
            break
    print("Episode total reward:", total)