"""
AI Tutor RL Environment — models a student learning across N subjects.

State:  proficiency scores in [0, 1] for each subject
Action: which subject to focus on (0..N-1)
Reward: proficiency of the focused subject after the step
  → encourages the agent to focus on subjects where it can make progress

Dynamics:
  - Focused subject gains +15–30% (simulates a focused study session)
  - All other subjects decay -1–3% (simulates forgetting / drift)
  - Episode ends when all subjects reach 98%+ mastery
"""

from __future__ import annotations
import numpy as np
import gymnasium as gym
from gymnasium import spaces

SUBJECTS = [
    "Mathematics",
    "Physics",
    "Literature",
    "History",
    "Computer Science",
]
SUBJECT_COLORS = ["#6366f1", "#10b981", "#f59e0b", "#ec4899", "#3b82f6"]
SUBJECT_ICONS  = ["∑", "⚛", "📖", "🏛", "</>" ]

N_SUBJECTS = len(SUBJECTS)


class AITutorEnv(gym.Env):
    """
    Gymnasium-compatible tutoring environment.

    Observation space: Box([0,1]^N) — normalised proficiency per subject
    Action space:      Discrete(N)  — which subject to study
    """

    metadata = {"render_modes": []}

    def __init__(self, n_subjects: int = N_SUBJECTS):
        super().__init__()
        self.n = n_subjects
        self.observation_space = spaces.Box(
            low=0.0, high=1.0, shape=(n_subjects,), dtype=np.float32
        )
        self.action_space = spaces.Discrete(n_subjects)
        self.state = np.zeros(n_subjects, dtype=np.float32)
        self.step_count = 0
        self.max_steps = 200   # hard cap prevents infinite episodes

    def reset(self, seed: int | None = None, options: dict | None = None):
        super().reset(seed=seed)
        # Start students with low, varied proficiency
        self.state = np.random.uniform(0.05, 0.35, self.n).astype(np.float32)
        self.step_count = 0
        return self.state.copy(), {}

    def step(self, action: int):
        self.step_count += 1

        # Learning gain for focused subject
        gain = float(np.random.uniform(0.12, 0.28))
        self.state[action] = min(1.0, self.state[action] + gain)

        # Forgetting for all other subjects
        decay = np.random.uniform(0.005, 0.025, self.n)
        decay[action] = 0.0
        self.state = np.maximum(0.0, self.state - decay)

        reward = float(self.state[action])                    # reward = current mastery
        mastered = bool(np.all(self.state >= 0.98))
        truncated = self.step_count >= self.max_steps
        done = mastered

        info = {"mastered": mastered, "step": self.step_count}
        return self.state.copy(), reward, done, truncated, info

    def set_state(self, proficiency_pct: list[float]):
        """Inject a specific state (from UI sliders, in 0–100 range)."""
        self.state = np.clip(
            np.array(proficiency_pct, dtype=np.float32) / 100.0, 0.0, 1.0
        )
        self.step_count = 0