AI-Tutor-A2C / core /environment.py
Daksh C Jain
Complete revamp β€” professional A2C tutoring platform
5251c5a
"""
AI Tutor RL Environment β€” models a student learning across N subjects.
State: proficiency scores in [0, 1] for each subject
Action: which subject to focus on (0..N-1)
Reward: proficiency of the focused subject after the step
β†’ encourages the agent to focus on subjects where it can make progress
Dynamics:
- Focused subject gains +15–30% (simulates a focused study session)
- All other subjects decay -1–3% (simulates forgetting / drift)
- Episode ends when all subjects reach 98%+ mastery
"""
from __future__ import annotations
import numpy as np
import gymnasium as gym
from gymnasium import spaces
SUBJECTS = [
"Mathematics",
"Physics",
"Literature",
"History",
"Computer Science",
]
SUBJECT_COLORS = ["#6366f1", "#10b981", "#f59e0b", "#ec4899", "#3b82f6"]
SUBJECT_ICONS = ["βˆ‘", "βš›", "πŸ“–", "πŸ›", "</>" ]
N_SUBJECTS = len(SUBJECTS)
class AITutorEnv(gym.Env):
"""
Gymnasium-compatible tutoring environment.
Observation space: Box([0,1]^N) β€” normalised proficiency per subject
Action space: Discrete(N) β€” which subject to study
"""
metadata = {"render_modes": []}
def __init__(self, n_subjects: int = N_SUBJECTS):
super().__init__()
self.n = n_subjects
self.observation_space = spaces.Box(
low=0.0, high=1.0, shape=(n_subjects,), dtype=np.float32
)
self.action_space = spaces.Discrete(n_subjects)
self.state = np.zeros(n_subjects, dtype=np.float32)
self.step_count = 0
self.max_steps = 200 # hard cap prevents infinite episodes
def reset(self, seed: int | None = None, options: dict | None = None):
super().reset(seed=seed)
# Start students with low, varied proficiency
self.state = np.random.uniform(0.05, 0.35, self.n).astype(np.float32)
self.step_count = 0
return self.state.copy(), {}
def step(self, action: int):
self.step_count += 1
# Learning gain for focused subject
gain = float(np.random.uniform(0.12, 0.28))
self.state[action] = min(1.0, self.state[action] + gain)
# Forgetting for all other subjects
decay = np.random.uniform(0.005, 0.025, self.n)
decay[action] = 0.0
self.state = np.maximum(0.0, self.state - decay)
reward = float(self.state[action]) # reward = current mastery
mastered = bool(np.all(self.state >= 0.98))
truncated = self.step_count >= self.max_steps
done = mastered
info = {"mastered": mastered, "step": self.step_count}
return self.state.copy(), reward, done, truncated, info
def set_state(self, proficiency_pct: list[float]):
"""Inject a specific state (from UI sliders, in 0–100 range)."""
self.state = np.clip(
np.array(proficiency_pct, dtype=np.float32) / 100.0, 0.0, 1.0
)
self.step_count = 0