Spaces:
Sleeping
Sleeping
File size: 12,590 Bytes
6f44ddb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | """
run_rollout.py β Load a trained EduForge model and run a 12-turn demo episode.
Prints per-turn state transitions: Confusion | Strategy | Reward.
Usage
-----
# With a trained model:
python scripts/run_rollout.py --model_dir outputs/eduforge_grpo/final_model
# Without a model (rule-based fallback for demo):
python scripts/run_rollout.py --no_model
"""
from __future__ import annotations
import argparse
import re
import sys
import os
# Ensure project root is on path when run directly
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from src.environment.openenv_wrapper import EduForgeEnv
from src.environment.student_fsm import MisconceptionType, TutorAction
from src.rewards.engine import RewardEngine
# ---------------------------------------------------------------------------
# ANSI colour helpers
# ---------------------------------------------------------------------------
RESET = "\033[0m"
BOLD = "\033[1m"
CYAN = "\033[96m"
GREEN = "\033[92m"
YELLOW = "\033[93m"
RED = "\033[91m"
GREY = "\033[90m"
BLUE = "\033[94m"
def _c(text: str, colour: str) -> str:
return f"{colour}{text}{RESET}"
def _bar(value: float, max_val: float = 10.0, width: int = 20, fill: str = "β", empty: str = "β") -> str:
filled = int(round((value / max_val) * width))
return fill * filled + empty * (width - filled)
# ---------------------------------------------------------------------------
# Model loader
# ---------------------------------------------------------------------------
def load_model(model_dir: str):
"""Load Unsloth/HF model + tokenizer. Returns (model, tokenizer)."""
try:
from unsloth import FastLanguageModel # type: ignore
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_dir,
max_seq_length=2048,
load_in_4bit=True,
)
FastLanguageModel.for_inference(model)
return model, tokenizer
except ImportError:
# Fallback: plain HF transformers
from transformers import AutoModelForCausalLM, AutoTokenizer # type: ignore
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto")
return model, tokenizer
# ---------------------------------------------------------------------------
# Action generator
# ---------------------------------------------------------------------------
_SYSTEM_PROMPT = (
"You are an expert AI tutor. Always respond with:\n"
"<STRATEGY>{strategy}</STRATEGY>\n"
"<RESPONSE>{tutoring response}</RESPONSE>\n"
"Valid strategies: explain, worked_example, hint, question, correct_fact, analogize, repeat."
)
_EFFECTIVE_MAP: dict[MisconceptionType, TutorAction] = {
MisconceptionType.PROCEDURAL: TutorAction.WORKED_EXAMPLE,
MisconceptionType.CONCEPTUAL: TutorAction.EXPLAIN,
MisconceptionType.FACTUAL: TutorAction.CORRECT_FACT,
MisconceptionType.TRANSFER: TutorAction.ANALOGIZE,
}
_RESPONSE_BANK: dict[TutorAction, list[str]] = {
TutorAction.EXPLAIN: [
"Let me walk you through the core concept step by step.",
"The key idea here is that the rule applies becauseβ¦",
],
TutorAction.WORKED_EXAMPLE: [
"Here's a concrete example β watch each step carefully.",
"Let's solve a simpler version first so you can see the pattern.",
],
TutorAction.HINT: [
"Think about what happens if you apply the rule backwards.",
"What do you notice about the first term?",
],
TutorAction.QUESTION: [
"Can you tell me what you think the first step should be?",
"Why do you think that step comes before the other?",
],
TutorAction.CORRECT_FACT: [
"Actually, the correct fact is: the value is always positive here.",
"That's a common mix-up β the actual definition is slightly different.",
],
TutorAction.ANALOGIZE: [
"Think of it like water flowing through pipes β same rules apply.",
"It's similar to how a map scale works; the ratio stays constant.",
],
TutorAction.REPEAT: [
"As I mentioned, the key step isβ¦",
],
}
import random as _random
def _rule_based_action(misconception: MisconceptionType, turn: int, prev_action: str | None) -> str:
"""Deterministic rule-based fallback agent for demo without a trained model."""
strategy = _EFFECTIVE_MAP[misconception]
# Occasionally vary to avoid repetition penalty
if turn % 3 == 2:
strategy = TutorAction.QUESTION
response = _random.choice(_RESPONSE_BANK[strategy])
return f"<STRATEGY>{strategy.value}</STRATEGY>\n<RESPONSE>{response}</RESPONSE>"
def _model_action(model, tokenizer, prompt: str) -> str:
import torch
inputs = tokenizer(
[f"<|system|>\n{_SYSTEM_PROMPT}\n<|user|>\n{prompt}\n<|assistant|>"],
return_tensors="pt",
).to(model.device)
with torch.no_grad():
out = model.generate(
**inputs,
max_new_tokens=200,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
decoded = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
return decoded.strip()
# ---------------------------------------------------------------------------
# Pretty printer
# ---------------------------------------------------------------------------
def _print_header(misconception: MisconceptionType) -> None:
print()
print(_c("β" * 66, BOLD))
print(_c(" EduForge β Demo Rollout", BOLD + CYAN))
print(_c(f" Student misconception type: {misconception.value.upper()}", BOLD))
print(_c("β" * 66, BOLD))
print()
def _print_turn(
turn: int,
confusion: float,
attention: float,
student_text:str,
action_str: str,
strategy: str | None,
reward: float,
done: bool,
done_reason: str | None,
) -> None:
conf_colour = RED if confusion > 6 else (YELLOW if confusion > 3 else GREEN)
attn_colour = RED if attention < 3 else (YELLOW if attention < 6 else GREEN)
print(_c(f"βββ Turn {turn:>2} {'β' * 52}", GREY))
print(f"β {_c('Student :', CYAN)} {student_text}")
print(f"β {_c('Strategy:', BLUE)} {strategy or '(none parsed)'}")
# Tutor response excerpt
response_match = re.search(r"<RESPONSE>(.*?)</RESPONSE>", action_str, re.DOTALL | re.I)
if response_match:
excerpt = response_match.group(1).strip()[:80]
print(f"β {_c('Tutor :', BLUE)} {excerpt}")
# Metrics row
conf_bar = _bar(confusion, 10.0, width=14)
attn_bar = _bar(attention, 10.0, width=14)
r_colour = GREEN if reward > 0 else RED
print(
f"β {_c('Confusion', conf_colour)} {conf_bar} {_c(f'{confusion:.2f}', conf_colour)} "
f"{_c('Attention', attn_colour)} {attn_bar} {_c(f'{attention:.2f}', attn_colour)} "
f"{_c('Reward', r_colour)} {_c(f'{reward:+.4f}', r_colour)}"
)
if done:
symbol = {"success": "β RESOLVED", "timeout": "β TIMEOUT", "disengaged": "β DISENGAGED"}.get(
done_reason or "", "β DONE"
)
colour = GREEN if done_reason == "success" else RED
print(f"β {_c(symbol, colour + BOLD)}")
print(_c("β" + "β" * 64, GREY))
def _print_summary(results: list[dict]) -> None:
total_r = sum(r["reward"] for r in results)
c_start = results[0]["confusion_before"]
c_end = results[-1]["confusion"]
delta = c_start - c_end
done_r = results[-1].get("done_reason")
outcome = _c("β SUCCESS", GREEN + BOLD) if done_r == "success" else _c(f"β {(done_r or 'unknown').upper()}", RED + BOLD)
avg_confusion = sum(r["confusion"] for r in results) / len(results)
action_counts: dict[str, int] = {}
for r in results:
s = r["strategy"] or "(none)"
action_counts[s] = action_counts.get(s, 0) + 1
total_actions = sum(action_counts.values())
print()
print(_c("β" * 66, BOLD))
print(_c(" Episode Summary", BOLD + CYAN))
print(_c("β" * 66, BOLD))
print(f" Outcome : {outcome}")
print(f" Turns used : {len(results)}")
print(f" Confusion Ξ : {c_start:.2f} β {c_end:.2f} ({_c(f'-{delta:.2f}', GREEN)})")
print(f" Avg confusion : {_c(f'{avg_confusion:.2f}', YELLOW)}")
print(f" Total reward : {_c(f'{total_r:+.4f}', GREEN if total_r > 0 else RED)}")
print(_c(" β" * 33, GREY))
print(f" {'Action':<18} {'Count':>5} {'%':>6}")
print(_c(" β" * 33, GREY))
for act, cnt in sorted(action_counts.items(), key=lambda x: x[1], reverse=True):
pct = cnt / total_actions * 100
print(f" {act:<18} {cnt:>5} {pct:>5.1f}%")
print(_c("β" * 66, BOLD))
print()
# ---------------------------------------------------------------------------
# Main rollout runner
# ---------------------------------------------------------------------------
def run_rollout(
model=None,
tokenizer=None,
seed: int = 0,
max_turns: int = 12,
) -> list[dict]:
env = EduForgeEnv(seed=seed)
reward_engine = RewardEngine()
obs = env.reset()
reward_engine.reset()
_print_header(obs.misconception_id)
results: list[dict] = []
prev_action: str | None = None
for _ in range(max_turns):
confusion_before = obs.confusion
# Generate action
if model is not None and tokenizer is not None:
prompt = f"[Turn {obs.turn}] Student: \"{obs.student_response}\"\nMisconception: {obs.misconception_id.value}"
action_str = _model_action(model, tokenizer, prompt)
else:
action_str = _rule_based_action(obs.misconception_id, obs.turn, prev_action)
# Environment step
obs, _, done, info = env.step(action_str)
# Reward
format_valid = bool(re.search(r"<STRATEGY>\s*\w+\s*</STRATEGY>", action_str, re.I))
_, components = reward_engine.compute(
confusion_before=confusion_before,
confusion_after=obs.confusion,
attention_after=obs.attention,
action_text=action_str,
format_valid=format_valid,
done=done,
done_reason=info.done_reason,
episode_length=obs.turn,
)
strategy = info.parsed_action.value if info.parsed_action else None
_print_turn(
turn = obs.turn,
confusion = obs.confusion,
attention = obs.attention,
student_text = obs.student_response,
action_str = action_str,
strategy = strategy,
reward = components.total,
done = done,
done_reason = info.done_reason,
)
results.append({
"turn": obs.turn,
"confusion_before": confusion_before,
"confusion": obs.confusion,
"attention": obs.attention,
"strategy": strategy,
"reward": components.total,
"done_reason": info.done_reason,
})
prev_action = action_str
if done:
break
_print_summary(results)
return results
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(description="EduForge rollout demo")
parser.add_argument("--model_dir", default=None, help="Path to trained model directory")
parser.add_argument("--no_model", action="store_true", help="Use rule-based fallback (no GPU needed)")
parser.add_argument("--seed", type=int, default=0, help="Episode seed")
parser.add_argument("--max_turns", type=int, default=12, help="Maximum turns")
args = parser.parse_args()
model = tokenizer = None
if not args.no_model:
if args.model_dir is None:
print(_c("No --model_dir given. Using rule-based fallback.", YELLOW))
else:
print(_c(f"Loading model from {args.model_dir} β¦", CYAN))
model, tokenizer = load_model(args.model_dir)
print(_c("Model loaded.", GREEN))
run_rollout(model=model, tokenizer=tokenizer, seed=args.seed, max_turns=args.max_turns)
if __name__ == "__main__":
main()
|