| """ |
| Quilltale — Automated Evaluation Runner |
| |
| Plays through a fixed scenario and measures three metrics: |
| 1. invalid_transition_rate — how often the GM proposes invalid state changes |
| 2. memory_utilisation_rate — how often NPC memories actually shape narration |
| 3. factual_consistency_rate — how often narration contradicts world state |
| |
| Run with: |
| python eval_runner.py |
| |
| Outputs: |
| eval_results/report_<timestamp>.json |
| eval_results/report_<timestamp>.txt |
| """ |
|
|
| import json |
| import os |
| import logging |
| from datetime import datetime |
| from pathlib import Path |
| from dataclasses import dataclass, field |
|
|
| from src.world.state import WorldState |
| from src.agents.game_master import GameMasterAgent |
| from src.llm import get_llm |
| from src.llm.base import BaseLLM |
|
|
| logging.basicConfig(level=logging.WARNING) |
|
|
| |
| |
| |
| |
|
|
| EVAL_SCENARIO = [ |
| |
| "look around the tavern carefully", |
| "examine the wanted notice on the wall", |
| "examine the rusty dagger on the table", |
|
|
| |
| "talk to Marta the barkeep", |
| "ask Marta about the dagger", |
| "ask Marta about the chest upstairs", |
|
|
| |
| "threaten Marta to tell you what she knows", |
|
|
| |
| "pick up the dagger", |
| "pick up the wanted notice", |
|
|
| |
| "pick up the strange coin", |
|
|
| |
| "go north to the street", |
| "go east to the market", |
|
|
| |
| "talk to Aldric the merchant", |
| "ask Aldric about the strange coin", |
|
|
| |
| "go back to the tavern", |
|
|
| |
| "talk to Marta", |
|
|
| |
| "go upstairs to my room", |
| "examine the locked chest", |
|
|
| |
| "use the old iron key on the chest", |
|
|
| |
| "go south", |
| ] |
|
|
|
|
| |
|
|
| JUDGE_SYSTEM = """ |
| You are an objective evaluator assessing AI game master output quality. |
| You always respond with valid JSON only. No preamble or explanation outside the JSON. |
| """ |
|
|
| def judge_memory_utilisation( |
| llm: BaseLLM, |
| narration: str, |
| npc_memories: list, |
| npc_name: str, |
| ) -> dict: |
| """ |
| Ask the LLM judge: does this narration reflect the NPC's recorded memories? |
| Returns {"reflects_memory": bool, "confidence": float, "reason": str} |
| """ |
| if not npc_memories: |
| return {"reflects_memory": True, "confidence": 1.0, "reason": "No memories to reflect."} |
|
|
| memory_text = "\n".join( |
| f" - Turn {m.turn} ({m.emotional_tone}, sig={m.significance}): {m.description}" |
| for m in npc_memories[:3] |
| ) |
|
|
| prompt = f""" |
| {npc_name} has these recorded memories of the player: |
| {memory_text} |
| |
| The game master produced this narration: |
| "{narration}" |
| |
| Does the narration reflect any of these memories through {npc_name}'s behaviour, |
| tone, dialogue, or reaction? Even subtle reflection counts (e.g. wariness, coldness, |
| gratitude shown through action rather than stated directly). |
| |
| Respond with JSON: |
| {{"reflects_memory": true/false, "confidence": 0.0-1.0, "reason": "one sentence"}} |
| """ |
| try: |
| raw = llm.generate_json(prompt, JUDGE_SYSTEM) |
| return json.loads(raw) |
| except Exception: |
| return {"reflects_memory": False, "confidence": 0.0, "reason": "Judge call failed."} |
|
|
|
|
| def judge_factual_consistency( |
| llm: BaseLLM, |
| narration: str, |
| world_context: str, |
| ) -> dict: |
| """ |
| Ask the LLM judge: does the narration contradict the world state? |
| Returns {"is_consistent": bool, "confidence": float, "violation": str} |
| """ |
| prompt = f""" |
| The current world state contains these facts: |
| {world_context} |
| |
| The game master produced this narration: |
| "{narration}" |
| |
| Does the narration contradict any recorded facts? Look for: |
| - Items mentioned that aren't in the current location or inventory |
| - NPCs described as present when they are not listed |
| - Movement described to locations not reachable from current exits |
| - Health or inventory states that differ from recorded values |
| |
| Respond with JSON: |
| {{"is_consistent": true/false, "confidence": 0.0-1.0, "violation": "describe any contradiction, or empty string if none"}} |
| """ |
| try: |
| raw = llm.generate_json(prompt, JUDGE_SYSTEM) |
| return json.loads(raw) |
| except Exception: |
| return {"is_consistent": True, "confidence": 0.0, "violation": "Judge call failed."} |
|
|
|
|
| |
|
|
| @dataclass |
| class TurnRecord: |
| turn: int |
| action: str |
| narration: str |
| state_update: dict |
| changes_applied: list[str] |
| rejected_transitions: list[str] |
| memory_judgement: dict = field(default_factory=dict) |
| consistency_judgement: dict = field(default_factory=dict) |
| npcs_present: list[str] = field(default_factory=list) |
| npc_memories_present: bool = False |
|
|
|
|
| @dataclass |
| class EvalReport: |
| scenario_name: str = "default_world" |
| total_turns: int = 0 |
| timestamp: str = "" |
|
|
| |
| total_transitions_attempted: int = 0 |
| total_transitions_rejected: int = 0 |
| total_turns_with_npcs: int = 0 |
| total_turns_memory_reflected: int = 0 |
| total_turns_memory_judged: int = 0 |
| total_turns_consistent: int = 0 |
| total_turns_consistency_judged: int = 0 |
|
|
| |
| invalid_transition_rate: float = 0.0 |
| memory_utilisation_rate: float = 0.0 |
| factual_consistency_rate: float = 0.0 |
|
|
| |
| turns: list[TurnRecord] = field(default_factory=list) |
|
|
| |
| rejection_examples: list[str] = field(default_factory=list) |
| memory_failures: list[dict] = field(default_factory=list) |
| consistency_violations: list[dict] = field(default_factory=list) |
|
|
| def compute_rates(self): |
| if self.total_transitions_attempted > 0: |
| self.invalid_transition_rate = round( |
| self.total_transitions_rejected / self.total_transitions_attempted, 3 |
| ) |
| if self.total_turns_memory_judged > 0: |
| self.memory_utilisation_rate = round( |
| self.total_turns_memory_reflected / self.total_turns_memory_judged, 3 |
| ) |
| if self.total_turns_consistency_judged > 0: |
| self.factual_consistency_rate = round( |
| self.total_turns_consistent / self.total_turns_consistency_judged, 3 |
| ) |
|
|
|
|
| |
|
|
| def run_evaluation( |
| scenario: list[str] = EVAL_SCENARIO, |
| world_path: str = "data/worlds/default.json", |
| llm_name: str = "gemini", |
| run_judge: bool = True, |
| ) -> EvalReport: |
| """ |
| Play through the scenario automatically and collect metrics. |
| |
| Args: |
| scenario: List of player actions to execute in order. |
| world_path: Path to the world JSON file. |
| llm_name: LLM provider to use for the GM. |
| run_judge: Whether to run LLM judge calls for memory and consistency. |
| Set False to only measure invalid transition rate (cheaper). |
| """ |
| print(f"Running evaluation: {len(scenario)} turns, judge={'on' if run_judge else 'off'}") |
| print("-" * 60) |
|
|
| with open(world_path) as f: |
| state = WorldState.from_json(f.read()) |
|
|
| llm = get_llm(llm_name) |
| judge_llm = get_llm(llm_name) if run_judge else None |
| gm = GameMasterAgent(llm) |
|
|
| report = EvalReport( |
| scenario_name=Path(world_path).stem, |
| timestamp=datetime.now().isoformat(), |
| ) |
|
|
| |
| opening = gm.generate_opening(state) |
| print(f"Opening: {opening['narration'][:80]}...") |
| print() |
|
|
| for i, action in enumerate(scenario): |
| print(f"Turn {i+1:02d}: {action}") |
|
|
| world_context_before = state.to_context_summary() |
| npcs_present = state.npcs_in_location(state.player.location) |
| npcs_with_memories = [ |
| npc for npc in npcs_present |
| if npc.alive and len(npc.memories) > 0 |
| ] |
|
|
| result = gm.process_turn(action, state) |
|
|
| |
| update = result.get("state_update", {}) |
| transition_keys = {"move_player", "pickup_item", "drop_item", "npc_state"} |
| attempted = sum(1 for k in transition_keys if k in update) |
| rejected = [c for c in result["changes_applied"] if "REJECTED" in c] |
|
|
| report.total_transitions_attempted += attempted |
| report.total_transitions_rejected += len(rejected) |
|
|
| if rejected: |
| report.rejection_examples.extend(rejected[:2]) |
|
|
| |
| memory_judgement = {} |
| if run_judge and npcs_with_memories: |
| report.total_turns_memory_judged += 1 |
| |
| primary_npc = max(npcs_with_memories, key=lambda n: len(n.memories)) |
| memory_judgement = judge_memory_utilisation( |
| judge_llm, |
| result["narration"], |
| primary_npc.relevant_memories(), |
| primary_npc.name, |
| ) |
| if memory_judgement.get("reflects_memory", False): |
| report.total_turns_memory_reflected += 1 |
| else: |
| report.memory_failures.append({ |
| "turn": i + 1, |
| "action": action, |
| "narration": result["narration"], |
| "reason": memory_judgement.get("reason", ""), |
| }) |
|
|
| |
| consistency_judgement = {} |
| if run_judge: |
| report.total_turns_consistency_judged += 1 |
| consistency_judgement = judge_factual_consistency( |
| judge_llm, |
| result["narration"], |
| world_context_before, |
| ) |
| if consistency_judgement.get("is_consistent", True): |
| report.total_turns_consistent += 1 |
| else: |
| report.consistency_violations.append({ |
| "turn": i + 1, |
| "action": action, |
| "narration": result["narration"], |
| "violation": consistency_judgement.get("violation", ""), |
| }) |
|
|
| |
| record = TurnRecord( |
| turn=i + 1, |
| action=action, |
| narration=result["narration"], |
| state_update=update, |
| changes_applied=result["changes_applied"], |
| rejected_transitions=rejected, |
| memory_judgement=memory_judgement, |
| consistency_judgement=consistency_judgement, |
| npcs_present=[n.name for n in npcs_present], |
| npc_memories_present=bool(npcs_with_memories), |
| ) |
| report.turns.append(record) |
| report.total_turns += 1 |
|
|
| print(f" → {result['narration'][:80]}...") |
| if rejected: |
| print(f" ✗ REJECTED: {rejected}") |
| print() |
|
|
| report.compute_rates() |
| return report |
|
|
|
|
| |
|
|
| def write_report(report: EvalReport, output_dir: str = "eval_results"): |
| Path(output_dir).mkdir(exist_ok=True) |
| timestamp = report.timestamp.replace(":", "-").replace(".", "-") |
|
|
| |
| json_path = Path(output_dir) / f"report_{timestamp}.json" |
| with open(json_path, "w", encoding="utf-8") as f: |
| json.dump(report.__dict__, f, indent=2, default=lambda o: o.__dict__) |
|
|
| |
| txt_path = Path(output_dir) / f"report_{timestamp}.txt" |
| with open(txt_path, "w", encoding="utf-8") as f: |
|
|
| f.write("QUILLTALE — EVALUATION REPORT\n") |
| f.write("=" * 60 + "\n\n") |
| f.write(f"Scenario: {report.scenario_name}\n") |
| f.write(f"Timestamp: {report.timestamp}\n") |
| f.write(f"Turns: {report.total_turns}\n\n") |
|
|
| f.write("METRICS\n") |
| f.write("-" * 40 + "\n") |
| f.write( |
| f"Invalid transition rate: {report.invalid_transition_rate:.1%} " |
| f"({report.total_transitions_rejected}/{report.total_transitions_attempted} transitions rejected)\n" |
| ) |
| f.write( |
| f"Memory utilisation rate: {report.memory_utilisation_rate:.1%} " |
| f"({report.total_turns_memory_reflected}/{report.total_turns_memory_judged} turns with NPCs reflected memory)\n" |
| ) |
| f.write( |
| f"Factual consistency rate: {report.factual_consistency_rate:.1%} " |
| f"({report.total_turns_consistent}/{report.total_turns_consistency_judged} turns without contradictions)\n\n" |
| ) |
|
|
| if report.rejection_examples: |
| f.write("REJECTED TRANSITIONS (sample)\n") |
| f.write("-" * 40 + "\n") |
| for ex in report.rejection_examples[:5]: |
| f.write(f" {ex}\n") |
| f.write("\n") |
|
|
| if report.memory_failures: |
| f.write(f"MEMORY UTILISATION FAILURES ({len(report.memory_failures)} turns)\n") |
| f.write("-" * 40 + "\n") |
| for mf in report.memory_failures[:3]: |
| f.write(f" Turn {mf['turn']}: {mf['action']}\n") |
| f.write(f" Narration: {mf['narration'][:100]}...\n") |
| f.write(f" Reason: {mf['reason']}\n\n") |
|
|
| if report.consistency_violations: |
| f.write(f"CONSISTENCY VIOLATIONS ({len(report.consistency_violations)} turns)\n") |
| f.write("-" * 40 + "\n") |
| for cv in report.consistency_violations[:3]: |
| f.write(f" Turn {cv['turn']}: {cv['action']}\n") |
| f.write(f" Narration: {cv['narration'][:100]}...\n") |
| f.write(f" Violation: {cv['violation']}\n\n") |
|
|
| f.write("TURN-BY-TURN SUMMARY\n") |
| f.write("-" * 40 + "\n") |
| for t in report.turns: |
| status = "✓" if not t.rejected_transitions else "✗" |
| mem = "" |
| if t.memory_judgement: |
| mem = " [mem:✓]" if t.memory_judgement.get("reflects_memory") else " [mem:✗]" |
| con = "" |
| if t.consistency_judgement: |
| con = " [con:✓]" if t.consistency_judgement.get("is_consistent") else " [con:✗]" |
| f.write(f" {status} T{t.turn:02d} {t.action[:45]:<45}{mem}{con}\n") |
|
|
| print(f"Report written to:") |
| print(f" {json_path}") |
| print(f" {txt_path}") |
| return json_path, txt_path |
|
|
|
|
| |
|
|
| if __name__ == "__main__": |
| import argparse |
|
|
| parser = argparse.ArgumentParser(description="Run Quilltale evaluation") |
| parser.add_argument("--no-judge", action="store_true", |
| help="Skip LLM judge calls (only measure transition rate)") |
| parser.add_argument("--llm", default="gemini", |
| help="LLM provider: gemini or claude") |
| parser.add_argument("--world", default="data/worlds/default.json", |
| help="Path to world JSON file") |
| args = parser.parse_args() |
|
|
| report = run_evaluation( |
| scenario=EVAL_SCENARIO, |
| world_path=args.world, |
| llm_name=args.llm, |
| run_judge=not args.no_judge, |
| ) |
|
|
| write_report(report) |
|
|
| print("\n" + "=" * 60) |
| print("SUMMARY") |
| print("=" * 60) |
| print(f"Invalid transition rate: {report.invalid_transition_rate:.1%}") |
| print(f"Memory utilisation rate: {report.memory_utilisation_rate:.1%}") |
| print(f"Factual consistency rate: {report.factual_consistency_rate:.1%}") |