Spaces:

aeesh1
/

quilltale

Running

App Files Files Community

quilltale / eval_runner.py

aeesh1

set up eval pipeline and run evaluations

b759ee5 8 days ago

raw

history blame contribute delete

16.3 kB

	"""
	Quilltale — Automated Evaluation Runner

	Plays through a fixed scenario and measures three metrics:
	1. invalid_transition_rate — how often the GM proposes invalid state changes
	2. memory_utilisation_rate — how often NPC memories actually shape narration
	3. factual_consistency_rate — how often narration contradicts world state

	Run with:
	python eval_runner.py

	Outputs:
	eval_results/report_<timestamp>.json
	eval_results/report_<timestamp>.txt
	"""

	import json
	import os
	import logging
	from datetime import datetime
	from pathlib import Path
	from dataclasses import dataclass, field

	from src.world.state import WorldState
	from src.agents.game_master import GameMasterAgent
	from src.llm import get_llm
	from src.llm.base import BaseLLM

	logging.basicConfig(level=logging.WARNING)

	### Evaluation scenario ########################################
	### A fixed sequence of actions designed to stress-test all three metrics.
	### Covers: movement, item interaction, NPC conversation,
	### NPC memory escalation, invalid attempts, multi-step routing.

	EVAL_SCENARIO = [
	### Turn 1-3: basic movement and scene establishment
	"look around the tavern carefully",
	"examine the wanted notice on the wall",
	"examine the rusty dagger on the table",

	### Turn 4-6: NPC interaction to help build Marta's memory
	"talk to Marta the barkeep",
	"ask Marta about the dagger",
	"ask Marta about the chest upstairs",

	### Turn 7: memory escalation
	"threaten Marta to tell you what she knows",

	### Turn 8-9: item interaction
	"pick up the dagger",
	"pick up the wanted notice",

	### Turn 10: invalid attempt to pick an item not here
	"pick up the strange coin",

	### Turn 11-12: movement
	"go north to the street",
	"go east to the market",

	### Turn 13-14: NPC interaction in new location
	"talk to Aldric the merchant",
	"ask Aldric about the strange coin",

	### Turn 15: multi-step routing back
	"go back to the tavern",

	### Turn 16: verify Marta still remembers the threat
	"talk to Marta",

	### Turn 17-18: upstairs exploration
	"go upstairs to my room",
	"examine the locked chest",

	### Turn 19: use key on chest
	"use the old iron key on the chest",

	### Turn 20: invalid movement attempt
	"go south",
	]


	### LLM Judge ########################################

	JUDGE_SYSTEM = """
	You are an objective evaluator assessing AI game master output quality.
	You always respond with valid JSON only. No preamble or explanation outside the JSON.
	"""

	def judge_memory_utilisation(
	llm: BaseLLM,
	narration: str,
	npc_memories: list,
	npc_name: str,
	) -> dict:
	"""
	Ask the LLM judge: does this narration reflect the NPC's recorded memories?
	Returns {"reflects_memory": bool, "confidence": float, "reason": str}
	"""
	if not npc_memories:
	return {"reflects_memory": True, "confidence": 1.0, "reason": "No memories to reflect."}

	memory_text = "\n".join(
	f" - Turn {m.turn} ({m.emotional_tone}, sig={m.significance}): {m.description}"
	for m in npc_memories[:3]
	)

	prompt = f"""
	{npc_name} has these recorded memories of the player:
	{memory_text}

	The game master produced this narration:
	"{narration}"

	Does the narration reflect any of these memories through {npc_name}'s behaviour,
	tone, dialogue, or reaction? Even subtle reflection counts (e.g. wariness, coldness,
	gratitude shown through action rather than stated directly).

	Respond with JSON:
	{{"reflects_memory": true/false, "confidence": 0.0-1.0, "reason": "one sentence"}}
	"""
	try:
	raw = llm.generate_json(prompt, JUDGE_SYSTEM)
	return json.loads(raw)
	except Exception:
	return {"reflects_memory": False, "confidence": 0.0, "reason": "Judge call failed."}


	def judge_factual_consistency(
	llm: BaseLLM,
	narration: str,
	world_context: str,
	) -> dict:
	"""
	Ask the LLM judge: does the narration contradict the world state?
	Returns {"is_consistent": bool, "confidence": float, "violation": str}
	"""
	prompt = f"""
	The current world state contains these facts:
	{world_context}

	The game master produced this narration:
	"{narration}"

	Does the narration contradict any recorded facts? Look for:
	- Items mentioned that aren't in the current location or inventory
	- NPCs described as present when they are not listed
	- Movement described to locations not reachable from current exits
	- Health or inventory states that differ from recorded values

	Respond with JSON:
	{{"is_consistent": true/false, "confidence": 0.0-1.0, "violation": "describe any contradiction, or empty string if none"}}
	"""
	try:
	raw = llm.generate_json(prompt, JUDGE_SYSTEM)
	return json.loads(raw)
	except Exception:
	return {"is_consistent": True, "confidence": 0.0, "violation": "Judge call failed."}


	### Metrics collector ########################################

	@dataclass
	class TurnRecord:
	turn: int
	action: str
	narration: str
	state_update: dict
	changes_applied: list[str]
	rejected_transitions: list[str]
	memory_judgement: dict = field(default_factory=dict)
	consistency_judgement: dict = field(default_factory=dict)
	npcs_present: list[str] = field(default_factory=list)
	npc_memories_present: bool = False


	@dataclass
	class EvalReport:
	scenario_name: str = "default_world"
	total_turns: int = 0
	timestamp: str = ""

	### Raw counts
	total_transitions_attempted: int = 0
	total_transitions_rejected: int = 0
	total_turns_with_npcs: int = 0
	total_turns_memory_reflected: int = 0
	total_turns_memory_judged: int = 0
	total_turns_consistent: int = 0
	total_turns_consistency_judged: int = 0

	### Derived metrics
	invalid_transition_rate: float = 0.0
	memory_utilisation_rate: float = 0.0
	factual_consistency_rate: float = 0.0

	### Turn-by-turn records
	turns: list[TurnRecord] = field(default_factory=list)

	### Failure examples
	rejection_examples: list[str] = field(default_factory=list)
	memory_failures: list[dict] = field(default_factory=list)
	consistency_violations: list[dict] = field(default_factory=list)

	def compute_rates(self):
	if self.total_transitions_attempted > 0:
	self.invalid_transition_rate = round(
	self.total_transitions_rejected / self.total_transitions_attempted, 3
	)
	if self.total_turns_memory_judged > 0:
	self.memory_utilisation_rate = round(
	self.total_turns_memory_reflected / self.total_turns_memory_judged, 3
	)
	if self.total_turns_consistency_judged > 0:
	self.factual_consistency_rate = round(
	self.total_turns_consistent / self.total_turns_consistency_judged, 3
	)


	### Evaluation runner ########################################

	def run_evaluation(
	scenario: list[str] = EVAL_SCENARIO,
	world_path: str = "data/worlds/default.json",
	llm_name: str = "gemini",
	run_judge: bool = True,
	) -> EvalReport:
	"""
	Play through the scenario automatically and collect metrics.

	Args:
	scenario: List of player actions to execute in order.
	world_path: Path to the world JSON file.
	llm_name: LLM provider to use for the GM.
	run_judge: Whether to run LLM judge calls for memory and consistency.
	Set False to only measure invalid transition rate (cheaper).
	"""
	print(f"Running evaluation: {len(scenario)} turns, judge={'on' if run_judge else 'off'}")
	print("-" * 60)

	with open(world_path) as f:
	state = WorldState.from_json(f.read())

	llm = get_llm(llm_name)
	judge_llm = get_llm(llm_name) if run_judge else None
	gm = GameMasterAgent(llm)

	report = EvalReport(
	scenario_name=Path(world_path).stem,
	timestamp=datetime.now().isoformat(),
	)

	### Generate opening (not counted in metrics — no action to evaluate)
	opening = gm.generate_opening(state)
	print(f"Opening: {opening['narration'][:80]}...")
	print()

	for i, action in enumerate(scenario):
	print(f"Turn {i+1:02d}: {action}")

	world_context_before = state.to_context_summary()
	npcs_present = state.npcs_in_location(state.player.location)
	npcs_with_memories = [
	npc for npc in npcs_present
	if npc.alive and len(npc.memories) > 0
	]

	result = gm.process_turn(action, state)

	### Count transitions
	update = result.get("state_update", {})
	transition_keys = {"move_player", "pickup_item", "drop_item", "npc_state"}
	attempted = sum(1 for k in transition_keys if k in update)
	rejected = [c for c in result["changes_applied"] if "REJECTED" in c]

	report.total_transitions_attempted += attempted
	report.total_transitions_rejected += len(rejected)

	if rejected:
	report.rejection_examples.extend(rejected[:2])

	### Memory utilisation judgement
	memory_judgement = {}
	if run_judge and npcs_with_memories:
	report.total_turns_memory_judged += 1
	### Judge against the NPC with most memories
	primary_npc = max(npcs_with_memories, key=lambda n: len(n.memories))
	memory_judgement = judge_memory_utilisation(
	judge_llm,
	result["narration"],
	primary_npc.relevant_memories(),
	primary_npc.name,
	)
	if memory_judgement.get("reflects_memory", False):
	report.total_turns_memory_reflected += 1
	else:
	report.memory_failures.append({
	"turn": i + 1,
	"action": action,
	"narration": result["narration"],
	"reason": memory_judgement.get("reason", ""),
	})

	### Factual consistency judgement
	consistency_judgement = {}
	if run_judge:
	report.total_turns_consistency_judged += 1
	consistency_judgement = judge_factual_consistency(
	judge_llm,
	result["narration"],
	world_context_before,
	)
	if consistency_judgement.get("is_consistent", True):
	report.total_turns_consistent += 1
	else:
	report.consistency_violations.append({
	"turn": i + 1,
	"action": action,
	"narration": result["narration"],
	"violation": consistency_judgement.get("violation", ""),
	})

	### Record turn
	record = TurnRecord(
	turn=i + 1,
	action=action,
	narration=result["narration"],
	state_update=update,
	changes_applied=result["changes_applied"],
	rejected_transitions=rejected,
	memory_judgement=memory_judgement,
	consistency_judgement=consistency_judgement,
	npcs_present=[n.name for n in npcs_present],
	npc_memories_present=bool(npcs_with_memories),
	)
	report.turns.append(record)
	report.total_turns += 1

	print(f" → {result['narration'][:80]}...")
	if rejected:
	print(f" ✗ REJECTED: {rejected}")
	print()

	report.compute_rates()
	return report


	### Report writer ########################################

	def write_report(report: EvalReport, output_dir: str = "eval_results"):
	Path(output_dir).mkdir(exist_ok=True)
	timestamp = report.timestamp.replace(":", "-").replace(".", "-")

	### Full JSON data report
	json_path = Path(output_dir) / f"report_{timestamp}.json"
	with open(json_path, "w", encoding="utf-8") as f:
	json.dump(report.__dict__, f, indent=2, default=lambda o: o.__dict__)

	### Report Summary
	txt_path = Path(output_dir) / f"report_{timestamp}.txt"
	with open(txt_path, "w", encoding="utf-8") as f:

	f.write("QUILLTALE — EVALUATION REPORT\n")
	f.write("=" * 60 + "\n\n")
	f.write(f"Scenario: {report.scenario_name}\n")
	f.write(f"Timestamp: {report.timestamp}\n")
	f.write(f"Turns: {report.total_turns}\n\n")

	f.write("METRICS\n")
	f.write("-" * 40 + "\n")
	f.write(
	f"Invalid transition rate: {report.invalid_transition_rate:.1%} "
	f"({report.total_transitions_rejected}/{report.total_transitions_attempted} transitions rejected)\n"
	)
	f.write(
	f"Memory utilisation rate: {report.memory_utilisation_rate:.1%} "
	f"({report.total_turns_memory_reflected}/{report.total_turns_memory_judged} turns with NPCs reflected memory)\n"
	)
	f.write(
	f"Factual consistency rate: {report.factual_consistency_rate:.1%} "
	f"({report.total_turns_consistent}/{report.total_turns_consistency_judged} turns without contradictions)\n\n"
	)

	if report.rejection_examples:
	f.write("REJECTED TRANSITIONS (sample)\n")
	f.write("-" * 40 + "\n")
	for ex in report.rejection_examples[:5]:
	f.write(f" {ex}\n")
	f.write("\n")

	if report.memory_failures:
	f.write(f"MEMORY UTILISATION FAILURES ({len(report.memory_failures)} turns)\n")
	f.write("-" * 40 + "\n")
	for mf in report.memory_failures[:3]:
	f.write(f" Turn {mf['turn']}: {mf['action']}\n")
	f.write(f" Narration: {mf['narration'][:100]}...\n")
	f.write(f" Reason: {mf['reason']}\n\n")

	if report.consistency_violations:
	f.write(f"CONSISTENCY VIOLATIONS ({len(report.consistency_violations)} turns)\n")
	f.write("-" * 40 + "\n")
	for cv in report.consistency_violations[:3]:
	f.write(f" Turn {cv['turn']}: {cv['action']}\n")
	f.write(f" Narration: {cv['narration'][:100]}...\n")
	f.write(f" Violation: {cv['violation']}\n\n")

	f.write("TURN-BY-TURN SUMMARY\n")
	f.write("-" * 40 + "\n")
	for t in report.turns:
	status = "✓" if not t.rejected_transitions else "✗"
	mem = ""
	if t.memory_judgement:
	mem = " [mem:✓]" if t.memory_judgement.get("reflects_memory") else " [mem:✗]"
	con = ""
	if t.consistency_judgement:
	con = " [con:✓]" if t.consistency_judgement.get("is_consistent") else " [con:✗]"
	f.write(f" {status} T{t.turn:02d} {t.action[:45]:<45}{mem}{con}\n")

	print(f"Report written to:")
	print(f" {json_path}")
	print(f" {txt_path}")
	return json_path, txt_path


	### Entry point ########################################

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Run Quilltale evaluation")
	parser.add_argument("--no-judge", action="store_true",
	help="Skip LLM judge calls (only measure transition rate)")
	parser.add_argument("--llm", default="gemini",
	help="LLM provider: gemini or claude")
	parser.add_argument("--world", default="data/worlds/default.json",
	help="Path to world JSON file")
	args = parser.parse_args()

	report = run_evaluation(
	scenario=EVAL_SCENARIO,
	world_path=args.world,
	llm_name=args.llm,
	run_judge=not args.no_judge,
	)

	write_report(report)

	print("\n" + "=" * 60)
	print("SUMMARY")
	print("=" * 60)
	print(f"Invalid transition rate: {report.invalid_transition_rate:.1%}")
	print(f"Memory utilisation rate: {report.memory_utilisation_rate:.1%}")
	print(f"Factual consistency rate: {report.factual_consistency_rate:.1%}")