Spaces:

aeesh1
/

quilltale

Running

File size: 16,263 Bytes

b759ee5

"""
Quilltale — Automated Evaluation Runner

Plays through a fixed scenario and measures three metrics:
  1. invalid_transition_rate   — how often the GM proposes invalid state changes
  2. memory_utilisation_rate   — how often NPC memories actually shape narration
  3. factual_consistency_rate  — how often narration contradicts world state

Run with:
    python eval_runner.py

Outputs:
    eval_results/report_<timestamp>.json
    eval_results/report_<timestamp>.txt
"""

import json
import os
import logging
from datetime import datetime
from pathlib import Path
from dataclasses import dataclass, field

from src.world.state import WorldState
from src.agents.game_master import GameMasterAgent
from src.llm import get_llm
from src.llm.base import BaseLLM

logging.basicConfig(level=logging.WARNING)

### Evaluation scenario ########################################
### A fixed sequence of actions designed to stress-test all three metrics.
### Covers: movement, item interaction, NPC conversation,
###         NPC memory escalation, invalid attempts, multi-step routing.

EVAL_SCENARIO = [
    ### Turn 1-3: basic movement and scene establishment
    "look around the tavern carefully",
    "examine the wanted notice on the wall",
    "examine the rusty dagger on the table",

    ### Turn 4-6: NPC interaction to help build Marta's memory
    "talk to Marta the barkeep",
    "ask Marta about the dagger",
    "ask Marta about the chest upstairs",

    ### Turn 7: memory escalation
    "threaten Marta to tell you what she knows",

    ### Turn 8-9: item interaction
    "pick up the dagger",
    "pick up the wanted notice",

    ### Turn 10: invalid attempt to pick an item not here
    "pick up the strange coin",

    ### Turn 11-12: movement
    "go north to the street",
    "go east to the market",

    ### Turn 13-14: NPC interaction in new location
    "talk to Aldric the merchant",
    "ask Aldric about the strange coin",

    ### Turn 15: multi-step routing back
    "go back to the tavern",

    ### Turn 16: verify Marta still remembers the threat
    "talk to Marta",

    ### Turn 17-18: upstairs exploration
    "go upstairs to my room",
    "examine the locked chest",

    ### Turn 19: use key on chest
    "use the old iron key on the chest",

    ### Turn 20: invalid movement attempt
    "go south",
]


### LLM Judge ########################################

JUDGE_SYSTEM = """
You are an objective evaluator assessing AI game master output quality.
You always respond with valid JSON only. No preamble or explanation outside the JSON.
"""

def judge_memory_utilisation(
    llm: BaseLLM,
    narration: str,
    npc_memories: list,
    npc_name: str,
) -> dict:
    """
    Ask the LLM judge: does this narration reflect the NPC's recorded memories?
    Returns {"reflects_memory": bool, "confidence": float, "reason": str}
    """
    if not npc_memories:
        return {"reflects_memory": True, "confidence": 1.0, "reason": "No memories to reflect."}

    memory_text = "\n".join(
        f"  - Turn {m.turn} ({m.emotional_tone}, sig={m.significance}): {m.description}"
        for m in npc_memories[:3]
    )

    prompt = f"""
            {npc_name} has these recorded memories of the player:
            {memory_text}

            The game master produced this narration:
            "{narration}"

            Does the narration reflect any of these memories through {npc_name}'s behaviour,
            tone, dialogue, or reaction? Even subtle reflection counts (e.g. wariness, coldness,
            gratitude shown through action rather than stated directly).

            Respond with JSON:
            {{"reflects_memory": true/false, "confidence": 0.0-1.0, "reason": "one sentence"}}
            """
    try:
        raw = llm.generate_json(prompt, JUDGE_SYSTEM)
        return json.loads(raw)
    except Exception:
        return {"reflects_memory": False, "confidence": 0.0, "reason": "Judge call failed."}


def judge_factual_consistency(
    llm: BaseLLM,
    narration: str,
    world_context: str,
) -> dict:
    """
    Ask the LLM judge: does the narration contradict the world state?
    Returns {"is_consistent": bool, "confidence": float, "violation": str}
    """
    prompt = f"""
              The current world state contains these facts:
              {world_context}

              The game master produced this narration:
              "{narration}"

              Does the narration contradict any recorded facts? Look for:
              - Items mentioned that aren't in the current location or inventory
              - NPCs described as present when they are not listed
              - Movement described to locations not reachable from current exits
              - Health or inventory states that differ from recorded values

              Respond with JSON:
              {{"is_consistent": true/false, "confidence": 0.0-1.0, "violation": "describe any contradiction, or empty string if none"}}
              """
    try:
        raw = llm.generate_json(prompt, JUDGE_SYSTEM)
        return json.loads(raw)
    except Exception:
        return {"is_consistent": True, "confidence": 0.0, "violation": "Judge call failed."}


### Metrics collector ########################################

@dataclass
class TurnRecord:
    turn: int
    action: str
    narration: str
    state_update: dict
    changes_applied: list[str]
    rejected_transitions: list[str]
    memory_judgement: dict = field(default_factory=dict)
    consistency_judgement: dict = field(default_factory=dict)
    npcs_present: list[str] = field(default_factory=list)
    npc_memories_present: bool = False


@dataclass
class EvalReport:
    scenario_name: str = "default_world"
    total_turns: int = 0
    timestamp: str = ""

    ### Raw counts
    total_transitions_attempted: int = 0
    total_transitions_rejected: int = 0
    total_turns_with_npcs: int = 0
    total_turns_memory_reflected: int = 0
    total_turns_memory_judged: int = 0
    total_turns_consistent: int = 0
    total_turns_consistency_judged: int = 0

    ### Derived metrics
    invalid_transition_rate: float = 0.0
    memory_utilisation_rate: float = 0.0
    factual_consistency_rate: float = 0.0

    ### Turn-by-turn records
    turns: list[TurnRecord] = field(default_factory=list)

    ### Failure examples
    rejection_examples: list[str] = field(default_factory=list)
    memory_failures: list[dict] = field(default_factory=list)
    consistency_violations: list[dict] = field(default_factory=list)

    def compute_rates(self):
        if self.total_transitions_attempted > 0:
            self.invalid_transition_rate = round(
                self.total_transitions_rejected / self.total_transitions_attempted, 3
            )
        if self.total_turns_memory_judged > 0:
            self.memory_utilisation_rate = round(
                self.total_turns_memory_reflected / self.total_turns_memory_judged, 3
            )
        if self.total_turns_consistency_judged > 0:
            self.factual_consistency_rate = round(
                self.total_turns_consistent / self.total_turns_consistency_judged, 3
            )


### Evaluation runner ########################################

def run_evaluation(
    scenario: list[str] = EVAL_SCENARIO,
    world_path: str = "data/worlds/default.json",
    llm_name: str = "gemini",
    run_judge: bool = True,
) -> EvalReport:
    """
    Play through the scenario automatically and collect metrics.

    Args:
        scenario:   List of player actions to execute in order.
        world_path: Path to the world JSON file.
        llm_name:   LLM provider to use for the GM.
        run_judge:  Whether to run LLM judge calls for memory and consistency.
                    Set False to only measure invalid transition rate (cheaper).
    """
    print(f"Running evaluation: {len(scenario)} turns, judge={'on' if run_judge else 'off'}")
    print("-" * 60)

    with open(world_path) as f:
        state = WorldState.from_json(f.read())

    llm = get_llm(llm_name)
    judge_llm = get_llm(llm_name) if run_judge else None
    gm = GameMasterAgent(llm)

    report = EvalReport(
        scenario_name=Path(world_path).stem,
        timestamp=datetime.now().isoformat(),
    )

    ### Generate opening (not counted in metrics — no action to evaluate)
    opening = gm.generate_opening(state)
    print(f"Opening: {opening['narration'][:80]}...")
    print()

    for i, action in enumerate(scenario):
        print(f"Turn {i+1:02d}: {action}")

        world_context_before = state.to_context_summary()
        npcs_present = state.npcs_in_location(state.player.location)
        npcs_with_memories = [
            npc for npc in npcs_present
            if npc.alive and len(npc.memories) > 0
        ]

        result = gm.process_turn(action, state)

        ### Count transitions
        update = result.get("state_update", {})
        transition_keys = {"move_player", "pickup_item", "drop_item", "npc_state"}
        attempted = sum(1 for k in transition_keys if k in update)
        rejected = [c for c in result["changes_applied"] if "REJECTED" in c]

        report.total_transitions_attempted += attempted
        report.total_transitions_rejected += len(rejected)

        if rejected:
            report.rejection_examples.extend(rejected[:2])

        ### Memory utilisation judgement
        memory_judgement = {}
        if run_judge and npcs_with_memories:
            report.total_turns_memory_judged += 1
            ### Judge against the NPC with most memories
            primary_npc = max(npcs_with_memories, key=lambda n: len(n.memories))
            memory_judgement = judge_memory_utilisation(
                judge_llm,
                result["narration"],
                primary_npc.relevant_memories(),
                primary_npc.name,
            )
            if memory_judgement.get("reflects_memory", False):
                report.total_turns_memory_reflected += 1
            else:
                report.memory_failures.append({
                    "turn": i + 1,
                    "action": action,
                    "narration": result["narration"],
                    "reason": memory_judgement.get("reason", ""),
                })

        ### Factual consistency judgement
        consistency_judgement = {}
        if run_judge:
            report.total_turns_consistency_judged += 1
            consistency_judgement = judge_factual_consistency(
                judge_llm,
                result["narration"],
                world_context_before,
            )
            if consistency_judgement.get("is_consistent", True):
                report.total_turns_consistent += 1
            else:
                report.consistency_violations.append({
                    "turn": i + 1,
                    "action": action,
                    "narration": result["narration"],
                    "violation": consistency_judgement.get("violation", ""),
                })

        ### Record turn
        record = TurnRecord(
            turn=i + 1,
            action=action,
            narration=result["narration"],
            state_update=update,
            changes_applied=result["changes_applied"],
            rejected_transitions=rejected,
            memory_judgement=memory_judgement,
            consistency_judgement=consistency_judgement,
            npcs_present=[n.name for n in npcs_present],
            npc_memories_present=bool(npcs_with_memories),
        )
        report.turns.append(record)
        report.total_turns += 1

        print(f"         → {result['narration'][:80]}...")
        if rejected:
            print(f"         ✗ REJECTED: {rejected}")
        print()

    report.compute_rates()
    return report


### Report writer ########################################

def write_report(report: EvalReport, output_dir: str = "eval_results"):
    Path(output_dir).mkdir(exist_ok=True)
    timestamp = report.timestamp.replace(":", "-").replace(".", "-")

    ### Full JSON data report
    json_path = Path(output_dir) / f"report_{timestamp}.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(report.__dict__, f, indent=2, default=lambda o: o.__dict__)

    ### Report Summary
    txt_path = Path(output_dir) / f"report_{timestamp}.txt"
    with open(txt_path, "w", encoding="utf-8") as f:

        f.write("QUILLTALE — EVALUATION REPORT\n")
        f.write("=" * 60 + "\n\n")
        f.write(f"Scenario:  {report.scenario_name}\n")
        f.write(f"Timestamp: {report.timestamp}\n")
        f.write(f"Turns:     {report.total_turns}\n\n")

        f.write("METRICS\n")
        f.write("-" * 40 + "\n")
        f.write(
            f"Invalid transition rate:    {report.invalid_transition_rate:.1%}  "
            f"({report.total_transitions_rejected}/{report.total_transitions_attempted} transitions rejected)\n"
        )
        f.write(
            f"Memory utilisation rate:    {report.memory_utilisation_rate:.1%}  "
            f"({report.total_turns_memory_reflected}/{report.total_turns_memory_judged} turns with NPCs reflected memory)\n"
        )
        f.write(
            f"Factual consistency rate:   {report.factual_consistency_rate:.1%}  "
            f"({report.total_turns_consistent}/{report.total_turns_consistency_judged} turns without contradictions)\n\n"
        )

        if report.rejection_examples:
            f.write("REJECTED TRANSITIONS (sample)\n")
            f.write("-" * 40 + "\n")
            for ex in report.rejection_examples[:5]:
                f.write(f"  {ex}\n")
            f.write("\n")

        if report.memory_failures:
            f.write(f"MEMORY UTILISATION FAILURES ({len(report.memory_failures)} turns)\n")
            f.write("-" * 40 + "\n")
            for mf in report.memory_failures[:3]:
                f.write(f"  Turn {mf['turn']}: {mf['action']}\n")
                f.write(f"  Narration: {mf['narration'][:100]}...\n")
                f.write(f"  Reason: {mf['reason']}\n\n")

        if report.consistency_violations:
            f.write(f"CONSISTENCY VIOLATIONS ({len(report.consistency_violations)} turns)\n")
            f.write("-" * 40 + "\n")
            for cv in report.consistency_violations[:3]:
                f.write(f"  Turn {cv['turn']}: {cv['action']}\n")
                f.write(f"  Narration: {cv['narration'][:100]}...\n")
                f.write(f"  Violation: {cv['violation']}\n\n")

        f.write("TURN-BY-TURN SUMMARY\n")
        f.write("-" * 40 + "\n")
        for t in report.turns:
            status = "✓" if not t.rejected_transitions else "✗"
            mem = ""
            if t.memory_judgement:
                mem = " [mem:✓]" if t.memory_judgement.get("reflects_memory") else " [mem:✗]"
            con = ""
            if t.consistency_judgement:
                con = " [con:✓]" if t.consistency_judgement.get("is_consistent") else " [con:✗]"
            f.write(f"  {status} T{t.turn:02d} {t.action[:45]:<45}{mem}{con}\n")

    print(f"Report written to:")
    print(f"  {json_path}")
    print(f"  {txt_path}")
    return json_path, txt_path


### Entry point ########################################

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Run Quilltale evaluation")
    parser.add_argument("--no-judge", action="store_true",
                        help="Skip LLM judge calls (only measure transition rate)")
    parser.add_argument("--llm", default="gemini",
                        help="LLM provider: gemini or claude")
    parser.add_argument("--world", default="data/worlds/default.json",
                        help="Path to world JSON file")
    args = parser.parse_args()

    report = run_evaluation(
        scenario=EVAL_SCENARIO,
        world_path=args.world,
        llm_name=args.llm,
        run_judge=not args.no_judge,
    )

    write_report(report)

    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Invalid transition rate:   {report.invalid_transition_rate:.1%}")
    print(f"Memory utilisation rate:   {report.memory_utilisation_rate:.1%}")
    print(f"Factual consistency rate:  {report.factual_consistency_rate:.1%}")