"""Deterministic seeker simulator with hidden internal state. Why rule-based / deterministic? ------------------------------- The OpenEnv graders must be reproducible. An LLM-driven seeker would make reward non-deterministic and fail the "score variance check" in Phase 2 of judging. We deliberately trade some linguistic realism for full determinism so that the same action sequence always yields the same reward — a hard requirement of the hackathon rubric ("graders deterministic and reproducible"). Design ------ The seeker is a finite-state machine with continuous hidden variables: distress ∈ [0, 1] — how emotionally overwhelmed the seeker feels trust ∈ [0, 1] — how safe the seeker feels with the agent openness ∈ [0, 1] — willingness to reveal the *true* issue revealed ∈ {0, 1} — has the core issue surfaced yet? stage ∈ enum — opening / exploring / reflecting / planning / closing On each turn, the environment analyses the agent's reply with a small bank of deterministic feature detectors (keyword/regex based), then applies a transition rule to update the hidden state and pick the seeker's next utterance from a scripted response tree indexed by (stage, features). """ from __future__ import annotations import re from dataclasses import dataclass, field from enum import Enum from typing import Dict, List, Tuple class Stage(str, Enum): OPENING = "opening" EXPLORING = "exploring" REFLECTING = "reflecting" PLANNING = "planning" CLOSING = "closing" # --------------------------------------------------------------------------- # Feature detectors — deterministic text analysis of the agent's reply. # --------------------------------------------------------------------------- EMPATHY_PATTERNS = [ r"\bi\s+(hear|understand|get|see)\s+(you|that|how)", r"\bthat\s+(sounds|must\s+be|seems)\b", r"\bit\s+makes\s+sense\b", r"\bi\s+can\s+imagine\b", r"\bthank\s+you\s+for\s+sharing\b", r"\bi'?m\s+(here|glad|sorry)\b", ] VALIDATION_PATTERNS = [ r"\byour\s+feelings?\s+(are|make)\s+(valid|sense)", r"\bit'?s\s+(okay|ok|normal|understandable)\s+to\s+feel", r"\banyone\s+would\s+feel\b", r"\bof\s+course\s+you\s+(feel|are)\b", ] OPEN_QUESTION_PATTERNS = [ r"\bhow\s+(are|do|did|does)\b", r"\bwhat\s+(is|are|do|does|has|makes|brought|happened)\b", r"\bcan\s+you\s+tell\s+me\s+more\b", r"\bwould\s+you\s+like\s+to\s+(talk|share)\b", ] ADVICE_PATTERNS = [ r"\byou\s+should\b", r"\byou\s+(need|have|ought)\s+to\b", r"\btry\s+(to|doing|this)\b", r"\bjust\s+(do|go|try|stop|start)\b", r"\bwhy\s+don'?t\s+you\b", r"\bmy\s+advice\b", ] DISMISSIVE_PATTERNS = [ r"\bget\s+over\s+it\b", r"\bstop\s+(complaining|whining|crying)\b", r"\byou'?re\s+overreacting\b", r"\bit'?s\s+not\s+a\s+big\s+deal\b", r"\bcalm\s+down\b", r"\bit\s+could\s+be\s+worse\b", ] INTERROGATIVE_PATTERNS = [ # rapid-fire closed questions (trust drain when high) r"\?\s*\?", ] SAFETY_PATTERNS = [ r"\bare\s+you\s+safe\b", r"\bprofessional\s+help\b", r"\bcrisis\s+line\b", r"\btherapist\b", ] def _count_matches(patterns: List[str], text: str) -> int: t = text.lower() return sum(1 for p in patterns if re.search(p, t)) @dataclass class Features: empathy: int validation: int open_question: int advice: int dismissive: int interrogative: int safety: int length: int closed_question: int # any '?' not matched by open bare: bool # very short / empty reply def extract_features(text: str) -> Features: stripped = (text or "").strip() lower = stripped.lower() empathy = _count_matches(EMPATHY_PATTERNS, lower) validation = _count_matches(VALIDATION_PATTERNS, lower) open_q = _count_matches(OPEN_QUESTION_PATTERNS, lower) advice = _count_matches(ADVICE_PATTERNS, lower) dismissive = _count_matches(DISMISSIVE_PATTERNS, lower) interrogative = _count_matches(INTERROGATIVE_PATTERNS, lower) safety = _count_matches(SAFETY_PATTERNS, lower) total_q = lower.count("?") closed_q = max(0, total_q - open_q) bare = len(stripped) < 8 return Features( empathy=empathy, validation=validation, open_question=open_q, advice=advice, dismissive=dismissive, interrogative=interrogative, safety=safety, length=len(stripped), closed_question=closed_q, bare=bare, ) # --------------------------------------------------------------------------- # Seeker state + scripted persona # --------------------------------------------------------------------------- @dataclass class SeekerPersona: """Static configuration describing the seeker's initial state + script.""" task_id: str scenario_brief: str surface_concern: str # what seeker says at turn 0 true_issue: str # hidden; only revealed if openness crosses threshold initial_distress: float initial_trust: float initial_openness: float reveal_threshold: float # openness value at which true_issue is revealed trust_fragility: float # how much a misstep drops trust (0..1) openness_gain_per_empathy: float distress_drop_per_validation: float # Scripted utterances by stage when cooperative opening_lines: List[str] exploring_lines: List[str] reflecting_lines: List[str] planning_lines: List[str] closing_lines: List[str] reveal_line: str # said the turn openness crosses reveal_threshold # Adverse reactions dismissed_lines: List[str] = field(default_factory=list) advice_too_early_lines: List[str] = field(default_factory=list) @dataclass class SeekerState: """Mutable hidden state updated each turn.""" persona: SeekerPersona distress: float trust: float openness: float revealed: bool stage: Stage last_line_idx_by_stage: Dict[Stage, int] turn: int @classmethod def from_persona(cls, persona: SeekerPersona) -> "SeekerState": return cls( persona=persona, distress=persona.initial_distress, trust=persona.initial_trust, openness=persona.initial_openness, revealed=False, stage=Stage.OPENING, last_line_idx_by_stage={s: -1 for s in Stage}, turn=0, ) # Snapshot for lookahead simulation — must be cheap and pure. def snapshot(self) -> "SeekerState": return SeekerState( persona=self.persona, distress=self.distress, trust=self.trust, openness=self.openness, revealed=self.revealed, stage=self.stage, last_line_idx_by_stage=dict(self.last_line_idx_by_stage), turn=self.turn, ) def _clip(x: float) -> float: return max(0.0, min(1.0, x)) # Stage ordering used for "progress" scalar in [0,1] STAGE_ORDER: List[Stage] = [ Stage.OPENING, Stage.EXPLORING, Stage.REFLECTING, Stage.PLANNING, Stage.CLOSING, ] def stage_progress(stage: Stage) -> float: return STAGE_ORDER.index(stage) / (len(STAGE_ORDER) - 1) def resolution_score(state: SeekerState) -> float: """Scalar summary of how 'resolved' the conversation currently is, in [0,1]. Weighted combination of stage progress, trust gained, distress relieved, and whether the true issue surfaced. This is the quantity the future-oriented reward tries to project forward under an oracle policy. """ p = state.persona progress = stage_progress(state.stage) trust_gain = max(0.0, state.trust - p.initial_trust) distress_relief = max(0.0, p.initial_distress - state.distress) reveal_bonus = 1.0 if state.revealed else 0.0 return _clip( 0.40 * progress + 0.25 * trust_gain / max(1e-6, 1.0 - p.initial_trust) + 0.25 * distress_relief / max(1e-6, p.initial_distress) + 0.10 * reveal_bonus ) # --------------------------------------------------------------------------- # Transition: given current state + agent features, produce new state + # seeker's next utterance + transition info. # --------------------------------------------------------------------------- @dataclass class Transition: new_state: SeekerState seeker_utterance: str flags: Dict[str, bool] # e.g. {"dismissed": True, "advice_too_early": False, ...} def _next_line(state: SeekerState, stage: Stage, pool: List[str]) -> str: if not pool: return "..." idx = (state.last_line_idx_by_stage[stage] + 1) % len(pool) state.last_line_idx_by_stage[stage] = idx return pool[idx] def step_seeker(state: SeekerState, features: Features) -> Transition: """Apply one turn of seeker dynamics given the agent's extracted features. Pure-ish: mutates a *copy* of state (caller should pass a snapshot if they want to preserve the original — the env always passes the live state). """ p = state.persona flags: Dict[str, bool] = { "dismissed": False, "advice_too_early": False, "bare_reply": features.bare, "empathic": features.empathy + features.validation > 0, "interrogated": False, "revealed_this_turn": False, } # --- 1. Dismissive / hostile language: hard drop on trust & distress spike. if features.dismissive > 0: state.trust = _clip(state.trust - 0.4 * (1.0 + p.trust_fragility)) state.distress = _clip(state.distress + 0.15) state.openness = _clip(state.openness - 0.2) flags["dismissed"] = True # --- 2. Premature advice (advice before trust ≥ 0.55): trust drop, openness drop. if features.advice > 0 and state.trust < 0.55: state.trust = _clip(state.trust - 0.15 * (1.0 + p.trust_fragility)) state.openness = _clip(state.openness - 0.1) flags["advice_too_early"] = True # --- 3. Empathy & validation: trust + openness up, distress down. if features.empathy > 0 or features.validation > 0: gain = p.openness_gain_per_empathy * (features.empathy + features.validation) state.trust = _clip(state.trust + 0.12 * (features.empathy + features.validation)) state.openness = _clip(state.openness + gain) state.distress = _clip(state.distress - p.distress_drop_per_validation * features.validation) # --- 4. Open questions: small trust gain, nudges stage forward. if features.open_question > 0: state.trust = _clip(state.trust + 0.05) state.openness = _clip(state.openness + 0.04) # --- 5. Interrogation (many closed questions or multiple "?"): trust drain. if features.closed_question >= 3 or features.interrogative > 0: state.trust = _clip(state.trust - 0.1) flags["interrogated"] = True # --- 6. Bare / empty reply: small penalty across the board. if features.bare: state.trust = _clip(state.trust - 0.05) state.distress = _clip(state.distress + 0.02) # --- 7. Stage progression (monotonic forward with cooperative conditions). def advance_to(s: Stage) -> None: if STAGE_ORDER.index(s) > STAGE_ORDER.index(state.stage): state.stage = s if state.stage == Stage.OPENING and ( features.empathy + features.validation + features.open_question > 0 ): advance_to(Stage.EXPLORING) elif state.stage == Stage.EXPLORING and state.trust >= 0.5 and state.openness >= 0.5: advance_to(Stage.REFLECTING) elif state.stage == Stage.REFLECTING and state.revealed and state.distress <= 0.5: advance_to(Stage.PLANNING) elif state.stage == Stage.PLANNING and features.open_question + features.empathy > 0: advance_to(Stage.CLOSING) # --- 8. Reveal check (cross threshold once). if not state.revealed and state.openness >= p.reveal_threshold: state.revealed = True flags["revealed_this_turn"] = True # --- 9. Pick seeker's next utterance. if flags["dismissed"] and p.dismissed_lines: utterance = _next_line(state, state.stage, p.dismissed_lines) elif flags["advice_too_early"] and p.advice_too_early_lines: utterance = _next_line(state, state.stage, p.advice_too_early_lines) elif flags["revealed_this_turn"]: utterance = p.reveal_line else: pool_by_stage = { Stage.OPENING: p.opening_lines, Stage.EXPLORING: p.exploring_lines, Stage.REFLECTING: p.reflecting_lines, Stage.PLANNING: p.planning_lines, Stage.CLOSING: p.closing_lines, } utterance = _next_line(state, state.stage, pool_by_stage[state.stage]) state.turn += 1 return Transition(new_state=state, seeker_utterance=utterance, flags=flags) # --------------------------------------------------------------------------- # Oracle policy for the future-oriented reward lookahead. # --------------------------------------------------------------------------- def oracle_features(state: SeekerState) -> Features: """What the 'oracle' agent would do from this state. Picks the stage-appropriate ideal action: - opening/exploring: empathy + open question - reflecting: empathy + validation - planning: open question + mild advice (trust is high here) - closing: empathy + safety mention """ s = state.stage if s in (Stage.OPENING, Stage.EXPLORING): return Features( empathy=1, validation=0, open_question=1, advice=0, dismissive=0, interrogative=0, safety=0, length=80, closed_question=0, bare=False, ) if s == Stage.REFLECTING: return Features( empathy=1, validation=1, open_question=0, advice=0, dismissive=0, interrogative=0, safety=0, length=90, closed_question=0, bare=False, ) if s == Stage.PLANNING: return Features( empathy=0, validation=0, open_question=1, advice=1, dismissive=0, interrogative=0, safety=0, length=90, closed_question=0, bare=False, ) return Features( # CLOSING empathy=1, validation=0, open_question=0, advice=0, dismissive=0, interrogative=0, safety=1, length=90, closed_question=0, bare=False, ) def simulate_oracle_rollout(state: SeekerState, k: int) -> float: """Run the oracle policy from a snapshot for k steps and return the final resolution_score. Used by the future-oriented reward.""" sim = state.snapshot() for _ in range(k): step_seeker(sim, oracle_features(sim)) return resolution_score(sim)