citadel / scripts /demo_export.py
Astro-Dude's picture
demo: exercise citation pathway + deterministic artifact + CI + walkthrough
b9182e8
Raw
History Blame Contribute Delete
12.9 kB
"""
Citadel — Demo Playbook Exporter
Runs a baseline council (naive Commander + teaching rule-based Oversight)
across all three tasks and all three adversary generations, then dumps the
accumulated shared playbook to `playbook_export.md` at the repo root.
Purpose: give judges a self-contained artifact showing how the playbook
grows across the curriculum — no LLM, no GPU, no API keys needed.
Usage (from repo root):
python scripts/demo_export.py
Outputs:
./playbook_export.md (human-readable markdown)
./playbook_demo.json (raw state, separate from the production playbook.json)
"""
from __future__ import annotations
import os
import random
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))
from environment import CitadelEnvironment, default_oversight_policy
from models import (
IncidentAction,
CommanderProposal,
IncidentState,
OversightAction,
OversightDecision,
SYSTEM_NAMES,
)
from playbook import (
Playbook,
SYSTEM_TYPE_TAGS,
ADVERSARY_GEN_TAGS,
reset_default_playbook,
)
from governance import DESTRUCTIVE_BASTION_ACTIONS, DATA_HOLDING_SYSTEMS
# Deterministic artifact: same input -> same output across commits.
DEMO_SEED = 4242
TASK_PLAN = [
("easy_1", 1), # Gen 1 — script kiddie baseline
("medium_1", 2), # Gen 2 — adaptive attacker (pivots, false flags)
("hard_1", 3), # Gen 3 — deceptive APT (anti-evidence planted)
("hard_2", 3), # Gen 3 — engineered deception-catch scenario
]
def teaching_oversight(
proposal: CommanderProposal,
state: IncidentState,
action_history: List[Dict[str, Any]],
) -> OversightAction:
"""Rule-based Oversight that also emits situational lesson_text.
Delegates decision/risk/weakness to the env's default policy, then
layers a one-line lesson pegged to the observed scenario so the
shared playbook actually fills up during a baseline run.
"""
from models import ActionType
base = default_oversight_policy(proposal, state, action_history)
target_name = (
SYSTEM_NAMES[proposal.target_system]
if 0 <= proposal.target_system < len(SYSTEM_NAMES) else ""
)
action_idx = proposal.action
is_destructive = action_idx in DESTRUCTIVE_BASTION_ACTIONS
is_data_sys = target_name in DATA_HOLDING_SYSTEMS
alert_conf = 0.0
for a in reversed(state.alerts):
if a.source_system == target_name and a.confidence > 0:
alert_conf = a.confidence
break
gen = state.adversary_gen
lesson = ""
# Include target/gen in text to avoid dedup collapse across contexts
tgt = target_name or "unknown_target"
if base.decision == int(OversightDecision.VETO) and is_destructive and is_data_sys:
lesson = (
f"[Gen {gen} / {tgt}] Destructive actions on data systems require "
f"investigate() first — veto when uninvestigated regardless of alert volume."
)
elif is_destructive and is_data_sys and 0 < alert_conf < 0.3:
lesson = (
f"[Gen {gen} / {tgt}] Alerts below 30% confidence on data systems "
f"should trigger investigate() before isolate() — low-confidence signals are bait."
)
elif gen == 3 and is_destructive and target_name and alert_conf > 0.7:
lesson = (
f"[Gen 3 / {tgt}] Gen 3 plants anti-evidence at high severity on compromised "
f"systems — high-confidence 'all clear' warrants a second look, not auto-approval."
)
elif gen == 2 and action_idx == int(ActionType.ISOLATE_SYSTEM):
lesson = (
f"[Gen 2 / {tgt}] Gen 2 pivots around isolated systems — isolate() without a "
f"companion hunt_threat() on adjacent targets leaks containment."
)
elif action_idx == int(ActionType.INVESTIGATE_SYSTEM) and is_data_sys:
lesson = (
f"[Gen {gen} / {tgt}] Early investigate() on data-holding systems before any "
f"destructive action is the canonical safe path."
)
elif action_idx == int(ActionType.DEPLOY_MONITORING):
lesson = (
f"[Gen {gen} / {tgt}] Monitoring deployed without an open ServiceNow ticket "
f"wastes a step — sequence governance (open_servicenow_incident) first on P1 scenarios."
)
elif is_destructive and not is_data_sys:
lesson = (
f"[Gen {gen} / {tgt}] Isolating non-data systems is cheap; reserve veto "
f"budget for destructive actions on data-holding systems."
)
elif base.decision == int(OversightDecision.REVISE):
lesson = (
f"[Gen {gen} / {tgt}] Short justifications correlate with weak proposals — "
f"require evidence pointers on every destructive action."
)
if lesson:
base.lesson_text = lesson
return base
def _pick_relevant_lesson(
playbook: Playbook,
adversary_gen: int,
target_name: str,
rng: random.Random,
) -> Optional[str]:
"""Return a lesson_id from `playbook` that matches the current context, or None.
The production `Playbook.retrieve()` sorts by tag overlap × utility ×
last-used time, where `utility` and `last_used_ts` both come from
`time.time()` and so differ by sub-millisecond deltas across runs.
For a *commit-stable* judge artifact we skip `retrieve()` and rank
candidates ourselves with a fully-deterministic tiebreaker
(`lesson_id`). Production callers should still use `retrieve()` —
real-time recency is the right signal there.
Exercising `record_outcome` happens automatically inside the env when
a cited lesson appears in `IncidentAction.cited_lessons`, so picking
*any* relevant lesson is enough to populate wins/losses.
"""
if len(playbook) == 0:
return None
query_tags = {ADVERSARY_GEN_TAGS.get(adversary_gen, "gen_1_script")}
if target_name in SYSTEM_TYPE_TAGS:
query_tags.add(SYSTEM_TYPE_TAGS[target_name])
ranked = sorted(
playbook.all(),
key=lambda ls: (
-len(query_tags & set(ls.tags)), # primary: more overlap first
ls.lesson_id, # secondary: deterministic
),
)
# Drop lessons with zero tag overlap.
candidates = [ls for ls in ranked if query_tags & set(ls.tags)]
if not candidates:
return None
if rng.random() > 0.60:
return None
return rng.choice(candidates[:4]).lesson_id
def naive_proposal(
state: IncidentState,
hour: int,
playbook: Optional[Playbook] = None,
rng: Optional[random.Random] = None,
) -> IncidentAction:
"""Demo proposal generator — rotates actions AND covers data systems.
The upstream `naive_policy` rotates targets by hour-index, which in
practice never lands ISOLATE on a data system. We pair action and
target so the council actually exercises destructive-on-data cases
(where the interesting lessons live).
When a playbook is supplied, the proposal cites a contextually
relevant lesson on ~60% of steps, exercising the citation/utility
pathway end-to-end.
"""
from models import ActionType
action_rotation = [
ActionType.INVESTIGATE_SYSTEM, # safe on anything
ActionType.ISOLATE_SYSTEM, # destructive — interesting on data sys
ActionType.DEPLOY_MONITORING,
ActionType.PATCH_VULNERABILITY,
]
# Cycle through data systems for destructive actions, other systems for safe ones
data_targets = [2, 3, 4, 6] # database, file_server, email_server, backup_server
other_targets = [0, 1, 5, 7] # web, app, workstations, firewall
action = action_rotation[hour % len(action_rotation)]
if action == ActionType.ISOLATE_SYSTEM:
target = data_targets[hour % len(data_targets)]
else:
target = other_targets[hour % len(other_targets)]
target_name = SYSTEM_NAMES[target] if 0 <= target < len(SYSTEM_NAMES) else ""
cited: List[str] = []
if playbook is not None and rng is not None:
lid = _pick_relevant_lesson(playbook, state.adversary_gen, target_name, rng)
if lid is not None:
cited.append(lid)
return IncidentAction(
action=int(action),
target_system=target,
justification=f"Baseline rotation: {action.name.lower()} on {target_name} at hour {hour}.",
cited_lessons=cited,
)
def run_episode(
env: CitadelEnvironment,
task_id: str,
adversary_gen: int,
rng: random.Random,
) -> Dict[str, Any]:
obs = env.reset(task_id=task_id, adversary_gen=adversary_gen)
steps = 0
citations_issued = 0
for hour in range(12):
action = naive_proposal(env._state, hour, playbook=env._playbook, rng=rng)
if action.cited_lessons:
citations_issued += 1
obs = env.step(action)
steps += 1
if obs.done:
break
# Post-episode retrospective lesson — guaranteed unique per (task, gen)
exfil = env._state.data_exfiltrated
outcome = "contained" if exfil < 0.2 else ("degraded" if exfil < 0.8 else "catastrophic")
env._playbook.write(
text=(
f"[retrospective] Baseline council on {task_id} vs Gen {adversary_gen} "
f"ended {outcome} after {steps} steps (exfil={exfil:.2f}). "
f"Trained policy should improve on this floor."
),
tags=[
f"gen_{adversary_gen}",
f"task_{task_id}",
"retrospective",
outcome,
],
adversary_gen=adversary_gen,
task_id=task_id,
hour=steps,
)
return {
"task_id": task_id,
"adversary_gen": adversary_gen,
"steps": steps,
"data_exfiltrated": round(exfil, 3),
"outcome": outcome,
"citations_issued": citations_issued,
}
def main() -> None:
# Reproducibility: seed every RNG the demo touches so the artifact is
# commit-stable. The env reseeds its own RNG per task from task config.
random.seed(DEMO_SEED)
rng = random.Random(DEMO_SEED)
# Fresh, isolated playbook — don't clobber production ./playbook.json
demo_path = str(REPO_ROOT / "playbook_demo.json")
if os.path.exists(demo_path):
os.remove(demo_path)
playbook = reset_default_playbook(path=demo_path)
env = CitadelEnvironment(oversight_policy=teaching_oversight)
summaries: List[Dict[str, Any]] = []
for task_id, gen in TASK_PLAN:
summary = run_episode(env, task_id, gen, rng)
summaries.append(summary)
print(
f" ran {task_id} | Gen {gen} -> {summary['steps']} steps, "
f"exfil={summary['data_exfiltrated']}, "
f"citations={summary['citations_issued']}",
flush=True,
)
playbook.save()
# Aggregate citation/utility stats — these are what makes the artifact
# demonstrate the playbook *mechanic*, not just lesson text.
total_citations = sum(ls.citations for ls in playbook.all())
total_wins = sum(ls.wins for ls in playbook.all())
total_losses = sum(ls.losses for ls in playbook.all())
cited_lessons = sum(1 for ls in playbook.all() if ls.citations > 0)
out_path = REPO_ROOT / "playbook_export.md"
header = [
"<!-- Generated by scripts/demo_export.py — do not edit by hand. -->",
"<!-- Re-run: `python scripts/demo_export.py` from repo root. -->",
"",
"> Baseline council (naive Commander + teaching rule-based Oversight) "
"across all three tasks and adversary generations. "
"Trained weights will produce richer lessons; this is the floor.",
"",
"## Runs",
"",
"| Task | Adversary Gen | Steps | Data Exfiltrated | Lessons Cited |",
"|---|---|---|---|---|",
]
for s in summaries:
header.append(
f"| `{s['task_id']}` | Gen {s['adversary_gen']} | {s['steps']} | "
f"{s['data_exfiltrated']} | {s['citations_issued']} |"
)
header.extend([
"",
"## Playbook mechanic — end-of-run snapshot",
"",
f"- {cited_lessons} of {len(playbook)} lessons cited at least once",
f"- {total_citations} total citations across the council's runs",
f"- {total_wins}W / {total_losses}L on cited lessons "
f"(env auto-records via `Playbook.record_outcome` on each cited step)",
"",
])
body = playbook.as_markdown()
out_path.write_text("\n".join(header) + "\n" + body + "\n", encoding="utf-8")
print(
f"\nWrote {out_path.relative_to(REPO_ROOT)} "
f"({len(playbook)} lessons, {total_citations} citations, "
f"{total_wins}W/{total_losses}L)"
)
if __name__ == "__main__":
main()