Spaces:

Astro-Dude
/

citadel

Sleeping

App Files Files Community

citadel / scripts /demo_export.py

Astro-Dude

demo: exercise citation pathway + deterministic artifact + CI + walkthrough

b9182e8 about 2 months ago

Raw

History Blame Contribute Delete

12.9 kB

	"""
	Citadel — Demo Playbook Exporter

	Runs a baseline council (naive Commander + teaching rule-based Oversight)
	across all three tasks and all three adversary generations, then dumps the
	accumulated shared playbook to `playbook_export.md` at the repo root.

	Purpose: give judges a self-contained artifact showing how the playbook
	grows across the curriculum — no LLM, no GPU, no API keys needed.

	Usage (from repo root):
	python scripts/demo_export.py

	Outputs:
	./playbook_export.md (human-readable markdown)
	./playbook_demo.json (raw state, separate from the production playbook.json)
	"""

	from __future__ import annotations

	import os
	import random
	import sys
	from pathlib import Path
	from typing import Any, Dict, List, Optional

	REPO_ROOT = Path(__file__).resolve().parent.parent
	sys.path.insert(0, str(REPO_ROOT))

	from environment import CitadelEnvironment, default_oversight_policy
	from models import (
	IncidentAction,
	CommanderProposal,
	IncidentState,
	OversightAction,
	OversightDecision,
	SYSTEM_NAMES,
	)
	from playbook import (
	Playbook,
	SYSTEM_TYPE_TAGS,
	ADVERSARY_GEN_TAGS,
	reset_default_playbook,
	)
	from governance import DESTRUCTIVE_BASTION_ACTIONS, DATA_HOLDING_SYSTEMS

	# Deterministic artifact: same input -> same output across commits.
	DEMO_SEED = 4242


	TASK_PLAN = [
	("easy_1", 1), # Gen 1 — script kiddie baseline
	("medium_1", 2), # Gen 2 — adaptive attacker (pivots, false flags)
	("hard_1", 3), # Gen 3 — deceptive APT (anti-evidence planted)
	("hard_2", 3), # Gen 3 — engineered deception-catch scenario
	]


	def teaching_oversight(
	proposal: CommanderProposal,
	state: IncidentState,
	action_history: List[Dict[str, Any]],
	) -> OversightAction:
	"""Rule-based Oversight that also emits situational lesson_text.

	Delegates decision/risk/weakness to the env's default policy, then
	layers a one-line lesson pegged to the observed scenario so the
	shared playbook actually fills up during a baseline run.
	"""
	from models import ActionType

	base = default_oversight_policy(proposal, state, action_history)

	target_name = (
	SYSTEM_NAMES[proposal.target_system]
	if 0 <= proposal.target_system < len(SYSTEM_NAMES) else ""
	)
	action_idx = proposal.action
	is_destructive = action_idx in DESTRUCTIVE_BASTION_ACTIONS
	is_data_sys = target_name in DATA_HOLDING_SYSTEMS

	alert_conf = 0.0
	for a in reversed(state.alerts):
	if a.source_system == target_name and a.confidence > 0:
	alert_conf = a.confidence
	break

	gen = state.adversary_gen
	lesson = ""

	# Include target/gen in text to avoid dedup collapse across contexts
	tgt = target_name or "unknown_target"
	if base.decision == int(OversightDecision.VETO) and is_destructive and is_data_sys:
	lesson = (
	f"[Gen {gen} / {tgt}] Destructive actions on data systems require "
	f"investigate() first — veto when uninvestigated regardless of alert volume."
	)
	elif is_destructive and is_data_sys and 0 < alert_conf < 0.3:
	lesson = (
	f"[Gen {gen} / {tgt}] Alerts below 30% confidence on data systems "
	f"should trigger investigate() before isolate() — low-confidence signals are bait."
	)
	elif gen == 3 and is_destructive and target_name and alert_conf > 0.7:
	lesson = (
	f"[Gen 3 / {tgt}] Gen 3 plants anti-evidence at high severity on compromised "
	f"systems — high-confidence 'all clear' warrants a second look, not auto-approval."
	)
	elif gen == 2 and action_idx == int(ActionType.ISOLATE_SYSTEM):
	lesson = (
	f"[Gen 2 / {tgt}] Gen 2 pivots around isolated systems — isolate() without a "
	f"companion hunt_threat() on adjacent targets leaks containment."
	)
	elif action_idx == int(ActionType.INVESTIGATE_SYSTEM) and is_data_sys:
	lesson = (
	f"[Gen {gen} / {tgt}] Early investigate() on data-holding systems before any "
	f"destructive action is the canonical safe path."
	)
	elif action_idx == int(ActionType.DEPLOY_MONITORING):
	lesson = (
	f"[Gen {gen} / {tgt}] Monitoring deployed without an open ServiceNow ticket "
	f"wastes a step — sequence governance (open_servicenow_incident) first on P1 scenarios."
	)
	elif is_destructive and not is_data_sys:
	lesson = (
	f"[Gen {gen} / {tgt}] Isolating non-data systems is cheap; reserve veto "
	f"budget for destructive actions on data-holding systems."
	)
	elif base.decision == int(OversightDecision.REVISE):
	lesson = (
	f"[Gen {gen} / {tgt}] Short justifications correlate with weak proposals — "
	f"require evidence pointers on every destructive action."
	)

	if lesson:
	base.lesson_text = lesson

	return base


	def _pick_relevant_lesson(
	playbook: Playbook,
	adversary_gen: int,
	target_name: str,
	rng: random.Random,
	) -> Optional[str]:
	"""Return a lesson_id from `playbook` that matches the current context, or None.

	The production `Playbook.retrieve()` sorts by tag overlap × utility ×
	last-used time, where `utility` and `last_used_ts` both come from
	`time.time()` and so differ by sub-millisecond deltas across runs.
	For a commit-stable judge artifact we skip `retrieve()` and rank
	candidates ourselves with a fully-deterministic tiebreaker
	(`lesson_id`). Production callers should still use `retrieve()` —
	real-time recency is the right signal there.

	Exercising `record_outcome` happens automatically inside the env when
	a cited lesson appears in `IncidentAction.cited_lessons`, so picking
	any relevant lesson is enough to populate wins/losses.
	"""
	if len(playbook) == 0:
	return None
	query_tags = {ADVERSARY_GEN_TAGS.get(adversary_gen, "gen_1_script")}
	if target_name in SYSTEM_TYPE_TAGS:
	query_tags.add(SYSTEM_TYPE_TAGS[target_name])

	ranked = sorted(
	playbook.all(),
	key=lambda ls: (
	-len(query_tags & set(ls.tags)), # primary: more overlap first
	ls.lesson_id, # secondary: deterministic
	),
	)
	# Drop lessons with zero tag overlap.
	candidates = [ls for ls in ranked if query_tags & set(ls.tags)]
	if not candidates:
	return None
	if rng.random() > 0.60:
	return None
	return rng.choice(candidates[:4]).lesson_id


	def naive_proposal(
	state: IncidentState,
	hour: int,
	playbook: Optional[Playbook] = None,
	rng: Optional[random.Random] = None,
	) -> IncidentAction:
	"""Demo proposal generator — rotates actions AND covers data systems.

	The upstream `naive_policy` rotates targets by hour-index, which in
	practice never lands ISOLATE on a data system. We pair action and
	target so the council actually exercises destructive-on-data cases
	(where the interesting lessons live).

	When a playbook is supplied, the proposal cites a contextually
	relevant lesson on ~60% of steps, exercising the citation/utility
	pathway end-to-end.
	"""
	from models import ActionType

	action_rotation = [
	ActionType.INVESTIGATE_SYSTEM, # safe on anything
	ActionType.ISOLATE_SYSTEM, # destructive — interesting on data sys
	ActionType.DEPLOY_MONITORING,
	ActionType.PATCH_VULNERABILITY,
	]
	# Cycle through data systems for destructive actions, other systems for safe ones
	data_targets = [2, 3, 4, 6] # database, file_server, email_server, backup_server
	other_targets = [0, 1, 5, 7] # web, app, workstations, firewall

	action = action_rotation[hour % len(action_rotation)]
	if action == ActionType.ISOLATE_SYSTEM:
	target = data_targets[hour % len(data_targets)]
	else:
	target = other_targets[hour % len(other_targets)]

	target_name = SYSTEM_NAMES[target] if 0 <= target < len(SYSTEM_NAMES) else ""

	cited: List[str] = []
	if playbook is not None and rng is not None:
	lid = _pick_relevant_lesson(playbook, state.adversary_gen, target_name, rng)
	if lid is not None:
	cited.append(lid)

	return IncidentAction(
	action=int(action),
	target_system=target,
	justification=f"Baseline rotation: {action.name.lower()} on {target_name} at hour {hour}.",
	cited_lessons=cited,
	)


	def run_episode(
	env: CitadelEnvironment,
	task_id: str,
	adversary_gen: int,
	rng: random.Random,
	) -> Dict[str, Any]:
	obs = env.reset(task_id=task_id, adversary_gen=adversary_gen)
	steps = 0
	citations_issued = 0
	for hour in range(12):
	action = naive_proposal(env._state, hour, playbook=env._playbook, rng=rng)
	if action.cited_lessons:
	citations_issued += 1
	obs = env.step(action)
	steps += 1
	if obs.done:
	break

	# Post-episode retrospective lesson — guaranteed unique per (task, gen)
	exfil = env._state.data_exfiltrated
	outcome = "contained" if exfil < 0.2 else ("degraded" if exfil < 0.8 else "catastrophic")
	env._playbook.write(
	text=(
	f"[retrospective] Baseline council on {task_id} vs Gen {adversary_gen} "
	f"ended {outcome} after {steps} steps (exfil={exfil:.2f}). "
	f"Trained policy should improve on this floor."
	),
	tags=[
	f"gen_{adversary_gen}",
	f"task_{task_id}",
	"retrospective",
	outcome,
	],
	adversary_gen=adversary_gen,
	task_id=task_id,
	hour=steps,
	)

	return {
	"task_id": task_id,
	"adversary_gen": adversary_gen,
	"steps": steps,
	"data_exfiltrated": round(exfil, 3),
	"outcome": outcome,
	"citations_issued": citations_issued,
	}


	def main() -> None:
	# Reproducibility: seed every RNG the demo touches so the artifact is
	# commit-stable. The env reseeds its own RNG per task from task config.
	random.seed(DEMO_SEED)
	rng = random.Random(DEMO_SEED)

	# Fresh, isolated playbook — don't clobber production ./playbook.json
	demo_path = str(REPO_ROOT / "playbook_demo.json")
	if os.path.exists(demo_path):
	os.remove(demo_path)
	playbook = reset_default_playbook(path=demo_path)

	env = CitadelEnvironment(oversight_policy=teaching_oversight)

	summaries: List[Dict[str, Any]] = []
	for task_id, gen in TASK_PLAN:
	summary = run_episode(env, task_id, gen, rng)
	summaries.append(summary)
	print(
	f" ran {task_id} \| Gen {gen} -> {summary['steps']} steps, "
	f"exfil={summary['data_exfiltrated']}, "
	f"citations={summary['citations_issued']}",
	flush=True,
	)

	playbook.save()

	# Aggregate citation/utility stats — these are what makes the artifact
	# demonstrate the playbook mechanic, not just lesson text.
	total_citations = sum(ls.citations for ls in playbook.all())
	total_wins = sum(ls.wins for ls in playbook.all())
	total_losses = sum(ls.losses for ls in playbook.all())
	cited_lessons = sum(1 for ls in playbook.all() if ls.citations > 0)

	out_path = REPO_ROOT / "playbook_export.md"
	header = [
	"<!-- Generated by scripts/demo_export.py — do not edit by hand. -->",
	"<!-- Re-run: `python scripts/demo_export.py` from repo root. -->",
	"",
	"> Baseline council (naive Commander + teaching rule-based Oversight) "
	"across all three tasks and adversary generations. "
	"Trained weights will produce richer lessons; this is the floor.",
	"",
	"## Runs",
	"",
	"\| Task \| Adversary Gen \| Steps \| Data Exfiltrated \| Lessons Cited \|",
	"\|---\|---\|---\|---\|---\|",
	]
	for s in summaries:
	header.append(
	f"\| `{s['task_id']}` \| Gen {s['adversary_gen']} \| {s['steps']} \| "
	f"{s['data_exfiltrated']} \| {s['citations_issued']} \|"
	)
	header.extend([
	"",
	"## Playbook mechanic — end-of-run snapshot",
	"",
	f"- {cited_lessons} of {len(playbook)} lessons cited at least once",
	f"- {total_citations} total citations across the council's runs",
	f"- {total_wins}W / {total_losses}L on cited lessons "
	f"(env auto-records via `Playbook.record_outcome` on each cited step)",
	"",
	])

	body = playbook.as_markdown()
	out_path.write_text("\n".join(header) + "\n" + body + "\n", encoding="utf-8")

	print(
	f"\nWrote {out_path.relative_to(REPO_ROOT)} "
	f"({len(playbook)} lessons, {total_citations} citations, "
	f"{total_wins}W/{total_losses}L)"
	)


	if __name__ == "__main__":
	main()