Spaces:

Majen
/

new_model

Sleeping

new_model / test_env.py

Ayu

feat: RecallTrace Tasks 1-9 complete - belief calibration + curriculum + plots

d19137b about 1 month ago

24.8 kB

	"""
	RecallTrace — ContaminationEnv Simulation
	Tasks 1-9: Environment, Tools, F1, Hidden Nodes,
	Belief Calibration, Training, Curriculum, Plots
	"""

	# ─── Required installs (for cold Colab run) ──────────────────────────────────
	# !pip install networkx numpy matplotlib

	import json
	import os
	import numpy as np
	import networkx as nx
	import matplotlib
	matplotlib.use("Agg") # headless — no display needed
	import matplotlib.pyplot as plt

	# ─── Always use relative paths so code runs anywhere (Task 8 fix) ─────────────
	os.makedirs("plots", exist_ok=True)
	PLOT_DIR = "plots"
	RESULTS_FILE = "training_results.json"


	# =============================================================================
	# ContaminationEnv (Tasks 1-4 + 5 + 7)
	# =============================================================================

	class ContaminationEnv:
	"""
	Supply-chain contamination environment with:
	- Random DAG generation per reset() [Task 1]
	- 4 noisy investigation tools [Task 2]
	- F1-scored finalize() [Task 3]
	- Hidden intervention nodes [Task 4]
	- Belief-calibrated finalize_with_beliefs() [Task 5]
	- Adversarial curriculum difficulty levels [Task 7]
	"""

	def __init__(self, difficulty_level: int = 3):
	self.graph = None
	self.contaminated_nodes: set = set()
	self.hidden_nodes: set = set()
	self.source_nodes: set = set()
	self.difficulty_level = max(1, min(5, difficulty_level))

	def set_difficulty(self, level: int) -> None:
	"""Set difficulty 1 (easy) … 5 (very hard)."""
	self.difficulty_level = max(1, min(5, level))

	# ── Task 1 + 7: Reset ────────────────────────────────────────────────────

	def reset(self) -> dict:
	"""Generate a new contamination scenario scaled to current difficulty."""
	params = {
	1: dict(n_range=(6, 8), n_sources=2, n_hidden=0, edge_p=0.25),
	2: dict(n_range=(8, 10), n_sources=2, n_hidden=1, edge_p=0.30),
	3: dict(n_range=(10, 13), n_sources=3, n_hidden=1, edge_p=0.30),
	4: dict(n_range=(12, 14), n_sources=3, n_hidden=2, edge_p=0.35),
	5: dict(n_range=(14, 16), n_sources=4, n_hidden=2, edge_p=0.40),
	}[self.difficulty_level]

	n_nodes = np.random.randint(*params["n_range"])
	self.graph = nx.DiGraph()
	self.graph.add_nodes_from(range(n_nodes))

	for i in range(n_nodes):
	for j in range(i + 1, n_nodes):
	if np.random.random() < params["edge_p"]:
	self.graph.add_edge(i, j)

	n_sources = min(params["n_sources"], n_nodes)
	self.source_nodes = set(
	np.random.choice(n_nodes, n_sources, replace=False).tolist()
	)

	n_hidden = min(params["n_hidden"], len(self.source_nodes))
	self.hidden_nodes = (
	set(np.random.choice(list(self.source_nodes), n_hidden, replace=False).tolist())
	if n_hidden > 0 else set()
	)

	self.contaminated_nodes = set(self.source_nodes)
	self._spread_contamination()

	return {
	"n_nodes": n_nodes,
	"graph_structure": list(self.graph.edges()),
	"observable_nodes": [n for n in range(n_nodes) if n not in self.hidden_nodes],
	"difficulty": self.difficulty_level,
	"n_hidden": len(self.hidden_nodes),
	"message": (
	f"Difficulty {self.difficulty_level}: {n_nodes}-node graph, "
	f"{len(self.hidden_nodes)} hidden source(s)."
	),
	}

	def _spread_contamination(self) -> None:
	to_contaminate = set(self.contaminated_nodes)
	for source in self.contaminated_nodes:
	to_contaminate.update(nx.descendants(self.graph, source))
	self.contaminated_nodes = to_contaminate

	# ── Task 2: Tools ────────────────────────────────────────────────────────

	def inspect_node(self, node_id: int) -> dict:
	"""Noisy visual inspection (80% TP / 10% FP). Blocked on hidden nodes."""
	if node_id not in self.graph.nodes():
	return {"error": "Node does not exist"}
	if node_id in self.hidden_nodes:
	return {
	"error": "Cannot inspect this node",
	"reason": "Node is not directly observable",
	"hint": "Examine downstream nodes to infer its state",
	}
	is_cont = node_id in self.contaminated_nodes
	obs = np.random.random() < (0.8 if is_cont else 0.1)
	return {
	"node_id": node_id,
	"appears_contaminated": bool(obs),
	"confidence": "medium",
	"upstream_count": len(list(self.graph.predecessors(node_id))),
	"downstream_count": len(list(self.graph.successors(node_id))),
	}

	def test_batch(self, node_id: int) -> dict:
	"""Lab test (95% TP / 5% FP). Blocked on hidden nodes."""
	if node_id not in self.graph.nodes():
	return {"error": "Node does not exist"}
	if node_id in self.hidden_nodes:
	return {
	"error": "Cannot test this node",
	"reason": "Node is not directly testable",
	"hint": "Infer contamination from causal structure",
	}
	is_cont = node_id in self.contaminated_nodes
	pos = np.random.random() < (0.95 if is_cont else 0.05)
	return {
	"node_id": node_id,
	"test_result": "POSITIVE" if pos else "NEGATIVE",
	"confidence": "high",
	"cost": 10,
	}

	def trace_upstream(self, node_id: int) -> dict:
	if node_id not in self.graph.nodes():
	return {"error": "Node does not exist"}
	parents = list(self.graph.predecessors(node_id))
	return {"node_id": node_id, "immediate_upstream": parents, "upstream_count": len(parents)}

	def trace_downstream(self, node_id: int) -> dict:
	if node_id not in self.graph.nodes():
	return {"error": "Node does not exist"}
	children = list(self.graph.successors(node_id))
	return {"node_id": node_id, "immediate_downstream": children, "downstream_count": len(children)}

	# ── Task 3: Finalize (F1) ─────────────────────────────────────────────────

	def finalize(self, suspected_nodes) -> dict:
	"""Score binary guess with F1 (precision + recall)."""
	suspected = set(suspected_nodes)
	actual = self.contaminated_nodes
	tp = len(suspected & actual)
	fp = len(suspected - actual)
	fn = len(actual - suspected)
	precision = tp / (tp + fp) if suspected else 0.0
	recall = tp / (tp + fn) if actual else 0.0
	f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0
	return {
	"f1_score": f1, "precision": precision, "recall": recall,
	"true_positives": tp, "false_positives": fp, "false_negatives": fn,
	"suspected_nodes": list(suspected), "actual_contaminated": list(actual),
	"total_nodes": self.graph.number_of_nodes(),
	}

	# ── Task 5: Finalize with Belief Calibration ──────────────────────────────

	def finalize_with_beliefs(self, beliefs: dict) -> dict:
	"""
	Score the agent's probabilistic beliefs.

	Args:
	beliefs: {node_id: confidence_probability} e.g. {1: 0.9, 3: 0.4}

	Returns:
	Dict with f1_score, calibration_score (Brier), total_reward, breakdown.
	"""
	suspected = {n for n, conf in beliefs.items() if conf > 0.5}
	actual = self.contaminated_nodes

	tp = len(suspected & actual)
	fp = len(suspected - actual)
	fn = len(actual - suspected)
	precision = tp / (tp + fp) if suspected else 0.0
	recall = tp / (tp + fn) if actual else 0.0
	f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0.0

	calibration = self._calculate_calibration(beliefs)

	# 70% accuracy + 30% calibration
	total_reward = 0.7 * f1 + 0.3 * calibration

	return {
	"f1_score": round(f1, 4),
	"calibration_score": round(calibration, 4),
	"total_reward": round(total_reward, 4),
	"precision": round(precision, 4),
	"recall": round(recall, 4),
	"breakdown": self._get_belief_breakdown(beliefs),
	}

	def _calculate_calibration(self, beliefs: dict) -> float:
	"""Inverted Brier score: 1 = perfect calibration, 0 = worst."""
	if not beliefs:
	return 0.0
	brier = sum(
	(conf - (1 if n in self.contaminated_nodes else 0)) ** 2
	for n, conf in beliefs.items()
	)
	return round(1 - brier / len(beliefs), 4)

	def _get_belief_breakdown(self, beliefs: dict) -> list:
	"""Classify each prediction by correctness and confidence."""
	breakdown = []
	for node_id, confidence in beliefs.items():
	is_cont = node_id in self.contaminated_nodes
	if is_cont and confidence > 0.5:
	result = "CORRECT_HIGH_CONF"
	elif is_cont:
	result = "MISSED_LOW_CONF"
	elif confidence > 0.5:
	result = "FALSE_ALARM_HIGH_CONF"
	else:
	result = "CORRECT_LOW_CONF"
	breakdown.append({
	"node": node_id,
	"confidence": round(confidence, 3),
	"actually_contaminated": is_cont,
	"result": result,
	})
	return breakdown


	# =============================================================================
	# Heuristic Agent (causal inference — same as Tasks 1-4)
	# =============================================================================

	def simple_heuristic_agent(env: ContaminationEnv, n_nodes: int) -> dict:
	"""
	Inspect all observable nodes, infer hidden nodes causally.
	Returns belief dict {node_id: confidence}.
	"""
	observable = [n for n in range(n_nodes) if n not in env.hidden_nodes]
	hidden = list(env.hidden_nodes)
	beliefs = {}

	# Step 1: lab-test observable nodes
	for node in observable:
	result = env.test_batch(node)
	if result.get("test_result") == "POSITIVE":
	beliefs[node] = 0.92
	elif result.get("test_result") == "NEGATIVE":
	beliefs[node] = 0.08

	# Step 2: causal inference for hidden nodes (multi-pass)
	changed = True
	while changed:
	changed = False
	for h in hidden:
	if h in beliefs:
	continue
	parents = list(env.graph.predecessors(h))
	children = list(env.graph.successors(h))

	# If a known-contaminated parent -> this node must be contaminated
	if any(beliefs.get(p, 0) > 0.5 for p in parents):
	beliefs[h] = 0.85
	changed = True
	continue

	# If all children are contaminated -> infer hidden source
	if children and all(beliefs.get(c, 0) > 0.5 for c in children):
	beliefs[h] = 0.75
	changed = True
	continue

	# Partial evidence from children
	if children:
	pos_children = sum(1 for c in children if beliefs.get(c, 0) > 0.5)
	ratio = pos_children / len(children)
	if ratio > 0:
	beliefs[h] = round(0.4 + 0.4 * ratio, 3)
	changed = True

	return beliefs


	def random_agent(n_nodes: int) -> dict:
	"""Purely random baseline."""
	return {
	i: float(np.random.random())
	for i in range(n_nodes)
	if np.random.random() > 0.5
	}


	# =============================================================================
	# Task 6: Training Loop (30 episodes)
	# =============================================================================

	def train_agent(n_episodes: int = 30, difficulty: int = 3) -> tuple:
	"""Run n_episodes and track F1, calibration, and total reward."""
	env = ContaminationEnv(difficulty_level=difficulty)
	rewards, f1_scores, calibration_scores = [], [], []

	print(f"\n{'='*55}")
	print(f" Training Agent — {n_episodes} Episodes (difficulty={difficulty})")
	print(f"{'='*55}")

	for ep in range(n_episodes):
	state = env.reset()
	n_nodes = state["n_nodes"]
	beliefs = simple_heuristic_agent(env, n_nodes)
	result = env.finalize_with_beliefs(beliefs)

	rewards.append(result["total_reward"])
	f1_scores.append(result["f1_score"])
	calibration_scores.append(result["calibration_score"])

	if (ep + 1) % 5 == 0:
	print(f" Ep {ep+1:3d}/{n_episodes} \| F1={result['f1_score']:.3f} "
	f"Cal={result['calibration_score']:.3f} "
	f"Reward={result['total_reward']:.3f}")

	print(f"\n Final averages -> F1={np.mean(f1_scores):.3f} "
	f"Cal={np.mean(calibration_scores):.3f} "
	f"Reward={np.mean(rewards):.3f}")

	return rewards, f1_scores, calibration_scores


	# =============================================================================
	# Task 7: Adversarial Curriculum (5 difficulty stages)
	# =============================================================================

	def train_with_curriculum(total_episodes: int = 50) -> tuple:
	"""Train from difficulty 1 -> 5, stepping up every 10 episodes."""
	env = ContaminationEnv(difficulty_level=1)
	rewards, difficulties = [], []

	print(f"\n{'='*55}")
	print(f" Curriculum Training — {total_episodes} Episodes")
	print(f"{'='*55}")

	for ep in range(total_episodes):
	level = min(5, 1 + ep // 10)
	env.set_difficulty(level)
	state = env.reset()
	beliefs = simple_heuristic_agent(env, state["n_nodes"])
	result = env.finalize_with_beliefs(beliefs)

	rewards.append(result["total_reward"])
	difficulties.append(level)

	if (ep + 1) % 10 == 0:
	print(f" Ep {ep+1:3d}/{total_episodes} \| "
	f"Difficulty={level} Reward={result['total_reward']:.3f}")

	return rewards, difficulties


	# =============================================================================
	# Task 9: Baseline Comparison
	# =============================================================================

	def compare_baselines(n_trials: int = 20, difficulty: int = 3) -> dict:
	"""Compare random vs heuristic agent across n_trials."""
	env = ContaminationEnv(difficulty_level=difficulty)
	results = {"random": [], "heuristic": []}

	for _ in range(n_trials):
	state = env.reset()
	n_nodes = state["n_nodes"]

	# Random baseline
	rg = random_agent(n_nodes)
	results["random"].append(env.finalize_with_beliefs(rg)["f1_score"])

	# Heuristic baseline
	hg = simple_heuristic_agent(env, n_nodes)
	results["heuristic"].append(env.finalize_with_beliefs(hg)["f1_score"])

	return {k: {"mean": round(float(np.mean(v)), 4),
	"std": round(float(np.std(v)), 4)}
	for k, v in results.items()}


	# =============================================================================
	# Plot helpers (Task 6 + 9) — always save as files, never rely on display
	# =============================================================================

	def plot_training_curves(rewards, f1_scores, calibration_scores):
	fig, axes = plt.subplots(1, 3, figsize=(15, 4))
	episodes = range(1, len(rewards) + 1)

	axes[0].plot(episodes, rewards, "b-", linewidth=2)
	axes[0].set_xlabel("Episode"); axes[0].set_ylabel("Total Reward")
	axes[0].set_title("Learning Curve: Total Reward"); axes[0].grid(True, alpha=0.3)

	axes[1].plot(episodes, f1_scores, "g-", linewidth=2)
	axes[1].set_xlabel("Episode"); axes[1].set_ylabel("F1 Score")
	axes[1].set_title("Detection Accuracy (F1)"); axes[1].grid(True, alpha=0.3)

	axes[2].plot(episodes, calibration_scores, "r-", linewidth=2)
	axes[2].set_xlabel("Episode"); axes[2].set_ylabel("Calibration Score")
	axes[2].set_title("Belief Calibration"); axes[2].grid(True, alpha=0.3)

	plt.tight_layout()
	path = os.path.join(PLOT_DIR, "training_curves.png")
	plt.savefig(path, dpi=150, bbox_inches="tight")
	plt.close()
	print(f" Saved -> {path}")


	def plot_curriculum(rewards, difficulties):
	fig, ax = plt.subplots(figsize=(10, 5))
	ax2 = ax.twinx()

	ax.plot(rewards, "b-", linewidth=2, label="Reward")
	ax2.plot(difficulties, "r--", linewidth=2, label="Difficulty", alpha=0.7)

	ax.set_xlabel("Episode"); ax.set_ylabel("Reward", color="b")
	ax2.set_ylabel("Difficulty Level", color="r")
	ax.set_title("Curriculum Learning: Reward vs Difficulty")
	ax.grid(True, alpha=0.3)

	lines1, labels1 = ax.get_legend_handles_labels()
	lines2, labels2 = ax2.get_legend_handles_labels()
	ax.legend(lines1 + lines2, labels1 + labels2, loc="upper left")

	path = os.path.join(PLOT_DIR, "curriculum_learning.png")
	plt.savefig(path, dpi=150, bbox_inches="tight")
	plt.close()
	print(f" Saved -> {path}")


	def plot_baseline_comparison(baselines):
	fig, ax = plt.subplots(figsize=(8, 6))
	names = list(baselines.keys())
	means = [baselines[k]["mean"] for k in names]
	stds = [baselines[k]["std"] for k in names]
	colors = ["#ff6b6b", "#6bcf7f"]

	bars = ax.bar(names, means, yerr=stds, capsize=6,
	color=colors, edgecolor="black", linewidth=0.8)
	ax.set_ylabel("F1 Score", fontsize=12)
	ax.set_title("Baseline Comparison: Detection Performance", fontsize=13, fontweight="bold")
	ax.set_ylim(0, 1.05)
	ax.grid(True, alpha=0.3, axis="y")
	for bar, mean in zip(bars, means):
	ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02,
	f"{mean:.3f}", ha="center", va="bottom", fontweight="bold")

	path = os.path.join(PLOT_DIR, "baseline_comparison.png")
	plt.savefig(path, dpi=150, bbox_inches="tight")
	plt.close()
	print(f" Saved -> {path}")


	def plot_before_after(f1_scores):
	first5 = f1_scores[:5]
	last5 = f1_scores[-5:]

	fig, ax = plt.subplots(figsize=(8, 6))
	ax.scatter([1] * len(first5), first5, s=120, alpha=0.7, color="red", label="First 5 Episodes")
	ax.scatter([2] * len(last5), last5, s=120, alpha=0.7, color="green",label="Last 5 Episodes")

	ax.plot([1, 2], [np.mean(first5), np.mean(last5)], "k--", linewidth=2, alpha=0.5)
	ax.set_xticks([1, 2]); ax.set_xticklabels(["Before Training", "After Training"])
	ax.set_ylabel("F1 Score"); ax.set_title("Learning Progress: Before vs After")
	ax.legend(); ax.grid(True, alpha=0.3, axis="y"); ax.set_ylim(0, 1.05)

	path = os.path.join(PLOT_DIR, "before_after.png")
	plt.savefig(path, dpi=150, bbox_inches="tight")
	plt.close()
	print(f" Saved -> {path}")


	# =============================================================================
	# Task 9: Generate everything for Shreya
	# =============================================================================

	def generate_all_plots_for_shreya():
	print("\n" + "="*55)
	print(" Generating All Plots & Results")
	print("="*55)

	# ── Training run ──────────────────────────────────────────────────────────
	print("\n[1/4] Training agent (30 episodes, difficulty 3)…")
	rewards, f1, cal = train_agent(n_episodes=30, difficulty=3)
	plot_training_curves(rewards, f1, cal)
	plot_before_after(f1)

	# ── Curriculum run ────────────────────────────────────────────────────────
	print("\n[2/4] Curriculum training (50 episodes, difficulty 1->5)…")
	cur_rewards, cur_diff = train_with_curriculum(total_episodes=50)
	plot_curriculum(cur_rewards, cur_diff)

	# ── Baseline comparison ───────────────────────────────────────────────────
	print("\n[3/4] Baseline comparison (20 trials)…")
	baselines = compare_baselines(n_trials=20, difficulty=3)
	plot_baseline_comparison(baselines)

	# ── Save JSON ─────────────────────────────────────────────────────────────
	print("\n[4/4] Saving results JSON…")
	data = {
	"training": {
	"n_episodes": 30,
	"difficulty": 3,
	"final_f1": float(f1[-1]),
	"final_calibration": float(cal[-1]),
	"final_reward": float(rewards[-1]),
	"avg_f1": round(float(np.mean(f1)), 4),
	"avg_calibration": round(float(np.mean(cal)), 4),
	"avg_reward": round(float(np.mean(rewards)), 4),
	"improvement_f1": round(float(f1[-1] - f1[0]), 4),
	},
	"curriculum": {
	"n_episodes": 50,
	"final_reward": float(cur_rewards[-1]),
	"avg_reward": round(float(np.mean(cur_rewards)), 4),
	},
	"baselines": baselines,
	"plots": [
	os.path.join(PLOT_DIR, f)
	for f in ["training_curves.png", "before_after.png",
	"curriculum_learning.png", "baseline_comparison.png"]
	],
	}
	with open(RESULTS_FILE, "w") as fh:
	json.dump(data, fh, indent=2)
	print(f" Saved -> {RESULTS_FILE}")

	print("\n" + "="*55)
	print(" RESULTS FOR SHREYA")
	print("="*55)
	t = data["training"]
	print(f" Avg F1 Score : {t['avg_f1']:.3f}")
	print(f" Avg Calibration : {t['avg_calibration']:.3f}")
	print(f" Avg Total Reward : {t['avg_reward']:.3f}")
	print(f" F1 Improvement : +{t['improvement_f1']:.3f}")
	print(f"\n Baselines (F1):")
	for name, stats in baselines.items():
	print(f" {name:12s}: {stats['mean']:.3f} ± {stats['std']:.3f}")
	print(f" All plots saved to -> {PLOT_DIR}/")
	print("="*55)

	return data


	# =============================================================================
	# Main — runs everything end-to-end
	# =============================================================================

	if __name__ == "__main__":
	print("RecallTrace — Tasks 1-9 Simulation")
	print("="*55)

	# ── Quick sanity check (Tasks 1-4) ────────────────────────────────────────
	print("\n[SANITY] 10-episode automated agent run…")
	f1_history = []
	for ep in range(10):
	env = ContaminationEnv(difficulty_level=3)
	state = env.reset()
	beliefs = simple_heuristic_agent(env, state["n_nodes"])
	r = env.finalize_with_beliefs(beliefs)
	f1_history.append(r["f1_score"])
	print(f" Ep {ep+1:2d} \| nodes={state['n_nodes']:2d} "
	f"\| hidden={state['n_hidden']} "
	f"\| F1={r['f1_score']:.3f} "
	f"\| Cal={r['calibration_score']:.3f} "
	f"\| Reward={r['total_reward']:.3f}")
	print(f" => Mean F1 over 10 episodes: {np.mean(f1_history):.3f}")

	# ── Task 5: Belief calibration demo ──────────────────────────────────────
	print("\n[TASK 5] Belief calibration example…")
	env = ContaminationEnv(difficulty_level=3)
	env.reset()
	demo_beliefs = {
	n: float(np.random.random())
	for n in range(env.graph.number_of_nodes())
	}
	result = env.finalize_with_beliefs(demo_beliefs)
	print(f" F1={result['f1_score']:.3f} "
	f"Calibration={result['calibration_score']:.3f} "
	f"Total Reward={result['total_reward']:.3f}")

	# ── Tasks 6, 7, 9: Full training + plots ─────────────────────────────────
	data = generate_all_plots_for_shreya()
	print("All done! Done")