import argparse import csv import os import random import sys import time import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import numpy as np from tqdm import tqdm sys.path.insert(0, os.path.dirname(__file__)) from src.tasks import EasyTask, MediumTask, HardTask from src.agent import DeterministicAgent TASK_MAP = { "Easy": EasyTask, "Medium": MediumTask, "Hard": HardTask, } def run_single(seed: int) -> dict: random.seed(seed) np.random.seed(seed) agent = DeterministicAgent() results = {"seed": seed} total_score = 0.0 for i, (level, TaskClass) in enumerate(TASK_MAP.items()): task_seed = seed + i * 999 task = TaskClass() state = task.reset(seed=task_seed) done = False steps = 0 total_reward = 0.0 while not done: action = agent.get_action(state) result = task.step(action) state = result.state total_reward += result.reward done = result.done steps += 1 if steps > 500: break score = task.evaluate() info = result.info total_score += score results[f"{level}_score"] = round(score, 6) results[f"{level}_reward"] = round(total_reward, 2) results[f"{level}_cleared"] = info["total_cleared"] results[f"{level}_wait"] = round(info["avg_waiting_time"], 2) results["overall_score"] = round(total_score / len(TASK_MAP), 6) return results def compute_stats(values: np.ndarray, label: str) -> dict: return { "label": label, "mean": float(np.mean(values)), "std": float(np.std(values)), "min": float(np.min(values)), "max": float(np.max(values)), "p5": float(np.percentile(values, 5)), "p95": float(np.percentile(values, 95)), } COLORS = { "Easy": "#4ECDC4", "Medium": "#FFD166", "Hard": "#EF476F", "Overall": "#6A0572", } def plot_histograms(all_records: list, output_path: str): metrics = ["Easy", "Medium", "Hard", "Overall"] fig, axes = plt.subplots(2, 2, figsize=(13, 9)) axes = axes.flatten() for ax, m in zip(axes, metrics): key = f"{m}_score" if m != "Overall" else "overall_score" vals = np.array([r[key] for r in all_records]) mean, std = vals.mean(), vals.std() ax.hist(vals, bins=40, color=COLORS[m], edgecolor="white", linewidth=0.5, alpha=0.85) ax.axvline(mean, color="black", linestyle="--", linewidth=1.4, label=f"Mean={mean:.4f}") ax.axvline(mean - std, color="gray", linestyle=":", linewidth=1.1) ax.axvline(mean + std, color="gray", linestyle=":", linewidth=1.1, label=f"±1σ={std:.4f}") ax.set_title(f"{m} Score Distribution", fontsize=12, fontweight="bold") ax.set_xlabel("Score (0–1)", fontsize=10) ax.set_ylabel("Frequency", fontsize=10) ax.legend(fontsize=9) ax.grid(axis="y", linestyle="--", alpha=0.4) ax.spines[["top", "right"]].set_visible(False) fig.suptitle( f"Stress Test Score Distributions ({len(all_records)} runs)", fontsize=14, fontweight="bold", y=1.01 ) fig.tight_layout() plt.savefig(output_path, dpi=150, bbox_inches="tight") plt.close(fig) print(f"Histogram saved → {output_path}") def plot_time_series(all_records: list, output_path: str): scores = [r["overall_score"] for r in all_records] runs = list(range(1, len(scores) + 1)) window = min(50, len(scores) // 10) rolling = np.convolve(scores, np.ones(window) / window, mode="valid") fig, ax = plt.subplots(figsize=(13, 4)) ax.plot(runs, scores, color="#cccccc", linewidth=0.6, label="Run score") ax.plot( range(window, len(scores) + 1), rolling, color=COLORS["Overall"], linewidth=2.0, label=f"Rolling mean (w={window})" ) ax.axhline(np.mean(scores), color="black", linestyle="--", linewidth=1.2, label=f"Global mean={np.mean(scores):.4f}") ax.set_title("Overall Score Over Simulation Runs", fontsize=13, fontweight="bold") ax.set_xlabel("Run number") ax.set_ylabel("Overall Score") ax.set_ylim(0, 1.05) ax.legend(fontsize=9) ax.grid(linestyle="--", alpha=0.35) ax.spines[["top", "right"]].set_visible(False) fig.tight_layout() plt.savefig(output_path, dpi=150, bbox_inches="tight") plt.close(fig) print(f"Time-series saved → {output_path}") def save_csv(records: list, path: str): if not records: return fieldnames = list(records[0].keys()) with open(path, "w", newline="") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(records) print(f"CSV saved → {path}") def validate(stats: dict, all_records: list) -> tuple[bool, list]: issues = [] overall_scores = np.array([r["overall_score"] for r in all_records]) if stats["std"] < 1e-6: issues.append("Std dev ≈ 0 → outputs are IDENTICAL across all runs (possible hardcoding)") if stats["min"] < 0 or stats["max"] > 1: issues.append(f"Scores out of range: min={stats['min']:.4f}, max={stats['max']:.4f}") z_scores = np.abs((overall_scores - stats["mean"]) / max(stats["std"], 1e-9)) extreme_runs = int((z_scores > 3).sum()) if extreme_runs > len(all_records) * 0.01: issues.append(f"{extreme_runs} extreme outliers detected (>3σ) — possible instability") if stats["mean"] < 0.5: issues.append(f"Mean score {stats['mean']:.4f} < 0.5 — agent performing worse than chance") return (len(issues) == 0), issues def main(): parser = argparse.ArgumentParser(description="Stress test the Smart Traffic environment") parser.add_argument("-n", "--runs", type=int, default=1000, help="Number of simulation runs (default: 1000)") parser.add_argument("--base-seed", type=int, default=None, help="Base seed for deterministic run sequence (optional)") parser.add_argument("--out-dir", type=str, default=".", help="Output directory for CSV and plots") args = parser.parse_args() N = args.runs base_rng = random.Random(args.base_seed) print(f"\n{'='*55}") print(f" SMART TRAFFIC STRESS TEST") print(f" Runs: {N} | Base seed: {args.base_seed or 'random'}") print(f"{'='*55}\n") all_records = [] t0 = time.time() for _ in tqdm(range(N), desc="Simulating", unit="run", ncols=72, colour="cyan"): seed = base_rng.randint(1, 999_999) rec = run_single(seed) all_records.append(rec) elapsed = time.time() - t0 print(f"\nCompleted {N} runs in {elapsed:.1f}s ({elapsed/N*1000:.1f} ms/run)\n") overall_scores = np.array([r["overall_score"] for r in all_records]) easy_scores = np.array([r["Easy_score"] for r in all_records]) medium_scores = np.array([r["Medium_score"] for r in all_records]) hard_scores = np.array([r["Hard_score"] for r in all_records]) overall_stats = compute_stats(overall_scores, "Overall") print(f"{'='*55}") print(f" STRESS TEST REPORT") print(f"{'='*55}") print(f" Runs : {N}") print(f" Elapsed : {elapsed:.1f}s") print() print(f" {'Metric':<18} {'Easy':>8} {'Medium':>8} {'Hard':>8} {'Overall':>9}") print(f" {'-'*55}") for label, vals in [("Mean", [easy_scores.mean(), medium_scores.mean(), hard_scores.mean(), overall_scores.mean()]), ("Std Dev", [easy_scores.std(), medium_scores.std(), hard_scores.std(), overall_scores.std()]), ("Min", [easy_scores.min(), medium_scores.min(), hard_scores.min(), overall_scores.min()]), ("Max", [easy_scores.max(), medium_scores.max(), hard_scores.max(), overall_scores.max()]), ("P5", [np.percentile(easy_scores, 5), np.percentile(medium_scores, 5), np.percentile(hard_scores, 5), np.percentile(overall_scores, 5)]), ("P95", [np.percentile(easy_scores,95), np.percentile(medium_scores,95), np.percentile(hard_scores,95), np.percentile(overall_scores,95)])]: print(f" {label:<18} {vals[0]:>8.4f} {vals[1]:>8.4f} {vals[2]:>8.4f} {vals[3]:>9.4f}") print() stable, issues = validate(overall_stats, all_records) print(f"{'='*55}") print(f" Stability Verdict:") if stable: print(f" STABLE ✅ — All validation rules passed") else: print(f" UNSTABLE ❌ — Issues detected:") for issue in issues: print(f" • {issue}") print(f"{'='*55}\n") os.makedirs(args.out_dir, exist_ok=True) csv_path = os.path.join(args.out_dir, "stress_test_results.csv") hist_path = os.path.join(args.out_dir, "stress_test_histogram.png") ts_path = os.path.join(args.out_dir, "stress_test_timeseries.png") save_csv(all_records, csv_path) plot_histograms(all_records, hist_path) plot_time_series(all_records, ts_path) print("\nDone.") return 0 if stable else 1 if __name__ == "__main__": sys.exit(main())