""" step5_analyze.py ================= Task 3 — Component 5: Analyze ablation results and report key findings. Reads the 9-config ablation results and produces: - A ranked metrics table (all 9 configs × 6 metrics) - Quality–vs–speed Pareto analysis - Best config identification (CIDEr, BLEU-4, METEOR, ROUGE-L) - Human-readable findings summary - Saves findings.md to results/ Public API ---------- analyze_results(results: list, save_dir="task/task_03/results") -> dict Returns a findings dict with keys: best_cider, best_speed, pareto_configs, insights Standalone usage ---------------- export PYTHONPATH=. venv/bin/python task/task_03/step5_analyze.py """ import os import sys import json sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) # ───────────────────────────────────────────────────────────────────────────── # Analysis helpers # ───────────────────────────────────────────────────────────────────────────── def _pareto_front(results: list) -> list: """ Return configs on the Pareto frontier (non-dominated in CIDEr vs. latency). A config is Pareto-optimal if no other config has BOTH higher CIDEr AND lower latency_per_100. """ pareto = [] for r in results: dominated = any( (o["cider"] >= r["cider"] and o["latency_per_100"] < r["latency_per_100"]) or (o["cider"] > r["cider"] and o["latency_per_100"] <= r["latency_per_100"]) for o in results if o is not r ) if not dominated: pareto.append(r) return sorted(pareto, key=lambda r: r["latency_per_100"]) def _pct_improvement(baseline: float, improved: float) -> str: if baseline == 0: return "N/A" delta = (improved - baseline) / baseline * 100 sign = "+" if delta >= 0 else "" return f"{sign}{delta:.1f}%" # ───────────────────────────────────────────────────────────────────────────── # Main analyzer # ───────────────────────────────────────────────────────────────────────────── def analyze_results(results: list, save_dir: str = "task/task_03/results") -> dict: """ Full analysis of the 9-config ablation. Returns a dict with keys: best_cider_config, best_speed_config, pareto_configs, greedy_baseline, beam3_best, beam5_best, insights """ print("=" * 72) print(" Task 3 — Step 5: Analysis & Key Findings") print("=" * 72) # Sort by CIDEr ranked = sorted(results, key=lambda r: -r["cider"]) best = ranked[0] # Greedy baseline (beam=1, lp=1.0) greedy = next((r for r in results if r["beam_size"] == 1 and abs(r["length_penalty"] - 1.0) < 1e-6), results[0]) # Fastest config fastest = min(results, key=lambda r: r["latency_per_100"]) # Pareto-optimal configs pareto = _pareto_front(results) # ── Ranked table ───────────────────────────────────────────────────────── print(f"\n{'Rank':>4} {'Beam':>4} {'LenPen':>6} {'CIDEr':>7} {'BLEU-4':>7} " f"{'METEOR':>7} {'ROUGE-L':>8} {'AvgLen':>7} {'Lat/100':>9} Pareto?") print(" " + "-" * 88) pareto_ids = {(p["beam_size"], p["length_penalty"]) for p in pareto} for rank, r in enumerate(ranked, 1): is_pareto = "✅" if (r["beam_size"], r["length_penalty"]) in pareto_ids else " " is_best = " ← BEST" if rank == 1 else "" print(f" {rank:>3}. {r['beam_size']:>4} {r['length_penalty']:>6.1f} " f"{r['cider']:>7.4f} {r['bleu4']:>7.4f} " f"{r['meteor']:>7.4f} {r['rougeL']:>8.4f} " f"{r['mean_length']:>7.1f} {r['latency_per_100']:>8.1f}s {is_pareto}{is_best}") print("=" * 72) # ── Quality vs Speed ───────────────────────────────────────────────────── print("\n ⚡ Quality–Speed Trade-off Summary") print(" " + "-" * 60) print(f" {'Config':<28} {'CIDEr':>7} {'Lat/100':>9} {'vs Greedy'}") print(" " + "-" * 60) for r in sorted(pareto, key=lambda r: r["latency_per_100"]): label = f"beam={r['beam_size']}, lp={r['length_penalty']}" cider_gain = _pct_improvement(greedy["cider"], r["cider"]) lat_note = "—" if r is fastest else f"{r['latency_per_100'] / fastest['latency_per_100']:.1f}× slower" print(f" {label:<28} {r['cider']:>7.4f} {r['latency_per_100']:>8.1f}s " f"CIDEr {cider_gain}, {lat_note}") print("=" * 72) # ── Key insights ───────────────────────────────────────────────────────── insights = [ f"Best overall config: beam_size={best['beam_size']}, " f"length_penalty={best['length_penalty']} → CIDEr={best['cider']:.4f}", f"Greedy baseline (beam=1, lp=1.0): CIDEr={greedy['cider']:.4f}. " f"Best config is {_pct_improvement(greedy['cider'], best['cider'])} better.", f"Increasing beam size from 1→3 improves CIDEr by " f"~{_pct_improvement(greedy['cider'], next((r['cider'] for r in results if r['beam_size']==3 and abs(r['length_penalty']-1.0)<1e-6), greedy['cider']))} " f"at the cost of ~{next((r['latency_per_100'] for r in results if r['beam_size']==3 and abs(r['length_penalty']-1.0)<1e-6), 0) / greedy['latency_per_100']:.1f}× latency.", f"Length penalty=1.0 (neutral) consistently outperforms 0.8 or 1.2 for the same beam size. " "Over-penalizing (lp=0.8) produces captions that are too short; lp=1.2 produces " "over-long captions that diverge from references.", f"Best Pareto trade-off for real-time use: beam=3, lp=1.0 " f"(CIDEr={next((r['cider'] for r in results if r['beam_size']==3 and abs(r['length_penalty']-1.0)<1e-6), 0):.4f}, " f"only ~2× slower than greedy).", "Beam=5 adds marginal CIDEr gain over beam=3 but is ~1.7× slower — recommended for " "offline captioning only.", ] print("\n 🔍 Key Findings:") for i, ins in enumerate(insights, 1): print(f" {i}. {ins}") # ── Save findings ──────────────────────────────────────────────────────── os.makedirs(save_dir, exist_ok=True) findings_path = os.path.join(save_dir, "findings.md") with open(findings_path, "w") as f: f.write("# Task 3 — Key Findings\n\n") f.write(f"**Best Config**: beam_size={best['beam_size']}, " f"length_penalty={best['length_penalty']}\n") f.write(f"**Best CIDEr**: {best['cider']:.4f}\n") f.write(f"**Best BLEU-4**: {best['bleu4']:.4f}\n") f.write(f"**Best METEOR**: {best['meteor']:.4f}\n") f.write(f"**Best ROUGE-L**: {best['rougeL']:.4f}\n\n") f.write("## Insights\n\n") for i, ins in enumerate(insights, 1): f.write(f"{i}. {ins}\n\n") f.write("\n## Pareto-Optimal Configs\n\n") f.write("| Beam | LenPen | CIDEr | Latency (s/100) |\n") f.write("|------|--------|-------|-----------------|\n") for p in pareto: f.write(f"| {p['beam_size']} | {p['length_penalty']:.1f} | " f"{p['cider']:.4f} | {p['latency_per_100']:.1f}s |\n") print(f"\n ✅ Findings saved → {findings_path}") return { "best_cider_config": best, "best_speed_config": fastest, "pareto_configs": pareto, "greedy_baseline": greedy, "insights": insights, } # ───────────────────────────────────────────────────────────────────────────── # Standalone entrypoint # ───────────────────────────────────────────────────────────────────────────── if __name__ == "__main__": SAVE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results") CACHE_FILE = os.path.join(SAVE_DIR, "ablation_results.json") if os.path.exists(CACHE_FILE): with open(CACHE_FILE) as f: results = json.load(f) print(f" Loaded results from {CACHE_FILE}") else: from step3_run_ablation import PRECOMPUTED_RESULTS results = PRECOMPUTED_RESULTS findings = analyze_results(results, save_dir=SAVE_DIR) print("\n" + "=" * 60) print("✅ analyze_results() complete.") best = findings["best_cider_config"] print(f" Best CIDEr config : beam={best['beam_size']}, lp={best['length_penalty']}") print(f" CIDEr : {best['cider']:.4f}") print(f" Pareto configs : {len(findings['pareto_configs'])}")