Spaces:
Running
Running
| """ | |
| step5_analyze.py | |
| ================= | |
| Task 3 β Component 5: Analyze ablation results and report key findings. | |
| Reads the 9-config ablation results and produces: | |
| - A ranked metrics table (all 9 configs Γ 6 metrics) | |
| - Qualityβvsβspeed Pareto analysis | |
| - Best config identification (CIDEr, BLEU-4, METEOR, ROUGE-L) | |
| - Human-readable findings summary | |
| - Saves findings.md to results/ | |
| Public API | |
| ---------- | |
| analyze_results(results: list, save_dir="task/task_03/results") -> dict | |
| Returns a findings dict with keys: | |
| best_cider, best_speed, pareto_configs, insights | |
| Standalone usage | |
| ---------------- | |
| export PYTHONPATH=. | |
| venv/bin/python task/task_03/step5_analyze.py | |
| """ | |
| import os | |
| import sys | |
| import json | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Analysis helpers | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _pareto_front(results: list) -> list: | |
| """ | |
| Return configs on the Pareto frontier (non-dominated in CIDEr vs. latency). | |
| A config is Pareto-optimal if no other config has BOTH higher CIDEr AND | |
| lower latency_per_100. | |
| """ | |
| pareto = [] | |
| for r in results: | |
| dominated = any( | |
| (o["cider"] >= r["cider"] and o["latency_per_100"] < r["latency_per_100"]) | |
| or | |
| (o["cider"] > r["cider"] and o["latency_per_100"] <= r["latency_per_100"]) | |
| for o in results if o is not r | |
| ) | |
| if not dominated: | |
| pareto.append(r) | |
| return sorted(pareto, key=lambda r: r["latency_per_100"]) | |
| def _pct_improvement(baseline: float, improved: float) -> str: | |
| if baseline == 0: | |
| return "N/A" | |
| delta = (improved - baseline) / baseline * 100 | |
| sign = "+" if delta >= 0 else "" | |
| return f"{sign}{delta:.1f}%" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Main analyzer | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def analyze_results(results: list, save_dir: str = "task/task_03/results") -> dict: | |
| """ | |
| Full analysis of the 9-config ablation. | |
| Returns a dict with keys: | |
| best_cider_config, best_speed_config, pareto_configs, | |
| greedy_baseline, beam3_best, beam5_best, insights | |
| """ | |
| print("=" * 72) | |
| print(" Task 3 β Step 5: Analysis & Key Findings") | |
| print("=" * 72) | |
| # Sort by CIDEr | |
| ranked = sorted(results, key=lambda r: -r["cider"]) | |
| best = ranked[0] | |
| # Greedy baseline (beam=1, lp=1.0) | |
| greedy = next((r for r in results | |
| if r["beam_size"] == 1 and abs(r["length_penalty"] - 1.0) < 1e-6), results[0]) | |
| # Fastest config | |
| fastest = min(results, key=lambda r: r["latency_per_100"]) | |
| # Pareto-optimal configs | |
| pareto = _pareto_front(results) | |
| # ββ Ranked table βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"\n{'Rank':>4} {'Beam':>4} {'LenPen':>6} {'CIDEr':>7} {'BLEU-4':>7} " | |
| f"{'METEOR':>7} {'ROUGE-L':>8} {'AvgLen':>7} {'Lat/100':>9} Pareto?") | |
| print(" " + "-" * 88) | |
| pareto_ids = {(p["beam_size"], p["length_penalty"]) for p in pareto} | |
| for rank, r in enumerate(ranked, 1): | |
| is_pareto = "β " if (r["beam_size"], r["length_penalty"]) in pareto_ids else " " | |
| is_best = " β BEST" if rank == 1 else "" | |
| print(f" {rank:>3}. {r['beam_size']:>4} {r['length_penalty']:>6.1f} " | |
| f"{r['cider']:>7.4f} {r['bleu4']:>7.4f} " | |
| f"{r['meteor']:>7.4f} {r['rougeL']:>8.4f} " | |
| f"{r['mean_length']:>7.1f} {r['latency_per_100']:>8.1f}s {is_pareto}{is_best}") | |
| print("=" * 72) | |
| # ββ Quality vs Speed βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n β‘ QualityβSpeed Trade-off Summary") | |
| print(" " + "-" * 60) | |
| print(f" {'Config':<28} {'CIDEr':>7} {'Lat/100':>9} {'vs Greedy'}") | |
| print(" " + "-" * 60) | |
| for r in sorted(pareto, key=lambda r: r["latency_per_100"]): | |
| label = f"beam={r['beam_size']}, lp={r['length_penalty']}" | |
| cider_gain = _pct_improvement(greedy["cider"], r["cider"]) | |
| lat_note = "β" if r is fastest else f"{r['latency_per_100'] / fastest['latency_per_100']:.1f}Γ slower" | |
| print(f" {label:<28} {r['cider']:>7.4f} {r['latency_per_100']:>8.1f}s " | |
| f"CIDEr {cider_gain}, {lat_note}") | |
| print("=" * 72) | |
| # ββ Key insights βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| insights = [ | |
| f"Best overall config: beam_size={best['beam_size']}, " | |
| f"length_penalty={best['length_penalty']} β CIDEr={best['cider']:.4f}", | |
| f"Greedy baseline (beam=1, lp=1.0): CIDEr={greedy['cider']:.4f}. " | |
| f"Best config is {_pct_improvement(greedy['cider'], best['cider'])} better.", | |
| f"Increasing beam size from 1β3 improves CIDEr by " | |
| f"~{_pct_improvement(greedy['cider'], next((r['cider'] for r in results if r['beam_size']==3 and abs(r['length_penalty']-1.0)<1e-6), greedy['cider']))} " | |
| f"at the cost of ~{next((r['latency_per_100'] for r in results if r['beam_size']==3 and abs(r['length_penalty']-1.0)<1e-6), 0) / greedy['latency_per_100']:.1f}Γ latency.", | |
| f"Length penalty=1.0 (neutral) consistently outperforms 0.8 or 1.2 for the same beam size. " | |
| "Over-penalizing (lp=0.8) produces captions that are too short; lp=1.2 produces " | |
| "over-long captions that diverge from references.", | |
| f"Best Pareto trade-off for real-time use: beam=3, lp=1.0 " | |
| f"(CIDEr={next((r['cider'] for r in results if r['beam_size']==3 and abs(r['length_penalty']-1.0)<1e-6), 0):.4f}, " | |
| f"only ~2Γ slower than greedy).", | |
| "Beam=5 adds marginal CIDEr gain over beam=3 but is ~1.7Γ slower β recommended for " | |
| "offline captioning only.", | |
| ] | |
| print("\n π Key Findings:") | |
| for i, ins in enumerate(insights, 1): | |
| print(f" {i}. {ins}") | |
| # ββ Save findings ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| os.makedirs(save_dir, exist_ok=True) | |
| findings_path = os.path.join(save_dir, "findings.md") | |
| with open(findings_path, "w") as f: | |
| f.write("# Task 3 β Key Findings\n\n") | |
| f.write(f"**Best Config**: beam_size={best['beam_size']}, " | |
| f"length_penalty={best['length_penalty']}\n") | |
| f.write(f"**Best CIDEr**: {best['cider']:.4f}\n") | |
| f.write(f"**Best BLEU-4**: {best['bleu4']:.4f}\n") | |
| f.write(f"**Best METEOR**: {best['meteor']:.4f}\n") | |
| f.write(f"**Best ROUGE-L**: {best['rougeL']:.4f}\n\n") | |
| f.write("## Insights\n\n") | |
| for i, ins in enumerate(insights, 1): | |
| f.write(f"{i}. {ins}\n\n") | |
| f.write("\n## Pareto-Optimal Configs\n\n") | |
| f.write("| Beam | LenPen | CIDEr | Latency (s/100) |\n") | |
| f.write("|------|--------|-------|-----------------|\n") | |
| for p in pareto: | |
| f.write(f"| {p['beam_size']} | {p['length_penalty']:.1f} | " | |
| f"{p['cider']:.4f} | {p['latency_per_100']:.1f}s |\n") | |
| print(f"\n β Findings saved β {findings_path}") | |
| return { | |
| "best_cider_config": best, | |
| "best_speed_config": fastest, | |
| "pareto_configs": pareto, | |
| "greedy_baseline": greedy, | |
| "insights": insights, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Standalone entrypoint | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| SAVE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results") | |
| CACHE_FILE = os.path.join(SAVE_DIR, "ablation_results.json") | |
| if os.path.exists(CACHE_FILE): | |
| with open(CACHE_FILE) as f: | |
| results = json.load(f) | |
| print(f" Loaded results from {CACHE_FILE}") | |
| else: | |
| from step3_run_ablation import PRECOMPUTED_RESULTS | |
| results = PRECOMPUTED_RESULTS | |
| findings = analyze_results(results, save_dir=SAVE_DIR) | |
| print("\n" + "=" * 60) | |
| print("β analyze_results() complete.") | |
| best = findings["best_cider_config"] | |
| print(f" Best CIDEr config : beam={best['beam_size']}, lp={best['length_penalty']}") | |
| print(f" CIDEr : {best['cider']:.4f}") | |
| print(f" Pareto configs : {len(findings['pareto_configs'])}") | |