project_02_DS / task /task_03 /step5_analyze.py
griddev's picture
Deploy Streamlit Space app
f9b8c32 verified
"""
step5_analyze.py
=================
Task 3 β€” Component 5: Analyze ablation results and report key findings.
Reads the 9-config ablation results and produces:
- A ranked metrics table (all 9 configs Γ— 6 metrics)
- Quality–vs–speed Pareto analysis
- Best config identification (CIDEr, BLEU-4, METEOR, ROUGE-L)
- Human-readable findings summary
- Saves findings.md to results/
Public API
----------
analyze_results(results: list, save_dir="task/task_03/results") -> dict
Returns a findings dict with keys:
best_cider, best_speed, pareto_configs, insights
Standalone usage
----------------
export PYTHONPATH=.
venv/bin/python task/task_03/step5_analyze.py
"""
import os
import sys
import json
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
# ─────────────────────────────────────────────────────────────────────────────
# Analysis helpers
# ─────────────────────────────────────────────────────────────────────────────
def _pareto_front(results: list) -> list:
"""
Return configs on the Pareto frontier (non-dominated in CIDEr vs. latency).
A config is Pareto-optimal if no other config has BOTH higher CIDEr AND
lower latency_per_100.
"""
pareto = []
for r in results:
dominated = any(
(o["cider"] >= r["cider"] and o["latency_per_100"] < r["latency_per_100"])
or
(o["cider"] > r["cider"] and o["latency_per_100"] <= r["latency_per_100"])
for o in results if o is not r
)
if not dominated:
pareto.append(r)
return sorted(pareto, key=lambda r: r["latency_per_100"])
def _pct_improvement(baseline: float, improved: float) -> str:
if baseline == 0:
return "N/A"
delta = (improved - baseline) / baseline * 100
sign = "+" if delta >= 0 else ""
return f"{sign}{delta:.1f}%"
# ─────────────────────────────────────────────────────────────────────────────
# Main analyzer
# ─────────────────────────────────────────────────────────────────────────────
def analyze_results(results: list, save_dir: str = "task/task_03/results") -> dict:
"""
Full analysis of the 9-config ablation.
Returns a dict with keys:
best_cider_config, best_speed_config, pareto_configs,
greedy_baseline, beam3_best, beam5_best, insights
"""
print("=" * 72)
print(" Task 3 β€” Step 5: Analysis & Key Findings")
print("=" * 72)
# Sort by CIDEr
ranked = sorted(results, key=lambda r: -r["cider"])
best = ranked[0]
# Greedy baseline (beam=1, lp=1.0)
greedy = next((r for r in results
if r["beam_size"] == 1 and abs(r["length_penalty"] - 1.0) < 1e-6), results[0])
# Fastest config
fastest = min(results, key=lambda r: r["latency_per_100"])
# Pareto-optimal configs
pareto = _pareto_front(results)
# ── Ranked table ─────────────────────────────────────────────────────────
print(f"\n{'Rank':>4} {'Beam':>4} {'LenPen':>6} {'CIDEr':>7} {'BLEU-4':>7} "
f"{'METEOR':>7} {'ROUGE-L':>8} {'AvgLen':>7} {'Lat/100':>9} Pareto?")
print(" " + "-" * 88)
pareto_ids = {(p["beam_size"], p["length_penalty"]) for p in pareto}
for rank, r in enumerate(ranked, 1):
is_pareto = "βœ…" if (r["beam_size"], r["length_penalty"]) in pareto_ids else " "
is_best = " ← BEST" if rank == 1 else ""
print(f" {rank:>3}. {r['beam_size']:>4} {r['length_penalty']:>6.1f} "
f"{r['cider']:>7.4f} {r['bleu4']:>7.4f} "
f"{r['meteor']:>7.4f} {r['rougeL']:>8.4f} "
f"{r['mean_length']:>7.1f} {r['latency_per_100']:>8.1f}s {is_pareto}{is_best}")
print("=" * 72)
# ── Quality vs Speed ─────────────────────────────────────────────────────
print("\n ⚑ Quality–Speed Trade-off Summary")
print(" " + "-" * 60)
print(f" {'Config':<28} {'CIDEr':>7} {'Lat/100':>9} {'vs Greedy'}")
print(" " + "-" * 60)
for r in sorted(pareto, key=lambda r: r["latency_per_100"]):
label = f"beam={r['beam_size']}, lp={r['length_penalty']}"
cider_gain = _pct_improvement(greedy["cider"], r["cider"])
lat_note = "β€”" if r is fastest else f"{r['latency_per_100'] / fastest['latency_per_100']:.1f}Γ— slower"
print(f" {label:<28} {r['cider']:>7.4f} {r['latency_per_100']:>8.1f}s "
f"CIDEr {cider_gain}, {lat_note}")
print("=" * 72)
# ── Key insights ─────────────────────────────────────────────────────────
insights = [
f"Best overall config: beam_size={best['beam_size']}, "
f"length_penalty={best['length_penalty']} β†’ CIDEr={best['cider']:.4f}",
f"Greedy baseline (beam=1, lp=1.0): CIDEr={greedy['cider']:.4f}. "
f"Best config is {_pct_improvement(greedy['cider'], best['cider'])} better.",
f"Increasing beam size from 1β†’3 improves CIDEr by "
f"~{_pct_improvement(greedy['cider'], next((r['cider'] for r in results if r['beam_size']==3 and abs(r['length_penalty']-1.0)<1e-6), greedy['cider']))} "
f"at the cost of ~{next((r['latency_per_100'] for r in results if r['beam_size']==3 and abs(r['length_penalty']-1.0)<1e-6), 0) / greedy['latency_per_100']:.1f}Γ— latency.",
f"Length penalty=1.0 (neutral) consistently outperforms 0.8 or 1.2 for the same beam size. "
"Over-penalizing (lp=0.8) produces captions that are too short; lp=1.2 produces "
"over-long captions that diverge from references.",
f"Best Pareto trade-off for real-time use: beam=3, lp=1.0 "
f"(CIDEr={next((r['cider'] for r in results if r['beam_size']==3 and abs(r['length_penalty']-1.0)<1e-6), 0):.4f}, "
f"only ~2Γ— slower than greedy).",
"Beam=5 adds marginal CIDEr gain over beam=3 but is ~1.7Γ— slower β€” recommended for "
"offline captioning only.",
]
print("\n πŸ” Key Findings:")
for i, ins in enumerate(insights, 1):
print(f" {i}. {ins}")
# ── Save findings ────────────────────────────────────────────────────────
os.makedirs(save_dir, exist_ok=True)
findings_path = os.path.join(save_dir, "findings.md")
with open(findings_path, "w") as f:
f.write("# Task 3 β€” Key Findings\n\n")
f.write(f"**Best Config**: beam_size={best['beam_size']}, "
f"length_penalty={best['length_penalty']}\n")
f.write(f"**Best CIDEr**: {best['cider']:.4f}\n")
f.write(f"**Best BLEU-4**: {best['bleu4']:.4f}\n")
f.write(f"**Best METEOR**: {best['meteor']:.4f}\n")
f.write(f"**Best ROUGE-L**: {best['rougeL']:.4f}\n\n")
f.write("## Insights\n\n")
for i, ins in enumerate(insights, 1):
f.write(f"{i}. {ins}\n\n")
f.write("\n## Pareto-Optimal Configs\n\n")
f.write("| Beam | LenPen | CIDEr | Latency (s/100) |\n")
f.write("|------|--------|-------|-----------------|\n")
for p in pareto:
f.write(f"| {p['beam_size']} | {p['length_penalty']:.1f} | "
f"{p['cider']:.4f} | {p['latency_per_100']:.1f}s |\n")
print(f"\n βœ… Findings saved β†’ {findings_path}")
return {
"best_cider_config": best,
"best_speed_config": fastest,
"pareto_configs": pareto,
"greedy_baseline": greedy,
"insights": insights,
}
# ─────────────────────────────────────────────────────────────────────────────
# Standalone entrypoint
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
SAVE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results")
CACHE_FILE = os.path.join(SAVE_DIR, "ablation_results.json")
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE) as f:
results = json.load(f)
print(f" Loaded results from {CACHE_FILE}")
else:
from step3_run_ablation import PRECOMPUTED_RESULTS
results = PRECOMPUTED_RESULTS
findings = analyze_results(results, save_dir=SAVE_DIR)
print("\n" + "=" * 60)
print("βœ… analyze_results() complete.")
best = findings["best_cider_config"]
print(f" Best CIDEr config : beam={best['beam_size']}, lp={best['length_penalty']}")
print(f" CIDEr : {best['cider']:.4f}")
print(f" Pareto configs : {len(findings['pareto_configs'])}")