from benchmark_utils import ( ACCURACY_REPORT_PATH, BENCHMARK_RESULTS_PATH, FINAL_SUMMARY_PATH, PIPELINES, read_json, write_json, ) def main() -> None: benchmark_rows = read_json(BENCHMARK_RESULTS_PATH, []) accuracy = read_json(ACCURACY_REPORT_PATH, {}) if not benchmark_rows: raise FileNotFoundError(f"No benchmark results found in {BENCHMARK_RESULTS_PATH}") summary = {} for pipeline in PIPELINES: metrics = [row["pipelines"].get(pipeline, {}) for row in benchmark_rows] summary[pipeline] = { "avg_total_tokens": average(metrics, "total_tokens"), "avg_latency_seconds": average(metrics, "latency_seconds"), "avg_estimated_cost": average(metrics, "estimated_cost"), "llm_judge_pass_rate": accuracy.get(pipeline, {}).get("llm_judge_pass_rate"), "bertscore_f1": accuracy.get(pipeline, {}).get("bertscore_f1"), } baseline = summary.get("llm_only", {}) for pipeline, item in summary.items(): item["token_reduction_vs_llm_only"] = reduction( baseline.get("avg_total_tokens"), item.get("avg_total_tokens") ) item["latency_reduction_vs_llm_only"] = reduction( baseline.get("avg_latency_seconds"), item.get("avg_latency_seconds") ) item["cost_reduction_vs_llm_only"] = reduction( baseline.get("avg_estimated_cost"), item.get("avg_estimated_cost") ) write_json(FINAL_SUMMARY_PATH, summary) print(f"Saved final summary to {FINAL_SUMMARY_PATH}") def average(rows, key): values = [row.get(key) for row in rows if isinstance(row.get(key), (int, float))] return sum(values) / len(values) if values else None def reduction(baseline, current): if not baseline or current is None: return None return ((baseline - current) / baseline) * 100 if __name__ == "__main__": main()