graphrag-benchmark / scripts /build_final_summary.py
VedantDhavan's picture
Deploy GraphRAG benchmark backend
83aed13
Raw
History Blame Contribute Delete
1.93 kB
from benchmark_utils import (
ACCURACY_REPORT_PATH,
BENCHMARK_RESULTS_PATH,
FINAL_SUMMARY_PATH,
PIPELINES,
read_json,
write_json,
)
def main() -> None:
benchmark_rows = read_json(BENCHMARK_RESULTS_PATH, [])
accuracy = read_json(ACCURACY_REPORT_PATH, {})
if not benchmark_rows:
raise FileNotFoundError(f"No benchmark results found in {BENCHMARK_RESULTS_PATH}")
summary = {}
for pipeline in PIPELINES:
metrics = [row["pipelines"].get(pipeline, {}) for row in benchmark_rows]
summary[pipeline] = {
"avg_total_tokens": average(metrics, "total_tokens"),
"avg_latency_seconds": average(metrics, "latency_seconds"),
"avg_estimated_cost": average(metrics, "estimated_cost"),
"llm_judge_pass_rate": accuracy.get(pipeline, {}).get("llm_judge_pass_rate"),
"bertscore_f1": accuracy.get(pipeline, {}).get("bertscore_f1"),
}
baseline = summary.get("llm_only", {})
for pipeline, item in summary.items():
item["token_reduction_vs_llm_only"] = reduction(
baseline.get("avg_total_tokens"), item.get("avg_total_tokens")
)
item["latency_reduction_vs_llm_only"] = reduction(
baseline.get("avg_latency_seconds"), item.get("avg_latency_seconds")
)
item["cost_reduction_vs_llm_only"] = reduction(
baseline.get("avg_estimated_cost"), item.get("avg_estimated_cost")
)
write_json(FINAL_SUMMARY_PATH, summary)
print(f"Saved final summary to {FINAL_SUMMARY_PATH}")
def average(rows, key):
values = [row.get(key) for row in rows if isinstance(row.get(key), (int, float))]
return sum(values) / len(values) if values else None
def reduction(baseline, current):
if not baseline or current is None:
return None
return ((baseline - current) / baseline) * 100
if __name__ == "__main__":
main()