Spaces:

VedantDhavan
/

graphrag-benchmark

Sleeping

App Files Files Community

graphrag-benchmark / scripts /evaluate_accuracy.py

VedantDhavan

Deploy public PDF upload backend

5b7114d about 1 month ago

Raw

History Blame Contribute Delete

3.15 kB

	import argparse

	from benchmark_utils import (
	ACCURACY_REPORT_PATH,
	BENCHMARK_RESULTS_PATH,
	PIPELINES,
	read_json,
	write_json,
	)
	from evaluation.bertscore_eval import compute_bertscore
	from evaluation.llm_judge import judge_answers


	def main() -> None:
	parser = argparse.ArgumentParser(description="Evaluate benchmark answer accuracy.")
	parser.add_argument(
	"--allow-skip-judge",
	action="store_true",
	help="Write null LLM judge pass rates instead of failing when judge calls are unavailable.",
	)
	parser.add_argument(
	"--skip-bertscore",
	action="store_true",
	help="Reuse existing BERTScore values from scientific_accuracy_report.json instead of recomputing them.",
	)
	args = parser.parse_args()

	rows = read_json(BENCHMARK_RESULTS_PATH, [])
	if not rows:
	raise FileNotFoundError(f"No benchmark results found in {BENCHMARK_RESULTS_PATH}")

	existing_report = read_json(ACCURACY_REPORT_PATH, {})
	report = {}
	for pipeline in PIPELINES:
	answers = [row["pipelines"].get(pipeline, {}).get("answer", "") for row in rows]
	references = [row.get("correct_answer", "") for row in rows]
	judge_rows = [
	{
	"question": row.get("question", ""),
	"correct_answer": row.get("correct_answer", ""),
	"system_answer": row["pipelines"].get(pipeline, {}).get("answer", ""),
	}
	for row in rows
	]
	judge_details = judge_answers(
	judge_rows,
	return_details=True,
	raise_on_error=not args.allow_skip_judge,
	)
	verdicts = [item["verdict"] for item in judge_details]
	judged = [verdict for verdict in verdicts if verdict != "SKIP"]
	skipped = [item for item in judge_details if item["verdict"] == "SKIP"]
	if not judged and not args.allow_skip_judge:
	errors = sorted({item.get("error") or "unknown error" for item in skipped})
	raise RuntimeError(
	"LLM judge produced no verdicts. "
	"Set HF_TOKEN and ensure the judge model is available. "
	f"Errors: {'; '.join(errors)}"
	)

	if args.skip_bertscore:
	bert_f1 = existing_report.get(pipeline, {}).get("bertscore_f1")
	else:
	bert = compute_bertscore(answers, references)
	bert_f1 = bert["mean_f1"]

	report[pipeline] = {
	"llm_judge_pass_rate": (
	sum(verdict == "PASS" for verdict in judged) / len(judged) if judged else None
	),
	"llm_judge_verdicts": verdicts,
	"llm_judge_judged_count": len(judged),
	"llm_judge_skipped_count": len(skipped),
	"llm_judge_errors": sorted(
	{item.get("error") for item in skipped if item.get("error")}
	),
	"bertscore_f1": bert_f1,
	"num_questions": len(rows),
	}

	write_json(ACCURACY_REPORT_PATH, report)
	print(f"Saved accuracy report to {ACCURACY_REPORT_PATH}")


	if __name__ == "__main__":
	main()