Spaces:
Running
Running
File size: 3,146 Bytes
5b7114d 83aed13 5b7114d 83aed13 5b7114d 83aed13 5b7114d 83aed13 5b7114d 83aed13 5b7114d 83aed13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | import argparse
from benchmark_utils import (
ACCURACY_REPORT_PATH,
BENCHMARK_RESULTS_PATH,
PIPELINES,
read_json,
write_json,
)
from evaluation.bertscore_eval import compute_bertscore
from evaluation.llm_judge import judge_answers
def main() -> None:
parser = argparse.ArgumentParser(description="Evaluate benchmark answer accuracy.")
parser.add_argument(
"--allow-skip-judge",
action="store_true",
help="Write null LLM judge pass rates instead of failing when judge calls are unavailable.",
)
parser.add_argument(
"--skip-bertscore",
action="store_true",
help="Reuse existing BERTScore values from scientific_accuracy_report.json instead of recomputing them.",
)
args = parser.parse_args()
rows = read_json(BENCHMARK_RESULTS_PATH, [])
if not rows:
raise FileNotFoundError(f"No benchmark results found in {BENCHMARK_RESULTS_PATH}")
existing_report = read_json(ACCURACY_REPORT_PATH, {})
report = {}
for pipeline in PIPELINES:
answers = [row["pipelines"].get(pipeline, {}).get("answer", "") for row in rows]
references = [row.get("correct_answer", "") for row in rows]
judge_rows = [
{
"question": row.get("question", ""),
"correct_answer": row.get("correct_answer", ""),
"system_answer": row["pipelines"].get(pipeline, {}).get("answer", ""),
}
for row in rows
]
judge_details = judge_answers(
judge_rows,
return_details=True,
raise_on_error=not args.allow_skip_judge,
)
verdicts = [item["verdict"] for item in judge_details]
judged = [verdict for verdict in verdicts if verdict != "SKIP"]
skipped = [item for item in judge_details if item["verdict"] == "SKIP"]
if not judged and not args.allow_skip_judge:
errors = sorted({item.get("error") or "unknown error" for item in skipped})
raise RuntimeError(
"LLM judge produced no verdicts. "
"Set HF_TOKEN and ensure the judge model is available. "
f"Errors: {'; '.join(errors)}"
)
if args.skip_bertscore:
bert_f1 = existing_report.get(pipeline, {}).get("bertscore_f1")
else:
bert = compute_bertscore(answers, references)
bert_f1 = bert["mean_f1"]
report[pipeline] = {
"llm_judge_pass_rate": (
sum(verdict == "PASS" for verdict in judged) / len(judged) if judged else None
),
"llm_judge_verdicts": verdicts,
"llm_judge_judged_count": len(judged),
"llm_judge_skipped_count": len(skipped),
"llm_judge_errors": sorted(
{item.get("error") for item in skipped if item.get("error")}
),
"bertscore_f1": bert_f1,
"num_questions": len(rows),
}
write_json(ACCURACY_REPORT_PATH, report)
print(f"Saved accuracy report to {ACCURACY_REPORT_PATH}")
if __name__ == "__main__":
main()
|