Spaces:
Sleeping
Sleeping
| import argparse | |
| from benchmark_utils import ( | |
| ACCURACY_REPORT_PATH, | |
| BENCHMARK_RESULTS_PATH, | |
| PIPELINES, | |
| read_json, | |
| write_json, | |
| ) | |
| from evaluation.bertscore_eval import compute_bertscore | |
| from evaluation.llm_judge import judge_answers | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Evaluate benchmark answer accuracy.") | |
| parser.add_argument( | |
| "--allow-skip-judge", | |
| action="store_true", | |
| help="Write null LLM judge pass rates instead of failing when judge calls are unavailable.", | |
| ) | |
| parser.add_argument( | |
| "--skip-bertscore", | |
| action="store_true", | |
| help="Reuse existing BERTScore values from scientific_accuracy_report.json instead of recomputing them.", | |
| ) | |
| args = parser.parse_args() | |
| rows = read_json(BENCHMARK_RESULTS_PATH, []) | |
| if not rows: | |
| raise FileNotFoundError(f"No benchmark results found in {BENCHMARK_RESULTS_PATH}") | |
| existing_report = read_json(ACCURACY_REPORT_PATH, {}) | |
| report = {} | |
| for pipeline in PIPELINES: | |
| answers = [row["pipelines"].get(pipeline, {}).get("answer", "") for row in rows] | |
| references = [row.get("correct_answer", "") for row in rows] | |
| judge_rows = [ | |
| { | |
| "question": row.get("question", ""), | |
| "correct_answer": row.get("correct_answer", ""), | |
| "system_answer": row["pipelines"].get(pipeline, {}).get("answer", ""), | |
| } | |
| for row in rows | |
| ] | |
| judge_details = judge_answers( | |
| judge_rows, | |
| return_details=True, | |
| raise_on_error=not args.allow_skip_judge, | |
| ) | |
| verdicts = [item["verdict"] for item in judge_details] | |
| judged = [verdict for verdict in verdicts if verdict != "SKIP"] | |
| skipped = [item for item in judge_details if item["verdict"] == "SKIP"] | |
| if not judged and not args.allow_skip_judge: | |
| errors = sorted({item.get("error") or "unknown error" for item in skipped}) | |
| raise RuntimeError( | |
| "LLM judge produced no verdicts. " | |
| "Set HF_TOKEN and ensure the judge model is available. " | |
| f"Errors: {'; '.join(errors)}" | |
| ) | |
| if args.skip_bertscore: | |
| bert_f1 = existing_report.get(pipeline, {}).get("bertscore_f1") | |
| else: | |
| bert = compute_bertscore(answers, references) | |
| bert_f1 = bert["mean_f1"] | |
| report[pipeline] = { | |
| "llm_judge_pass_rate": ( | |
| sum(verdict == "PASS" for verdict in judged) / len(judged) if judged else None | |
| ), | |
| "llm_judge_verdicts": verdicts, | |
| "llm_judge_judged_count": len(judged), | |
| "llm_judge_skipped_count": len(skipped), | |
| "llm_judge_errors": sorted( | |
| {item.get("error") for item in skipped if item.get("error")} | |
| ), | |
| "bertscore_f1": bert_f1, | |
| "num_questions": len(rows), | |
| } | |
| write_json(ACCURACY_REPORT_PATH, report) | |
| print(f"Saved accuracy report to {ACCURACY_REPORT_PATH}") | |
| if __name__ == "__main__": | |
| main() | |