File size: 3,146 Bytes
5b7114d
 
83aed13
 
 
 
 
 
 
 
 
 
 
 
5b7114d
 
 
 
 
 
 
 
 
 
 
 
 
83aed13
 
 
 
5b7114d
83aed13
 
 
 
 
 
 
 
 
 
 
 
5b7114d
 
 
 
 
 
83aed13
5b7114d
 
 
 
 
 
 
 
 
 
 
 
 
 
83aed13
 
 
 
 
5b7114d
 
 
 
 
 
 
83aed13
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import argparse

from benchmark_utils import (
    ACCURACY_REPORT_PATH,
    BENCHMARK_RESULTS_PATH,
    PIPELINES,
    read_json,
    write_json,
)
from evaluation.bertscore_eval import compute_bertscore
from evaluation.llm_judge import judge_answers


def main() -> None:
    parser = argparse.ArgumentParser(description="Evaluate benchmark answer accuracy.")
    parser.add_argument(
        "--allow-skip-judge",
        action="store_true",
        help="Write null LLM judge pass rates instead of failing when judge calls are unavailable.",
    )
    parser.add_argument(
        "--skip-bertscore",
        action="store_true",
        help="Reuse existing BERTScore values from scientific_accuracy_report.json instead of recomputing them.",
    )
    args = parser.parse_args()

    rows = read_json(BENCHMARK_RESULTS_PATH, [])
    if not rows:
        raise FileNotFoundError(f"No benchmark results found in {BENCHMARK_RESULTS_PATH}")

    existing_report = read_json(ACCURACY_REPORT_PATH, {})
    report = {}
    for pipeline in PIPELINES:
        answers = [row["pipelines"].get(pipeline, {}).get("answer", "") for row in rows]
        references = [row.get("correct_answer", "") for row in rows]
        judge_rows = [
            {
                "question": row.get("question", ""),
                "correct_answer": row.get("correct_answer", ""),
                "system_answer": row["pipelines"].get(pipeline, {}).get("answer", ""),
            }
            for row in rows
        ]
        judge_details = judge_answers(
            judge_rows,
            return_details=True,
            raise_on_error=not args.allow_skip_judge,
        )
        verdicts = [item["verdict"] for item in judge_details]
        judged = [verdict for verdict in verdicts if verdict != "SKIP"]
        skipped = [item for item in judge_details if item["verdict"] == "SKIP"]
        if not judged and not args.allow_skip_judge:
            errors = sorted({item.get("error") or "unknown error" for item in skipped})
            raise RuntimeError(
                "LLM judge produced no verdicts. "
                "Set HF_TOKEN and ensure the judge model is available. "
                f"Errors: {'; '.join(errors)}"
            )

        if args.skip_bertscore:
            bert_f1 = existing_report.get(pipeline, {}).get("bertscore_f1")
        else:
            bert = compute_bertscore(answers, references)
            bert_f1 = bert["mean_f1"]

        report[pipeline] = {
            "llm_judge_pass_rate": (
                sum(verdict == "PASS" for verdict in judged) / len(judged) if judged else None
            ),
            "llm_judge_verdicts": verdicts,
            "llm_judge_judged_count": len(judged),
            "llm_judge_skipped_count": len(skipped),
            "llm_judge_errors": sorted(
                {item.get("error") for item in skipped if item.get("error")}
            ),
            "bertscore_f1": bert_f1,
            "num_questions": len(rows),
        }

    write_json(ACCURACY_REPORT_PATH, report)
    print(f"Saved accuracy report to {ACCURACY_REPORT_PATH}")


if __name__ == "__main__":
    main()