File size: 2,226 Bytes
461f64f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Contains supplementary routines for post-hoc validation/inspection of the results:
    - Additional safeguard that dev set results are reliable (external recomputation of F1/EM metrics).
    - Offers example-level inspection to users.
"""

import json
import pandas as pd
from pathlib import Path
from src.utils.constants import Col
from src.evaluation.squad_v2_official import normalize_answer, compute_exact, compute_f1


def validate_experiment(exp_dir: Path, df: pd.DataFrame) -> pd.DataFrame:
    """Load predictions, compute scores, validate against saved metrics."""
    exp_dir = Path(exp_dir)
    # Load and merge predictions
    preds = json.loads((exp_dir / "predictions.json").read_text())
    pred_series = pd.Series(preds, name="predicted_answer")

    df_eval = df.set_index(Col.QUESTION_ID.value).join(pred_series)
    assert df_eval["predicted_answer"].isna().sum() == 0, "Missing predictions"

    df_eval = _compute_scores(df_eval)
    computed_em = 100.0 * df_eval["em_score"].mean()
    computed_f1 = 100.0 * df_eval["f1_score"].mean()

    # Compare with saved
    saved = json.loads((exp_dir / "metrics.json").read_text())
    saved_em, saved_f1 = saved["exact_score"], saved["f1_score"]

    print(f"\n{exp_dir.name}")
    print(f"Computed: EM={computed_em:.2f}%, F1={computed_f1:.2f}%")
    print(f"Saved:    EM={saved_em:.2f}%, F1={saved_f1:.2f}%")
    if abs(computed_em - saved_em) < 0.01 and abs(computed_f1 - saved_f1) < 0.01:
        print("MATCH\n")
    else:
        print("MISMATCH - check evaluation\n")
    return df_eval


def _compute_scores(df: pd.DataFrame) -> pd.DataFrame:
    """Adds em_score and f1_score columns."""
    scores = []
    for _, row in df.iterrows():
        golds = row[Col.ANSWER_TEXTS.value]
        pred = row["predicted_answer"]

        if not golds:
            golds = [""]
        else:
            golds = [g for g in golds if normalize_answer(str(g))] or [""]

        em = max((compute_exact(g, pred) for g in golds), default=0)
        f1 = max((compute_f1(g, pred) for g in golds), default=0.0)
        scores.append((em, f1))

    df = df.copy()
    df["em_score"] = [s[0] for s in scores]
    df["f1_score"] = [s[1] for s in scores]
    return df