File size: 3,295 Bytes
69dc570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import argparse
import json
import os
from pathlib import Path

from src.evaluation.metrics import bleu_score, flatten_ocr_json, levenshtein_similarity


REPO_ROOT = Path(__file__).resolve().parents[2]
GOLD_ROOT = REPO_ROOT / "data" / "lloyd-jones-soph-170" / "ocr"
TEXT_GOLD_PATH = GOLD_ROOT / "lloyd-jones-text.json"
APPARATUS_GOLD_PATH = GOLD_ROOT / "lloyd-jones-apparatus.json"


def build_result_payload(
    model_name: str,
    revision: str,
    precision: str,
    easy_text_path: str,
    easy_apparatus_path: str,
    hard_text_path: str,
    hard_apparatus_path: str,
) -> dict:
    text_gold = _load_json(TEXT_GOLD_PATH)
    apparatus_gold = _load_json(APPARATUS_GOLD_PATH)

    easy_reference = _join_sections(text_gold, apparatus_gold)
    easy_prediction = _join_sections(_load_json(easy_text_path), _load_json(easy_apparatus_path))
    hard_reference = _join_sections(text_gold, apparatus_gold)
    hard_prediction = _join_sections(_load_json(hard_text_path), _load_json(hard_apparatus_path))

    return {
        "config": {
            "model_dtype": _normalize_precision(precision),
            "model_name": model_name,
            "model_sha": revision,
        },
        "results": {
            "easy_levenshtein": {"score": levenshtein_similarity(easy_reference, easy_prediction) / 100.0},
            "easy_bleu": {"score": bleu_score(easy_reference, easy_prediction) / 100.0},
            "hard_levenshtein": {"score": levenshtein_similarity(hard_reference, hard_prediction) / 100.0},
            "hard_bleu": {"score": bleu_score(hard_reference, hard_prediction) / 100.0},
        },
    }


def main():
    parser = argparse.ArgumentParser(description="Build a leaderboard-compatible result JSON for the OCR benchmark.")
    parser.add_argument("--model-name", required=True)
    parser.add_argument("--revision", default="main")
    parser.add_argument("--precision", default="float16")
    parser.add_argument("--easy-text", required=True)
    parser.add_argument("--easy-apparatus", required=True)
    parser.add_argument("--hard-text", required=True)
    parser.add_argument("--hard-apparatus", required=True)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    payload = build_result_payload(
        model_name=args.model_name,
        revision=args.revision,
        precision=args.precision,
        easy_text_path=args.easy_text,
        easy_apparatus_path=args.easy_apparatus,
        hard_text_path=args.hard_text,
        hard_apparatus_path=args.hard_apparatus,
    )

    os.makedirs(os.path.dirname(args.output), exist_ok=True)
    with open(args.output, "w") as handle:
        json.dump(payload, handle, ensure_ascii=False, indent=2)


def _load_json(path: str | Path) -> dict[str, str]:
    with open(path) as handle:
        return json.load(handle)


def _join_sections(text_json: dict[str, str], apparatus_json: dict[str, str]) -> str:
    return "\n".join(
        [
            "[TEXT]",
            flatten_ocr_json(text_json),
            "[APPARATUS]",
            flatten_ocr_json(apparatus_json),
        ]
    )


def _normalize_precision(precision: str) -> str:
    if precision.startswith("torch."):
        return precision
    return f"torch.{precision}"


if __name__ == "__main__":
    main()