File size: 7,986 Bytes
a969e99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
"""
EEE-specific preprocessing for entity resolution.

Raw strings from the EEE datastore often encode multiple entity types in a
single field (e.g. ``evaluation_name`` contains both benchmark and metric).
These helpers extract clean, resolvable strings before passing them to the
resolver.

Usage::

    from eval_entity_resolver.eee import extract_metric, clean_eval_name

    metric_raw = extract_metric("Accuracy on IFEval")       # β†’ "Accuracy"
    bench_raw  = clean_eval_name("bfcl.live.live_accuracy")  # β†’ "bfcl live"
"""
from __future__ import annotations

import re


# ------------------------------------------------------------------
# Metric extraction
# ------------------------------------------------------------------

def extract_metric(metric_desc: str) -> str:
    """Extract a reusable metric name from an EEE evaluation description.

    EEE configs rarely provide a structured metric_id.  Instead the metric
    lives inside ``evaluation_description`` in one of several formats:

    * **"X on Y"** β€” ``"Accuracy on IFEval"`` β†’ ``"Accuracy"``
    * **Dot notation** β€” ``"bfcl.live.live_accuracy"`` β†’ ``"accuracy"``
    * **Verbose description** β€” ``"Chat accuracy - includes easy subsets"``
      β†’ ``"accuracy"`` (keyword extraction)
    * **No keyword** β€” ``"Global MMLU Lite - Arabic"`` β†’ ``"score"``
      (generic fallback)

    The returned string is passed to the resolver, which maps it to a
    canonical metric entity via alias lookup / normalized match.
    """
    text = metric_desc.strip()
    if not text:
        return text

    from_dot = False

    # 1. Dot notation: "bfcl.live.live_accuracy" β†’ last segment β†’ "live accuracy"
    if "." in text and " " not in text:
        text = text.rsplit(".", 1)[1].replace("_", " ").strip()
        from_dot = True

    # 2. "X on Y" pattern: "Accuracy on IFEval" β†’ "Accuracy"
    if not from_dot:
        m = re.match(r"^(.+?)\s+on\s+\S+", text, re.IGNORECASE)
        if m:
            text = m.group(1).strip()

    # 3. Try keyword extraction on any multi-word text or dot-notation segment.
    #    Single bare words ("Accuracy", "F1", "EM") pass straight to the resolver.
    word_count = len(text.split())
    needs_extraction = from_dot or word_count > 1

    if needs_extraction:
        canonical = _keyword_extract(text)
        if canonical:
            return canonical
        # No keyword found β€” verbose descriptions (4+ words) β†’ generic fallback.
        # Short phrases (2-3 words) pass through so the resolver can still
        # match them via alias (e.g. "Equivalent (CoT)" β†’ cot-correct).
        if not from_dot and word_count > 3:
            return "score"

    return text


# Ordered from most-specific to most-generic.  When multiple patterns
# match, the earliest *position* in the input text wins (see
# _keyword_extract).
_METRIC_KEYWORDS: list[tuple[str, str]] = [
    # Multi-word / compound patterns
    (r"pass@8",                          "Pass@8"),
    (r"pass@1",                          "Pass@1"),
    (r"mean[\s_-]*win[\s_-]*rate",       "Mean Win Rate"),
    (r"win[\s_-]*rate",                  "Win Rate"),
    (r"mean[\s_-]*response[\s_-]*time",  "Mean Response Time"),
    (r"mean[\s_-]*score",                "Mean Score"),
    (r"exact[\s_-]*match",               "Exact Match"),
    (r"bleu[\s_-]*4",                    "BLEU-4"),
    (r"cot[\s_-]*correct",              "COT correct"),
    (r"wb[\s_-]*score",                  "WB Score"),
    (r"avg[\s_-]*attempts",              "Average Attempts"),
    (r"latency[\s_-]*mean",              "mean-latency"),
    (r"latency.*(?:p95|95th)",            "p95-latency"),
    (r"latency.*(?:std|standard)",        "latency-stddev"),
    (r"max[\s_-]*delta",                 "max-delta"),
    (r"benchmark\s+evaluation",          "score"),
    (r"outperform",                      "rank"),
    # Compound accuracy types (before generic accuracy)
    # Patterns sourced from metric_names in evaleval/card_backend eval-list.
    (r"ast[\s_-]*accuracy",              "AST Accuracy"),
    (r"overall[\s_-]*accuracy",          "Accuracy"),
    (r"(?:ir)?relevance[\s_-]*detection[\s_-]*accuracy", "Accuracy"),
    (r"no[\s_-]*snippet[\s_-]*accuracy", "Accuracy"),
    (r"long[\s_-]*context[\s_-]*accuracy", "Accuracy"),
    (r"kv[\s_-]*accuracy",               "Accuracy"),
    (r"vector[\s_-]*accuracy",           "Accuracy"),
    (r"recursive[\s_-]*summarization[\s_-]*accuracy", "Accuracy"),
    (r"total[\s_-]*cost",                "cost"),
    (r"cost[\s_-]*per[\s_-]*task",       "cost-per-task"),
    # Single-word patterns (generic, checked last by position)
    (r"\baccuracy\b",                    "Accuracy"),
    (r"\bacc\b",                         "Accuracy"),
    (r"\bscores?\b",                     "score"),
    (r"\bf1\b",                          "F1"),
    (r"\bem\b",                          "Exact Match"),
    (r"\belo\b",                         "Elo Rating"),
    (r"\branks?\b",                      "rank"),
    (r"\bcosts?\b",                      "cost"),
    (r"\bharmlessness\b",                "harmlessness"),
    (r"\bstddev\b",                      "stddev"),
]


def _keyword_extract(text: str) -> str | None:
    """Return the canonical metric name for the first keyword found in *text*."""
    lower = text.lower()
    best: str | None = None
    best_pos = len(lower) + 1
    for pattern, canonical in _METRIC_KEYWORDS:
        m = re.search(pattern, lower)
        if m and m.start() < best_pos:
            best_pos = m.start()
            best = canonical
    return best


# ------------------------------------------------------------------
# Benchmark-name cleaning
# ------------------------------------------------------------------

# Trailing metric patterns for space-separated names (e.g.
# "Gaming Score" β†’ "Gaming").  Checked with ``re.search`` against
# the lowered name; the first match wins.
_TRAILING_METRIC_RE: list[str] = [
    r"mean\s+win\s+rate$",
    r"mean\s+response\s+time$",
    r"mean\s+score$",
    r"win\s+rate$",
    r"avg\s+attempts$",
    r"avg\s+latency\s+ms$",
    r"cost\s+per\s+\d+\s+calls\s+usd$",
    r"cost\s+per\s+task$",
    r"pass@\d+$",
    r"\b(?:score|accuracy|acc|elo|rank|f1|em)$",
]


def clean_eval_name(eval_name: str) -> str:
    """Strip embedded metric information from an ``evaluation_name``.

    EEE configs often encode both benchmark *and* metric in a single
    ``evaluation_name`` string.  This function extracts the benchmark
    portion so that the metric lives only in ``metric_id``.

    Patterns handled:

    * **Dot notation** β€” ``"bfcl.live.live_accuracy"`` β†’ ``"bfcl live"``
      (last segment is the metric, everything before is the benchmark)
    * **Underscore suffix** β€” ``"fibble1_arena_win_rate"`` β†’ ``"fibble1 arena"``
    * **Trailing words** β€” ``"Gaming Score"`` β†’ ``"Gaming"``
    """
    name = eval_name.strip()
    if not name:
        return name

    # --- 1. Dot notation: split on last dot ------------------------------
    # The last segment is the metric; everything before is the benchmark.
    # e.g. "bfcl.live.live_simple_ast_accuracy" β†’ "bfcl live"
    if "." in name and " " not in name:
        parts = name.rsplit(".", 1)[0].split(".")
        return " ".join(p.replace("_", " ") for p in parts)

    # --- 2. Underscore/space names: strip trailing metric keywords -------
    # Normalise underscores to spaces so "fibble1_arena_win_rate" and
    # "Gaming Score" use the same codepath.
    has_underscores = "_" in name and " " not in name
    normalized = name.replace("_", " ") if has_underscores else name

    lower = normalized.lower()
    for pattern in _TRAILING_METRIC_RE:
        m = re.search(pattern, lower)
        if m:
            prefix = normalized[: m.start()].strip()
            if prefix:
                return prefix
            break  # matched but prefix is empty β€” fall through

    return name