Spaces:
Sleeping
Sleeping
| """ | |
| EEE-specific preprocessing for entity resolution. | |
| Raw strings from the EEE datastore often encode multiple entity types in a | |
| single field (e.g. ``evaluation_name`` contains both benchmark and metric). | |
| These helpers extract clean, resolvable strings before passing them to the | |
| resolver. | |
| Usage:: | |
| from eval_entity_resolver.eee import extract_metric, clean_eval_name | |
| metric_raw = extract_metric("Accuracy on IFEval") # β "Accuracy" | |
| bench_raw = clean_eval_name("bfcl.live.live_accuracy") # β "bfcl live" | |
| """ | |
| from __future__ import annotations | |
| import re | |
| # ------------------------------------------------------------------ | |
| # Metric extraction | |
| # ------------------------------------------------------------------ | |
| def extract_metric(metric_desc: str) -> str: | |
| """Extract a reusable metric name from an EEE evaluation description. | |
| EEE configs rarely provide a structured metric_id. Instead the metric | |
| lives inside ``evaluation_description`` in one of several formats: | |
| * **"X on Y"** β ``"Accuracy on IFEval"`` β ``"Accuracy"`` | |
| * **Dot notation** β ``"bfcl.live.live_accuracy"`` β ``"accuracy"`` | |
| * **Verbose description** β ``"Chat accuracy - includes easy subsets"`` | |
| β ``"accuracy"`` (keyword extraction) | |
| * **No keyword** β ``"Global MMLU Lite - Arabic"`` β ``"score"`` | |
| (generic fallback) | |
| The returned string is passed to the resolver, which maps it to a | |
| canonical metric entity via alias lookup / normalized match. | |
| """ | |
| text = metric_desc.strip() | |
| if not text: | |
| return text | |
| from_dot = False | |
| # 1. Dot notation: "bfcl.live.live_accuracy" β last segment β "live accuracy" | |
| if "." in text and " " not in text: | |
| text = text.rsplit(".", 1)[1].replace("_", " ").strip() | |
| from_dot = True | |
| # 2. "X on Y" pattern: "Accuracy on IFEval" β "Accuracy" | |
| if not from_dot: | |
| m = re.match(r"^(.+?)\s+on\s+\S+", text, re.IGNORECASE) | |
| if m: | |
| text = m.group(1).strip() | |
| # 3. Try keyword extraction on any multi-word text or dot-notation segment. | |
| # Single bare words ("Accuracy", "F1", "EM") pass straight to the resolver. | |
| word_count = len(text.split()) | |
| needs_extraction = from_dot or word_count > 1 | |
| if needs_extraction: | |
| canonical = _keyword_extract(text) | |
| if canonical: | |
| return canonical | |
| # No keyword found β verbose descriptions (4+ words) β generic fallback. | |
| # Short phrases (2-3 words) pass through so the resolver can still | |
| # match them via alias (e.g. "Equivalent (CoT)" β cot-correct). | |
| if not from_dot and word_count > 3: | |
| return "score" | |
| return text | |
| # Ordered from most-specific to most-generic. When multiple patterns | |
| # match, the earliest *position* in the input text wins (see | |
| # _keyword_extract). | |
| _METRIC_KEYWORDS: list[tuple[str, str]] = [ | |
| # Multi-word / compound patterns | |
| (r"pass@8", "Pass@8"), | |
| (r"pass@1", "Pass@1"), | |
| (r"mean[\s_-]*win[\s_-]*rate", "Mean Win Rate"), | |
| (r"win[\s_-]*rate", "Win Rate"), | |
| (r"mean[\s_-]*response[\s_-]*time", "Mean Response Time"), | |
| (r"mean[\s_-]*score", "Mean Score"), | |
| (r"exact[\s_-]*match", "Exact Match"), | |
| (r"bleu[\s_-]*4", "BLEU-4"), | |
| (r"cot[\s_-]*correct", "COT correct"), | |
| (r"wb[\s_-]*score", "WB Score"), | |
| (r"avg[\s_-]*attempts", "Average Attempts"), | |
| (r"latency[\s_-]*mean", "mean-latency"), | |
| (r"latency.*(?:p95|95th)", "p95-latency"), | |
| (r"latency.*(?:std|standard)", "latency-stddev"), | |
| (r"max[\s_-]*delta", "max-delta"), | |
| (r"benchmark\s+evaluation", "score"), | |
| (r"outperform", "rank"), | |
| # Compound accuracy types (before generic accuracy) | |
| # Patterns sourced from metric_names in evaleval/card_backend eval-list. | |
| (r"ast[\s_-]*accuracy", "AST Accuracy"), | |
| (r"overall[\s_-]*accuracy", "Accuracy"), | |
| (r"(?:ir)?relevance[\s_-]*detection[\s_-]*accuracy", "Accuracy"), | |
| (r"no[\s_-]*snippet[\s_-]*accuracy", "Accuracy"), | |
| (r"long[\s_-]*context[\s_-]*accuracy", "Accuracy"), | |
| (r"kv[\s_-]*accuracy", "Accuracy"), | |
| (r"vector[\s_-]*accuracy", "Accuracy"), | |
| (r"recursive[\s_-]*summarization[\s_-]*accuracy", "Accuracy"), | |
| (r"total[\s_-]*cost", "cost"), | |
| (r"cost[\s_-]*per[\s_-]*task", "cost-per-task"), | |
| # Single-word patterns (generic, checked last by position) | |
| (r"\baccuracy\b", "Accuracy"), | |
| (r"\bacc\b", "Accuracy"), | |
| (r"\bscores?\b", "score"), | |
| (r"\bf1\b", "F1"), | |
| (r"\bem\b", "Exact Match"), | |
| (r"\belo\b", "Elo Rating"), | |
| (r"\branks?\b", "rank"), | |
| (r"\bcosts?\b", "cost"), | |
| (r"\bharmlessness\b", "harmlessness"), | |
| (r"\bstddev\b", "stddev"), | |
| ] | |
| def _keyword_extract(text: str) -> str | None: | |
| """Return the canonical metric name for the first keyword found in *text*.""" | |
| lower = text.lower() | |
| best: str | None = None | |
| best_pos = len(lower) + 1 | |
| for pattern, canonical in _METRIC_KEYWORDS: | |
| m = re.search(pattern, lower) | |
| if m and m.start() < best_pos: | |
| best_pos = m.start() | |
| best = canonical | |
| return best | |
| # ------------------------------------------------------------------ | |
| # Benchmark-name cleaning | |
| # ------------------------------------------------------------------ | |
| # Trailing metric patterns for space-separated names (e.g. | |
| # "Gaming Score" β "Gaming"). Checked with ``re.search`` against | |
| # the lowered name; the first match wins. | |
| _TRAILING_METRIC_RE: list[str] = [ | |
| r"mean\s+win\s+rate$", | |
| r"mean\s+response\s+time$", | |
| r"mean\s+score$", | |
| r"win\s+rate$", | |
| r"avg\s+attempts$", | |
| r"avg\s+latency\s+ms$", | |
| r"cost\s+per\s+\d+\s+calls\s+usd$", | |
| r"cost\s+per\s+task$", | |
| r"pass@\d+$", | |
| r"\b(?:score|accuracy|acc|elo|rank|f1|em)$", | |
| ] | |
| def clean_eval_name(eval_name: str) -> str: | |
| """Strip embedded metric information from an ``evaluation_name``. | |
| EEE configs often encode both benchmark *and* metric in a single | |
| ``evaluation_name`` string. This function extracts the benchmark | |
| portion so that the metric lives only in ``metric_id``. | |
| Patterns handled: | |
| * **Dot notation** β ``"bfcl.live.live_accuracy"`` β ``"bfcl live"`` | |
| (last segment is the metric, everything before is the benchmark) | |
| * **Underscore suffix** β ``"fibble1_arena_win_rate"`` β ``"fibble1 arena"`` | |
| * **Trailing words** β ``"Gaming Score"`` β ``"Gaming"`` | |
| """ | |
| name = eval_name.strip() | |
| if not name: | |
| return name | |
| # --- 1. Dot notation: split on last dot ------------------------------ | |
| # The last segment is the metric; everything before is the benchmark. | |
| # e.g. "bfcl.live.live_simple_ast_accuracy" β "bfcl live" | |
| if "." in name and " " not in name: | |
| parts = name.rsplit(".", 1)[0].split(".") | |
| return " ".join(p.replace("_", " ") for p in parts) | |
| # --- 2. Underscore/space names: strip trailing metric keywords ------- | |
| # Normalise underscores to spaces so "fibble1_arena_win_rate" and | |
| # "Gaming Score" use the same codepath. | |
| has_underscores = "_" in name and " " not in name | |
| normalized = name.replace("_", " ") if has_underscores else name | |
| lower = normalized.lower() | |
| for pattern in _TRAILING_METRIC_RE: | |
| m = re.search(pattern, lower) | |
| if m: | |
| prefix = normalized[: m.start()].strip() | |
| if prefix: | |
| return prefix | |
| break # matched but prefix is empty β fall through | |
| return name | |