Spaces:

meetkai
/

modelchorus-evals

Runtime error

App Files Files Community

modelchorus-evals / evals /f1_utils.py

brycemeetkai

Mirror evals/ from 4ac99d72af66

a540a5c verified 18 days ago

raw

history blame contribute delete

11.1 kB

	"""Shared macro-F1 utilities for classification / MCQ / claim tasks.

	Provides the macro-averaged F1 aggregation and a common process_results
	helper used across all language-specific classification utils modules.

	Registers:

	- ``regex_last``: like lm_eval's ``regex`` filter, but picks a match from
	``findall`` using ``group_select``; default ``group_select=-1`` is the
	last match (needed when CoT/reasoning mentions labels before the answer).
	- ``strip_think_recover``: drop ``<think>…</think>`` so
	downstream ``regex`` sees only the final answer channel; if that tail is empty
	(e.g. stop at ``\\n\\n`` before content), fall back to the last non-empty line
	of the reasoning block (see ``run_eval.py`` merge format).
	- ``regex_label_set``: pick the last occurrence of any allowed label from a
	per-doc field (e.g. ``labels_str`` for SIB-200, ``intents_str`` for InjongoIntent).
	Robust to channel-marker leak (e.g. a leaked ``<\|channel\|>`` header before the
	answer), models that say "the answer is X", and substring collisions
	(``science/technology`` vs ``science``) -- labels are matched longest-first.
	- ``strip_channel_header``: drop a Harmony-style channel-marker prefix
	(``<channel\|>`` / ``<\|channel\|>`` and optional trailing ``<\|message\|>``) from
	the start of the response. Useful for open-text generation tasks
	(summarization / QA / open generation) where the actual answer is correct
	but the chat template leaks tokens at the start. No-op when no marker found.
	"""

	import re

	from lm_eval.api.filter import Filter
	from lm_eval.api.registry import register_filter


	def _strip_think_tags(text: str) -> str:
	"""Strip <think>...</think> reasoning wrapper (e.g. Qwen thinking models)."""
	if "</think>" in text:
	return text.split("</think>")[-1].strip()
	return text


	@register_filter("strip_think_recover")
	class StripThinkRecoverFilter(Filter):
	"""Remove think wrapper so MCQ ``regex`` runs on the answer tail only.

	When ``run_eval.py`` merges API ``reasoning`` + ``content``, the built-in
	``regex`` ``([ABCD])`` / ``([ABCDE])`` filter would otherwise match the
	first letter inside the reasoning block. This step keeps only text after
	``</think>`` when non-empty; if that tail is empty, uses the
	last non-empty line inside the reasoning (common when generation stops early).
	"""

	def __init__(self) -> None:
	pass

	def apply(self, resps, docs):
	def strip_set(inst):
	stripped = []
	for resp in inst:
	if not isinstance(resp, str):
	resp = ""
	content = _strip_think_tags(resp)
	if not content and "</think>" in resp:
	reasoning = resp.split("</think>")[0]
	if "<think>" in reasoning:
	reasoning = reasoning.split("<think>", 1)[1]
	lines = [
	ln.strip() for ln in reasoning.strip().splitlines() if ln.strip()
	]
	content = lines[-1] if lines else ""
	stripped.append(content)
	return stripped

	return list(map(strip_set, resps))


	def macro_f1_agg(items):
	"""Compute macro-averaged F1 over all class labels.

	``items`` is a list of ``(pred, gold)`` tuples per document. Use
	``resps`` / ``reasoning_content`` on logged samples for the full trace.
	"""
	preds = [item[0] for item in items]
	golds = [item[1] for item in items]

	# Only compute F1 over labels that actually appear in the gold set
	all_labels = sorted(set(golds))

	f1_scores = []
	for label in all_labels:
	tp = sum(1 for p, g in zip(preds, golds) if p == label and g == label)
	fp = sum(1 for p, g in zip(preds, golds) if p == label and g != label)
	fn = sum(1 for p, g in zip(preds, golds) if p != label and g == label)

	precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
	recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
	f1 = (
	2 * precision * recall / (precision + recall)
	if (precision + recall) > 0
	else 0.0
	)
	f1_scores.append(f1)

	return sum(f1_scores) / len(f1_scores) if f1_scores else 0.0


	@register_filter("regex_last")
	class RegexLastFilter(Filter):
	"""Regex extraction; ``group_select=-1`` uses the last ``findall`` hit."""

	def __init__(
	self,
	regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
	group_select: int = -1,
	fallback: str = "[invalid]",
	) -> None:
	self.regex = re.compile(regex_pattern)
	self.group_select = group_select
	self.fallback = fallback

	def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
	def filter_set(inst):
	filtered = []
	for resp in inst:
	if not isinstance(resp, str):
	resp = ""
	matches = self.regex.findall(resp)
	if not matches:
	filtered.append(self.fallback)
	continue
	if self.group_select >= 0:
	idx = min(self.group_select, len(matches) - 1)
	else:
	idx = max(0, len(matches) + self.group_select)
	match = matches[idx]
	if isinstance(match, tuple):
	match = [m for m in match if m]
	if match:
	match = match[0]
	else:
	match = self.fallback
	match = str(match).strip()
	filtered.append(match)
	return filtered

	return list(map(filter_set, resps))


	@register_filter("regex_label_set")
	class RegexLabelSetFilter(Filter):
	"""Pick the LAST occurrence of any allowed label from the response.

	The allowed-label list is read from a per-doc field (default ``labels_str``,
	e.g. ``"entertainment, geography, ..., science/technology, ..."``). Labels
	are matched longest-first so multi-segment labels like
	``science/technology`` win over substring collisions like ``science``.

	Robust to:

	- ``<think>...</think>`` reasoning leak -- typically chain ``strip_think_recover``
	first to drop the reasoning block, then this filter on the answer tail.
	- Harmony / channel-marker leak (e.g. a leaked ``<\|channel\|>`` header followed
	by the actual label) -- the regex still finds the trailing label substring.
	- "The answer is X" / "Final: X" patterns -- the LAST occurrence wins.

	If the doc field is missing, empty, or no label matches, returns
	``fallback`` (default ``"[invalid]"``) so the row counts as wrong in F1.
	"""

	def __init__(
	self,
	labels_field: str = "labels_str",
	separator: str = ",",
	fallback: str = "[invalid]",
	) -> None:
	self.labels_field = labels_field
	self.separator = separator
	self.fallback = fallback

	def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
	out: list[list[str]] = []
	for resp_set, doc in zip(resps, docs):
	raw_labels = str((doc or {}).get(self.labels_field, "") or "")
	labels = [
	lbl.strip() for lbl in raw_labels.split(self.separator) if lbl.strip()
	]
	# Sort longest-first so e.g. "science/technology" matches before
	# the bare "science" substring inside it.
	labels_sorted = sorted(set(labels), key=len, reverse=True)
	pattern = (
	re.compile("(" + "\|".join(re.escape(lbl) for lbl in labels_sorted) + ")")
	if labels_sorted
	else None
	)
	filtered: list[str] = []
	for resp in resp_set:
	if not isinstance(resp, str):
	resp = ""
	if pattern is None:
	filtered.append(self.fallback)
	continue
	matches = pattern.findall(resp)
	filtered.append(matches[-1].strip() if matches else self.fallback)
	out.append(filtered)
	return out


	@register_filter("strip_channel_header")
	class StripChannelHeaderFilter(Filter):
	"""Strip Harmony-style channel/message header leaks from the response.

	Some providers (notably deepinfra fp8 Gemma) leak chat template tokens like
	``<\|channel\|>final<\|message\|>`` -- or partial fragments such as
	``s.<channel\|>`` (where ``s.`` is the tail of the previous token) -- into
	the assistant ``content``. For open-text tasks (summarization / QA / open
	generation) this hurts every metric (ROUGE / BLEU / SAS-encoder / LLM judge)
	because the prefix garbage drags down the score even when the actual answer
	that follows is correct.

	Strategy: anchored at the start of the response, match up to ``max_prefix_chars``
	(default 80) characters of any text followed by a ``<channel\|>`` /
	``<\|channel\|>`` marker, optionally followed by a Harmony ``<\|message\|>`` /
	``<message\|>`` marker (and any text in between like ``final``). Drop everything
	matched. No-op when no marker is found near the start, so safe for clean responses.

	Order tip: chain after ``strip_think_recover`` so reasoning is dropped first
	and this filter operates on the answer tail only.
	"""

	def __init__(self, max_prefix_chars: int = 80) -> None:
	self.max_prefix_chars = int(max_prefix_chars)
	# ^ - anchored at start
	# .{0,N}? - up to N chars of garbage prefix (non-greedy)
	# <\\|?channel\\|?> - matches both <channel\|> and <\|channel\|>
	# (?:[^<]*<\\|?message\\|?>)? - optional Harmony "<...><\|message\|>" tail
	# \s* - eat trailing whitespace
	self._pattern = re.compile(
	r"^.{0," + str(self.max_prefix_chars) + r"}?<\\|?channel\\|?>"
	r"(?:[^<]{0,40}<\\|?message\\|?>)?\s*",
	re.DOTALL,
	)

	def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
	out: list[list[str]] = []
	for resp_set in resps:
	stripped: list[str] = []
	for resp in resp_set:
	if not isinstance(resp, str):
	resp = ""
	cleaned = self._pattern.sub("", resp, count=1)
	stripped.append(cleaned)
	out.append(stripped)
	return out


	def process_results_f1(doc, results, *, gold_key="target"):
	"""Return ``(pred, gold)`` for macro-F1 aggregation.

	``pred`` is the label after stripping think wrappers. Full reasoning
	is logged as ``reasoning_content`` when using ``run_eval.py``.

	Most tasks use ``gold_key="target"``; override for tasks that store
	the gold label under a different field name.
	"""
	raw_response = results[0].strip() if results[0] else ""
	pred = _strip_think_tags(raw_response)
	gold = doc.get(gold_key, "").strip()
	return {"f1_macro": (pred, gold)}