Spaces:

evaleval
/

entity-registry

Sleeping

App Files Files Community

entity-registry / packages /eval-entity-resolver /src /eval_entity_resolver /display.py

j-chim

Upload folder using huggingface_hub

4feff22 verified 11 days ago

raw

history blame contribute delete

6 kB

	"""Display-name humanization for canonical model slugs.

	Single source of truth for converting machine slugs (`gpt-4o-2024-05-13`) into
	human-friendly display names (`GPT-4o (2024-05-13)`). Used by refresh scripts
	and the seed migration; consumers (frontend, API) should NOT re-humanize but
	read `canonical_models.display_name` directly.

	Rules in priority order:
	1. Strip org prefix (`openai/gpt-5` -> `gpt-5`).
	2. Strip and parenthesize a trailing date suffix:
	- `-YYYY-MM-DD` -> ` (YYYY-MM-DD)`
	- `-YYYYMMDD` -> ` (YYYY-MM-DD)`
	- `-MMDD` (4-digit) -> ` (MMDD)`
	3. Per-token formatting:
	- Known acronyms render uppercase (`gpt` -> `GPT`).
	- Mixed-case overrides apply (`moe` -> `MoE`).
	- Param sizes uppercase the unit (`7b` -> `7B`, `a22b` -> `A22B`,
	`8x7b` -> `8x7B`, `30m` -> `30M`).
	- Number+letter version tags preserve case (`4o` -> `4o`).
	- O-series stays lowercase (`o1`, `o3`).
	- Vendor-name overrides (`deepseek` -> `DeepSeek`).
	- Default: capitalize first letter.
	4. Glue an acronym token to the next token with a hyphen when the next
	token is a bare version number (digits + optional `.NN` + optional
	single non-size letter): `GPT 5 Mini` -> `GPT-5 Mini`,
	`GPT 4o ...` -> `GPT-4o ...`. Skipped when the next token is a param
	size like `7B`.
	"""

	from __future__ import annotations

	import re

	ACRONYMS: frozenset[str] = frozenset(
	{
	"gpt",
	"glm",
	"llm",
	"vl",
	"vlm",
	"qvq",
	"qwq",
	"mt",
	"vit",
	"clip",
	"dit",
	"hf",
	"ocr",
	"tts",
	"asr",
	"moe",
	"mlp",
	"rlhf",
	}
	)

	# Tokens whose canonical rendering is mixed case rather than ALL CAPS.
	CASE_OVERRIDES: dict[str, str] = {
	"moe": "MoE",
	"vit": "ViT",
	"dit": "DiT",
	}

	# Vendor / family tokens whose canonical rendering doesn't match a simple
	# capitalize() — e.g., `deepseek` should display as `DeepSeek`. Keep the
	# list short; this is for tokens the auto-rule mangles, not a general
	# branding registry.
	TOKEN_OVERRIDES: dict[str, str] = {
	"deepseek": "DeepSeek",
	"openai": "OpenAI",
	"stepfun": "StepFun",
	"moonshotai": "MoonshotAI",
	"mistralai": "MistralAI",
	}

	# Suffixes treated as parameter-count units, NOT version letters. When a
	# token like `7b` appears after an acronym, we do NOT hyphen-glue it.
	_SIZE_SUFFIXES: frozenset[str] = frozenset({"b", "m", "k"})


	def humanize_model_slug(slug: str) -> str:
	"""Render a model slug as a human display name.

	Accepts a bare slug (`gpt-4o-2024-05-13`) or a full canonical id
	(`openai/gpt-4o-2024-05-13`); the org prefix is dropped.
	"""
	if not slug:
	return ""
	if "/" in slug:
	slug = slug.split("/", 1)[1]

	slug, suffix = _strip_date_suffix(slug)

	tokens = slug.split("-")
	formatted = [_format_token(t) for t in tokens]

	out: list[str] = []
	i = 0
	while i < len(formatted):
	cur_lower = tokens[i].lower()
	if (
	i + 1 < len(formatted)
	and cur_lower in ACRONYMS
	and _is_version_token(tokens[i + 1])
	):
	out.append(f"{formatted[i]}-{formatted[i + 1]}")
	i += 2
	else:
	out.append(formatted[i])
	i += 1

	return " ".join(out) + suffix


	def _strip_date_suffix(slug: str) -> tuple[str, str]:
	"""Pop a trailing date or 4-digit code; return (slug_without, ' (suffix)').

	Order matters: more specific patterns first, since a partial match
	against a less-specific pattern would mis-render (e.g. `2025` as a
	bare 4-digit code when it's actually the year half of `2025-08`).
	"""
	# Full ISO date: `-YYYY-MM-DD`
	m = re.search(r"-(20\d{2}-\d{2}-\d{2})$", slug)
	if m:
	return slug[: m.start()], f" ({m.group(1)})"
	# Compact date: `-YYYYMMDD`
	m = re.search(r"-(20\d{6})$", slug)
	if m:
	d = m.group(1)
	return slug[: m.start()], f" ({d[:4]}-{d[4:6]}-{d[6:8]})"
	# Year-month: `-YYYY-MM` (e.g. `gpt-5-2025-08`)
	m = re.search(r"-(20\d{2})-(\d{2})$", slug)
	if m:
	return slug[: m.start()], f" ({m.group(1)}-{m.group(2)})"
	# Cohere convention: `-MM-YYYY` (e.g. `command-r-08-2024`).
	# Render as `(YYYY-MM)` for ISO-ordered display.
	m = re.search(r"-(\d{2})-(20\d{2})$", slug)
	if m:
	return slug[: m.start()], f" ({m.group(2)}-{m.group(1)})"
	# Bare 4-digit code: `-NNNN` (e.g. `grok-4-0709`, `kimi-k2-0711`).
	m = re.search(r"-(\d{4})$", slug)
	if m:
	return slug[: m.start()], f" ({m.group(1)})"
	return slug, ""


	def _format_token(tok: str) -> str:
	if not tok:
	return tok
	low = tok.lower()
	if low in CASE_OVERRIDES:
	return CASE_OVERRIDES[low]
	if low in ACRONYMS:
	return low.upper()
	if low in TOKEN_OVERRIDES:
	return TOKEN_OVERRIDES[low]
	# Param size: 7b, 70b, 1.5b, 30m
	if re.fullmatch(r"\d+(?:\.\d+)?[bmk]", low):
	return low[:-1] + low[-1].upper()
	# MoE active-expert form: a22b, a3b
	if re.fullmatch(r"a\d+(?:\.\d+)?b", low):
	return "A" + low[1:-1] + "B"
	# MxNb: 8x7b -> 8x7B
	if re.fullmatch(r"\d+x\d+(?:\.\d+)?b", low):
	return low[:-1] + "B"
	# Number followed by a single lowercase letter that's NOT a size suffix:
	# version tags like `4o`, `5o` — keep as-is.
	if re.fullmatch(r"\d+(?:\.\d+)?[a-z]", low) and low[-1] not in _SIZE_SUFFIXES:
	return low
	# O-series: o1, o3, o4
	if re.fullmatch(r"o\d+", low):
	return low
	# Default: capitalize first letter, preserve rest.
	return tok[0].upper() + tok[1:] if tok[0].isalpha() else tok


	def _is_version_token(tok: str) -> bool:
	"""True if `tok` looks like a version (e.g. `5`, `4.5`, `4o`) and not
	a parameter size (`7b`, `70m`)."""
	m = re.fullmatch(r"(\d+(?:\.\d+)?)([a-z]?)", tok.lower())
	if not m:
	return False
	return m.group(2) not in _SIZE_SUFFIXES