j-chim's picture
Upload folder using huggingface_hub
4feff22 verified
"""Display-name humanization for canonical model slugs.
Single source of truth for converting machine slugs (`gpt-4o-2024-05-13`) into
human-friendly display names (`GPT-4o (2024-05-13)`). Used by refresh scripts
and the seed migration; consumers (frontend, API) should NOT re-humanize but
read `canonical_models.display_name` directly.
Rules in priority order:
1. Strip org prefix (`openai/gpt-5` -> `gpt-5`).
2. Strip and parenthesize a trailing date suffix:
- `-YYYY-MM-DD` -> ` (YYYY-MM-DD)`
- `-YYYYMMDD` -> ` (YYYY-MM-DD)`
- `-MMDD` (4-digit) -> ` (MMDD)`
3. Per-token formatting:
- Known acronyms render uppercase (`gpt` -> `GPT`).
- Mixed-case overrides apply (`moe` -> `MoE`).
- Param sizes uppercase the unit (`7b` -> `7B`, `a22b` -> `A22B`,
`8x7b` -> `8x7B`, `30m` -> `30M`).
- Number+letter version tags preserve case (`4o` -> `4o`).
- O-series stays lowercase (`o1`, `o3`).
- Vendor-name overrides (`deepseek` -> `DeepSeek`).
- Default: capitalize first letter.
4. Glue an acronym token to the next token with a hyphen when the next
token is a bare version number (digits + optional `.NN` + optional
single non-size letter): `GPT 5 Mini` -> `GPT-5 Mini`,
`GPT 4o ...` -> `GPT-4o ...`. Skipped when the next token is a param
size like `7B`.
"""
from __future__ import annotations
import re
ACRONYMS: frozenset[str] = frozenset(
{
"gpt",
"glm",
"llm",
"vl",
"vlm",
"qvq",
"qwq",
"mt",
"vit",
"clip",
"dit",
"hf",
"ocr",
"tts",
"asr",
"moe",
"mlp",
"rlhf",
}
)
# Tokens whose canonical rendering is mixed case rather than ALL CAPS.
CASE_OVERRIDES: dict[str, str] = {
"moe": "MoE",
"vit": "ViT",
"dit": "DiT",
}
# Vendor / family tokens whose canonical rendering doesn't match a simple
# capitalize() — e.g., `deepseek` should display as `DeepSeek`. Keep the
# list short; this is for tokens the auto-rule mangles, not a general
# branding registry.
TOKEN_OVERRIDES: dict[str, str] = {
"deepseek": "DeepSeek",
"openai": "OpenAI",
"stepfun": "StepFun",
"moonshotai": "MoonshotAI",
"mistralai": "MistralAI",
}
# Suffixes treated as parameter-count units, NOT version letters. When a
# token like `7b` appears after an acronym, we do NOT hyphen-glue it.
_SIZE_SUFFIXES: frozenset[str] = frozenset({"b", "m", "k"})
def humanize_model_slug(slug: str) -> str:
"""Render a model slug as a human display name.
Accepts a bare slug (`gpt-4o-2024-05-13`) or a full canonical id
(`openai/gpt-4o-2024-05-13`); the org prefix is dropped.
"""
if not slug:
return ""
if "/" in slug:
slug = slug.split("/", 1)[1]
slug, suffix = _strip_date_suffix(slug)
tokens = slug.split("-")
formatted = [_format_token(t) for t in tokens]
out: list[str] = []
i = 0
while i < len(formatted):
cur_lower = tokens[i].lower()
if (
i + 1 < len(formatted)
and cur_lower in ACRONYMS
and _is_version_token(tokens[i + 1])
):
out.append(f"{formatted[i]}-{formatted[i + 1]}")
i += 2
else:
out.append(formatted[i])
i += 1
return " ".join(out) + suffix
def _strip_date_suffix(slug: str) -> tuple[str, str]:
"""Pop a trailing date or 4-digit code; return (slug_without, ' (suffix)').
Order matters: more specific patterns first, since a partial match
against a less-specific pattern would mis-render (e.g. `2025` as a
bare 4-digit code when it's actually the year half of `2025-08`).
"""
# Full ISO date: `-YYYY-MM-DD`
m = re.search(r"-(20\d{2}-\d{2}-\d{2})$", slug)
if m:
return slug[: m.start()], f" ({m.group(1)})"
# Compact date: `-YYYYMMDD`
m = re.search(r"-(20\d{6})$", slug)
if m:
d = m.group(1)
return slug[: m.start()], f" ({d[:4]}-{d[4:6]}-{d[6:8]})"
# Year-month: `-YYYY-MM` (e.g. `gpt-5-2025-08`)
m = re.search(r"-(20\d{2})-(\d{2})$", slug)
if m:
return slug[: m.start()], f" ({m.group(1)}-{m.group(2)})"
# Cohere convention: `-MM-YYYY` (e.g. `command-r-08-2024`).
# Render as `(YYYY-MM)` for ISO-ordered display.
m = re.search(r"-(\d{2})-(20\d{2})$", slug)
if m:
return slug[: m.start()], f" ({m.group(2)}-{m.group(1)})"
# Bare 4-digit code: `-NNNN` (e.g. `grok-4-0709`, `kimi-k2-0711`).
m = re.search(r"-(\d{4})$", slug)
if m:
return slug[: m.start()], f" ({m.group(1)})"
return slug, ""
def _format_token(tok: str) -> str:
if not tok:
return tok
low = tok.lower()
if low in CASE_OVERRIDES:
return CASE_OVERRIDES[low]
if low in ACRONYMS:
return low.upper()
if low in TOKEN_OVERRIDES:
return TOKEN_OVERRIDES[low]
# Param size: 7b, 70b, 1.5b, 30m
if re.fullmatch(r"\d+(?:\.\d+)?[bmk]", low):
return low[:-1] + low[-1].upper()
# MoE active-expert form: a22b, a3b
if re.fullmatch(r"a\d+(?:\.\d+)?b", low):
return "A" + low[1:-1] + "B"
# MxNb: 8x7b -> 8x7B
if re.fullmatch(r"\d+x\d+(?:\.\d+)?b", low):
return low[:-1] + "B"
# Number followed by a single lowercase letter that's NOT a size suffix:
# version tags like `4o`, `5o` — keep as-is.
if re.fullmatch(r"\d+(?:\.\d+)?[a-z]", low) and low[-1] not in _SIZE_SUFFIXES:
return low
# O-series: o1, o3, o4
if re.fullmatch(r"o\d+", low):
return low
# Default: capitalize first letter, preserve rest.
return tok[0].upper() + tok[1:] if tok[0].isalpha() else tok
def _is_version_token(tok: str) -> bool:
"""True if `tok` looks like a version (e.g. `5`, `4.5`, `4o`) and not
a parameter size (`7b`, `70m`)."""
m = re.fullmatch(r"(\d+(?:\.\d+)?)([a-z]?)", tok.lower())
if not m:
return False
return m.group(2) not in _SIZE_SUFFIXES