Spaces:
Sleeping
Sleeping
File size: 5,998 Bytes
4feff22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | """Display-name humanization for canonical model slugs.
Single source of truth for converting machine slugs (`gpt-4o-2024-05-13`) into
human-friendly display names (`GPT-4o (2024-05-13)`). Used by refresh scripts
and the seed migration; consumers (frontend, API) should NOT re-humanize but
read `canonical_models.display_name` directly.
Rules in priority order:
1. Strip org prefix (`openai/gpt-5` -> `gpt-5`).
2. Strip and parenthesize a trailing date suffix:
- `-YYYY-MM-DD` -> ` (YYYY-MM-DD)`
- `-YYYYMMDD` -> ` (YYYY-MM-DD)`
- `-MMDD` (4-digit) -> ` (MMDD)`
3. Per-token formatting:
- Known acronyms render uppercase (`gpt` -> `GPT`).
- Mixed-case overrides apply (`moe` -> `MoE`).
- Param sizes uppercase the unit (`7b` -> `7B`, `a22b` -> `A22B`,
`8x7b` -> `8x7B`, `30m` -> `30M`).
- Number+letter version tags preserve case (`4o` -> `4o`).
- O-series stays lowercase (`o1`, `o3`).
- Vendor-name overrides (`deepseek` -> `DeepSeek`).
- Default: capitalize first letter.
4. Glue an acronym token to the next token with a hyphen when the next
token is a bare version number (digits + optional `.NN` + optional
single non-size letter): `GPT 5 Mini` -> `GPT-5 Mini`,
`GPT 4o ...` -> `GPT-4o ...`. Skipped when the next token is a param
size like `7B`.
"""
from __future__ import annotations
import re
ACRONYMS: frozenset[str] = frozenset(
{
"gpt",
"glm",
"llm",
"vl",
"vlm",
"qvq",
"qwq",
"mt",
"vit",
"clip",
"dit",
"hf",
"ocr",
"tts",
"asr",
"moe",
"mlp",
"rlhf",
}
)
# Tokens whose canonical rendering is mixed case rather than ALL CAPS.
CASE_OVERRIDES: dict[str, str] = {
"moe": "MoE",
"vit": "ViT",
"dit": "DiT",
}
# Vendor / family tokens whose canonical rendering doesn't match a simple
# capitalize() — e.g., `deepseek` should display as `DeepSeek`. Keep the
# list short; this is for tokens the auto-rule mangles, not a general
# branding registry.
TOKEN_OVERRIDES: dict[str, str] = {
"deepseek": "DeepSeek",
"openai": "OpenAI",
"stepfun": "StepFun",
"moonshotai": "MoonshotAI",
"mistralai": "MistralAI",
}
# Suffixes treated as parameter-count units, NOT version letters. When a
# token like `7b` appears after an acronym, we do NOT hyphen-glue it.
_SIZE_SUFFIXES: frozenset[str] = frozenset({"b", "m", "k"})
def humanize_model_slug(slug: str) -> str:
"""Render a model slug as a human display name.
Accepts a bare slug (`gpt-4o-2024-05-13`) or a full canonical id
(`openai/gpt-4o-2024-05-13`); the org prefix is dropped.
"""
if not slug:
return ""
if "/" in slug:
slug = slug.split("/", 1)[1]
slug, suffix = _strip_date_suffix(slug)
tokens = slug.split("-")
formatted = [_format_token(t) for t in tokens]
out: list[str] = []
i = 0
while i < len(formatted):
cur_lower = tokens[i].lower()
if (
i + 1 < len(formatted)
and cur_lower in ACRONYMS
and _is_version_token(tokens[i + 1])
):
out.append(f"{formatted[i]}-{formatted[i + 1]}")
i += 2
else:
out.append(formatted[i])
i += 1
return " ".join(out) + suffix
def _strip_date_suffix(slug: str) -> tuple[str, str]:
"""Pop a trailing date or 4-digit code; return (slug_without, ' (suffix)').
Order matters: more specific patterns first, since a partial match
against a less-specific pattern would mis-render (e.g. `2025` as a
bare 4-digit code when it's actually the year half of `2025-08`).
"""
# Full ISO date: `-YYYY-MM-DD`
m = re.search(r"-(20\d{2}-\d{2}-\d{2})$", slug)
if m:
return slug[: m.start()], f" ({m.group(1)})"
# Compact date: `-YYYYMMDD`
m = re.search(r"-(20\d{6})$", slug)
if m:
d = m.group(1)
return slug[: m.start()], f" ({d[:4]}-{d[4:6]}-{d[6:8]})"
# Year-month: `-YYYY-MM` (e.g. `gpt-5-2025-08`)
m = re.search(r"-(20\d{2})-(\d{2})$", slug)
if m:
return slug[: m.start()], f" ({m.group(1)}-{m.group(2)})"
# Cohere convention: `-MM-YYYY` (e.g. `command-r-08-2024`).
# Render as `(YYYY-MM)` for ISO-ordered display.
m = re.search(r"-(\d{2})-(20\d{2})$", slug)
if m:
return slug[: m.start()], f" ({m.group(2)}-{m.group(1)})"
# Bare 4-digit code: `-NNNN` (e.g. `grok-4-0709`, `kimi-k2-0711`).
m = re.search(r"-(\d{4})$", slug)
if m:
return slug[: m.start()], f" ({m.group(1)})"
return slug, ""
def _format_token(tok: str) -> str:
if not tok:
return tok
low = tok.lower()
if low in CASE_OVERRIDES:
return CASE_OVERRIDES[low]
if low in ACRONYMS:
return low.upper()
if low in TOKEN_OVERRIDES:
return TOKEN_OVERRIDES[low]
# Param size: 7b, 70b, 1.5b, 30m
if re.fullmatch(r"\d+(?:\.\d+)?[bmk]", low):
return low[:-1] + low[-1].upper()
# MoE active-expert form: a22b, a3b
if re.fullmatch(r"a\d+(?:\.\d+)?b", low):
return "A" + low[1:-1] + "B"
# MxNb: 8x7b -> 8x7B
if re.fullmatch(r"\d+x\d+(?:\.\d+)?b", low):
return low[:-1] + "B"
# Number followed by a single lowercase letter that's NOT a size suffix:
# version tags like `4o`, `5o` — keep as-is.
if re.fullmatch(r"\d+(?:\.\d+)?[a-z]", low) and low[-1] not in _SIZE_SUFFIXES:
return low
# O-series: o1, o3, o4
if re.fullmatch(r"o\d+", low):
return low
# Default: capitalize first letter, preserve rest.
return tok[0].upper() + tok[1:] if tok[0].isalpha() else tok
def _is_version_token(tok: str) -> bool:
"""True if `tok` looks like a version (e.g. `5`, `4.5`, `4o`) and not
a parameter size (`7b`, `70m`)."""
m = re.fullmatch(r"(\d+(?:\.\d+)?)([a-z]?)", tok.lower())
if not m:
return False
return m.group(2) not in _SIZE_SUFFIXES
|