File size: 5,998 Bytes
4feff22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""Display-name humanization for canonical model slugs.

Single source of truth for converting machine slugs (`gpt-4o-2024-05-13`) into
human-friendly display names (`GPT-4o (2024-05-13)`). Used by refresh scripts
and the seed migration; consumers (frontend, API) should NOT re-humanize but
read `canonical_models.display_name` directly.

Rules in priority order:
1. Strip org prefix (`openai/gpt-5` -> `gpt-5`).
2. Strip and parenthesize a trailing date suffix:
   - `-YYYY-MM-DD` -> ` (YYYY-MM-DD)`
   - `-YYYYMMDD`  -> ` (YYYY-MM-DD)`
   - `-MMDD` (4-digit) -> ` (MMDD)`
3. Per-token formatting:
   - Known acronyms render uppercase (`gpt` -> `GPT`).
   - Mixed-case overrides apply (`moe` -> `MoE`).
   - Param sizes uppercase the unit (`7b` -> `7B`, `a22b` -> `A22B`,
     `8x7b` -> `8x7B`, `30m` -> `30M`).
   - Number+letter version tags preserve case (`4o` -> `4o`).
   - O-series stays lowercase (`o1`, `o3`).
   - Vendor-name overrides (`deepseek` -> `DeepSeek`).
   - Default: capitalize first letter.
4. Glue an acronym token to the next token with a hyphen when the next
   token is a bare version number (digits + optional `.NN` + optional
   single non-size letter): `GPT 5 Mini` -> `GPT-5 Mini`,
   `GPT 4o ...` -> `GPT-4o ...`. Skipped when the next token is a param
   size like `7B`.
"""

from __future__ import annotations

import re

ACRONYMS: frozenset[str] = frozenset(
    {
        "gpt",
        "glm",
        "llm",
        "vl",
        "vlm",
        "qvq",
        "qwq",
        "mt",
        "vit",
        "clip",
        "dit",
        "hf",
        "ocr",
        "tts",
        "asr",
        "moe",
        "mlp",
        "rlhf",
    }
)

# Tokens whose canonical rendering is mixed case rather than ALL CAPS.
CASE_OVERRIDES: dict[str, str] = {
    "moe": "MoE",
    "vit": "ViT",
    "dit": "DiT",
}

# Vendor / family tokens whose canonical rendering doesn't match a simple
# capitalize() — e.g., `deepseek` should display as `DeepSeek`. Keep the
# list short; this is for tokens the auto-rule mangles, not a general
# branding registry.
TOKEN_OVERRIDES: dict[str, str] = {
    "deepseek": "DeepSeek",
    "openai": "OpenAI",
    "stepfun": "StepFun",
    "moonshotai": "MoonshotAI",
    "mistralai": "MistralAI",
}

# Suffixes treated as parameter-count units, NOT version letters. When a
# token like `7b` appears after an acronym, we do NOT hyphen-glue it.
_SIZE_SUFFIXES: frozenset[str] = frozenset({"b", "m", "k"})


def humanize_model_slug(slug: str) -> str:
    """Render a model slug as a human display name.

    Accepts a bare slug (`gpt-4o-2024-05-13`) or a full canonical id
    (`openai/gpt-4o-2024-05-13`); the org prefix is dropped.
    """
    if not slug:
        return ""
    if "/" in slug:
        slug = slug.split("/", 1)[1]

    slug, suffix = _strip_date_suffix(slug)

    tokens = slug.split("-")
    formatted = [_format_token(t) for t in tokens]

    out: list[str] = []
    i = 0
    while i < len(formatted):
        cur_lower = tokens[i].lower()
        if (
            i + 1 < len(formatted)
            and cur_lower in ACRONYMS
            and _is_version_token(tokens[i + 1])
        ):
            out.append(f"{formatted[i]}-{formatted[i + 1]}")
            i += 2
        else:
            out.append(formatted[i])
            i += 1

    return " ".join(out) + suffix


def _strip_date_suffix(slug: str) -> tuple[str, str]:
    """Pop a trailing date or 4-digit code; return (slug_without, ' (suffix)').

    Order matters: more specific patterns first, since a partial match
    against a less-specific pattern would mis-render (e.g. `2025` as a
    bare 4-digit code when it's actually the year half of `2025-08`).
    """
    # Full ISO date: `-YYYY-MM-DD`
    m = re.search(r"-(20\d{2}-\d{2}-\d{2})$", slug)
    if m:
        return slug[: m.start()], f" ({m.group(1)})"
    # Compact date: `-YYYYMMDD`
    m = re.search(r"-(20\d{6})$", slug)
    if m:
        d = m.group(1)
        return slug[: m.start()], f" ({d[:4]}-{d[4:6]}-{d[6:8]})"
    # Year-month: `-YYYY-MM` (e.g. `gpt-5-2025-08`)
    m = re.search(r"-(20\d{2})-(\d{2})$", slug)
    if m:
        return slug[: m.start()], f" ({m.group(1)}-{m.group(2)})"
    # Cohere convention: `-MM-YYYY` (e.g. `command-r-08-2024`).
    # Render as `(YYYY-MM)` for ISO-ordered display.
    m = re.search(r"-(\d{2})-(20\d{2})$", slug)
    if m:
        return slug[: m.start()], f" ({m.group(2)}-{m.group(1)})"
    # Bare 4-digit code: `-NNNN` (e.g. `grok-4-0709`, `kimi-k2-0711`).
    m = re.search(r"-(\d{4})$", slug)
    if m:
        return slug[: m.start()], f" ({m.group(1)})"
    return slug, ""


def _format_token(tok: str) -> str:
    if not tok:
        return tok
    low = tok.lower()
    if low in CASE_OVERRIDES:
        return CASE_OVERRIDES[low]
    if low in ACRONYMS:
        return low.upper()
    if low in TOKEN_OVERRIDES:
        return TOKEN_OVERRIDES[low]
    # Param size: 7b, 70b, 1.5b, 30m
    if re.fullmatch(r"\d+(?:\.\d+)?[bmk]", low):
        return low[:-1] + low[-1].upper()
    # MoE active-expert form: a22b, a3b
    if re.fullmatch(r"a\d+(?:\.\d+)?b", low):
        return "A" + low[1:-1] + "B"
    # MxNb: 8x7b -> 8x7B
    if re.fullmatch(r"\d+x\d+(?:\.\d+)?b", low):
        return low[:-1] + "B"
    # Number followed by a single lowercase letter that's NOT a size suffix:
    # version tags like `4o`, `5o` — keep as-is.
    if re.fullmatch(r"\d+(?:\.\d+)?[a-z]", low) and low[-1] not in _SIZE_SUFFIXES:
        return low
    # O-series: o1, o3, o4
    if re.fullmatch(r"o\d+", low):
        return low
    # Default: capitalize first letter, preserve rest.
    return tok[0].upper() + tok[1:] if tok[0].isalpha() else tok


def _is_version_token(tok: str) -> bool:
    """True if `tok` looks like a version (e.g. `5`, `4.5`, `4o`) and not
    a parameter size (`7b`, `70m`)."""
    m = re.fullmatch(r"(\d+(?:\.\d+)?)([a-z]?)", tok.lower())
    if not m:
        return False
    return m.group(2) not in _SIZE_SUFFIXES