Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- packages/eval-entity-resolver/src/eval_entity_resolver/__init__.py +2 -0
- packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/__init__.cpython-311.pyc +0 -0
- packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/__init__.cpython-314.pyc +0 -0
- packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/display.cpython-311.pyc +0 -0
- packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/display.cpython-314.pyc +0 -0
- packages/eval-entity-resolver/src/eval_entity_resolver/display.py +183 -0
- src/eval_card_registry/__pycache__/cli.cpython-314.pyc +0 -0
- src/eval_card_registry/services/__pycache__/hub_stats.cpython-314.pyc +0 -0
- src/eval_card_registry/services/__pycache__/resolution_service.cpython-314.pyc +0 -0
- src/eval_card_registry/services/hub_stats.py +13 -0
- src/eval_card_registry/services/resolution_service.py +10 -1
- src/eval_card_registry/store/__pycache__/queries.cpython-314.pyc +0 -0
- src/eval_card_registry/store/queries.py +24 -9
packages/eval-entity-resolver/src/eval_entity_resolver/__init__.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from eval_entity_resolver.alias_store import AliasStore
|
| 2 |
from eval_entity_resolver.canonical_store import CanonicalStore
|
|
|
|
| 3 |
from eval_entity_resolver.eee import clean_eval_name, extract_metric
|
| 4 |
from eval_entity_resolver.models import ResolutionResult, ResolverConfig
|
| 5 |
from eval_entity_resolver.resolver import Resolver
|
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
| 12 |
"ResolutionResult",
|
| 13 |
"clean_eval_name",
|
| 14 |
"extract_metric",
|
|
|
|
| 15 |
]
|
|
|
|
| 1 |
from eval_entity_resolver.alias_store import AliasStore
|
| 2 |
from eval_entity_resolver.canonical_store import CanonicalStore
|
| 3 |
+
from eval_entity_resolver.display import humanize_model_slug
|
| 4 |
from eval_entity_resolver.eee import clean_eval_name, extract_metric
|
| 5 |
from eval_entity_resolver.models import ResolutionResult, ResolverConfig
|
| 6 |
from eval_entity_resolver.resolver import Resolver
|
|
|
|
| 13 |
"ResolutionResult",
|
| 14 |
"clean_eval_name",
|
| 15 |
"extract_metric",
|
| 16 |
+
"humanize_model_slug",
|
| 17 |
]
|
packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/__init__.cpython-311.pyc
CHANGED
|
Binary files a/packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/__init__.cpython-311.pyc and b/packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/__init__.cpython-311.pyc differ
|
|
|
packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/__init__.cpython-314.pyc
CHANGED
|
Binary files a/packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/__init__.cpython-314.pyc and b/packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/__init__.cpython-314.pyc differ
|
|
|
packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/display.cpython-311.pyc
ADDED
|
Binary file (7.88 kB). View file
|
|
|
packages/eval-entity-resolver/src/eval_entity_resolver/__pycache__/display.cpython-314.pyc
ADDED
|
Binary file (7.83 kB). View file
|
|
|
packages/eval-entity-resolver/src/eval_entity_resolver/display.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Display-name humanization for canonical model slugs.
|
| 2 |
+
|
| 3 |
+
Single source of truth for converting machine slugs (`gpt-4o-2024-05-13`) into
|
| 4 |
+
human-friendly display names (`GPT-4o (2024-05-13)`). Used by refresh scripts
|
| 5 |
+
and the seed migration; consumers (frontend, API) should NOT re-humanize but
|
| 6 |
+
read `canonical_models.display_name` directly.
|
| 7 |
+
|
| 8 |
+
Rules in priority order:
|
| 9 |
+
1. Strip org prefix (`openai/gpt-5` -> `gpt-5`).
|
| 10 |
+
2. Strip and parenthesize a trailing date suffix:
|
| 11 |
+
- `-YYYY-MM-DD` -> ` (YYYY-MM-DD)`
|
| 12 |
+
- `-YYYYMMDD` -> ` (YYYY-MM-DD)`
|
| 13 |
+
- `-MMDD` (4-digit) -> ` (MMDD)`
|
| 14 |
+
3. Per-token formatting:
|
| 15 |
+
- Known acronyms render uppercase (`gpt` -> `GPT`).
|
| 16 |
+
- Mixed-case overrides apply (`moe` -> `MoE`).
|
| 17 |
+
- Param sizes uppercase the unit (`7b` -> `7B`, `a22b` -> `A22B`,
|
| 18 |
+
`8x7b` -> `8x7B`, `30m` -> `30M`).
|
| 19 |
+
- Number+letter version tags preserve case (`4o` -> `4o`).
|
| 20 |
+
- O-series stays lowercase (`o1`, `o3`).
|
| 21 |
+
- Vendor-name overrides (`deepseek` -> `DeepSeek`).
|
| 22 |
+
- Default: capitalize first letter.
|
| 23 |
+
4. Glue an acronym token to the next token with a hyphen when the next
|
| 24 |
+
token is a bare version number (digits + optional `.NN` + optional
|
| 25 |
+
single non-size letter): `GPT 5 Mini` -> `GPT-5 Mini`,
|
| 26 |
+
`GPT 4o ...` -> `GPT-4o ...`. Skipped when the next token is a param
|
| 27 |
+
size like `7B`.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
from __future__ import annotations
|
| 31 |
+
|
| 32 |
+
import re
|
| 33 |
+
|
| 34 |
+
ACRONYMS: frozenset[str] = frozenset(
|
| 35 |
+
{
|
| 36 |
+
"gpt",
|
| 37 |
+
"glm",
|
| 38 |
+
"llm",
|
| 39 |
+
"vl",
|
| 40 |
+
"vlm",
|
| 41 |
+
"qvq",
|
| 42 |
+
"qwq",
|
| 43 |
+
"mt",
|
| 44 |
+
"vit",
|
| 45 |
+
"clip",
|
| 46 |
+
"dit",
|
| 47 |
+
"hf",
|
| 48 |
+
"ocr",
|
| 49 |
+
"tts",
|
| 50 |
+
"asr",
|
| 51 |
+
"moe",
|
| 52 |
+
"mlp",
|
| 53 |
+
"rlhf",
|
| 54 |
+
}
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Tokens whose canonical rendering is mixed case rather than ALL CAPS.
|
| 58 |
+
CASE_OVERRIDES: dict[str, str] = {
|
| 59 |
+
"moe": "MoE",
|
| 60 |
+
"vit": "ViT",
|
| 61 |
+
"dit": "DiT",
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# Vendor / family tokens whose canonical rendering doesn't match a simple
|
| 65 |
+
# capitalize() β e.g., `deepseek` should display as `DeepSeek`. Keep the
|
| 66 |
+
# list short; this is for tokens the auto-rule mangles, not a general
|
| 67 |
+
# branding registry.
|
| 68 |
+
TOKEN_OVERRIDES: dict[str, str] = {
|
| 69 |
+
"deepseek": "DeepSeek",
|
| 70 |
+
"openai": "OpenAI",
|
| 71 |
+
"stepfun": "StepFun",
|
| 72 |
+
"moonshotai": "MoonshotAI",
|
| 73 |
+
"mistralai": "MistralAI",
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
# Suffixes treated as parameter-count units, NOT version letters. When a
|
| 77 |
+
# token like `7b` appears after an acronym, we do NOT hyphen-glue it.
|
| 78 |
+
_SIZE_SUFFIXES: frozenset[str] = frozenset({"b", "m", "k"})
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def humanize_model_slug(slug: str) -> str:
|
| 82 |
+
"""Render a model slug as a human display name.
|
| 83 |
+
|
| 84 |
+
Accepts a bare slug (`gpt-4o-2024-05-13`) or a full canonical id
|
| 85 |
+
(`openai/gpt-4o-2024-05-13`); the org prefix is dropped.
|
| 86 |
+
"""
|
| 87 |
+
if not slug:
|
| 88 |
+
return ""
|
| 89 |
+
if "/" in slug:
|
| 90 |
+
slug = slug.split("/", 1)[1]
|
| 91 |
+
|
| 92 |
+
slug, suffix = _strip_date_suffix(slug)
|
| 93 |
+
|
| 94 |
+
tokens = slug.split("-")
|
| 95 |
+
formatted = [_format_token(t) for t in tokens]
|
| 96 |
+
|
| 97 |
+
out: list[str] = []
|
| 98 |
+
i = 0
|
| 99 |
+
while i < len(formatted):
|
| 100 |
+
cur_lower = tokens[i].lower()
|
| 101 |
+
if (
|
| 102 |
+
i + 1 < len(formatted)
|
| 103 |
+
and cur_lower in ACRONYMS
|
| 104 |
+
and _is_version_token(tokens[i + 1])
|
| 105 |
+
):
|
| 106 |
+
out.append(f"{formatted[i]}-{formatted[i + 1]}")
|
| 107 |
+
i += 2
|
| 108 |
+
else:
|
| 109 |
+
out.append(formatted[i])
|
| 110 |
+
i += 1
|
| 111 |
+
|
| 112 |
+
return " ".join(out) + suffix
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _strip_date_suffix(slug: str) -> tuple[str, str]:
|
| 116 |
+
"""Pop a trailing date or 4-digit code; return (slug_without, ' (suffix)').
|
| 117 |
+
|
| 118 |
+
Order matters: more specific patterns first, since a partial match
|
| 119 |
+
against a less-specific pattern would mis-render (e.g. `2025` as a
|
| 120 |
+
bare 4-digit code when it's actually the year half of `2025-08`).
|
| 121 |
+
"""
|
| 122 |
+
# Full ISO date: `-YYYY-MM-DD`
|
| 123 |
+
m = re.search(r"-(20\d{2}-\d{2}-\d{2})$", slug)
|
| 124 |
+
if m:
|
| 125 |
+
return slug[: m.start()], f" ({m.group(1)})"
|
| 126 |
+
# Compact date: `-YYYYMMDD`
|
| 127 |
+
m = re.search(r"-(20\d{6})$", slug)
|
| 128 |
+
if m:
|
| 129 |
+
d = m.group(1)
|
| 130 |
+
return slug[: m.start()], f" ({d[:4]}-{d[4:6]}-{d[6:8]})"
|
| 131 |
+
# Year-month: `-YYYY-MM` (e.g. `gpt-5-2025-08`)
|
| 132 |
+
m = re.search(r"-(20\d{2})-(\d{2})$", slug)
|
| 133 |
+
if m:
|
| 134 |
+
return slug[: m.start()], f" ({m.group(1)}-{m.group(2)})"
|
| 135 |
+
# Cohere convention: `-MM-YYYY` (e.g. `command-r-08-2024`).
|
| 136 |
+
# Render as `(YYYY-MM)` for ISO-ordered display.
|
| 137 |
+
m = re.search(r"-(\d{2})-(20\d{2})$", slug)
|
| 138 |
+
if m:
|
| 139 |
+
return slug[: m.start()], f" ({m.group(2)}-{m.group(1)})"
|
| 140 |
+
# Bare 4-digit code: `-NNNN` (e.g. `grok-4-0709`, `kimi-k2-0711`).
|
| 141 |
+
m = re.search(r"-(\d{4})$", slug)
|
| 142 |
+
if m:
|
| 143 |
+
return slug[: m.start()], f" ({m.group(1)})"
|
| 144 |
+
return slug, ""
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _format_token(tok: str) -> str:
|
| 148 |
+
if not tok:
|
| 149 |
+
return tok
|
| 150 |
+
low = tok.lower()
|
| 151 |
+
if low in CASE_OVERRIDES:
|
| 152 |
+
return CASE_OVERRIDES[low]
|
| 153 |
+
if low in ACRONYMS:
|
| 154 |
+
return low.upper()
|
| 155 |
+
if low in TOKEN_OVERRIDES:
|
| 156 |
+
return TOKEN_OVERRIDES[low]
|
| 157 |
+
# Param size: 7b, 70b, 1.5b, 30m
|
| 158 |
+
if re.fullmatch(r"\d+(?:\.\d+)?[bmk]", low):
|
| 159 |
+
return low[:-1] + low[-1].upper()
|
| 160 |
+
# MoE active-expert form: a22b, a3b
|
| 161 |
+
if re.fullmatch(r"a\d+(?:\.\d+)?b", low):
|
| 162 |
+
return "A" + low[1:-1] + "B"
|
| 163 |
+
# MxNb: 8x7b -> 8x7B
|
| 164 |
+
if re.fullmatch(r"\d+x\d+(?:\.\d+)?b", low):
|
| 165 |
+
return low[:-1] + "B"
|
| 166 |
+
# Number followed by a single lowercase letter that's NOT a size suffix:
|
| 167 |
+
# version tags like `4o`, `5o` β keep as-is.
|
| 168 |
+
if re.fullmatch(r"\d+(?:\.\d+)?[a-z]", low) and low[-1] not in _SIZE_SUFFIXES:
|
| 169 |
+
return low
|
| 170 |
+
# O-series: o1, o3, o4
|
| 171 |
+
if re.fullmatch(r"o\d+", low):
|
| 172 |
+
return low
|
| 173 |
+
# Default: capitalize first letter, preserve rest.
|
| 174 |
+
return tok[0].upper() + tok[1:] if tok[0].isalpha() else tok
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def _is_version_token(tok: str) -> bool:
|
| 178 |
+
"""True if `tok` looks like a version (e.g. `5`, `4.5`, `4o`) and not
|
| 179 |
+
a parameter size (`7b`, `70m`)."""
|
| 180 |
+
m = re.fullmatch(r"(\d+(?:\.\d+)?)([a-z]?)", tok.lower())
|
| 181 |
+
if not m:
|
| 182 |
+
return False
|
| 183 |
+
return m.group(2) not in _SIZE_SUFFIXES
|
src/eval_card_registry/__pycache__/cli.cpython-314.pyc
CHANGED
|
Binary files a/src/eval_card_registry/__pycache__/cli.cpython-314.pyc and b/src/eval_card_registry/__pycache__/cli.cpython-314.pyc differ
|
|
|
src/eval_card_registry/services/__pycache__/hub_stats.cpython-314.pyc
CHANGED
|
Binary files a/src/eval_card_registry/services/__pycache__/hub_stats.cpython-314.pyc and b/src/eval_card_registry/services/__pycache__/hub_stats.cpython-314.pyc differ
|
|
|
src/eval_card_registry/services/__pycache__/resolution_service.cpython-314.pyc
CHANGED
|
Binary files a/src/eval_card_registry/services/__pycache__/resolution_service.cpython-314.pyc and b/src/eval_card_registry/services/__pycache__/resolution_service.cpython-314.pyc differ
|
|
|
src/eval_card_registry/services/hub_stats.py
CHANGED
|
@@ -199,9 +199,22 @@ class HubStatsClient:
|
|
| 199 |
return self._con
|
| 200 |
# Import lazily so processes that never call lookup() don't pay
|
| 201 |
# the duckdb import cost.
|
|
|
|
| 202 |
import duckdb
|
| 203 |
con = duckdb.connect()
|
| 204 |
con.execute("INSTALL httpfs; LOAD httpfs;")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
self._con = con
|
| 206 |
return con
|
| 207 |
|
|
|
|
| 199 |
return self._con
|
| 200 |
# Import lazily so processes that never call lookup() don't pay
|
| 201 |
# the duckdb import cost.
|
| 202 |
+
import os
|
| 203 |
import duckdb
|
| 204 |
con = duckdb.connect()
|
| 205 |
con.execute("INSTALL httpfs; LOAD httpfs;")
|
| 206 |
+
# Authenticate parquet fetches when HF_TOKEN is in the environment
|
| 207 |
+
# (typical on the deployed Space). Unauth limit is 500 req/5min;
|
| 208 |
+
# one DuckDB read_parquet against the remote file streams via
|
| 209 |
+
# several range requests and a sync that auto-creates many drafts
|
| 210 |
+
# can brush that ceiling. With auth the ceiling is ~30k/5min.
|
| 211 |
+
hf_token = os.environ.get("HF_TOKEN")
|
| 212 |
+
if hf_token:
|
| 213 |
+
escaped = hf_token.replace("'", "''")
|
| 214 |
+
con.execute(
|
| 215 |
+
f"CREATE SECRET hf_auth (TYPE HTTP, BEARER_TOKEN '{escaped}', "
|
| 216 |
+
f"SCOPE 'https://huggingface.co');"
|
| 217 |
+
)
|
| 218 |
self._con = con
|
| 219 |
return con
|
| 220 |
|
src/eval_card_registry/services/resolution_service.py
CHANGED
|
@@ -15,6 +15,7 @@ from datetime import datetime, timezone
|
|
| 15 |
from typing import Optional
|
| 16 |
|
| 17 |
from eval_entity_resolver import AliasStore, CanonicalStore, Resolver, ResolverConfig, ResolutionResult
|
|
|
|
| 18 |
|
| 19 |
from eval_card_registry.config import settings
|
| 20 |
from eval_card_registry.store.hf_store import RegistryStore
|
|
@@ -294,9 +295,17 @@ class ResolutionService:
|
|
| 294 |
candidate_id = f"{candidate_id}-{str(uuid.uuid4())[:8]}"
|
| 295 |
|
| 296 |
now = _now()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
base = {
|
| 298 |
"id": candidate_id,
|
| 299 |
-
"display_name":
|
| 300 |
"metadata": "{}",
|
| 301 |
"review_status": "draft",
|
| 302 |
"created_at": now,
|
|
|
|
| 15 |
from typing import Optional
|
| 16 |
|
| 17 |
from eval_entity_resolver import AliasStore, CanonicalStore, Resolver, ResolverConfig, ResolutionResult
|
| 18 |
+
from eval_entity_resolver.display import humanize_model_slug
|
| 19 |
|
| 20 |
from eval_card_registry.config import settings
|
| 21 |
from eval_card_registry.store.hf_store import RegistryStore
|
|
|
|
| 295 |
candidate_id = f"{candidate_id}-{str(uuid.uuid4())[:8]}"
|
| 296 |
|
| 297 |
now = _now()
|
| 298 |
+
# Models get a humanized display name (`gpt-5-2025-08-07` ->
|
| 299 |
+
# `GPT-5 (2025-08-07)`); other entity types pass `raw_value`
|
| 300 |
+
# through β benchmark/metric/harness/org names are usually
|
| 301 |
+
# already in their preferred display form.
|
| 302 |
+
if entity_type == "model":
|
| 303 |
+
display = humanize_model_slug(raw_value) or raw_value
|
| 304 |
+
else:
|
| 305 |
+
display = raw_value
|
| 306 |
base = {
|
| 307 |
"id": candidate_id,
|
| 308 |
+
"display_name": display,
|
| 309 |
"metadata": "{}",
|
| 310 |
"review_status": "draft",
|
| 311 |
"created_at": now,
|
src/eval_card_registry/store/__pycache__/queries.cpython-314.pyc
CHANGED
|
Binary files a/src/eval_card_registry/store/__pycache__/queries.cpython-314.pyc and b/src/eval_card_registry/store/__pycache__/queries.cpython-314.pyc differ
|
|
|
src/eval_card_registry/store/queries.py
CHANGED
|
@@ -85,9 +85,12 @@ def derive_model_lineage_fields(store: RegistryStore) -> dict[str, int]:
|
|
| 85 |
`root_model_id`, `lineage_origin_org_id`, and inherited `open_weights`
|
| 86 |
columns.
|
| 87 |
|
| 88 |
-
- `root_model_id`: walk parents up through
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
| 91 |
- `lineage_origin_org_id`: walk through any non-`variant` edge
|
| 92 |
(quantized / finetune / merge / adapter) to the deepest ancestor,
|
| 93 |
then read its `org_id`. For Meta-originated models = self.org_id;
|
|
@@ -117,8 +120,8 @@ def derive_model_lineage_fields(store: RegistryStore) -> dict[str, int]:
|
|
| 117 |
ow = row.get("open_weights")
|
| 118 |
open_by_id[cid] = None if _is_na(ow) else bool(ow)
|
| 119 |
|
| 120 |
-
def _walk(start: str,
|
| 121 |
-
"""Walk parents through edges
|
| 122 |
Returns the deepest reachable id; stops on no-match or cycle."""
|
| 123 |
visited = {start}
|
| 124 |
current = start
|
|
@@ -128,7 +131,7 @@ def derive_model_lineage_fields(store: RegistryStore) -> dict[str, int]:
|
|
| 128 |
for p in edges:
|
| 129 |
if not isinstance(p, dict):
|
| 130 |
continue
|
| 131 |
-
if
|
| 132 |
next_id = p["id"]
|
| 133 |
break
|
| 134 |
if not next_id or next_id in visited or next_id not in parents_by_id:
|
|
@@ -136,6 +139,17 @@ def derive_model_lineage_fields(store: RegistryStore) -> dict[str, int]:
|
|
| 136 |
visited.add(next_id)
|
| 137 |
current = next_id
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
def _inherit_open_from_ancestors(start: str) -> Optional[bool]:
|
| 140 |
"""Walk ONLY ancestors (skip self) through `variant` + `quantized`
|
| 141 |
edges and return the first explicit `open_weights` value found.
|
|
@@ -166,11 +180,12 @@ def derive_model_lineage_fields(store: RegistryStore) -> dict[str, int]:
|
|
| 166 |
open_updates: dict[str, Optional[bool]] = {}
|
| 167 |
inherited_count = 0
|
| 168 |
for cid in parents_by_id:
|
| 169 |
-
# Identity root via quantized-
|
| 170 |
-
|
|
|
|
| 171 |
root_updates[cid] = root if root != cid else None
|
| 172 |
# Lineage origin via any non-variant edge; org of deepest ancestor
|
| 173 |
-
ancestor = _walk(cid,
|
| 174 |
lineage_updates[cid] = org_by_id.get(ancestor) or org_by_id.get(cid)
|
| 175 |
# Open weights β explicit self value WINS; only fall back to
|
| 176 |
# ancestor inheritance when self has no value set. Never overwrite
|
|
|
|
| 85 |
`root_model_id`, `lineage_origin_org_id`, and inherited `open_weights`
|
| 86 |
columns.
|
| 87 |
|
| 88 |
+
- `root_model_id`: walk parents up through edges that preserve API
|
| 89 |
+
identity β `quantized` (different precision, same model) and
|
| 90 |
+
`variant axis=version` (dated snapshot of the same release, e.g.
|
| 91 |
+
`gpt-4o-2024-05-13` -> `gpt-4o`). NULL when self has no such
|
| 92 |
+
ancestor β i.e., self IS the identity root. Other variant axes
|
| 93 |
+
(size, mode, modality, domain) keep separate identity at the leaf.
|
| 94 |
- `lineage_origin_org_id`: walk through any non-`variant` edge
|
| 95 |
(quantized / finetune / merge / adapter) to the deepest ancestor,
|
| 96 |
then read its `org_id`. For Meta-originated models = self.org_id;
|
|
|
|
| 120 |
ow = row.get("open_weights")
|
| 121 |
open_by_id[cid] = None if _is_na(ow) else bool(ow)
|
| 122 |
|
| 123 |
+
def _walk(start: str, edge_ok) -> str:
|
| 124 |
+
"""Walk parents through edges where `edge_ok(edge)` is True.
|
| 125 |
Returns the deepest reachable id; stops on no-match or cycle."""
|
| 126 |
visited = {start}
|
| 127 |
current = start
|
|
|
|
| 131 |
for p in edges:
|
| 132 |
if not isinstance(p, dict):
|
| 133 |
continue
|
| 134 |
+
if edge_ok(p) and p.get("id"):
|
| 135 |
next_id = p["id"]
|
| 136 |
break
|
| 137 |
if not next_id or next_id in visited or next_id not in parents_by_id:
|
|
|
|
| 139 |
visited.add(next_id)
|
| 140 |
current = next_id
|
| 141 |
|
| 142 |
+
def _is_identity_edge(p: dict) -> bool:
|
| 143 |
+
rel = p.get("relationship")
|
| 144 |
+
if rel == "quantized":
|
| 145 |
+
return True
|
| 146 |
+
if rel == "variant" and p.get("axis") == "version":
|
| 147 |
+
return True
|
| 148 |
+
return False
|
| 149 |
+
|
| 150 |
+
def _is_lineage_edge(p: dict) -> bool:
|
| 151 |
+
return p.get("relationship") in {"quantized", "finetune", "merge", "adapter"}
|
| 152 |
+
|
| 153 |
def _inherit_open_from_ancestors(start: str) -> Optional[bool]:
|
| 154 |
"""Walk ONLY ancestors (skip self) through `variant` + `quantized`
|
| 155 |
edges and return the first explicit `open_weights` value found.
|
|
|
|
| 180 |
open_updates: dict[str, Optional[bool]] = {}
|
| 181 |
inherited_count = 0
|
| 182 |
for cid in parents_by_id:
|
| 183 |
+
# Identity root via quantized + variant-version walk (both treat
|
| 184 |
+
# the parent as the same model at the API level β see docstring).
|
| 185 |
+
root = _walk(cid, _is_identity_edge)
|
| 186 |
root_updates[cid] = root if root != cid else None
|
| 187 |
# Lineage origin via any non-variant edge; org of deepest ancestor
|
| 188 |
+
ancestor = _walk(cid, _is_lineage_edge)
|
| 189 |
lineage_updates[cid] = org_by_id.get(ancestor) or org_by_id.get(cid)
|
| 190 |
# Open weights β explicit self value WINS; only fall back to
|
| 191 |
# ancestor inheritance when self has no value set. Never overwrite
|