j-chim's picture
Upload folder using huggingface_hub
e60e7e0 verified
Raw
History Blame Contribute Delete
15.1 kB
"""Org-aware fold decision: does a minted off-HF / dev-org-slug canonical refer
to the SAME model as a real HF repo already in the registry?
This is the single source of truth for the cross-source same-model dedup used by
BOTH the generator (scripts/refresh_from_modelsdev.py reconciliation, so a mint
DEFERS to the real HF id at generate time) AND the gate
(tests/test_gate_invariants.py via scripts/fold_modelsdev_dupes.py, which verifies
none survive). Keeping it here means the two can never drift.
A match is "confident" only with ORG AGREEMENT (after the curated two-tier dev-org
remap: meta-llama->meta, Qwen->alibaba, ...): a group with no resolvable org, or
whose name matches only under a DIFFERENT developer, NEVER folds (no cross-vendor
false merge). Tiers, strongest first: exact id == HF id; mint string already an
alias of an HF entry; normalized-name (all separators removed) + org agreement;
brand-prefix-stripped name + org agreement (so `qwen-qwq-32b` -> `QwQ-32B`).
`name_norm` removes ALL separators to ONE token, so models.dev's mangled
spellings collapse onto HF casing: `qwen-2-5-14b-instruct` == `Qwen2.5-14B-Instruct`.
"""
from __future__ import annotations
import re
from typing import Optional
from eval_entity_resolver.normalization import normalize as _nz
# Curated HF-namespace -> developer-org remap. The SINGLE owner of this map
# (moved here from strategies/fuzzy.py so both the resolver and the seed
# generators consume one source). HF namespaces that are alternate spellings of
# one developer fold to that developer's slug (org_id only — a canonical_id keeps
# its real HF repo prefix). Lowercased keys.
_ORG_ALIASES: dict[str, str] = {
"deepseek-ai": "deepseek",
"cohereforai": "cohere",
"cohere-labs": "cohere",
# HF renamed the Cohere org `CohereForAI` -> `CohereLabs` (no hyphen);
# both are the same lab, canonical `cohere`.
"coherelabs": "cohere",
# HF's SmolLM team `HuggingFaceTB` is part of Hugging Face.
"huggingfacetb": "huggingface",
# Baichuan is a curated lab; its HF namespace `baichuan-inc` folds into it.
"baichuan-inc": "baichuan",
# HF `MiniMaxAI` / `SarvamAI` namespaces -> the lab slug we already use.
"minimaxai": "minimax",
"sarvamai": "sarvam",
"tii-uae": "tiiuae",
"meta-llama": "meta",
"mistral-ai": "mistralai",
"nvidia-nemo": "nvidia",
# Zhipu/Z.ai → zai. `THUDM` is the legacy HF org for the GLM/ChatGLM
# family (Tsinghua/Zhipu); HF now publishes under `zai-org`.
"zhipu": "zai",
"zhipu-ai": "zai",
"z-ai": "zai",
"zai-org": "zai",
"thudm": "zai",
# Moonshot → moonshotai
"moonshot": "moonshotai",
"moonshot-ai": "moonshotai",
# Qwen models live under canonical org `alibaba` (Alibaba Cloud).
# HF uploads use the `Qwen/` namespace (e.g. Qwen/Qwen2-VL-7B-Instruct).
# The reverse mapping (alibaba → qwen) was rejected because
# `alibaba__mineru2-pipeline` is a non-Qwen entry; this direction has
# no analogous collision since every `qwen/<X>` upstream id we've seen
# corresponds to an Alibaba/Qwen-family model.
"qwen": "alibaba",
# Alternate HF namespaces of a known developer fold to the one parent org
# (org_id only — the canonical_id keeps the real HF repo prefix). These
# consolidate the developer in downstream listings.
"facebook": "meta", # Meta's pre-Llama HF org (OPT, BART, ...)
"mistral": "mistralai",
"mosaicml": "databricks", # MosaicML (MPT) acquired by Databricks
"databricks-mosaic-research": "databricks",
"alibaba-aidc": "alibaba",
"alibaba-nlp": "alibaba",
"aws-prototyping": "amazon",
"ibm-research": "ibm",
"ibm-granite": "ibm", # Granite folded into IBM (curation decision)
"bytedance-seed": "bytedance",
}
def build_curated_org_map(orgs_yaml_entries: list[dict]) -> dict[str, str]:
"""The SINGLE curated HF-namespace -> developer-org map every generator and
the resolver should use: `_ORG_ALIASES` UNION every curated org's id +
`hf_org` + each entry in its `aliases`, keyed lowercase -> curated id. The
curated seed (orgs.yaml) wins over `_ORG_ALIASES` on conflict.
Every generator and the resolver build their fold map here so a curated
alias added to orgs.yaml automatically reaches every consumer (no drift)."""
m: dict[str, str] = {k.lower(): v for k, v in _ORG_ALIASES.items()}
for e in orgs_yaml_entries or []:
if not isinstance(e, dict):
continue
oid = e.get("id")
if not isinstance(oid, str) or not oid:
continue
m[oid.lower()] = oid
hf_org = e.get("hf_org")
if isinstance(hf_org, str) and hf_org.strip():
m[hf_org.lower()] = oid
for a in (e.get("aliases") or []):
if isinstance(a, str) and a.strip():
m[a.lower()] = oid
return m
def build_org_dev_map_from_store(org_records, org_alias_pairs) -> dict[str, str]:
"""Same dev-org map as `build_curated_org_map`, for STORE-backed callers that
don't have orgs.yaml at hand (the live resolution_service; seed-time lineage
derivation; the deployed Space reads from the HF dataset, not seed files).
`canonical_orgs` has no `aliases` column, so the alias tier lives as org rows
in the alias table — feed it via `org_alias_pairs` ((raw, canonical_id) of
entity_type=org). Identical result to build_curated_org_map over orgs.yaml, so
every consumer folds orgs the same way (single source, no drift)."""
m = build_curated_org_map([
{"id": r.get("id"), "hf_org": r.get("hf_org")} for r in (org_records or [])
])
for raw, cid in org_alias_pairs or []:
if isinstance(raw, str) and raw and isinstance(cid, str) and cid:
m[raw.lower()] = cid
return m
def _norm_org_key(org: str) -> str:
"""Separator/case-insensitive org key (one token) for community-casing folds."""
return re.sub(r"[^a-z0-9]", "", org.lower())
def build_community_casing(org_prefixes: list[str]) -> dict[str, str]:
"""Map a separator/case-insensitive org key -> the authoritative HF-true
casing, derived from real-HF repo org prefixes (the hf_oracle / hub_stats
sources). Lets every generator snap a community org (no curated id) to ONE
canonical spelling so `Sao10K`/`sao10k`/`sao10K` collapse to one
canonical_orgs row. Deterministic tie-break (sorted) when a key has >1 real
spelling — so a refresh never flips a previously-chosen casing."""
by_key: dict[str, set[str]] = {}
for p in org_prefixes:
if isinstance(p, str) and p.strip():
by_key.setdefault(_norm_org_key(p), set()).add(p)
return {k: sorted(v)[0] for k, v in by_key.items() if v}
def canonicalize_org(
prefix: str,
curated_map: dict[str, str],
community_casing: Optional[dict[str, str]] = None,
distinct_allowlist: Optional[set[str]] = None,
) -> str:
"""Canonical org id for an HF org spelling: (1) curated developer id when the
prefix folds to one; (2) else the authoritative HF-true community casing
(unless the spelling is on the distinct-org allowlist, which keeps verified
separate uploaders apart); (3) else the prefix verbatim. The single org
canonicalizer all generators + the reconcile call."""
if not prefix:
return prefix
curated = curated_map.get(prefix.lower())
if curated is not None:
return curated
if distinct_allowlist and prefix in distinct_allowlist:
return prefix
if community_casing:
cased = community_casing.get(_norm_org_key(prefix))
if cased is not None:
return cased
return prefix
def dev_org_of_prefix(prefix: str, hf_to_dev: dict[str, str]) -> str:
"""Remap an id-prefix org to its curated developer slug (else itself)."""
return hf_to_dev.get(prefix.lower(), prefix)
def name_norm(name: str) -> str:
"""Normalized name with separators collapsed AND removed (one token), so
`Qwen2.5-14B-Instruct` and `qwen-2-5-14b-instruct` map to the same token."""
return _nz(name).replace(" ", "")
def brand_tokens_for(dev_org: str, hf_to_dev: dict[str, str]) -> set[str]:
"""Brand tokens a models.dev key may glue onto a model name for this
developer: the dev slug + every HF alias mapping to it (alibaba -> {alibaba,
qwen, ...}; meta -> {meta, meta-llama, facebook}), plus a few spelling
variants not in the org map. Normalized to single tokens."""
toks: set[str] = set()
d = dev_org.lower()
toks.add(name_norm(d))
for hf_alias, dev in hf_to_dev.items():
if dev.lower() == d:
toks.add(name_norm(hf_alias))
extra = {
"alibaba": {"qwen"},
"meta": {"llama"},
"minimax": {"minimax"},
"google": {"gemini", "gemma"},
"deepseek": {"deepseek"},
}
toks |= extra.get(d, set())
return {t for t in toks if t}
def strip_brand_prefix(norm_name_tok: str, brands: set[str]) -> set[str]:
"""Candidate name tokens with a leading brand token removed (always includes
the original). Strips repeatedly (defensive against `qwen-qwen-...`)."""
out = {norm_name_tok}
cur = norm_name_tok
changed = True
while changed:
changed = False
for b in sorted(brands, key=len, reverse=True):
if b and cur.startswith(b) and len(cur) > len(b):
cur = cur[len(b):]
out.add(cur)
changed = True
break
return out
def build_hf_index(
entries: list[dict],
hf_to_dev: dict[str, str],
fixed_ids: frozenset[str] = frozenset(),
):
"""Build the HF target authority from registry `entries` (+ any extra
real-HF `fixed_ids`, e.g. from the frozen oracle):
- hf_ids: every real-HF canonical id (resolution_source == 'hf' OR in
fixed_ids) — for exact id match.
- alias_to_hf: every id/display/alias string on an HF entry -> that HF id.
- by_org_name: (dev_org, name_norm) -> hf_id — for org-aware normalized match.
- hf_entry_by_id: id -> entry (so callers can merge onto it).
"""
hf_entry_by_id: dict[str, dict] = {}
hf_ids: set[str] = set(fixed_ids)
for e in entries:
if not isinstance(e, dict):
continue
cid = e.get("id")
if not isinstance(cid, str):
continue
if e.get("resolution_source") == "hf" or cid in fixed_ids:
hf_ids.add(cid)
hf_entry_by_id[cid] = e
alias_to_hf: dict[str, str] = {}
by_org_name: dict[tuple[str, str], str] = {}
def index_target(cid: str, entry: Optional[dict]) -> None:
if "/" not in cid:
return
org, name = cid.split("/", 1)
dev = dev_org_of_prefix(org, hf_to_dev)
by_org_name.setdefault((dev, name_norm(name)), cid)
alias_to_hf.setdefault(cid, cid)
if entry is not None:
dn = entry.get("display_name")
if isinstance(dn, str):
alias_to_hf.setdefault(dn, cid)
for a in entry.get("aliases") or []:
if isinstance(a, str):
alias_to_hf.setdefault(a, cid)
for cid, e in hf_entry_by_id.items():
index_target(cid, e)
for cid in fixed_ids:
if cid not in hf_entry_by_id:
index_target(cid, None)
return hf_ids, alias_to_hf, by_org_name, hf_entry_by_id
def decide_fold(mint: dict, hf_ids, alias_to_hf, by_org_name, hf_to_dev) -> Optional[dict]:
"""Return a fold dict (mint_id, hf_target, match_type, org agreement,
evidence) when `mint` confidently refers to the same model as a real HF id;
else None (never a cross-developer false merge)."""
cid = mint.get("id")
if not isinstance(cid, str):
return None
mint_org = mint.get("org_id")
mint_org = mint_org if isinstance(mint_org, str) and mint_org else None
prefix_org = dev_org_of_prefix(cid.split("/", 1)[0], hf_to_dev) if "/" in cid else None
eff_org = mint_org or prefix_org
mint_strings = [cid]
dn = mint.get("display_name")
if isinstance(dn, str):
mint_strings.append(dn)
for a in mint.get("aliases") or []:
if isinstance(a, str):
mint_strings.append(a)
# exact id equality
for s in mint_strings:
if s in hf_ids and s != cid:
return _mk(mint, s, "exact", eff_org, hf_to_dev, f"mint string {s!r} is a real HF id")
# alias linkage (mint string already an alias of an HF entry). A mint string
# can be a generic BARE name (e.g. `gemma-3-4b-it`) that DISTINCT developers
# legitimately both carry (google's gemma AND unsloth's re-upload). Such a
# shared alias must NOT link the mint across developers, so require org
# agreement (after the dev remap) when BOTH the mint's effective org and the
# target's developer are known and they DISAGREE — skip that match instead of
# false-merging unsloth/gemma-3-4b-it into google/gemma-3-4b-it. (A full
# org/model HF-id match is handled by the exact tier above, which is
# unambiguous and stays unguarded.)
for s in mint_strings:
tgt = alias_to_hf.get(s)
if tgt and tgt != cid and tgt in hf_ids:
tgt_org = dev_org_of_prefix(tgt.split("/", 1)[0], hf_to_dev) if "/" in tgt else None
if eff_org and tgt_org and eff_org.lower() != tgt_org.lower():
continue
return _mk(mint, tgt, "alias", eff_org, hf_to_dev, f"mint string {s!r} declared on {tgt!r}")
if eff_org is None:
return None # no org agreement possible -> never fold
cand_names = {(s.split("/", 1)[1] if "/" in s else s) for s in mint_strings}
cand_names = {name_norm(n) for n in cand_names}
# normalized-name equality + org agreement
for nm in cand_names:
tgt = by_org_name.get((eff_org, nm))
if tgt and tgt != cid:
return _mk(mint, tgt, "normalized", eff_org, hf_to_dev,
f"org={eff_org} + name {nm!r} == {tgt!r}")
# fuzzy: brand-prefix-stripped name + org agreement
brands = brand_tokens_for(eff_org, hf_to_dev)
stripped: set[str] = set()
for nm in cand_names:
stripped |= strip_brand_prefix(nm, brands)
for nm in stripped - cand_names:
tgt = by_org_name.get((eff_org, nm))
if tgt and tgt != cid:
return _mk(mint, tgt, "fuzzy", eff_org, hf_to_dev,
f"org={eff_org} + brand-stripped {nm!r} == {tgt!r}")
return None
def _mk(mint, hf_target, match_type, mint_dev_org, hf_to_dev, evidence) -> dict:
hf_org = (hf_to_dev.get(hf_target.split("/", 1)[0].lower(), hf_target.split("/", 1)[0])
if "/" in hf_target else hf_target)
return {
"mint_id": mint["id"],
"hf_target": hf_target,
"match_type": match_type,
"mint_org": mint_dev_org or "",
"hf_org": hf_org,
"org_agreement": (mint_dev_org or "").lower() == (hf_org or "").lower(),
"evidence": evidence,
}