"""Org-aware fold decision: does a minted off-HF / dev-org-slug canonical refer to the SAME model as a real HF repo already in the registry? This is the single source of truth for the cross-source same-model dedup used by BOTH the generator (scripts/refresh_from_modelsdev.py reconciliation, so a mint DEFERS to the real HF id at generate time) AND the gate (tests/test_gate_invariants.py via scripts/fold_modelsdev_dupes.py, which verifies none survive). Keeping it here means the two can never drift. A match is "confident" only with ORG AGREEMENT (after the curated two-tier dev-org remap: meta-llama->meta, Qwen->alibaba, ...): a group with no resolvable org, or whose name matches only under a DIFFERENT developer, NEVER folds (no cross-vendor false merge). Tiers, strongest first: exact id == HF id; mint string already an alias of an HF entry; normalized-name (all separators removed) + org agreement; brand-prefix-stripped name + org agreement (so `qwen-qwq-32b` -> `QwQ-32B`). `name_norm` removes ALL separators to ONE token, so models.dev's mangled spellings collapse onto HF casing: `qwen-2-5-14b-instruct` == `Qwen2.5-14B-Instruct`. """ from __future__ import annotations import re from typing import Optional from eval_entity_resolver.normalization import normalize as _nz # Curated HF-namespace -> developer-org remap. The SINGLE owner of this map # (moved here from strategies/fuzzy.py so both the resolver and the seed # generators consume one source). HF namespaces that are alternate spellings of # one developer fold to that developer's slug (org_id only — a canonical_id keeps # its real HF repo prefix). Lowercased keys. _ORG_ALIASES: dict[str, str] = { "deepseek-ai": "deepseek", "cohereforai": "cohere", "cohere-labs": "cohere", # HF renamed the Cohere org `CohereForAI` -> `CohereLabs` (no hyphen); # both are the same lab, canonical `cohere`. "coherelabs": "cohere", # HF's SmolLM team `HuggingFaceTB` is part of Hugging Face. "huggingfacetb": "huggingface", # Baichuan is a curated lab; its HF namespace `baichuan-inc` folds into it. "baichuan-inc": "baichuan", # HF `MiniMaxAI` / `SarvamAI` namespaces -> the lab slug we already use. "minimaxai": "minimax", "sarvamai": "sarvam", "tii-uae": "tiiuae", "meta-llama": "meta", "mistral-ai": "mistralai", "nvidia-nemo": "nvidia", # Zhipu/Z.ai → zai. `THUDM` is the legacy HF org for the GLM/ChatGLM # family (Tsinghua/Zhipu); HF now publishes under `zai-org`. "zhipu": "zai", "zhipu-ai": "zai", "z-ai": "zai", "zai-org": "zai", "thudm": "zai", # Moonshot → moonshotai "moonshot": "moonshotai", "moonshot-ai": "moonshotai", # Qwen models live under canonical org `alibaba` (Alibaba Cloud). # HF uploads use the `Qwen/` namespace (e.g. Qwen/Qwen2-VL-7B-Instruct). # The reverse mapping (alibaba → qwen) was rejected because # `alibaba__mineru2-pipeline` is a non-Qwen entry; this direction has # no analogous collision since every `qwen/` upstream id we've seen # corresponds to an Alibaba/Qwen-family model. "qwen": "alibaba", # Alternate HF namespaces of a known developer fold to the one parent org # (org_id only — the canonical_id keeps the real HF repo prefix). These # consolidate the developer in downstream listings. "facebook": "meta", # Meta's pre-Llama HF org (OPT, BART, ...) "mistral": "mistralai", "mosaicml": "databricks", # MosaicML (MPT) acquired by Databricks "databricks-mosaic-research": "databricks", "alibaba-aidc": "alibaba", "alibaba-nlp": "alibaba", "aws-prototyping": "amazon", "ibm-research": "ibm", "ibm-granite": "ibm", # Granite folded into IBM (curation decision) "bytedance-seed": "bytedance", } def build_curated_org_map(orgs_yaml_entries: list[dict]) -> dict[str, str]: """The SINGLE curated HF-namespace -> developer-org map every generator and the resolver should use: `_ORG_ALIASES` UNION every curated org's id + `hf_org` + each entry in its `aliases`, keyed lowercase -> curated id. The curated seed (orgs.yaml) wins over `_ORG_ALIASES` on conflict. Every generator and the resolver build their fold map here so a curated alias added to orgs.yaml automatically reaches every consumer (no drift).""" m: dict[str, str] = {k.lower(): v for k, v in _ORG_ALIASES.items()} for e in orgs_yaml_entries or []: if not isinstance(e, dict): continue oid = e.get("id") if not isinstance(oid, str) or not oid: continue m[oid.lower()] = oid hf_org = e.get("hf_org") if isinstance(hf_org, str) and hf_org.strip(): m[hf_org.lower()] = oid for a in (e.get("aliases") or []): if isinstance(a, str) and a.strip(): m[a.lower()] = oid return m def build_org_dev_map_from_store(org_records, org_alias_pairs) -> dict[str, str]: """Same dev-org map as `build_curated_org_map`, for STORE-backed callers that don't have orgs.yaml at hand (the live resolution_service; seed-time lineage derivation; the deployed Space reads from the HF dataset, not seed files). `canonical_orgs` has no `aliases` column, so the alias tier lives as org rows in the alias table — feed it via `org_alias_pairs` ((raw, canonical_id) of entity_type=org). Identical result to build_curated_org_map over orgs.yaml, so every consumer folds orgs the same way (single source, no drift).""" m = build_curated_org_map([ {"id": r.get("id"), "hf_org": r.get("hf_org")} for r in (org_records or []) ]) for raw, cid in org_alias_pairs or []: if isinstance(raw, str) and raw and isinstance(cid, str) and cid: m[raw.lower()] = cid return m def _norm_org_key(org: str) -> str: """Separator/case-insensitive org key (one token) for community-casing folds.""" return re.sub(r"[^a-z0-9]", "", org.lower()) def build_community_casing(org_prefixes: list[str]) -> dict[str, str]: """Map a separator/case-insensitive org key -> the authoritative HF-true casing, derived from real-HF repo org prefixes (the hf_oracle / hub_stats sources). Lets every generator snap a community org (no curated id) to ONE canonical spelling so `Sao10K`/`sao10k`/`sao10K` collapse to one canonical_orgs row. Deterministic tie-break (sorted) when a key has >1 real spelling — so a refresh never flips a previously-chosen casing.""" by_key: dict[str, set[str]] = {} for p in org_prefixes: if isinstance(p, str) and p.strip(): by_key.setdefault(_norm_org_key(p), set()).add(p) return {k: sorted(v)[0] for k, v in by_key.items() if v} def canonicalize_org( prefix: str, curated_map: dict[str, str], community_casing: Optional[dict[str, str]] = None, distinct_allowlist: Optional[set[str]] = None, ) -> str: """Canonical org id for an HF org spelling: (1) curated developer id when the prefix folds to one; (2) else the authoritative HF-true community casing (unless the spelling is on the distinct-org allowlist, which keeps verified separate uploaders apart); (3) else the prefix verbatim. The single org canonicalizer all generators + the reconcile call.""" if not prefix: return prefix curated = curated_map.get(prefix.lower()) if curated is not None: return curated if distinct_allowlist and prefix in distinct_allowlist: return prefix if community_casing: cased = community_casing.get(_norm_org_key(prefix)) if cased is not None: return cased return prefix def dev_org_of_prefix(prefix: str, hf_to_dev: dict[str, str]) -> str: """Remap an id-prefix org to its curated developer slug (else itself).""" return hf_to_dev.get(prefix.lower(), prefix) def name_norm(name: str) -> str: """Normalized name with separators collapsed AND removed (one token), so `Qwen2.5-14B-Instruct` and `qwen-2-5-14b-instruct` map to the same token.""" return _nz(name).replace(" ", "") def brand_tokens_for(dev_org: str, hf_to_dev: dict[str, str]) -> set[str]: """Brand tokens a models.dev key may glue onto a model name for this developer: the dev slug + every HF alias mapping to it (alibaba -> {alibaba, qwen, ...}; meta -> {meta, meta-llama, facebook}), plus a few spelling variants not in the org map. Normalized to single tokens.""" toks: set[str] = set() d = dev_org.lower() toks.add(name_norm(d)) for hf_alias, dev in hf_to_dev.items(): if dev.lower() == d: toks.add(name_norm(hf_alias)) extra = { "alibaba": {"qwen"}, "meta": {"llama"}, "minimax": {"minimax"}, "google": {"gemini", "gemma"}, "deepseek": {"deepseek"}, } toks |= extra.get(d, set()) return {t for t in toks if t} def strip_brand_prefix(norm_name_tok: str, brands: set[str]) -> set[str]: """Candidate name tokens with a leading brand token removed (always includes the original). Strips repeatedly (defensive against `qwen-qwen-...`).""" out = {norm_name_tok} cur = norm_name_tok changed = True while changed: changed = False for b in sorted(brands, key=len, reverse=True): if b and cur.startswith(b) and len(cur) > len(b): cur = cur[len(b):] out.add(cur) changed = True break return out def build_hf_index( entries: list[dict], hf_to_dev: dict[str, str], fixed_ids: frozenset[str] = frozenset(), ): """Build the HF target authority from registry `entries` (+ any extra real-HF `fixed_ids`, e.g. from the frozen oracle): - hf_ids: every real-HF canonical id (resolution_source == 'hf' OR in fixed_ids) — for exact id match. - alias_to_hf: every id/display/alias string on an HF entry -> that HF id. - by_org_name: (dev_org, name_norm) -> hf_id — for org-aware normalized match. - hf_entry_by_id: id -> entry (so callers can merge onto it). """ hf_entry_by_id: dict[str, dict] = {} hf_ids: set[str] = set(fixed_ids) for e in entries: if not isinstance(e, dict): continue cid = e.get("id") if not isinstance(cid, str): continue if e.get("resolution_source") == "hf" or cid in fixed_ids: hf_ids.add(cid) hf_entry_by_id[cid] = e alias_to_hf: dict[str, str] = {} by_org_name: dict[tuple[str, str], str] = {} def index_target(cid: str, entry: Optional[dict]) -> None: if "/" not in cid: return org, name = cid.split("/", 1) dev = dev_org_of_prefix(org, hf_to_dev) by_org_name.setdefault((dev, name_norm(name)), cid) alias_to_hf.setdefault(cid, cid) if entry is not None: dn = entry.get("display_name") if isinstance(dn, str): alias_to_hf.setdefault(dn, cid) for a in entry.get("aliases") or []: if isinstance(a, str): alias_to_hf.setdefault(a, cid) for cid, e in hf_entry_by_id.items(): index_target(cid, e) for cid in fixed_ids: if cid not in hf_entry_by_id: index_target(cid, None) return hf_ids, alias_to_hf, by_org_name, hf_entry_by_id def decide_fold(mint: dict, hf_ids, alias_to_hf, by_org_name, hf_to_dev) -> Optional[dict]: """Return a fold dict (mint_id, hf_target, match_type, org agreement, evidence) when `mint` confidently refers to the same model as a real HF id; else None (never a cross-developer false merge).""" cid = mint.get("id") if not isinstance(cid, str): return None mint_org = mint.get("org_id") mint_org = mint_org if isinstance(mint_org, str) and mint_org else None prefix_org = dev_org_of_prefix(cid.split("/", 1)[0], hf_to_dev) if "/" in cid else None eff_org = mint_org or prefix_org mint_strings = [cid] dn = mint.get("display_name") if isinstance(dn, str): mint_strings.append(dn) for a in mint.get("aliases") or []: if isinstance(a, str): mint_strings.append(a) # exact id equality for s in mint_strings: if s in hf_ids and s != cid: return _mk(mint, s, "exact", eff_org, hf_to_dev, f"mint string {s!r} is a real HF id") # alias linkage (mint string already an alias of an HF entry). A mint string # can be a generic BARE name (e.g. `gemma-3-4b-it`) that DISTINCT developers # legitimately both carry (google's gemma AND unsloth's re-upload). Such a # shared alias must NOT link the mint across developers, so require org # agreement (after the dev remap) when BOTH the mint's effective org and the # target's developer are known and they DISAGREE — skip that match instead of # false-merging unsloth/gemma-3-4b-it into google/gemma-3-4b-it. (A full # org/model HF-id match is handled by the exact tier above, which is # unambiguous and stays unguarded.) for s in mint_strings: tgt = alias_to_hf.get(s) if tgt and tgt != cid and tgt in hf_ids: tgt_org = dev_org_of_prefix(tgt.split("/", 1)[0], hf_to_dev) if "/" in tgt else None if eff_org and tgt_org and eff_org.lower() != tgt_org.lower(): continue return _mk(mint, tgt, "alias", eff_org, hf_to_dev, f"mint string {s!r} declared on {tgt!r}") if eff_org is None: return None # no org agreement possible -> never fold cand_names = {(s.split("/", 1)[1] if "/" in s else s) for s in mint_strings} cand_names = {name_norm(n) for n in cand_names} # normalized-name equality + org agreement for nm in cand_names: tgt = by_org_name.get((eff_org, nm)) if tgt and tgt != cid: return _mk(mint, tgt, "normalized", eff_org, hf_to_dev, f"org={eff_org} + name {nm!r} == {tgt!r}") # fuzzy: brand-prefix-stripped name + org agreement brands = brand_tokens_for(eff_org, hf_to_dev) stripped: set[str] = set() for nm in cand_names: stripped |= strip_brand_prefix(nm, brands) for nm in stripped - cand_names: tgt = by_org_name.get((eff_org, nm)) if tgt and tgt != cid: return _mk(mint, tgt, "fuzzy", eff_org, hf_to_dev, f"org={eff_org} + brand-stripped {nm!r} == {tgt!r}") return None def _mk(mint, hf_target, match_type, mint_dev_org, hf_to_dev, evidence) -> dict: hf_org = (hf_to_dev.get(hf_target.split("/", 1)[0].lower(), hf_target.split("/", 1)[0]) if "/" in hf_target else hf_target) return { "mint_id": mint["id"], "hf_target": hf_target, "match_type": match_type, "mint_org": mint_dev_org or "", "hf_org": hf_org, "org_agreement": (mint_dev_org or "").lower() == (hf_org or "").lower(), "evidence": evidence, }