Spaces:

evaleval
/

entity-registry

Running

App Files Files Community

entity-registry / packages /eval-entity-resolver /src /eval_entity_resolver /fold.py

j-chim

Upload folder using huggingface_hub

e60e7e0 verified 27 days ago

Raw

History Blame Contribute Delete

15.1 kB

	"""Org-aware fold decision: does a minted off-HF / dev-org-slug canonical refer
	to the SAME model as a real HF repo already in the registry?

	This is the single source of truth for the cross-source same-model dedup used by
	BOTH the generator (scripts/refresh_from_modelsdev.py reconciliation, so a mint
	DEFERS to the real HF id at generate time) AND the gate
	(tests/test_gate_invariants.py via scripts/fold_modelsdev_dupes.py, which verifies
	none survive). Keeping it here means the two can never drift.

	A match is "confident" only with ORG AGREEMENT (after the curated two-tier dev-org
	remap: meta-llama->meta, Qwen->alibaba, ...): a group with no resolvable org, or
	whose name matches only under a DIFFERENT developer, NEVER folds (no cross-vendor
	false merge). Tiers, strongest first: exact id == HF id; mint string already an
	alias of an HF entry; normalized-name (all separators removed) + org agreement;
	brand-prefix-stripped name + org agreement (so `qwen-qwq-32b` -> `QwQ-32B`).

	`name_norm` removes ALL separators to ONE token, so models.dev's mangled
	spellings collapse onto HF casing: `qwen-2-5-14b-instruct` == `Qwen2.5-14B-Instruct`.
	"""
	from __future__ import annotations

	import re
	from typing import Optional

	from eval_entity_resolver.normalization import normalize as _nz


	# Curated HF-namespace -> developer-org remap. The SINGLE owner of this map
	# (moved here from strategies/fuzzy.py so both the resolver and the seed
	# generators consume one source). HF namespaces that are alternate spellings of
	# one developer fold to that developer's slug (org_id only — a canonical_id keeps
	# its real HF repo prefix). Lowercased keys.
	_ORG_ALIASES: dict[str, str] = {
	"deepseek-ai": "deepseek",
	"cohereforai": "cohere",
	"cohere-labs": "cohere",
	# HF renamed the Cohere org `CohereForAI` -> `CohereLabs` (no hyphen);
	# both are the same lab, canonical `cohere`.
	"coherelabs": "cohere",
	# HF's SmolLM team `HuggingFaceTB` is part of Hugging Face.
	"huggingfacetb": "huggingface",
	# Baichuan is a curated lab; its HF namespace `baichuan-inc` folds into it.
	"baichuan-inc": "baichuan",
	# HF `MiniMaxAI` / `SarvamAI` namespaces -> the lab slug we already use.
	"minimaxai": "minimax",
	"sarvamai": "sarvam",
	"tii-uae": "tiiuae",
	"meta-llama": "meta",
	"mistral-ai": "mistralai",
	"nvidia-nemo": "nvidia",
	# Zhipu/Z.ai → zai. `THUDM` is the legacy HF org for the GLM/ChatGLM
	# family (Tsinghua/Zhipu); HF now publishes under `zai-org`.
	"zhipu": "zai",
	"zhipu-ai": "zai",
	"z-ai": "zai",
	"zai-org": "zai",
	"thudm": "zai",
	# Moonshot → moonshotai
	"moonshot": "moonshotai",
	"moonshot-ai": "moonshotai",
	# Qwen models live under canonical org `alibaba` (Alibaba Cloud).
	# HF uploads use the `Qwen/` namespace (e.g. Qwen/Qwen2-VL-7B-Instruct).
	# The reverse mapping (alibaba → qwen) was rejected because
	# `alibaba__mineru2-pipeline` is a non-Qwen entry; this direction has
	# no analogous collision since every `qwen/<X>` upstream id we've seen
	# corresponds to an Alibaba/Qwen-family model.
	"qwen": "alibaba",
	# Alternate HF namespaces of a known developer fold to the one parent org
	# (org_id only — the canonical_id keeps the real HF repo prefix). These
	# consolidate the developer in downstream listings.
	"facebook": "meta", # Meta's pre-Llama HF org (OPT, BART, ...)
	"mistral": "mistralai",
	"mosaicml": "databricks", # MosaicML (MPT) acquired by Databricks
	"databricks-mosaic-research": "databricks",
	"alibaba-aidc": "alibaba",
	"alibaba-nlp": "alibaba",
	"aws-prototyping": "amazon",
	"ibm-research": "ibm",
	"ibm-granite": "ibm", # Granite folded into IBM (curation decision)
	"bytedance-seed": "bytedance",
	}


	def build_curated_org_map(orgs_yaml_entries: list[dict]) -> dict[str, str]:
	"""The SINGLE curated HF-namespace -> developer-org map every generator and
	the resolver should use: `_ORG_ALIASES` UNION every curated org's id +
	`hf_org` + each entry in its `aliases`, keyed lowercase -> curated id. The
	curated seed (orgs.yaml) wins over `_ORG_ALIASES` on conflict.

	Every generator and the resolver build their fold map here so a curated
	alias added to orgs.yaml automatically reaches every consumer (no drift)."""
	m: dict[str, str] = {k.lower(): v for k, v in _ORG_ALIASES.items()}
	for e in orgs_yaml_entries or []:
	if not isinstance(e, dict):
	continue
	oid = e.get("id")
	if not isinstance(oid, str) or not oid:
	continue
	m[oid.lower()] = oid
	hf_org = e.get("hf_org")
	if isinstance(hf_org, str) and hf_org.strip():
	m[hf_org.lower()] = oid
	for a in (e.get("aliases") or []):
	if isinstance(a, str) and a.strip():
	m[a.lower()] = oid
	return m


	def build_org_dev_map_from_store(org_records, org_alias_pairs) -> dict[str, str]:
	"""Same dev-org map as `build_curated_org_map`, for STORE-backed callers that
	don't have orgs.yaml at hand (the live resolution_service; seed-time lineage
	derivation; the deployed Space reads from the HF dataset, not seed files).
	`canonical_orgs` has no `aliases` column, so the alias tier lives as org rows
	in the alias table — feed it via `org_alias_pairs` ((raw, canonical_id) of
	entity_type=org). Identical result to build_curated_org_map over orgs.yaml, so
	every consumer folds orgs the same way (single source, no drift)."""
	m = build_curated_org_map([
	{"id": r.get("id"), "hf_org": r.get("hf_org")} for r in (org_records or [])
	])
	for raw, cid in org_alias_pairs or []:
	if isinstance(raw, str) and raw and isinstance(cid, str) and cid:
	m[raw.lower()] = cid
	return m


	def _norm_org_key(org: str) -> str:
	"""Separator/case-insensitive org key (one token) for community-casing folds."""
	return re.sub(r"[^a-z0-9]", "", org.lower())


	def build_community_casing(org_prefixes: list[str]) -> dict[str, str]:
	"""Map a separator/case-insensitive org key -> the authoritative HF-true
	casing, derived from real-HF repo org prefixes (the hf_oracle / hub_stats
	sources). Lets every generator snap a community org (no curated id) to ONE
	canonical spelling so `Sao10K`/`sao10k`/`sao10K` collapse to one
	canonical_orgs row. Deterministic tie-break (sorted) when a key has >1 real
	spelling — so a refresh never flips a previously-chosen casing."""
	by_key: dict[str, set[str]] = {}
	for p in org_prefixes:
	if isinstance(p, str) and p.strip():
	by_key.setdefault(_norm_org_key(p), set()).add(p)
	return {k: sorted(v)[0] for k, v in by_key.items() if v}


	def canonicalize_org(
	prefix: str,
	curated_map: dict[str, str],
	community_casing: Optional[dict[str, str]] = None,
	distinct_allowlist: Optional[set[str]] = None,
	) -> str:
	"""Canonical org id for an HF org spelling: (1) curated developer id when the
	prefix folds to one; (2) else the authoritative HF-true community casing
	(unless the spelling is on the distinct-org allowlist, which keeps verified
	separate uploaders apart); (3) else the prefix verbatim. The single org
	canonicalizer all generators + the reconcile call."""
	if not prefix:
	return prefix
	curated = curated_map.get(prefix.lower())
	if curated is not None:
	return curated
	if distinct_allowlist and prefix in distinct_allowlist:
	return prefix
	if community_casing:
	cased = community_casing.get(_norm_org_key(prefix))
	if cased is not None:
	return cased
	return prefix


	def dev_org_of_prefix(prefix: str, hf_to_dev: dict[str, str]) -> str:
	"""Remap an id-prefix org to its curated developer slug (else itself)."""
	return hf_to_dev.get(prefix.lower(), prefix)


	def name_norm(name: str) -> str:
	"""Normalized name with separators collapsed AND removed (one token), so
	`Qwen2.5-14B-Instruct` and `qwen-2-5-14b-instruct` map to the same token."""
	return _nz(name).replace(" ", "")


	def brand_tokens_for(dev_org: str, hf_to_dev: dict[str, str]) -> set[str]:
	"""Brand tokens a models.dev key may glue onto a model name for this
	developer: the dev slug + every HF alias mapping to it (alibaba -> {alibaba,
	qwen, ...}; meta -> {meta, meta-llama, facebook}), plus a few spelling
	variants not in the org map. Normalized to single tokens."""
	toks: set[str] = set()
	d = dev_org.lower()
	toks.add(name_norm(d))
	for hf_alias, dev in hf_to_dev.items():
	if dev.lower() == d:
	toks.add(name_norm(hf_alias))
	extra = {
	"alibaba": {"qwen"},
	"meta": {"llama"},
	"minimax": {"minimax"},
	"google": {"gemini", "gemma"},
	"deepseek": {"deepseek"},
	}
	toks \|= extra.get(d, set())
	return {t for t in toks if t}


	def strip_brand_prefix(norm_name_tok: str, brands: set[str]) -> set[str]:
	"""Candidate name tokens with a leading brand token removed (always includes
	the original). Strips repeatedly (defensive against `qwen-qwen-...`)."""
	out = {norm_name_tok}
	cur = norm_name_tok
	changed = True
	while changed:
	changed = False
	for b in sorted(brands, key=len, reverse=True):
	if b and cur.startswith(b) and len(cur) > len(b):
	cur = cur[len(b):]
	out.add(cur)
	changed = True
	break
	return out


	def build_hf_index(
	entries: list[dict],
	hf_to_dev: dict[str, str],
	fixed_ids: frozenset[str] = frozenset(),
	):
	"""Build the HF target authority from registry `entries` (+ any extra
	real-HF `fixed_ids`, e.g. from the frozen oracle):
	- hf_ids: every real-HF canonical id (resolution_source == 'hf' OR in
	fixed_ids) — for exact id match.
	- alias_to_hf: every id/display/alias string on an HF entry -> that HF id.
	- by_org_name: (dev_org, name_norm) -> hf_id — for org-aware normalized match.
	- hf_entry_by_id: id -> entry (so callers can merge onto it).
	"""
	hf_entry_by_id: dict[str, dict] = {}
	hf_ids: set[str] = set(fixed_ids)
	for e in entries:
	if not isinstance(e, dict):
	continue
	cid = e.get("id")
	if not isinstance(cid, str):
	continue
	if e.get("resolution_source") == "hf" or cid in fixed_ids:
	hf_ids.add(cid)
	hf_entry_by_id[cid] = e

	alias_to_hf: dict[str, str] = {}
	by_org_name: dict[tuple[str, str], str] = {}

	def index_target(cid: str, entry: Optional[dict]) -> None:
	if "/" not in cid:
	return
	org, name = cid.split("/", 1)
	dev = dev_org_of_prefix(org, hf_to_dev)
	by_org_name.setdefault((dev, name_norm(name)), cid)
	alias_to_hf.setdefault(cid, cid)
	if entry is not None:
	dn = entry.get("display_name")
	if isinstance(dn, str):
	alias_to_hf.setdefault(dn, cid)
	for a in entry.get("aliases") or []:
	if isinstance(a, str):
	alias_to_hf.setdefault(a, cid)

	for cid, e in hf_entry_by_id.items():
	index_target(cid, e)
	for cid in fixed_ids:
	if cid not in hf_entry_by_id:
	index_target(cid, None)

	return hf_ids, alias_to_hf, by_org_name, hf_entry_by_id


	def decide_fold(mint: dict, hf_ids, alias_to_hf, by_org_name, hf_to_dev) -> Optional[dict]:
	"""Return a fold dict (mint_id, hf_target, match_type, org agreement,
	evidence) when `mint` confidently refers to the same model as a real HF id;
	else None (never a cross-developer false merge)."""
	cid = mint.get("id")
	if not isinstance(cid, str):
	return None
	mint_org = mint.get("org_id")
	mint_org = mint_org if isinstance(mint_org, str) and mint_org else None
	prefix_org = dev_org_of_prefix(cid.split("/", 1)[0], hf_to_dev) if "/" in cid else None
	eff_org = mint_org or prefix_org

	mint_strings = [cid]
	dn = mint.get("display_name")
	if isinstance(dn, str):
	mint_strings.append(dn)
	for a in mint.get("aliases") or []:
	if isinstance(a, str):
	mint_strings.append(a)

	# exact id equality
	for s in mint_strings:
	if s in hf_ids and s != cid:
	return _mk(mint, s, "exact", eff_org, hf_to_dev, f"mint string {s!r} is a real HF id")
	# alias linkage (mint string already an alias of an HF entry). A mint string
	# can be a generic BARE name (e.g. `gemma-3-4b-it`) that DISTINCT developers
	# legitimately both carry (google's gemma AND unsloth's re-upload). Such a
	# shared alias must NOT link the mint across developers, so require org
	# agreement (after the dev remap) when BOTH the mint's effective org and the
	# target's developer are known and they DISAGREE — skip that match instead of
	# false-merging unsloth/gemma-3-4b-it into google/gemma-3-4b-it. (A full
	# org/model HF-id match is handled by the exact tier above, which is
	# unambiguous and stays unguarded.)
	for s in mint_strings:
	tgt = alias_to_hf.get(s)
	if tgt and tgt != cid and tgt in hf_ids:
	tgt_org = dev_org_of_prefix(tgt.split("/", 1)[0], hf_to_dev) if "/" in tgt else None
	if eff_org and tgt_org and eff_org.lower() != tgt_org.lower():
	continue
	return _mk(mint, tgt, "alias", eff_org, hf_to_dev, f"mint string {s!r} declared on {tgt!r}")

	if eff_org is None:
	return None # no org agreement possible -> never fold

	cand_names = {(s.split("/", 1)[1] if "/" in s else s) for s in mint_strings}
	cand_names = {name_norm(n) for n in cand_names}

	# normalized-name equality + org agreement
	for nm in cand_names:
	tgt = by_org_name.get((eff_org, nm))
	if tgt and tgt != cid:
	return _mk(mint, tgt, "normalized", eff_org, hf_to_dev,
	f"org={eff_org} + name {nm!r} == {tgt!r}")
	# fuzzy: brand-prefix-stripped name + org agreement
	brands = brand_tokens_for(eff_org, hf_to_dev)
	stripped: set[str] = set()
	for nm in cand_names:
	stripped \|= strip_brand_prefix(nm, brands)
	for nm in stripped - cand_names:
	tgt = by_org_name.get((eff_org, nm))
	if tgt and tgt != cid:
	return _mk(mint, tgt, "fuzzy", eff_org, hf_to_dev,
	f"org={eff_org} + brand-stripped {nm!r} == {tgt!r}")
	return None


	def _mk(mint, hf_target, match_type, mint_dev_org, hf_to_dev, evidence) -> dict:
	hf_org = (hf_to_dev.get(hf_target.split("/", 1)[0].lower(), hf_target.split("/", 1)[0])
	if "/" in hf_target else hf_target)
	return {
	"mint_id": mint["id"],
	"hf_target": hf_target,
	"match_type": match_type,
	"mint_org": mint_dev_org or "",
	"hf_org": hf_org,
	"org_agreement": (mint_dev_org or "").lower() == (hf_org or "").lower(),
	"evidence": evidence,
	}