Spaces:

evaleval
/

entity-registry

Sleeping

App Files Files Community

entity-registry / packages /eval-entity-resolver /src /eval_entity_resolver /resolver.py

j-chim

Upload folder using huggingface_hub

266f01b verified 10 days ago

raw

history blame contribute delete

8.83 kB

	"""The bare resolver. Matches a raw value to a canonical id via the
	strategy chain (exact → normalized → fuzzy → no_match), and — when
	given a `CanonicalStore` — enriches the result with the matched
	canonical's metadata, parent edges, model-specific lineage fields,
	and quantized-chain root collapse.

	The enrichment matches the HTTP API's response shape exactly. Callers
	using the resolver standalone get the same `ResolutionResult` they'd
	get back from `POST /api/v1/resolve`."""
	from __future__ import annotations

	from pathlib import Path
	from typing import Optional

	from eval_entity_resolver.alias_store import AliasStore
	from eval_entity_resolver.canonical_store import CanonicalStore
	from eval_entity_resolver.models import ResolutionResult, ResolverConfig
	from eval_entity_resolver.strategies.exact import exact_match
	from eval_entity_resolver.strategies.normalized import normalized_match
	from eval_entity_resolver.strategies.fuzzy import fuzzy_match


	class Resolver:
	def __init__(
	self,
	store: AliasStore,
	config: Optional[ResolverConfig] = None,
	canonical_store: Optional[CanonicalStore] = None,
	) -> None:
	"""`store` is required (alias matching is the resolver's core job).
	`canonical_store` is optional — when provided, results are
	enriched with parent / lineage / metadata fields. Without it,
	only the basic match fields (canonical_id, strategy, confidence)
	are populated."""
	self.store = store
	self.config = config or ResolverConfig()
	self.canonical_store = canonical_store

	@classmethod
	def from_parquet(
	cls,
	path: str \| Path,
	config: Optional[ResolverConfig] = None,
	) -> "Resolver":
	"""Load both alias and canonical stores from a parquet directory
	(e.g. `./fixtures/`) and return a fully-enriching resolver. This
	is the recommended convenience for callers who want the same
	response shape as the HTTP API."""
	return cls(
	AliasStore.from_parquet(path),
	config=config,
	canonical_store=CanonicalStore.from_parquet(path),
	)

	@classmethod
	def from_hf(
	cls,
	repo_id: str,
	config: Optional[ResolverConfig] = None,
	) -> "Resolver":
	"""Load both stores from a HF Dataset repo and return a
	fully-enriching resolver."""
	return cls(
	AliasStore.from_hf(repo_id),
	config=config,
	canonical_store=CanonicalStore.from_hf(repo_id),
	)

	def resolve(
	self,
	raw_value: str,
	entity_type: str,
	source_config: Optional[str] = None,
	) -> ResolutionResult:
	# 1. Exact
	canonical_id = exact_match(raw_value, entity_type, source_config, self.store)
	if canonical_id is not None:
	return self._enrich(raw_value, entity_type, source_config, canonical_id, "exact", 1.0)

	# 2. Normalized (confidence 0.95 — only return if above threshold)
	_NORMALIZED_CONFIDENCE = 0.95
	if _NORMALIZED_CONFIDENCE >= self.config.threshold:
	canonical_id = normalized_match(raw_value, entity_type, self.store, source_config)
	if canonical_id is not None:
	return self._enrich(
	raw_value, entity_type, source_config,
	canonical_id, "normalized", _NORMALIZED_CONFIDENCE,
	)

	# 3. Fuzzy
	canonical_id, confidence = fuzzy_match(
	raw_value, entity_type, self.config.threshold, self.store, source_config
	)
	if canonical_id is not None:
	return self._enrich(
	raw_value, entity_type, source_config,
	canonical_id, "fuzzy", confidence,
	)

	# 4. No match
	return ResolutionResult(
	raw_value=raw_value,
	entity_type=entity_type,
	source_config=source_config,
	canonical_id=None,
	strategy="no_match",
	confidence=0.0,
	)

	# ------------------------------------------------------------------
	# Enrichment (no-op when no canonical_store is attached)
	# ------------------------------------------------------------------

	def build_result(
	self,
	raw_value: str,
	entity_type: str,
	source_config: Optional[str],
	canonical_id: str,
	strategy: str,
	confidence: float,
	) -> ResolutionResult:
	"""Construct an enriched `ResolutionResult` for a canonical_id
	the caller already knows — useful for callers that bypass the
	strategy chain (e.g. an alias-table cache hit, an auto-created
	draft) but want the same rich response shape. Identical to the
	enrichment that happens inside `resolve()`."""
	return self._enrich(raw_value, entity_type, source_config, canonical_id, strategy, confidence)

	def _enrich(
	self,
	raw_value: str,
	entity_type: str,
	source_config: Optional[str],
	matched_canonical_id: str,
	strategy: str,
	confidence: float,
	) -> ResolutionResult:
	"""Look up the matched canonical's row and populate the rich
	response fields. When no canonical_store is attached, the rich
	fields stay None and the result has just the basic match info."""
	if self.canonical_store is None:
	return ResolutionResult(
	raw_value=raw_value,
	entity_type=entity_type,
	source_config=source_config,
	canonical_id=matched_canonical_id,
	strategy=strategy,
	confidence=confidence,
	)

	cs = self.canonical_store
	matched_entity = cs.lookup(entity_type, matched_canonical_id)
	review_status = (matched_entity or {}).get("review_status") if matched_entity else None

	if entity_type == "model":
	fields = cs.model_metadata_fields(matched_canonical_id, matched_entity)
	# If the response collapses to a different canonical (root),
	# surface THAT canonical's review_status — keeps the response
	# internally consistent.
	if fields["canonical_id"] != matched_canonical_id:
	root_entity = cs.lookup("model", fields["canonical_id"])
	if root_entity:
	review_status = root_entity.get("review_status") or review_status
	return ResolutionResult(
	raw_value=raw_value,
	entity_type=entity_type,
	source_config=source_config,
	canonical_id=fields["canonical_id"],
	strategy=strategy,
	confidence=confidence,
	review_status=review_status,
	parent_canonical_id=cs.parent_canonical_id("model", matched_entity),
	resolved_leaf_id=fields["resolved_leaf_id"],
	root_model_id=fields["root_model_id"],
	lineage_origin_org_id=fields["lineage_origin_org_id"],
	parents=fields["parents"],
	open_weights=fields["open_weights"],
	release_date=fields["release_date"],
	params_billions=fields["params_billions"],
	)

	# Benchmark: fill in hierarchy-alignment fields (family_key,
	# category) by walking canonical_families. composite_keys stays
	# empty here — see CanonicalStore.benchmark_family_enrichment for
	# why composite computation belongs in the producer.
	if entity_type == "benchmark":
	fam = cs.benchmark_family_enrichment(matched_canonical_id)
	return ResolutionResult(
	raw_value=raw_value,
	entity_type=entity_type,
	source_config=source_config,
	canonical_id=matched_canonical_id,
	strategy=strategy,
	confidence=confidence,
	review_status=review_status,
	parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity),
	family_key=fam["family_key"],
	category=fam["category"],
	composite_keys=fam["composite_keys"],
	)

	# Other non-model types (metric, harness, org): only
	# parent_canonical_id and review_status are meaningful
	return ResolutionResult(
	raw_value=raw_value,
	entity_type=entity_type,
	source_config=source_config,
	canonical_id=matched_canonical_id,
	strategy=strategy,
	confidence=confidence,
	review_status=review_status,
	parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity),
	)