j-chim's picture
Upload folder using huggingface_hub
266f01b verified
"""The bare resolver. Matches a raw value to a canonical id via the
strategy chain (exact β†’ normalized β†’ fuzzy β†’ no_match), and β€” when
given a `CanonicalStore` β€” enriches the result with the matched
canonical's metadata, parent edges, model-specific lineage fields,
and quantized-chain root collapse.
The enrichment matches the HTTP API's response shape exactly. Callers
using the resolver standalone get the same `ResolutionResult` they'd
get back from `POST /api/v1/resolve`."""
from __future__ import annotations
from pathlib import Path
from typing import Optional
from eval_entity_resolver.alias_store import AliasStore
from eval_entity_resolver.canonical_store import CanonicalStore
from eval_entity_resolver.models import ResolutionResult, ResolverConfig
from eval_entity_resolver.strategies.exact import exact_match
from eval_entity_resolver.strategies.normalized import normalized_match
from eval_entity_resolver.strategies.fuzzy import fuzzy_match
class Resolver:
def __init__(
self,
store: AliasStore,
config: Optional[ResolverConfig] = None,
canonical_store: Optional[CanonicalStore] = None,
) -> None:
"""`store` is required (alias matching is the resolver's core job).
`canonical_store` is optional β€” when provided, results are
enriched with parent / lineage / metadata fields. Without it,
only the basic match fields (canonical_id, strategy, confidence)
are populated."""
self.store = store
self.config = config or ResolverConfig()
self.canonical_store = canonical_store
@classmethod
def from_parquet(
cls,
path: str | Path,
config: Optional[ResolverConfig] = None,
) -> "Resolver":
"""Load both alias and canonical stores from a parquet directory
(e.g. `./fixtures/`) and return a fully-enriching resolver. This
is the recommended convenience for callers who want the same
response shape as the HTTP API."""
return cls(
AliasStore.from_parquet(path),
config=config,
canonical_store=CanonicalStore.from_parquet(path),
)
@classmethod
def from_hf(
cls,
repo_id: str,
config: Optional[ResolverConfig] = None,
) -> "Resolver":
"""Load both stores from a HF Dataset repo and return a
fully-enriching resolver."""
return cls(
AliasStore.from_hf(repo_id),
config=config,
canonical_store=CanonicalStore.from_hf(repo_id),
)
def resolve(
self,
raw_value: str,
entity_type: str,
source_config: Optional[str] = None,
) -> ResolutionResult:
# 1. Exact
canonical_id = exact_match(raw_value, entity_type, source_config, self.store)
if canonical_id is not None:
return self._enrich(raw_value, entity_type, source_config, canonical_id, "exact", 1.0)
# 2. Normalized (confidence 0.95 β€” only return if above threshold)
_NORMALIZED_CONFIDENCE = 0.95
if _NORMALIZED_CONFIDENCE >= self.config.threshold:
canonical_id = normalized_match(raw_value, entity_type, self.store, source_config)
if canonical_id is not None:
return self._enrich(
raw_value, entity_type, source_config,
canonical_id, "normalized", _NORMALIZED_CONFIDENCE,
)
# 3. Fuzzy
canonical_id, confidence = fuzzy_match(
raw_value, entity_type, self.config.threshold, self.store, source_config
)
if canonical_id is not None:
return self._enrich(
raw_value, entity_type, source_config,
canonical_id, "fuzzy", confidence,
)
# 4. No match
return ResolutionResult(
raw_value=raw_value,
entity_type=entity_type,
source_config=source_config,
canonical_id=None,
strategy="no_match",
confidence=0.0,
)
# ------------------------------------------------------------------
# Enrichment (no-op when no canonical_store is attached)
# ------------------------------------------------------------------
def build_result(
self,
raw_value: str,
entity_type: str,
source_config: Optional[str],
canonical_id: str,
strategy: str,
confidence: float,
) -> ResolutionResult:
"""Construct an enriched `ResolutionResult` for a canonical_id
the caller already knows β€” useful for callers that bypass the
strategy chain (e.g. an alias-table cache hit, an auto-created
draft) but want the same rich response shape. Identical to the
enrichment that happens inside `resolve()`."""
return self._enrich(raw_value, entity_type, source_config, canonical_id, strategy, confidence)
def _enrich(
self,
raw_value: str,
entity_type: str,
source_config: Optional[str],
matched_canonical_id: str,
strategy: str,
confidence: float,
) -> ResolutionResult:
"""Look up the matched canonical's row and populate the rich
response fields. When no canonical_store is attached, the rich
fields stay None and the result has just the basic match info."""
if self.canonical_store is None:
return ResolutionResult(
raw_value=raw_value,
entity_type=entity_type,
source_config=source_config,
canonical_id=matched_canonical_id,
strategy=strategy,
confidence=confidence,
)
cs = self.canonical_store
matched_entity = cs.lookup(entity_type, matched_canonical_id)
review_status = (matched_entity or {}).get("review_status") if matched_entity else None
if entity_type == "model":
fields = cs.model_metadata_fields(matched_canonical_id, matched_entity)
# If the response collapses to a different canonical (root),
# surface THAT canonical's review_status β€” keeps the response
# internally consistent.
if fields["canonical_id"] != matched_canonical_id:
root_entity = cs.lookup("model", fields["canonical_id"])
if root_entity:
review_status = root_entity.get("review_status") or review_status
return ResolutionResult(
raw_value=raw_value,
entity_type=entity_type,
source_config=source_config,
canonical_id=fields["canonical_id"],
strategy=strategy,
confidence=confidence,
review_status=review_status,
parent_canonical_id=cs.parent_canonical_id("model", matched_entity),
resolved_leaf_id=fields["resolved_leaf_id"],
root_model_id=fields["root_model_id"],
lineage_origin_org_id=fields["lineage_origin_org_id"],
parents=fields["parents"],
open_weights=fields["open_weights"],
release_date=fields["release_date"],
params_billions=fields["params_billions"],
)
# Benchmark: fill in hierarchy-alignment fields (family_key,
# category) by walking canonical_families. composite_keys stays
# empty here β€” see CanonicalStore.benchmark_family_enrichment for
# why composite computation belongs in the producer.
if entity_type == "benchmark":
fam = cs.benchmark_family_enrichment(matched_canonical_id)
return ResolutionResult(
raw_value=raw_value,
entity_type=entity_type,
source_config=source_config,
canonical_id=matched_canonical_id,
strategy=strategy,
confidence=confidence,
review_status=review_status,
parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity),
family_key=fam["family_key"],
category=fam["category"],
composite_keys=fam["composite_keys"],
)
# Other non-model types (metric, harness, org): only
# parent_canonical_id and review_status are meaningful
return ResolutionResult(
raw_value=raw_value,
entity_type=entity_type,
source_config=source_config,
canonical_id=matched_canonical_id,
strategy=strategy,
confidence=confidence,
review_status=review_status,
parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity),
)