Spaces:

evaleval
/

entity-registry

Sleeping

File size: 8,827 Bytes

"""The bare resolver. Matches a raw value to a canonical id via the
strategy chain (exact → normalized → fuzzy → no_match), and — when
given a `CanonicalStore` — enriches the result with the matched
canonical's metadata, parent edges, model-specific lineage fields,
and quantized-chain root collapse.

The enrichment matches the HTTP API's response shape exactly. Callers
using the resolver standalone get the same `ResolutionResult` they'd
get back from `POST /api/v1/resolve`."""
from __future__ import annotations

from pathlib import Path
from typing import Optional

from eval_entity_resolver.alias_store import AliasStore
from eval_entity_resolver.canonical_store import CanonicalStore
from eval_entity_resolver.models import ResolutionResult, ResolverConfig
from eval_entity_resolver.strategies.exact import exact_match
from eval_entity_resolver.strategies.normalized import normalized_match
from eval_entity_resolver.strategies.fuzzy import fuzzy_match


class Resolver:
    def __init__(
        self,
        store: AliasStore,
        config: Optional[ResolverConfig] = None,
        canonical_store: Optional[CanonicalStore] = None,
    ) -> None:
        """`store` is required (alias matching is the resolver's core job).
        `canonical_store` is optional — when provided, results are
        enriched with parent / lineage / metadata fields. Without it,
        only the basic match fields (canonical_id, strategy, confidence)
        are populated."""
        self.store = store
        self.config = config or ResolverConfig()
        self.canonical_store = canonical_store

    @classmethod
    def from_parquet(
        cls,
        path: str | Path,
        config: Optional[ResolverConfig] = None,
    ) -> "Resolver":
        """Load both alias and canonical stores from a parquet directory
        (e.g. `./fixtures/`) and return a fully-enriching resolver. This
        is the recommended convenience for callers who want the same
        response shape as the HTTP API."""
        return cls(
            AliasStore.from_parquet(path),
            config=config,
            canonical_store=CanonicalStore.from_parquet(path),
        )

    @classmethod
    def from_hf(
        cls,
        repo_id: str,
        config: Optional[ResolverConfig] = None,
    ) -> "Resolver":
        """Load both stores from a HF Dataset repo and return a
        fully-enriching resolver."""
        return cls(
            AliasStore.from_hf(repo_id),
            config=config,
            canonical_store=CanonicalStore.from_hf(repo_id),
        )

    def resolve(
        self,
        raw_value: str,
        entity_type: str,
        source_config: Optional[str] = None,
    ) -> ResolutionResult:
        # 1. Exact
        canonical_id = exact_match(raw_value, entity_type, source_config, self.store)
        if canonical_id is not None:
            return self._enrich(raw_value, entity_type, source_config, canonical_id, "exact", 1.0)

        # 2. Normalized (confidence 0.95 — only return if above threshold)
        _NORMALIZED_CONFIDENCE = 0.95
        if _NORMALIZED_CONFIDENCE >= self.config.threshold:
            canonical_id = normalized_match(raw_value, entity_type, self.store, source_config)
            if canonical_id is not None:
                return self._enrich(
                    raw_value, entity_type, source_config,
                    canonical_id, "normalized", _NORMALIZED_CONFIDENCE,
                )

        # 3. Fuzzy
        canonical_id, confidence = fuzzy_match(
            raw_value, entity_type, self.config.threshold, self.store, source_config
        )
        if canonical_id is not None:
            return self._enrich(
                raw_value, entity_type, source_config,
                canonical_id, "fuzzy", confidence,
            )

        # 4. No match
        return ResolutionResult(
            raw_value=raw_value,
            entity_type=entity_type,
            source_config=source_config,
            canonical_id=None,
            strategy="no_match",
            confidence=0.0,
        )

    # ------------------------------------------------------------------
    # Enrichment (no-op when no canonical_store is attached)
    # ------------------------------------------------------------------

    def build_result(
        self,
        raw_value: str,
        entity_type: str,
        source_config: Optional[str],
        canonical_id: str,
        strategy: str,
        confidence: float,
    ) -> ResolutionResult:
        """Construct an enriched `ResolutionResult` for a canonical_id
        the caller already knows — useful for callers that bypass the
        strategy chain (e.g. an alias-table cache hit, an auto-created
        draft) but want the same rich response shape. Identical to the
        enrichment that happens inside `resolve()`."""
        return self._enrich(raw_value, entity_type, source_config, canonical_id, strategy, confidence)

    def _enrich(
        self,
        raw_value: str,
        entity_type: str,
        source_config: Optional[str],
        matched_canonical_id: str,
        strategy: str,
        confidence: float,
    ) -> ResolutionResult:
        """Look up the matched canonical's row and populate the rich
        response fields. When no canonical_store is attached, the rich
        fields stay None and the result has just the basic match info."""
        if self.canonical_store is None:
            return ResolutionResult(
                raw_value=raw_value,
                entity_type=entity_type,
                source_config=source_config,
                canonical_id=matched_canonical_id,
                strategy=strategy,
                confidence=confidence,
            )

        cs = self.canonical_store
        matched_entity = cs.lookup(entity_type, matched_canonical_id)
        review_status = (matched_entity or {}).get("review_status") if matched_entity else None

        if entity_type == "model":
            fields = cs.model_metadata_fields(matched_canonical_id, matched_entity)
            # If the response collapses to a different canonical (root),
            # surface THAT canonical's review_status — keeps the response
            # internally consistent.
            if fields["canonical_id"] != matched_canonical_id:
                root_entity = cs.lookup("model", fields["canonical_id"])
                if root_entity:
                    review_status = root_entity.get("review_status") or review_status
            return ResolutionResult(
                raw_value=raw_value,
                entity_type=entity_type,
                source_config=source_config,
                canonical_id=fields["canonical_id"],
                strategy=strategy,
                confidence=confidence,
                review_status=review_status,
                parent_canonical_id=cs.parent_canonical_id("model", matched_entity),
                resolved_leaf_id=fields["resolved_leaf_id"],
                root_model_id=fields["root_model_id"],
                lineage_origin_org_id=fields["lineage_origin_org_id"],
                parents=fields["parents"],
                open_weights=fields["open_weights"],
                release_date=fields["release_date"],
                params_billions=fields["params_billions"],
            )

        # Benchmark: fill in hierarchy-alignment fields (family_key,
        # category) by walking canonical_families. composite_keys stays
        # empty here — see CanonicalStore.benchmark_family_enrichment for
        # why composite computation belongs in the producer.
        if entity_type == "benchmark":
            fam = cs.benchmark_family_enrichment(matched_canonical_id)
            return ResolutionResult(
                raw_value=raw_value,
                entity_type=entity_type,
                source_config=source_config,
                canonical_id=matched_canonical_id,
                strategy=strategy,
                confidence=confidence,
                review_status=review_status,
                parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity),
                family_key=fam["family_key"],
                category=fam["category"],
                composite_keys=fam["composite_keys"],
            )

        # Other non-model types (metric, harness, org): only
        # parent_canonical_id and review_status are meaningful
        return ResolutionResult(
            raw_value=raw_value,
            entity_type=entity_type,
            source_config=source_config,
            canonical_id=matched_canonical_id,
            strategy=strategy,
            confidence=confidence,
            review_status=review_status,
            parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity),
        )