File size: 8,827 Bytes
315cbe0
 
 
 
 
 
 
 
 
 
 
 
a969e99
 
 
315cbe0
a969e99
 
 
 
 
 
 
315cbe0
 
 
 
 
 
 
 
 
 
 
a969e99
 
315cbe0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a969e99
 
 
 
 
 
 
 
 
 
315cbe0
a969e99
 
 
 
 
 
315cbe0
 
 
a969e99
 
 
 
 
 
 
315cbe0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d5cd41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315cbe0
 
 
 
 
 
 
 
 
 
 
 
 
a969e99
 
 
 
315cbe0
 
a969e99
 
 
315cbe0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266f01b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a969e99
 
 
 
315cbe0
 
 
 
 
a969e99
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""The bare resolver. Matches a raw value to a canonical id via the
strategy chain (exact β†’ normalized β†’ fuzzy β†’ no_match), and β€” when
given a `CanonicalStore` β€” enriches the result with the matched
canonical's metadata, parent edges, model-specific lineage fields,
and quantized-chain root collapse.

The enrichment matches the HTTP API's response shape exactly. Callers
using the resolver standalone get the same `ResolutionResult` they'd
get back from `POST /api/v1/resolve`."""
from __future__ import annotations

from pathlib import Path
from typing import Optional

from eval_entity_resolver.alias_store import AliasStore
from eval_entity_resolver.canonical_store import CanonicalStore
from eval_entity_resolver.models import ResolutionResult, ResolverConfig
from eval_entity_resolver.strategies.exact import exact_match
from eval_entity_resolver.strategies.normalized import normalized_match
from eval_entity_resolver.strategies.fuzzy import fuzzy_match


class Resolver:
    def __init__(
        self,
        store: AliasStore,
        config: Optional[ResolverConfig] = None,
        canonical_store: Optional[CanonicalStore] = None,
    ) -> None:
        """`store` is required (alias matching is the resolver's core job).
        `canonical_store` is optional β€” when provided, results are
        enriched with parent / lineage / metadata fields. Without it,
        only the basic match fields (canonical_id, strategy, confidence)
        are populated."""
        self.store = store
        self.config = config or ResolverConfig()
        self.canonical_store = canonical_store

    @classmethod
    def from_parquet(
        cls,
        path: str | Path,
        config: Optional[ResolverConfig] = None,
    ) -> "Resolver":
        """Load both alias and canonical stores from a parquet directory
        (e.g. `./fixtures/`) and return a fully-enriching resolver. This
        is the recommended convenience for callers who want the same
        response shape as the HTTP API."""
        return cls(
            AliasStore.from_parquet(path),
            config=config,
            canonical_store=CanonicalStore.from_parquet(path),
        )

    @classmethod
    def from_hf(
        cls,
        repo_id: str,
        config: Optional[ResolverConfig] = None,
    ) -> "Resolver":
        """Load both stores from a HF Dataset repo and return a
        fully-enriching resolver."""
        return cls(
            AliasStore.from_hf(repo_id),
            config=config,
            canonical_store=CanonicalStore.from_hf(repo_id),
        )

    def resolve(
        self,
        raw_value: str,
        entity_type: str,
        source_config: Optional[str] = None,
    ) -> ResolutionResult:
        # 1. Exact
        canonical_id = exact_match(raw_value, entity_type, source_config, self.store)
        if canonical_id is not None:
            return self._enrich(raw_value, entity_type, source_config, canonical_id, "exact", 1.0)

        # 2. Normalized (confidence 0.95 β€” only return if above threshold)
        _NORMALIZED_CONFIDENCE = 0.95
        if _NORMALIZED_CONFIDENCE >= self.config.threshold:
            canonical_id = normalized_match(raw_value, entity_type, self.store, source_config)
            if canonical_id is not None:
                return self._enrich(
                    raw_value, entity_type, source_config,
                    canonical_id, "normalized", _NORMALIZED_CONFIDENCE,
                )

        # 3. Fuzzy
        canonical_id, confidence = fuzzy_match(
            raw_value, entity_type, self.config.threshold, self.store, source_config
        )
        if canonical_id is not None:
            return self._enrich(
                raw_value, entity_type, source_config,
                canonical_id, "fuzzy", confidence,
            )

        # 4. No match
        return ResolutionResult(
            raw_value=raw_value,
            entity_type=entity_type,
            source_config=source_config,
            canonical_id=None,
            strategy="no_match",
            confidence=0.0,
        )

    # ------------------------------------------------------------------
    # Enrichment (no-op when no canonical_store is attached)
    # ------------------------------------------------------------------

    def build_result(
        self,
        raw_value: str,
        entity_type: str,
        source_config: Optional[str],
        canonical_id: str,
        strategy: str,
        confidence: float,
    ) -> ResolutionResult:
        """Construct an enriched `ResolutionResult` for a canonical_id
        the caller already knows β€” useful for callers that bypass the
        strategy chain (e.g. an alias-table cache hit, an auto-created
        draft) but want the same rich response shape. Identical to the
        enrichment that happens inside `resolve()`."""
        return self._enrich(raw_value, entity_type, source_config, canonical_id, strategy, confidence)

    def _enrich(
        self,
        raw_value: str,
        entity_type: str,
        source_config: Optional[str],
        matched_canonical_id: str,
        strategy: str,
        confidence: float,
    ) -> ResolutionResult:
        """Look up the matched canonical's row and populate the rich
        response fields. When no canonical_store is attached, the rich
        fields stay None and the result has just the basic match info."""
        if self.canonical_store is None:
            return ResolutionResult(
                raw_value=raw_value,
                entity_type=entity_type,
                source_config=source_config,
                canonical_id=matched_canonical_id,
                strategy=strategy,
                confidence=confidence,
            )

        cs = self.canonical_store
        matched_entity = cs.lookup(entity_type, matched_canonical_id)
        review_status = (matched_entity or {}).get("review_status") if matched_entity else None

        if entity_type == "model":
            fields = cs.model_metadata_fields(matched_canonical_id, matched_entity)
            # If the response collapses to a different canonical (root),
            # surface THAT canonical's review_status β€” keeps the response
            # internally consistent.
            if fields["canonical_id"] != matched_canonical_id:
                root_entity = cs.lookup("model", fields["canonical_id"])
                if root_entity:
                    review_status = root_entity.get("review_status") or review_status
            return ResolutionResult(
                raw_value=raw_value,
                entity_type=entity_type,
                source_config=source_config,
                canonical_id=fields["canonical_id"],
                strategy=strategy,
                confidence=confidence,
                review_status=review_status,
                parent_canonical_id=cs.parent_canonical_id("model", matched_entity),
                resolved_leaf_id=fields["resolved_leaf_id"],
                root_model_id=fields["root_model_id"],
                lineage_origin_org_id=fields["lineage_origin_org_id"],
                parents=fields["parents"],
                open_weights=fields["open_weights"],
                release_date=fields["release_date"],
                params_billions=fields["params_billions"],
            )

        # Benchmark: fill in hierarchy-alignment fields (family_key,
        # category) by walking canonical_families. composite_keys stays
        # empty here β€” see CanonicalStore.benchmark_family_enrichment for
        # why composite computation belongs in the producer.
        if entity_type == "benchmark":
            fam = cs.benchmark_family_enrichment(matched_canonical_id)
            return ResolutionResult(
                raw_value=raw_value,
                entity_type=entity_type,
                source_config=source_config,
                canonical_id=matched_canonical_id,
                strategy=strategy,
                confidence=confidence,
                review_status=review_status,
                parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity),
                family_key=fam["family_key"],
                category=fam["category"],
                composite_keys=fam["composite_keys"],
            )

        # Other non-model types (metric, harness, org): only
        # parent_canonical_id and review_status are meaningful
        return ResolutionResult(
            raw_value=raw_value,
            entity_type=entity_type,
            source_config=source_config,
            canonical_id=matched_canonical_id,
            strategy=strategy,
            confidence=confidence,
            review_status=review_status,
            parent_canonical_id=cs.parent_canonical_id(entity_type, matched_entity),
        )