Spaces:

evaleval
/

entity-registry

Sleeping

App Files Files Community

j-chim commited on 25 days ago

Commit

4c87a8b

verified ·

1 Parent(s): 94d49c0

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

packages/eval-entity-resolver/src/eval_entity_resolver/strategies/fuzzy.py +34 -23
src/eval_card_registry/services/hub_stats.py +181 -3
src/eval_card_registry/services/resolution_service.py +145 -29

packages/eval-entity-resolver/src/eval_entity_resolver/strategies/fuzzy.py CHANGED Viewed

@@ -108,9 +108,15 @@ _STRIP_SUFFIX_PATTERNS: list[re.Pattern[str]] = [
     # exist; only when they don't does this strip's drop-thinking behavior
     # take over.
     re.compile(r"-thinking-\d+k$", re.IGNORECASE),
-    # Date version suffix (YYYYMMDD): "-20251101", "-20240315"
-    # Only strip dates (8 consecutive digits) to avoid touching version numbers.
-    re.compile(r"-\d{8}$"),
 ]
 # Strip just the `-Nk` budget tail, leaving `-thinking` intact. Used by
@@ -382,18 +388,28 @@ _ISO_DATE_YEAR_RE = re.compile(r"^(.+)-(\d{4})$")
 def _strip_openai_iso_date(value: str) -> list[str]:
-    """For OpenAI-shaped values ending in an ISO-format date, return a
-    list of progressively-truncated candidates (day → month → year → bare).
-    Each candidate gets looked up by the caller; the first hit wins.
-    Lookup is verifying — if no truncated form is aliased in the registry,
-    nothing changes (no false matches manufactured by the strip itself).
-    Examples:
-        openai/gpt-5-2025-08-07 → [openai/gpt-5-2025-08, openai/gpt-5-2025, openai/gpt-5]
-        openai/o3-mini-2025-01-31 → [openai/o3-mini-2025-01, openai/o3-mini-2025, openai/o3-mini]
-        openai/gpt-4o-mini-2024 → [openai/gpt-4o-mini]
-        meta/llama-3-2024-04-18 → []   (not OpenAI-shaped)
     """
     if not _is_openai_shaped(value):
         return []
@@ -415,7 +431,6 @@ def _strip_openai_iso_date(value: str) -> list[str]:
         if _is_release_year(y) and 1 <= int(mo) <= 12 and 1 <= int(d) <= 31:
             candidates.append(f"{prefix}-{y}-{mo}")
             candidates.append(f"{prefix}-{y}")
-            candidates.append(prefix)
             return candidates
     m = _ISO_DATE_MONTH_RE.match(value)
@@ -423,15 +438,11 @@ def _strip_openai_iso_date(value: str) -> list[str]:
         prefix, y, mo = m.groups()
         if _is_release_year(y) and 1 <= int(mo) <= 12:
             candidates.append(f"{prefix}-{y}")
-            candidates.append(prefix)
             return candidates
-    m = _ISO_DATE_YEAR_RE.match(value)
-    if m:
-        prefix, y = m.groups()
-        if _is_release_year(y):
-            candidates.append(prefix)
     return candidates

     # exist; only when they don't does this strip's drop-thinking behavior
     # take over.
     re.compile(r"-thinking-\d+k$", re.IGNORECASE),
+    # NB: trailing 8-digit date suffix (`-20251101`) is NOT stripped here.
+    # Stripping a packed YYYYMMDD ALWAYS produces the bare-family form,
+    # which silently aliases dated snapshots into their family pointer
+    # and loses the snapshot's `release_date`. The auto-create +
+    # hub-stats path produces a properly-linked snapshot canonical
+    # instead. See `infer_family_parent_edge` in
+    # services/hub_stats.py for the family-version edge inference.
+    # When a snapshot canonical is already aliased (exact / normalized
+    # match wins before fuzzy), the resolver returns it directly.
 ]
 # Strip just the `-Nk` budget tail, leaving `-thinking` intact. Used by
 def _strip_openai_iso_date(value: str) -> list[str]:
+    """For OpenAI-shaped values ending in an ISO-format date, return
+    progressively-truncated candidates that STILL retain at least one
+    date component. The bare-family candidate (everything stripped) is
+    intentionally omitted: collapsing a dated snapshot all the way to
+    its family pointer drops the per-snapshot identity and silently
+    loses the snapshot's `release_date`. The auto-create + hub-stats
+    path is the right home for that case — it creates a snapshot
+    canonical with a `variant axis=version` parent edge to the family.
+    When an INTERMEDIATE snapshot canonical is aliased in the registry
+    (e.g. `openai/gpt-5-2025-08`), this function still returns it as a
+    candidate so a more-specific raw value (`openai/gpt-5-2025-08-07`)
+    can resolve to the existing snapshot rather than auto-creating a
+    duplicate.
+    Examples (registry contents shape what hits — this just emits the
+    candidates that are tried in order):
+        openai/gpt-5-2025-08-07 → [openai/gpt-5-2025-08, openai/gpt-5-2025]
+        openai/o3-mini-2025-01-31 → [openai/o3-mini-2025-01, openai/o3-mini-2025]
+        openai/gpt-4o-mini-2024 → []       (year-only has no intermediate;
+                                            handled via auto-create path)
+        meta/llama-3-2024-04-18 → []       (not OpenAI-shaped)
     """
     if not _is_openai_shaped(value):
         return []
         if _is_release_year(y) and 1 <= int(mo) <= 12 and 1 <= int(d) <= 31:
             candidates.append(f"{prefix}-{y}-{mo}")
             candidates.append(f"{prefix}-{y}")
             return candidates
     m = _ISO_DATE_MONTH_RE.match(value)
         prefix, y, mo = m.groups()
         if _is_release_year(y) and 1 <= int(mo) <= 12:
             candidates.append(f"{prefix}-{y}")
             return candidates
+    # Year-only case (`-YYYY`) intentionally produces no candidates: the
+    # only possible peel is to bare family, which the auto-create path
+    # owns. Returning empty falls through to no_match cleanly.
     return candidates

src/eval_card_registry/services/hub_stats.py CHANGED Viewed

@@ -151,6 +151,158 @@ def filter_useful_tags(raw_tags) -> list[str]:
     return sorted(set(keep))
 def extract_base_models(base_models) -> list[dict]:
     """Decode the `baseModels` struct into a list of typed parent edges.
     Returns `[{id, relationship}, ...]` — caller resolves each id to our
@@ -295,14 +447,21 @@ class HubStatsClient:
         try:
             con = self._ensure_con()
             use_local = self._ensure_local_table(con)
-            escaped = hf_id.replace("'", "''")
             if use_local:
-                sql = f"SELECT * FROM hub_stats WHERE id = '{escaped}' LIMIT 1"
             else:
                 sql = (
                     f"SELECT {QUERY_COLUMNS} "
                     f"FROM read_parquet('{self.parquet_url}') "
-                    f"WHERE id = '{escaped}' LIMIT 1"
                 )
             cursor = con.execute(sql)
             cols = [d[0] for d in cursor.description]
@@ -330,6 +489,7 @@ def enrich_draft_from_row(
     row: dict,
     aliases_to_canonical: dict[str, str],
     org_alias_map: dict[str, str],
 ) -> dict:
     """Convert one hub-stats row into a partial canonical_models dict
     suitable for merging into an auto-created draft. Computes:
@@ -383,6 +543,24 @@ def enrich_draft_from_row(
             if lineage_origin_org_id is None and edge["relationship"] != "variant":
                 if "/" in parent_canonical:
                     lineage_origin_org_id = parent_canonical.split("/", 1)[0]
     if parents:
         out["parents"] = json.dumps(parents)
     if lineage_origin_org_id:

     return sorted(set(keep))
+# ---------------------------------------------------------------------------
+# Family-version parent inference
+# ---------------------------------------------------------------------------
+#
+# Hub-stats `baseModels` records *upstream* lineage (finetune / quantized /
+# merge / adapter), never the family-version relationship between a dated
+# snapshot and its moving pointer canonical (`Olmo-3-1125-32B` ↔ our
+# `allenai/olmo-3-32b`). The pointer isn't an HF id — it only exists in our
+# registry — so HF can't surface that edge. Without inference here, dated
+# snapshots auto-create as orphaned canonicals: `release_date` lands fine
+# but `parents`/`root_model_id` stay empty, root-collapse never fires, and
+# the snapshot shows up as a separate model in consumers.
+_INTERNAL_DATE_RE = re.compile(r"^(.+?)-(\d{4})-([^-].*)$")
+_TRAILING_4DIGIT_RE = re.compile(r"^(.+)-(\d{4})$")
+_TRAILING_6DIGIT_RE = re.compile(r"^(.+)-(\d{6})$")
+_TRAILING_8DIGIT_RE = re.compile(r"^(.+)-(\d{8})$")
+# ISO date patterns (anchored, full-string). Strict component widths
+# stop us from peeling tokens that aren't dates (a 5-digit numeric tail
+# won't match `\d{4}-\d{2}`).
+_ISO_FULL_DATE_RE = re.compile(r"^(.+)-(\d{4})-(\d{2})-(\d{2})$")
+_ISO_MONTH_DATE_RE = re.compile(r"^(.+)-(\d{4})-(\d{2})$")
+_ISO_YEAR_DATE_RE = re.compile(r"^(.+)-(\d{4})$")
+def _looks_like_mmdd(token: str) -> bool:
+    """4-digit MMDD where MM ∈ [01,12] and DD ∈ [01,31]. Used to gate
+    snapshot-token stripping on shapes that actually look like dates,
+    avoiding false-positives on numeric size/version tokens like `8000`."""
+    if len(token) != 4 or not token.isdigit():
+        return False
+    mm, dd = int(token[:2]), int(token[2:])
+    return 1 <= mm <= 12 and 1 <= dd <= 31
+def _looks_like_yyyymm(token: str) -> bool:
+    """6-digit YYYYMM (year+month). Stepfun and several Chinese-lab
+    release tags use this convention, e.g. `step-2-16k-202411`."""
+    if len(token) != 6 or not token.isdigit():
+        return False
+    yyyy, mm = int(token[:4]), int(token[4:])
+    return 2015 <= yyyy <= 2035 and 1 <= mm <= 12
+def _looks_like_yyyymmdd(token: str) -> bool:
+    if len(token) != 8 or not token.isdigit():
+        return False
+    yyyy, mm, dd = int(token[:4]), int(token[4:6]), int(token[6:])
+    return 2015 <= yyyy <= 2035 and 1 <= mm <= 12 and 1 <= dd <= 31
+def _looks_like_release_year(token: str) -> bool:
+    if len(token) != 4 or not token.isdigit():
+        return False
+    return 2015 <= int(token) <= 2035
+def infer_family_parent_edge(
+    hf_id: str,
+    aliases_to_canonical: dict[str, str],
+    target_canonical: Optional[str] = None,
+) -> Optional[dict]:
+    """Detect snapshot-shape ids whose stripped form matches an existing
+    canonical, and return a `{id, relationship: variant, axis: version}`
+    edge pointing at it. Returns None when the id has no snapshot shape
+    or the stripped form doesn't match any known canonical/alias.
+    Patterns recognized (single-pass strip — does NOT compose with
+    mode/quant suffix stripping):
+      - internal MMDD token: `Olmo-3-1125-32B` → `Olmo-3-32B`
+        also `Olmo-3-1125-7B-Instruct` → `Olmo-3-7B-Instruct`
+      - trailing MMDD token: `kimi-k2-0905` → `kimi-k2`
+      - trailing YYYYMM token: `step-2-16k-202411` → `step-2-16k`
+      - trailing YYYYMMDD: `claude-haiku-4-5-20251001` → `claude-haiku-4-5`
+      - trailing ISO date ladder: `gpt-5-2025-08-07` →
+        `gpt-5-2025-08` → `gpt-5-2025` → `gpt-5`
+    Only fires when the candidate stripped form resolves through the
+    alias index — no false matches manufactured by stripping alone.
+    For compound mode+date inputs (`claude-4-5-thinking-20251001`), the
+    strip resolves to the mode-promoted canonical iff one exists; if
+    not, returns None (the snapshot still gets `release_date` from
+    hub-stats but lands without a parent edge).
+    `target_canonical` is the canonical id the inferred edge will be
+    attached to. When provided, suppresses self-edges (matters in the
+    bulk-refresh path where an HF id may be aliased directly to its
+    family pointer rather than a separate snapshot canonical — without
+    this guard the family pointer gains a parent edge to itself,
+    breaking the lineage walker). Live auto-create can also pass the
+    proposed draft id; it just makes the guard tighter.
+    """
+    candidates: list[str] = []
+    # Internal MMDD: `Olmo-3-1125-32B` shape. Tries first because
+    # internal-token strips give a more specific lookup target than
+    # trailing-token strips.
+    m = _INTERNAL_DATE_RE.match(hf_id)
+    if m and _looks_like_mmdd(m.group(2)):
+        prefix, _, suffix = m.groups()
+        candidates.append(f"{prefix}-{suffix}")
+    # ISO ladder (full → month → year). The three regexes match
+    # mutually exclusive tail shapes (`-YYYY-MM-DD` vs `-YYYY-MM` vs
+    # `-YYYY`), so each input fires at most one branch.
+    m = _ISO_FULL_DATE_RE.match(hf_id)
+    if m:
+        prefix, y, mo, d = m.groups()
+        if (_looks_like_release_year(y) and 1 <= int(mo) <= 12
+                and 1 <= int(d) <= 31):
+            candidates.append(f"{prefix}-{y}-{mo}")
+            candidates.append(f"{prefix}-{y}")
+            candidates.append(prefix)
+    else:
+        m = _ISO_MONTH_DATE_RE.match(hf_id)
+        if m:
+            prefix, y, mo = m.groups()
+            if _looks_like_release_year(y) and 1 <= int(mo) <= 12:
+                candidates.append(f"{prefix}-{y}")
+                candidates.append(prefix)
+        else:
+            m = _ISO_YEAR_DATE_RE.match(hf_id)
+            if m:
+                prefix, y = m.groups()
+                if _looks_like_release_year(y):
+                    candidates.append(prefix)
+    # Trailing YYYYMMDD (Anthropic/xAI/Tencent style).
+    m = _TRAILING_8DIGIT_RE.match(hf_id)
+    if m and _looks_like_yyyymmdd(m.group(2)):
+        candidates.append(m.group(1))
+    # Trailing YYYYMM (Stepfun and several Chinese-lab release tags).
+    m = _TRAILING_6DIGIT_RE.match(hf_id)
+    if m and _looks_like_yyyymm(m.group(2)):
+        candidates.append(m.group(1))
+    # Trailing 4-digit MMDD (Moonshot/Kimi, Google -exp tags).
+    m = _TRAILING_4DIGIT_RE.match(hf_id)
+    if m and _looks_like_mmdd(m.group(2)):
+        candidates.append(m.group(1))
+    for cand in candidates:
+        canonical = aliases_to_canonical.get(normalize(cand))
+        if not canonical:
+            continue
+        if target_canonical is not None and canonical == target_canonical:
+            continue
+        return {"id": canonical, "relationship": "variant", "axis": "version"}
+    return None
 def extract_base_models(base_models) -> list[dict]:
     """Decode the `baseModels` struct into a list of typed parent edges.
     Returns `[{id, relationship}, ...]` — caller resolves each id to our
         try:
             con = self._ensure_con()
             use_local = self._ensure_local_table(con)
+            # Case-insensitive match — HF stores ids with the upstream
+            # author's original casing (`allenai/Olmo-3-1125-32B`); EEE
+            # surfaces values in mixed conventions (some leaderboards
+            # lowercase, some preserve). An exact-case `=` filter
+            # silently misses any casing mismatch and the draft lands
+            # without enrichment metadata. LOWER() forces a match
+            # regardless of the surface form.
+            escaped = hf_id.lower().replace("'", "''")
             if use_local:
+                sql = f"SELECT * FROM hub_stats WHERE LOWER(id) = '{escaped}' LIMIT 1"
             else:
                 sql = (
                     f"SELECT {QUERY_COLUMNS} "
                     f"FROM read_parquet('{self.parquet_url}') "
+                    f"WHERE LOWER(id) = '{escaped}' LIMIT 1"
                 )
             cursor = con.execute(sql)
             cols = [d[0] for d in cursor.description]
     row: dict,
     aliases_to_canonical: dict[str, str],
     org_alias_map: dict[str, str],
+    target_canonical: Optional[str] = None,
 ) -> dict:
     """Convert one hub-stats row into a partial canonical_models dict
     suitable for merging into an auto-created draft. Computes:
             if lineage_origin_org_id is None and edge["relationship"] != "variant":
                 if "/" in parent_canonical:
                     lineage_origin_org_id = parent_canonical.split("/", 1)[0]
+    # Family-version inference: hub-stats `baseModels` only records
+    # upstream-lineage edges (finetune/quantized/merge/adapter), never
+    # the dated-snapshot ↔ moving-pointer relationship that lives only
+    # in our registry. Without this, snapshots like `Olmo-3-1125-32B`
+    # auto-create as orphan canonicals — release_date lands but parents
+    # stays empty and root-collapse never fires.
+    hf_id = row.get("id")
+    if isinstance(hf_id, str) and not any(
+        p.get("relationship") == "variant" and p.get("axis") == "version"
+        for p in parents
+    ):
+        version_edge = infer_family_parent_edge(
+            hf_id, aliases_to_canonical, target_canonical=target_canonical,
+        )
+        if version_edge is not None:
+            parents.append(version_edge)
     if parents:
         out["parents"] = json.dumps(parents)
     if lineage_origin_org_id:

src/eval_card_registry/services/resolution_service.py CHANGED Viewed

@@ -9,8 +9,11 @@ Responsibilities:
 """
 from __future__ import annotations
 import re
 import uuid
 from datetime import datetime, timezone
 from typing import Optional
@@ -49,6 +52,32 @@ def _now() -> str:
     return datetime.now(timezone.utc).isoformat()
 def _build_alias_store(registry_store: RegistryStore) -> AliasStore:
     """Build an AliasStore from the registry's in-memory aliases table."""
     aliases_df = registry_store.table("aliases")
@@ -60,13 +89,16 @@ def _build_canonical_store(registry_store: RegistryStore) -> CanonicalStore:
     tables. Lets the bare resolver enrich its results with the same
     metadata fields the HTTP API exposes — including benchmark
     `family_key` / `category` (which need families_df + composites_df
-    to populate; otherwise they fall back to the benchmark's own id)."""
     return CanonicalStore(
-        models_df=registry_store.table("canonical_models"),
-        benchmarks_df=registry_store.table("canonical_benchmarks"),
-        metrics_df=registry_store.table("canonical_metrics"),
-        harnesses_df=registry_store.table("eval_harnesses"),
-        orgs_df=registry_store.table("canonical_orgs") if registry_store.has_table("canonical_orgs") else None,
         families_df=registry_store.table("canonical_families") if registry_store.has_table("canonical_families") else None,
         composites_df=registry_store.table("canonical_composites") if registry_store.has_table("canonical_composites") else None,
     )
@@ -182,18 +214,36 @@ class ResolutionService:
             return result_dict
         # Check if alias already exists (skip resolver on rerun=False).
-        # Build the enriched response via `Resolver.build_result` so we
-        # preserve the original alias's strategy/confidence (audit trail)
-        # while still surfacing the same canonical-collapse / metadata
-        # fields a fresh resolve would produce.
         if not rerun:
             existing = queries.get_alias(self.store, raw_value, entity_type, source_config)
             if existing:
                 resolver = self._get_resolver()
-                enriched = resolver.build_result(
-                    raw_value, entity_type, source_config,
-                    existing["canonical_id"], existing["strategy"], existing["confidence"],
-                )
                 result_dict = _result_to_dict(enriched, created_new=False)
                 self._resolve_cache[cache_key] = result_dict
                 return result_dict
@@ -274,17 +324,33 @@ class ResolutionService:
         if created_new:
             self.invalidate_resolver()
-        # Build the enriched response via the resolver. For auto-drafts
-        # the freshly-created entity sits in the pending-write buffer and
-        # may NOT be visible to the canonical_store's DataFrame snapshot
-        # yet (`_auto_create_entity` writes with `buffered=True`). When
-        # the lookup misses, the resolver returns review_status=None;
-        # we know auto-drafts land at "draft" by definition, so override.
-        resolver = self._get_resolver()
-        enriched = resolver.build_result(
-            raw_value, entity_type, source_config,
-            canonical_id, strategy_used, result.confidence,
-        )
         result_dict = _result_to_dict(enriched, created_new=created_new)
         if created_new and result_dict.get("review_status") is None:
             result_dict["review_status"] = "draft"
@@ -322,7 +388,7 @@ class ResolutionService:
         # — `enrichment` is `{}` on lookup miss or any error.
         enrichment: dict = {}
         if entity_type == "model" and self._looks_like_hf_id(raw_value):
-            enrichment = self._lookup_hub_stats(raw_value) or {}
         if entity_type == "model":
             base.update({
                 "developer": None,
@@ -343,6 +409,16 @@ class ResolutionService:
             for k, v in enrichment.items():
                 if v is not None:
                     base[k] = v
         elif entity_type == "benchmark":
             base.update({"description": None, "dataset_repo": None, "parent_benchmark_id": None, "tags": "[]"})
         elif entity_type == "metric":
@@ -361,6 +437,37 @@ class ResolutionService:
         queries.upsert_entity(self.store, table, base, buffered=True)
         return candidate_id
     @staticmethod
     def _looks_like_hf_id(raw_value: str) -> bool:
         """HF id heuristic: contains a single `/` with non-empty parts on
@@ -372,12 +479,18 @@ class ResolutionService:
         org, name = raw_value.split("/", 1)
         return bool(org.strip()) and bool(name.strip())
-    def _lookup_hub_stats(self, hf_id: str) -> Optional[dict]:
         """Query hub-stats live for `hf_id` and return a partial draft
         dict (release_date, params_billions, parents, lineage_origin_org_id,
         tags, metadata) ready to merge. Returns None on miss or any error.
         Uses the `aliases` table to resolve baseModels parents to our
-        canonical ids, and `canonical_orgs` HF aliases to map authors."""
         if not settings.hub_stats_lookup_enabled:
             return None
         try:
@@ -390,7 +503,10 @@ class ResolutionService:
         from eval_card_registry.services import hub_stats as _hs
         try:
             aliases_to_canonical, org_alias_map = self._build_hub_stats_indices()
-            return _hs.enrich_draft_from_row(row, aliases_to_canonical, org_alias_map)
         except Exception:
             return None

 """
 from __future__ import annotations
+import json
 import re
+import threading
 import uuid
+from dataclasses import replace as _dc_replace
 from datetime import datetime, timezone
 from typing import Optional
     return datetime.now(timezone.utc).isoformat()
+def _table_with_pending(registry_store: RegistryStore, name: str) -> "pd.DataFrame":
+    """Return a table DataFrame with pending-buffer rows appended.
+    `_auto_create_entity` writes drafts with `buffered=True`, so they sit
+    in `store._pending[<table>]` until `flush_pending` runs at the end of
+    a sync. Without overlaying pending here, the resolver's
+    `CanonicalStore` snapshot can't see the just-created row, and
+    `build_result` for an auto-created entity returns null for every
+    metadata field that hub-stats just enriched.
+    Concat is safe because `upsert_entity` enforces id-uniqueness across
+    base + pending (existing rows go to in-place update; only genuinely
+    new ids land in pending), so no duplicate keys end up in the
+    CanonicalStore index.
+    """
+    import pandas as pd
+    base_df = registry_store.table(name) if registry_store.has_table(name) else pd.DataFrame()
+    pending = getattr(registry_store, "_pending", {}).get(name, [])
+    if not pending:
+        return base_df
+    pending_df = pd.DataFrame(pending)
+    if base_df.empty:
+        return pending_df
+    return pd.concat([base_df, pending_df], ignore_index=True)
 def _build_alias_store(registry_store: RegistryStore) -> AliasStore:
     """Build an AliasStore from the registry's in-memory aliases table."""
     aliases_df = registry_store.table("aliases")
     tables. Lets the bare resolver enrich its results with the same
     metadata fields the HTTP API exposes — including benchmark
     `family_key` / `category` (which need families_df + composites_df
+    to populate; otherwise they fall back to the benchmark's own id).
+    Pending-buffer rows are overlaid so the resolver sees auto-created
+    drafts before `flush_pending` runs. See `_table_with_pending`."""
     return CanonicalStore(
+        models_df=_table_with_pending(registry_store, "canonical_models"),
+        benchmarks_df=_table_with_pending(registry_store, "canonical_benchmarks"),
+        metrics_df=_table_with_pending(registry_store, "canonical_metrics"),
+        harnesses_df=_table_with_pending(registry_store, "eval_harnesses"),
+        orgs_df=_table_with_pending(registry_store, "canonical_orgs") if registry_store.has_table("canonical_orgs") else None,
         families_df=registry_store.table("canonical_families") if registry_store.has_table("canonical_families") else None,
         composites_df=registry_store.table("canonical_composites") if registry_store.has_table("canonical_composites") else None,
     )
             return result_dict
         # Check if alias already exists (skip resolver on rerun=False).
+        # Re-run the strategy chain so the response carries the correct
+        # `resolved_leaf_id` — the alias table only stores the
+        # root-collapsed `canonical_id`, so reconstructing the response
+        # via `build_result(root, ...)` would clobber the leaf to the
+        # root (model_metadata_fields can't recover leaf identity from
+        # a root row alone — there's no back-pointer). The strategy
+        # chain re-derives leaf cleanly; perf cost is one alias-index
+        # lookup since exact-match hits in O(1) for already-aliased
+        # values. Audit fields are overlaid from the alias entry so
+        # callers still see the original strategy/confidence.
         if not rerun:
             existing = queries.get_alias(self.store, raw_value, entity_type, source_config)
             if existing:
                 resolver = self._get_resolver()
+                fresh = resolver.resolve(raw_value, entity_type, source_config)
+                if fresh.canonical_id == existing["canonical_id"]:
+                    enriched = _dc_replace(
+                        fresh,
+                        strategy=existing["strategy"],
+                        confidence=existing["confidence"],
+                    )
+                else:
+                    # Rare: registry restructure has moved the canonical
+                    # for this raw_value since the alias was written.
+                    # The alias entry is the source of truth for "what
+                    # this raw resolved to" — accept the leaf clobber.
+                    enriched = resolver.build_result(
+                        raw_value, entity_type, source_config,
+                        existing["canonical_id"], existing["strategy"], existing["confidence"],
+                    )
                 result_dict = _result_to_dict(enriched, created_new=False)
                 self._resolve_cache[cache_key] = result_dict
                 return result_dict
         if created_new:
             self.invalidate_resolver()
+        # Build the enriched response. Two cases:
+        #   1. Match found — the original `result` already carries the
+        #      correct canonical_id (root-collapsed), resolved_leaf_id
+        #      (the matched leaf), parents, and metadata. Don't re-run
+        #      `build_result` here: it would call `model_metadata_fields`
+        #      with the ROOT id, which can't recover the leaf and ends
+        #      up returning resolved_leaf_id = canonical_id. The alias
+        #      write earlier doesn't change canonical_models — `result`
+        #      stays accurate.
+        #   2. Auto-create — `result.canonical_id` was None, the new
+        #      `canonical_id` came from `_auto_create_entity`. The new
+        #      canonical IS the leaf (its parents may point at family
+        #      via the inferred version-axis edge), so `build_result`
+        #      with the new id correctly preserves leaf info via
+        #      `model_metadata_fields`. The `invalidate_resolver()`
+        #      above ensures the canonical_store snapshot sees the new
+        #      row, but the entity may still sit in the pending-write
+        #      buffer; on lookup miss the review_status falls back to
+        #      None and we override to 'draft' below.
+        if created_new:
+            resolver = self._get_resolver()
+            enriched = resolver.build_result(
+                raw_value, entity_type, source_config,
+                canonical_id, strategy_used, result.confidence,
+            )
+        else:
+            enriched = result
         result_dict = _result_to_dict(enriched, created_new=created_new)
         if created_new and result_dict.get("review_status") is None:
             result_dict["review_status"] = "draft"
         # — `enrichment` is `{}` on lookup miss or any error.
         enrichment: dict = {}
         if entity_type == "model" and self._looks_like_hf_id(raw_value):
+            enrichment = self._lookup_hub_stats(raw_value, target_canonical=candidate_id) or {}
         if entity_type == "model":
             base.update({
                 "developer": None,
             for k, v in enrichment.items():
                 if v is not None:
                     base[k] = v
+            # Family-version inference fallback: when hub-stats misses
+            # (parquet stale, lookup disabled, rate-limited, or row
+            # absent), the snapshot still has its shape — try to infer a
+            # version-axis parent from just the alias index. The
+            # inference is alias-lookup-only, so it never manufactures
+            # a false parent. Idempotent with the inference inside
+            # enrich_draft_from_row: only fires when no version-axis
+            # edge is already present.
+            if self._looks_like_hf_id(raw_value):
+                self._maybe_infer_family_parent(base, raw_value, candidate_id)
         elif entity_type == "benchmark":
             base.update({"description": None, "dataset_repo": None, "parent_benchmark_id": None, "tags": "[]"})
         elif entity_type == "metric":
         queries.upsert_entity(self.store, table, base, buffered=True)
         return candidate_id
+    def _maybe_infer_family_parent(
+        self, base: dict, raw_value: str, candidate_id: str,
+    ) -> None:
+        """Mutate `base['parents']` to add a `{variant, axis: version}`
+        edge when the raw value's snapshot shape resolves to an existing
+        family canonical via the alias index. Runs independently of
+        hub-stats so brand-new releases not yet in the parquet still
+        get linked into the lineage graph."""
+        try:
+            existing = json.loads(base.get("parents") or "[]")
+        except (ValueError, TypeError):
+            existing = []
+        if any(
+            p.get("relationship") == "variant" and p.get("axis") == "version"
+            for p in existing
+            if isinstance(p, dict)
+        ):
+            return
+        from eval_card_registry.services.hub_stats import infer_family_parent_edge
+        try:
+            aliases_to_canonical, _ = self._build_hub_stats_indices()
+        except Exception:
+            return
+        edge = infer_family_parent_edge(
+            raw_value, aliases_to_canonical, target_canonical=candidate_id,
+        )
+        if edge is None:
+            return
+        existing.append(edge)
+        base["parents"] = json.dumps(existing)
     @staticmethod
     def _looks_like_hf_id(raw_value: str) -> bool:
         """HF id heuristic: contains a single `/` with non-empty parts on
         org, name = raw_value.split("/", 1)
         return bool(org.strip()) and bool(name.strip())
+    def _lookup_hub_stats(
+        self, hf_id: str, target_canonical: Optional[str] = None,
+    ) -> Optional[dict]:
         """Query hub-stats live for `hf_id` and return a partial draft
         dict (release_date, params_billions, parents, lineage_origin_org_id,
         tags, metadata) ready to merge. Returns None on miss or any error.
         Uses the `aliases` table to resolve baseModels parents to our
+        canonical ids, and `canonical_orgs` HF aliases to map authors.
+        `target_canonical` is the candidate canonical id of the draft
+        being created — passed through to enrich_draft_from_row so the
+        family-version inference can suppress a self-edge."""
         if not settings.hub_stats_lookup_enabled:
             return None
         try:
         from eval_card_registry.services import hub_stats as _hs
         try:
             aliases_to_canonical, org_alias_map = self._build_hub_stats_indices()
+            return _hs.enrich_draft_from_row(
+                row, aliases_to_canonical, org_alias_map,
+                target_canonical=target_canonical,
+            )
         except Exception:
             return None