j-chim commited on
Commit
4c87a8b
Β·
verified Β·
1 Parent(s): 94d49c0

Upload folder using huggingface_hub

Browse files
packages/eval-entity-resolver/src/eval_entity_resolver/strategies/fuzzy.py CHANGED
@@ -108,9 +108,15 @@ _STRIP_SUFFIX_PATTERNS: list[re.Pattern[str]] = [
108
  # exist; only when they don't does this strip's drop-thinking behavior
109
  # take over.
110
  re.compile(r"-thinking-\d+k$", re.IGNORECASE),
111
- # Date version suffix (YYYYMMDD): "-20251101", "-20240315"
112
- # Only strip dates (8 consecutive digits) to avoid touching version numbers.
113
- re.compile(r"-\d{8}$"),
 
 
 
 
 
 
114
  ]
115
 
116
  # Strip just the `-Nk` budget tail, leaving `-thinking` intact. Used by
@@ -382,18 +388,28 @@ _ISO_DATE_YEAR_RE = re.compile(r"^(.+)-(\d{4})$")
382
 
383
 
384
  def _strip_openai_iso_date(value: str) -> list[str]:
385
- """For OpenAI-shaped values ending in an ISO-format date, return a
386
- list of progressively-truncated candidates (day β†’ month β†’ year β†’ bare).
387
-
388
- Each candidate gets looked up by the caller; the first hit wins.
389
- Lookup is verifying β€” if no truncated form is aliased in the registry,
390
- nothing changes (no false matches manufactured by the strip itself).
391
-
392
- Examples:
393
- openai/gpt-5-2025-08-07 β†’ [openai/gpt-5-2025-08, openai/gpt-5-2025, openai/gpt-5]
394
- openai/o3-mini-2025-01-31 β†’ [openai/o3-mini-2025-01, openai/o3-mini-2025, openai/o3-mini]
395
- openai/gpt-4o-mini-2024 β†’ [openai/gpt-4o-mini]
396
- meta/llama-3-2024-04-18 β†’ [] (not OpenAI-shaped)
 
 
 
 
 
 
 
 
 
 
397
  """
398
  if not _is_openai_shaped(value):
399
  return []
@@ -415,7 +431,6 @@ def _strip_openai_iso_date(value: str) -> list[str]:
415
  if _is_release_year(y) and 1 <= int(mo) <= 12 and 1 <= int(d) <= 31:
416
  candidates.append(f"{prefix}-{y}-{mo}")
417
  candidates.append(f"{prefix}-{y}")
418
- candidates.append(prefix)
419
  return candidates
420
 
421
  m = _ISO_DATE_MONTH_RE.match(value)
@@ -423,15 +438,11 @@ def _strip_openai_iso_date(value: str) -> list[str]:
423
  prefix, y, mo = m.groups()
424
  if _is_release_year(y) and 1 <= int(mo) <= 12:
425
  candidates.append(f"{prefix}-{y}")
426
- candidates.append(prefix)
427
  return candidates
428
 
429
- m = _ISO_DATE_YEAR_RE.match(value)
430
- if m:
431
- prefix, y = m.groups()
432
- if _is_release_year(y):
433
- candidates.append(prefix)
434
-
435
  return candidates
436
 
437
 
 
108
  # exist; only when they don't does this strip's drop-thinking behavior
109
  # take over.
110
  re.compile(r"-thinking-\d+k$", re.IGNORECASE),
111
+ # NB: trailing 8-digit date suffix (`-20251101`) is NOT stripped here.
112
+ # Stripping a packed YYYYMMDD ALWAYS produces the bare-family form,
113
+ # which silently aliases dated snapshots into their family pointer
114
+ # and loses the snapshot's `release_date`. The auto-create +
115
+ # hub-stats path produces a properly-linked snapshot canonical
116
+ # instead. See `infer_family_parent_edge` in
117
+ # services/hub_stats.py for the family-version edge inference.
118
+ # When a snapshot canonical is already aliased (exact / normalized
119
+ # match wins before fuzzy), the resolver returns it directly.
120
  ]
121
 
122
  # Strip just the `-Nk` budget tail, leaving `-thinking` intact. Used by
 
388
 
389
 
390
  def _strip_openai_iso_date(value: str) -> list[str]:
391
+ """For OpenAI-shaped values ending in an ISO-format date, return
392
+ progressively-truncated candidates that STILL retain at least one
393
+ date component. The bare-family candidate (everything stripped) is
394
+ intentionally omitted: collapsing a dated snapshot all the way to
395
+ its family pointer drops the per-snapshot identity and silently
396
+ loses the snapshot's `release_date`. The auto-create + hub-stats
397
+ path is the right home for that case β€” it creates a snapshot
398
+ canonical with a `variant axis=version` parent edge to the family.
399
+
400
+ When an INTERMEDIATE snapshot canonical is aliased in the registry
401
+ (e.g. `openai/gpt-5-2025-08`), this function still returns it as a
402
+ candidate so a more-specific raw value (`openai/gpt-5-2025-08-07`)
403
+ can resolve to the existing snapshot rather than auto-creating a
404
+ duplicate.
405
+
406
+ Examples (registry contents shape what hits β€” this just emits the
407
+ candidates that are tried in order):
408
+ openai/gpt-5-2025-08-07 β†’ [openai/gpt-5-2025-08, openai/gpt-5-2025]
409
+ openai/o3-mini-2025-01-31 β†’ [openai/o3-mini-2025-01, openai/o3-mini-2025]
410
+ openai/gpt-4o-mini-2024 β†’ [] (year-only has no intermediate;
411
+ handled via auto-create path)
412
+ meta/llama-3-2024-04-18 β†’ [] (not OpenAI-shaped)
413
  """
414
  if not _is_openai_shaped(value):
415
  return []
 
431
  if _is_release_year(y) and 1 <= int(mo) <= 12 and 1 <= int(d) <= 31:
432
  candidates.append(f"{prefix}-{y}-{mo}")
433
  candidates.append(f"{prefix}-{y}")
 
434
  return candidates
435
 
436
  m = _ISO_DATE_MONTH_RE.match(value)
 
438
  prefix, y, mo = m.groups()
439
  if _is_release_year(y) and 1 <= int(mo) <= 12:
440
  candidates.append(f"{prefix}-{y}")
 
441
  return candidates
442
 
443
+ # Year-only case (`-YYYY`) intentionally produces no candidates: the
444
+ # only possible peel is to bare family, which the auto-create path
445
+ # owns. Returning empty falls through to no_match cleanly.
 
 
 
446
  return candidates
447
 
448
 
src/eval_card_registry/services/hub_stats.py CHANGED
@@ -151,6 +151,158 @@ def filter_useful_tags(raw_tags) -> list[str]:
151
  return sorted(set(keep))
152
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def extract_base_models(base_models) -> list[dict]:
155
  """Decode the `baseModels` struct into a list of typed parent edges.
156
  Returns `[{id, relationship}, ...]` β€” caller resolves each id to our
@@ -295,14 +447,21 @@ class HubStatsClient:
295
  try:
296
  con = self._ensure_con()
297
  use_local = self._ensure_local_table(con)
298
- escaped = hf_id.replace("'", "''")
 
 
 
 
 
 
 
299
  if use_local:
300
- sql = f"SELECT * FROM hub_stats WHERE id = '{escaped}' LIMIT 1"
301
  else:
302
  sql = (
303
  f"SELECT {QUERY_COLUMNS} "
304
  f"FROM read_parquet('{self.parquet_url}') "
305
- f"WHERE id = '{escaped}' LIMIT 1"
306
  )
307
  cursor = con.execute(sql)
308
  cols = [d[0] for d in cursor.description]
@@ -330,6 +489,7 @@ def enrich_draft_from_row(
330
  row: dict,
331
  aliases_to_canonical: dict[str, str],
332
  org_alias_map: dict[str, str],
 
333
  ) -> dict:
334
  """Convert one hub-stats row into a partial canonical_models dict
335
  suitable for merging into an auto-created draft. Computes:
@@ -383,6 +543,24 @@ def enrich_draft_from_row(
383
  if lineage_origin_org_id is None and edge["relationship"] != "variant":
384
  if "/" in parent_canonical:
385
  lineage_origin_org_id = parent_canonical.split("/", 1)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  if parents:
387
  out["parents"] = json.dumps(parents)
388
  if lineage_origin_org_id:
 
151
  return sorted(set(keep))
152
 
153
 
154
+ # ---------------------------------------------------------------------------
155
+ # Family-version parent inference
156
+ # ---------------------------------------------------------------------------
157
+ #
158
+ # Hub-stats `baseModels` records *upstream* lineage (finetune / quantized /
159
+ # merge / adapter), never the family-version relationship between a dated
160
+ # snapshot and its moving pointer canonical (`Olmo-3-1125-32B` ↔ our
161
+ # `allenai/olmo-3-32b`). The pointer isn't an HF id β€” it only exists in our
162
+ # registry β€” so HF can't surface that edge. Without inference here, dated
163
+ # snapshots auto-create as orphaned canonicals: `release_date` lands fine
164
+ # but `parents`/`root_model_id` stay empty, root-collapse never fires, and
165
+ # the snapshot shows up as a separate model in consumers.
166
+
167
+ _INTERNAL_DATE_RE = re.compile(r"^(.+?)-(\d{4})-([^-].*)$")
168
+ _TRAILING_4DIGIT_RE = re.compile(r"^(.+)-(\d{4})$")
169
+ _TRAILING_6DIGIT_RE = re.compile(r"^(.+)-(\d{6})$")
170
+ _TRAILING_8DIGIT_RE = re.compile(r"^(.+)-(\d{8})$")
171
+ # ISO date patterns (anchored, full-string). Strict component widths
172
+ # stop us from peeling tokens that aren't dates (a 5-digit numeric tail
173
+ # won't match `\d{4}-\d{2}`).
174
+ _ISO_FULL_DATE_RE = re.compile(r"^(.+)-(\d{4})-(\d{2})-(\d{2})$")
175
+ _ISO_MONTH_DATE_RE = re.compile(r"^(.+)-(\d{4})-(\d{2})$")
176
+ _ISO_YEAR_DATE_RE = re.compile(r"^(.+)-(\d{4})$")
177
+
178
+
179
+ def _looks_like_mmdd(token: str) -> bool:
180
+ """4-digit MMDD where MM ∈ [01,12] and DD ∈ [01,31]. Used to gate
181
+ snapshot-token stripping on shapes that actually look like dates,
182
+ avoiding false-positives on numeric size/version tokens like `8000`."""
183
+ if len(token) != 4 or not token.isdigit():
184
+ return False
185
+ mm, dd = int(token[:2]), int(token[2:])
186
+ return 1 <= mm <= 12 and 1 <= dd <= 31
187
+
188
+
189
+ def _looks_like_yyyymm(token: str) -> bool:
190
+ """6-digit YYYYMM (year+month). Stepfun and several Chinese-lab
191
+ release tags use this convention, e.g. `step-2-16k-202411`."""
192
+ if len(token) != 6 or not token.isdigit():
193
+ return False
194
+ yyyy, mm = int(token[:4]), int(token[4:])
195
+ return 2015 <= yyyy <= 2035 and 1 <= mm <= 12
196
+
197
+
198
+ def _looks_like_yyyymmdd(token: str) -> bool:
199
+ if len(token) != 8 or not token.isdigit():
200
+ return False
201
+ yyyy, mm, dd = int(token[:4]), int(token[4:6]), int(token[6:])
202
+ return 2015 <= yyyy <= 2035 and 1 <= mm <= 12 and 1 <= dd <= 31
203
+
204
+
205
+ def _looks_like_release_year(token: str) -> bool:
206
+ if len(token) != 4 or not token.isdigit():
207
+ return False
208
+ return 2015 <= int(token) <= 2035
209
+
210
+
211
+ def infer_family_parent_edge(
212
+ hf_id: str,
213
+ aliases_to_canonical: dict[str, str],
214
+ target_canonical: Optional[str] = None,
215
+ ) -> Optional[dict]:
216
+ """Detect snapshot-shape ids whose stripped form matches an existing
217
+ canonical, and return a `{id, relationship: variant, axis: version}`
218
+ edge pointing at it. Returns None when the id has no snapshot shape
219
+ or the stripped form doesn't match any known canonical/alias.
220
+
221
+ Patterns recognized (single-pass strip β€” does NOT compose with
222
+ mode/quant suffix stripping):
223
+ - internal MMDD token: `Olmo-3-1125-32B` β†’ `Olmo-3-32B`
224
+ also `Olmo-3-1125-7B-Instruct` β†’ `Olmo-3-7B-Instruct`
225
+ - trailing MMDD token: `kimi-k2-0905` β†’ `kimi-k2`
226
+ - trailing YYYYMM token: `step-2-16k-202411` β†’ `step-2-16k`
227
+ - trailing YYYYMMDD: `claude-haiku-4-5-20251001` β†’ `claude-haiku-4-5`
228
+ - trailing ISO date ladder: `gpt-5-2025-08-07` β†’
229
+ `gpt-5-2025-08` β†’ `gpt-5-2025` β†’ `gpt-5`
230
+
231
+ Only fires when the candidate stripped form resolves through the
232
+ alias index β€” no false matches manufactured by stripping alone.
233
+ For compound mode+date inputs (`claude-4-5-thinking-20251001`), the
234
+ strip resolves to the mode-promoted canonical iff one exists; if
235
+ not, returns None (the snapshot still gets `release_date` from
236
+ hub-stats but lands without a parent edge).
237
+
238
+ `target_canonical` is the canonical id the inferred edge will be
239
+ attached to. When provided, suppresses self-edges (matters in the
240
+ bulk-refresh path where an HF id may be aliased directly to its
241
+ family pointer rather than a separate snapshot canonical β€” without
242
+ this guard the family pointer gains a parent edge to itself,
243
+ breaking the lineage walker). Live auto-create can also pass the
244
+ proposed draft id; it just makes the guard tighter.
245
+ """
246
+ candidates: list[str] = []
247
+
248
+ # Internal MMDD: `Olmo-3-1125-32B` shape. Tries first because
249
+ # internal-token strips give a more specific lookup target than
250
+ # trailing-token strips.
251
+ m = _INTERNAL_DATE_RE.match(hf_id)
252
+ if m and _looks_like_mmdd(m.group(2)):
253
+ prefix, _, suffix = m.groups()
254
+ candidates.append(f"{prefix}-{suffix}")
255
+
256
+ # ISO ladder (full β†’ month β†’ year). The three regexes match
257
+ # mutually exclusive tail shapes (`-YYYY-MM-DD` vs `-YYYY-MM` vs
258
+ # `-YYYY`), so each input fires at most one branch.
259
+ m = _ISO_FULL_DATE_RE.match(hf_id)
260
+ if m:
261
+ prefix, y, mo, d = m.groups()
262
+ if (_looks_like_release_year(y) and 1 <= int(mo) <= 12
263
+ and 1 <= int(d) <= 31):
264
+ candidates.append(f"{prefix}-{y}-{mo}")
265
+ candidates.append(f"{prefix}-{y}")
266
+ candidates.append(prefix)
267
+ else:
268
+ m = _ISO_MONTH_DATE_RE.match(hf_id)
269
+ if m:
270
+ prefix, y, mo = m.groups()
271
+ if _looks_like_release_year(y) and 1 <= int(mo) <= 12:
272
+ candidates.append(f"{prefix}-{y}")
273
+ candidates.append(prefix)
274
+ else:
275
+ m = _ISO_YEAR_DATE_RE.match(hf_id)
276
+ if m:
277
+ prefix, y = m.groups()
278
+ if _looks_like_release_year(y):
279
+ candidates.append(prefix)
280
+
281
+ # Trailing YYYYMMDD (Anthropic/xAI/Tencent style).
282
+ m = _TRAILING_8DIGIT_RE.match(hf_id)
283
+ if m and _looks_like_yyyymmdd(m.group(2)):
284
+ candidates.append(m.group(1))
285
+
286
+ # Trailing YYYYMM (Stepfun and several Chinese-lab release tags).
287
+ m = _TRAILING_6DIGIT_RE.match(hf_id)
288
+ if m and _looks_like_yyyymm(m.group(2)):
289
+ candidates.append(m.group(1))
290
+
291
+ # Trailing 4-digit MMDD (Moonshot/Kimi, Google -exp tags).
292
+ m = _TRAILING_4DIGIT_RE.match(hf_id)
293
+ if m and _looks_like_mmdd(m.group(2)):
294
+ candidates.append(m.group(1))
295
+
296
+ for cand in candidates:
297
+ canonical = aliases_to_canonical.get(normalize(cand))
298
+ if not canonical:
299
+ continue
300
+ if target_canonical is not None and canonical == target_canonical:
301
+ continue
302
+ return {"id": canonical, "relationship": "variant", "axis": "version"}
303
+ return None
304
+
305
+
306
  def extract_base_models(base_models) -> list[dict]:
307
  """Decode the `baseModels` struct into a list of typed parent edges.
308
  Returns `[{id, relationship}, ...]` β€” caller resolves each id to our
 
447
  try:
448
  con = self._ensure_con()
449
  use_local = self._ensure_local_table(con)
450
+ # Case-insensitive match β€” HF stores ids with the upstream
451
+ # author's original casing (`allenai/Olmo-3-1125-32B`); EEE
452
+ # surfaces values in mixed conventions (some leaderboards
453
+ # lowercase, some preserve). An exact-case `=` filter
454
+ # silently misses any casing mismatch and the draft lands
455
+ # without enrichment metadata. LOWER() forces a match
456
+ # regardless of the surface form.
457
+ escaped = hf_id.lower().replace("'", "''")
458
  if use_local:
459
+ sql = f"SELECT * FROM hub_stats WHERE LOWER(id) = '{escaped}' LIMIT 1"
460
  else:
461
  sql = (
462
  f"SELECT {QUERY_COLUMNS} "
463
  f"FROM read_parquet('{self.parquet_url}') "
464
+ f"WHERE LOWER(id) = '{escaped}' LIMIT 1"
465
  )
466
  cursor = con.execute(sql)
467
  cols = [d[0] for d in cursor.description]
 
489
  row: dict,
490
  aliases_to_canonical: dict[str, str],
491
  org_alias_map: dict[str, str],
492
+ target_canonical: Optional[str] = None,
493
  ) -> dict:
494
  """Convert one hub-stats row into a partial canonical_models dict
495
  suitable for merging into an auto-created draft. Computes:
 
543
  if lineage_origin_org_id is None and edge["relationship"] != "variant":
544
  if "/" in parent_canonical:
545
  lineage_origin_org_id = parent_canonical.split("/", 1)[0]
546
+
547
+ # Family-version inference: hub-stats `baseModels` only records
548
+ # upstream-lineage edges (finetune/quantized/merge/adapter), never
549
+ # the dated-snapshot ↔ moving-pointer relationship that lives only
550
+ # in our registry. Without this, snapshots like `Olmo-3-1125-32B`
551
+ # auto-create as orphan canonicals β€” release_date lands but parents
552
+ # stays empty and root-collapse never fires.
553
+ hf_id = row.get("id")
554
+ if isinstance(hf_id, str) and not any(
555
+ p.get("relationship") == "variant" and p.get("axis") == "version"
556
+ for p in parents
557
+ ):
558
+ version_edge = infer_family_parent_edge(
559
+ hf_id, aliases_to_canonical, target_canonical=target_canonical,
560
+ )
561
+ if version_edge is not None:
562
+ parents.append(version_edge)
563
+
564
  if parents:
565
  out["parents"] = json.dumps(parents)
566
  if lineage_origin_org_id:
src/eval_card_registry/services/resolution_service.py CHANGED
@@ -9,8 +9,11 @@ Responsibilities:
9
  """
10
  from __future__ import annotations
11
 
 
12
  import re
 
13
  import uuid
 
14
  from datetime import datetime, timezone
15
  from typing import Optional
16
 
@@ -49,6 +52,32 @@ def _now() -> str:
49
  return datetime.now(timezone.utc).isoformat()
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def _build_alias_store(registry_store: RegistryStore) -> AliasStore:
53
  """Build an AliasStore from the registry's in-memory aliases table."""
54
  aliases_df = registry_store.table("aliases")
@@ -60,13 +89,16 @@ def _build_canonical_store(registry_store: RegistryStore) -> CanonicalStore:
60
  tables. Lets the bare resolver enrich its results with the same
61
  metadata fields the HTTP API exposes β€” including benchmark
62
  `family_key` / `category` (which need families_df + composites_df
63
- to populate; otherwise they fall back to the benchmark's own id)."""
 
 
 
64
  return CanonicalStore(
65
- models_df=registry_store.table("canonical_models"),
66
- benchmarks_df=registry_store.table("canonical_benchmarks"),
67
- metrics_df=registry_store.table("canonical_metrics"),
68
- harnesses_df=registry_store.table("eval_harnesses"),
69
- orgs_df=registry_store.table("canonical_orgs") if registry_store.has_table("canonical_orgs") else None,
70
  families_df=registry_store.table("canonical_families") if registry_store.has_table("canonical_families") else None,
71
  composites_df=registry_store.table("canonical_composites") if registry_store.has_table("canonical_composites") else None,
72
  )
@@ -182,18 +214,36 @@ class ResolutionService:
182
  return result_dict
183
 
184
  # Check if alias already exists (skip resolver on rerun=False).
185
- # Build the enriched response via `Resolver.build_result` so we
186
- # preserve the original alias's strategy/confidence (audit trail)
187
- # while still surfacing the same canonical-collapse / metadata
188
- # fields a fresh resolve would produce.
 
 
 
 
 
 
189
  if not rerun:
190
  existing = queries.get_alias(self.store, raw_value, entity_type, source_config)
191
  if existing:
192
  resolver = self._get_resolver()
193
- enriched = resolver.build_result(
194
- raw_value, entity_type, source_config,
195
- existing["canonical_id"], existing["strategy"], existing["confidence"],
196
- )
 
 
 
 
 
 
 
 
 
 
 
 
197
  result_dict = _result_to_dict(enriched, created_new=False)
198
  self._resolve_cache[cache_key] = result_dict
199
  return result_dict
@@ -274,17 +324,33 @@ class ResolutionService:
274
  if created_new:
275
  self.invalidate_resolver()
276
 
277
- # Build the enriched response via the resolver. For auto-drafts
278
- # the freshly-created entity sits in the pending-write buffer and
279
- # may NOT be visible to the canonical_store's DataFrame snapshot
280
- # yet (`_auto_create_entity` writes with `buffered=True`). When
281
- # the lookup misses, the resolver returns review_status=None;
282
- # we know auto-drafts land at "draft" by definition, so override.
283
- resolver = self._get_resolver()
284
- enriched = resolver.build_result(
285
- raw_value, entity_type, source_config,
286
- canonical_id, strategy_used, result.confidence,
287
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  result_dict = _result_to_dict(enriched, created_new=created_new)
289
  if created_new and result_dict.get("review_status") is None:
290
  result_dict["review_status"] = "draft"
@@ -322,7 +388,7 @@ class ResolutionService:
322
  # β€” `enrichment` is `{}` on lookup miss or any error.
323
  enrichment: dict = {}
324
  if entity_type == "model" and self._looks_like_hf_id(raw_value):
325
- enrichment = self._lookup_hub_stats(raw_value) or {}
326
  if entity_type == "model":
327
  base.update({
328
  "developer": None,
@@ -343,6 +409,16 @@ class ResolutionService:
343
  for k, v in enrichment.items():
344
  if v is not None:
345
  base[k] = v
 
 
 
 
 
 
 
 
 
 
346
  elif entity_type == "benchmark":
347
  base.update({"description": None, "dataset_repo": None, "parent_benchmark_id": None, "tags": "[]"})
348
  elif entity_type == "metric":
@@ -361,6 +437,37 @@ class ResolutionService:
361
  queries.upsert_entity(self.store, table, base, buffered=True)
362
  return candidate_id
363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  @staticmethod
365
  def _looks_like_hf_id(raw_value: str) -> bool:
366
  """HF id heuristic: contains a single `/` with non-empty parts on
@@ -372,12 +479,18 @@ class ResolutionService:
372
  org, name = raw_value.split("/", 1)
373
  return bool(org.strip()) and bool(name.strip())
374
 
375
- def _lookup_hub_stats(self, hf_id: str) -> Optional[dict]:
 
 
376
  """Query hub-stats live for `hf_id` and return a partial draft
377
  dict (release_date, params_billions, parents, lineage_origin_org_id,
378
  tags, metadata) ready to merge. Returns None on miss or any error.
379
  Uses the `aliases` table to resolve baseModels parents to our
380
- canonical ids, and `canonical_orgs` HF aliases to map authors."""
 
 
 
 
381
  if not settings.hub_stats_lookup_enabled:
382
  return None
383
  try:
@@ -390,7 +503,10 @@ class ResolutionService:
390
  from eval_card_registry.services import hub_stats as _hs
391
  try:
392
  aliases_to_canonical, org_alias_map = self._build_hub_stats_indices()
393
- return _hs.enrich_draft_from_row(row, aliases_to_canonical, org_alias_map)
 
 
 
394
  except Exception:
395
  return None
396
 
 
9
  """
10
  from __future__ import annotations
11
 
12
+ import json
13
  import re
14
+ import threading
15
  import uuid
16
+ from dataclasses import replace as _dc_replace
17
  from datetime import datetime, timezone
18
  from typing import Optional
19
 
 
52
  return datetime.now(timezone.utc).isoformat()
53
 
54
 
55
+ def _table_with_pending(registry_store: RegistryStore, name: str) -> "pd.DataFrame":
56
+ """Return a table DataFrame with pending-buffer rows appended.
57
+
58
+ `_auto_create_entity` writes drafts with `buffered=True`, so they sit
59
+ in `store._pending[<table>]` until `flush_pending` runs at the end of
60
+ a sync. Without overlaying pending here, the resolver's
61
+ `CanonicalStore` snapshot can't see the just-created row, and
62
+ `build_result` for an auto-created entity returns null for every
63
+ metadata field that hub-stats just enriched.
64
+
65
+ Concat is safe because `upsert_entity` enforces id-uniqueness across
66
+ base + pending (existing rows go to in-place update; only genuinely
67
+ new ids land in pending), so no duplicate keys end up in the
68
+ CanonicalStore index.
69
+ """
70
+ import pandas as pd
71
+ base_df = registry_store.table(name) if registry_store.has_table(name) else pd.DataFrame()
72
+ pending = getattr(registry_store, "_pending", {}).get(name, [])
73
+ if not pending:
74
+ return base_df
75
+ pending_df = pd.DataFrame(pending)
76
+ if base_df.empty:
77
+ return pending_df
78
+ return pd.concat([base_df, pending_df], ignore_index=True)
79
+
80
+
81
  def _build_alias_store(registry_store: RegistryStore) -> AliasStore:
82
  """Build an AliasStore from the registry's in-memory aliases table."""
83
  aliases_df = registry_store.table("aliases")
 
89
  tables. Lets the bare resolver enrich its results with the same
90
  metadata fields the HTTP API exposes β€” including benchmark
91
  `family_key` / `category` (which need families_df + composites_df
92
+ to populate; otherwise they fall back to the benchmark's own id).
93
+
94
+ Pending-buffer rows are overlaid so the resolver sees auto-created
95
+ drafts before `flush_pending` runs. See `_table_with_pending`."""
96
  return CanonicalStore(
97
+ models_df=_table_with_pending(registry_store, "canonical_models"),
98
+ benchmarks_df=_table_with_pending(registry_store, "canonical_benchmarks"),
99
+ metrics_df=_table_with_pending(registry_store, "canonical_metrics"),
100
+ harnesses_df=_table_with_pending(registry_store, "eval_harnesses"),
101
+ orgs_df=_table_with_pending(registry_store, "canonical_orgs") if registry_store.has_table("canonical_orgs") else None,
102
  families_df=registry_store.table("canonical_families") if registry_store.has_table("canonical_families") else None,
103
  composites_df=registry_store.table("canonical_composites") if registry_store.has_table("canonical_composites") else None,
104
  )
 
214
  return result_dict
215
 
216
  # Check if alias already exists (skip resolver on rerun=False).
217
+ # Re-run the strategy chain so the response carries the correct
218
+ # `resolved_leaf_id` β€” the alias table only stores the
219
+ # root-collapsed `canonical_id`, so reconstructing the response
220
+ # via `build_result(root, ...)` would clobber the leaf to the
221
+ # root (model_metadata_fields can't recover leaf identity from
222
+ # a root row alone β€” there's no back-pointer). The strategy
223
+ # chain re-derives leaf cleanly; perf cost is one alias-index
224
+ # lookup since exact-match hits in O(1) for already-aliased
225
+ # values. Audit fields are overlaid from the alias entry so
226
+ # callers still see the original strategy/confidence.
227
  if not rerun:
228
  existing = queries.get_alias(self.store, raw_value, entity_type, source_config)
229
  if existing:
230
  resolver = self._get_resolver()
231
+ fresh = resolver.resolve(raw_value, entity_type, source_config)
232
+ if fresh.canonical_id == existing["canonical_id"]:
233
+ enriched = _dc_replace(
234
+ fresh,
235
+ strategy=existing["strategy"],
236
+ confidence=existing["confidence"],
237
+ )
238
+ else:
239
+ # Rare: registry restructure has moved the canonical
240
+ # for this raw_value since the alias was written.
241
+ # The alias entry is the source of truth for "what
242
+ # this raw resolved to" β€” accept the leaf clobber.
243
+ enriched = resolver.build_result(
244
+ raw_value, entity_type, source_config,
245
+ existing["canonical_id"], existing["strategy"], existing["confidence"],
246
+ )
247
  result_dict = _result_to_dict(enriched, created_new=False)
248
  self._resolve_cache[cache_key] = result_dict
249
  return result_dict
 
324
  if created_new:
325
  self.invalidate_resolver()
326
 
327
+ # Build the enriched response. Two cases:
328
+ # 1. Match found β€” the original `result` already carries the
329
+ # correct canonical_id (root-collapsed), resolved_leaf_id
330
+ # (the matched leaf), parents, and metadata. Don't re-run
331
+ # `build_result` here: it would call `model_metadata_fields`
332
+ # with the ROOT id, which can't recover the leaf and ends
333
+ # up returning resolved_leaf_id = canonical_id. The alias
334
+ # write earlier doesn't change canonical_models β€” `result`
335
+ # stays accurate.
336
+ # 2. Auto-create β€” `result.canonical_id` was None, the new
337
+ # `canonical_id` came from `_auto_create_entity`. The new
338
+ # canonical IS the leaf (its parents may point at family
339
+ # via the inferred version-axis edge), so `build_result`
340
+ # with the new id correctly preserves leaf info via
341
+ # `model_metadata_fields`. The `invalidate_resolver()`
342
+ # above ensures the canonical_store snapshot sees the new
343
+ # row, but the entity may still sit in the pending-write
344
+ # buffer; on lookup miss the review_status falls back to
345
+ # None and we override to 'draft' below.
346
+ if created_new:
347
+ resolver = self._get_resolver()
348
+ enriched = resolver.build_result(
349
+ raw_value, entity_type, source_config,
350
+ canonical_id, strategy_used, result.confidence,
351
+ )
352
+ else:
353
+ enriched = result
354
  result_dict = _result_to_dict(enriched, created_new=created_new)
355
  if created_new and result_dict.get("review_status") is None:
356
  result_dict["review_status"] = "draft"
 
388
  # β€” `enrichment` is `{}` on lookup miss or any error.
389
  enrichment: dict = {}
390
  if entity_type == "model" and self._looks_like_hf_id(raw_value):
391
+ enrichment = self._lookup_hub_stats(raw_value, target_canonical=candidate_id) or {}
392
  if entity_type == "model":
393
  base.update({
394
  "developer": None,
 
409
  for k, v in enrichment.items():
410
  if v is not None:
411
  base[k] = v
412
+ # Family-version inference fallback: when hub-stats misses
413
+ # (parquet stale, lookup disabled, rate-limited, or row
414
+ # absent), the snapshot still has its shape β€” try to infer a
415
+ # version-axis parent from just the alias index. The
416
+ # inference is alias-lookup-only, so it never manufactures
417
+ # a false parent. Idempotent with the inference inside
418
+ # enrich_draft_from_row: only fires when no version-axis
419
+ # edge is already present.
420
+ if self._looks_like_hf_id(raw_value):
421
+ self._maybe_infer_family_parent(base, raw_value, candidate_id)
422
  elif entity_type == "benchmark":
423
  base.update({"description": None, "dataset_repo": None, "parent_benchmark_id": None, "tags": "[]"})
424
  elif entity_type == "metric":
 
437
  queries.upsert_entity(self.store, table, base, buffered=True)
438
  return candidate_id
439
 
440
+ def _maybe_infer_family_parent(
441
+ self, base: dict, raw_value: str, candidate_id: str,
442
+ ) -> None:
443
+ """Mutate `base['parents']` to add a `{variant, axis: version}`
444
+ edge when the raw value's snapshot shape resolves to an existing
445
+ family canonical via the alias index. Runs independently of
446
+ hub-stats so brand-new releases not yet in the parquet still
447
+ get linked into the lineage graph."""
448
+ try:
449
+ existing = json.loads(base.get("parents") or "[]")
450
+ except (ValueError, TypeError):
451
+ existing = []
452
+ if any(
453
+ p.get("relationship") == "variant" and p.get("axis") == "version"
454
+ for p in existing
455
+ if isinstance(p, dict)
456
+ ):
457
+ return
458
+ from eval_card_registry.services.hub_stats import infer_family_parent_edge
459
+ try:
460
+ aliases_to_canonical, _ = self._build_hub_stats_indices()
461
+ except Exception:
462
+ return
463
+ edge = infer_family_parent_edge(
464
+ raw_value, aliases_to_canonical, target_canonical=candidate_id,
465
+ )
466
+ if edge is None:
467
+ return
468
+ existing.append(edge)
469
+ base["parents"] = json.dumps(existing)
470
+
471
  @staticmethod
472
  def _looks_like_hf_id(raw_value: str) -> bool:
473
  """HF id heuristic: contains a single `/` with non-empty parts on
 
479
  org, name = raw_value.split("/", 1)
480
  return bool(org.strip()) and bool(name.strip())
481
 
482
+ def _lookup_hub_stats(
483
+ self, hf_id: str, target_canonical: Optional[str] = None,
484
+ ) -> Optional[dict]:
485
  """Query hub-stats live for `hf_id` and return a partial draft
486
  dict (release_date, params_billions, parents, lineage_origin_org_id,
487
  tags, metadata) ready to merge. Returns None on miss or any error.
488
  Uses the `aliases` table to resolve baseModels parents to our
489
+ canonical ids, and `canonical_orgs` HF aliases to map authors.
490
+
491
+ `target_canonical` is the candidate canonical id of the draft
492
+ being created β€” passed through to enrich_draft_from_row so the
493
+ family-version inference can suppress a self-edge."""
494
  if not settings.hub_stats_lookup_enabled:
495
  return None
496
  try:
 
503
  from eval_card_registry.services import hub_stats as _hs
504
  try:
505
  aliases_to_canonical, org_alias_map = self._build_hub_stats_indices()
506
+ return _hs.enrich_draft_from_row(
507
+ row, aliases_to_canonical, org_alias_map,
508
+ target_canonical=target_canonical,
509
+ )
510
  except Exception:
511
  return None
512