j-chim commited on
Commit
63cb6b2
·
verified ·
1 Parent(s): 2a16c24

Upload folder using huggingface_hub

Browse files
src/eval_card_registry/cli.py CHANGED
@@ -494,7 +494,8 @@ def seed(
494
  typer.echo(
495
  f" derived: root_model_id={lineage_counts['root_set']}, "
496
  f"lineage_origin_org_id={lineage_counts['lineage_set']}, "
497
- f"open_weights_inherited={lineage_counts['open_weights_inherited']}"
 
498
  )
499
 
500
  removed_entities = 0
 
494
  typer.echo(
495
  f" derived: root_model_id={lineage_counts['root_set']}, "
496
  f"lineage_origin_org_id={lineage_counts['lineage_set']}, "
497
+ f"open_weights_inherited={lineage_counts['open_weights_inherited']}, "
498
+ f"release_date_from_id={lineage_counts['release_date_derived_from_id']}"
499
  )
500
 
501
  removed_entities = 0
src/eval_card_registry/store/queries.py CHANGED
@@ -11,6 +11,7 @@ from __future__ import annotations
11
 
12
  import hashlib
13
  import json
 
14
  import uuid
15
  from datetime import datetime, timezone
16
  from typing import Any, Optional
@@ -25,6 +26,62 @@ def _now() -> str:
25
  return datetime.now(timezone.utc).isoformat()
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def _is_na(value) -> bool:
29
  try:
30
  return bool(pd.isna(value))
@@ -107,11 +164,17 @@ def derive_model_lineage_fields(store: RegistryStore) -> dict[str, int]:
107
  """
108
  df = store.table("canonical_models")
109
  if df.empty:
110
- return {"root_set": 0, "lineage_set": 0, "open_weights_inherited": 0}
 
 
 
 
 
111
 
112
  parents_by_id: dict[str, list[dict]] = {}
113
  org_by_id: dict[str, Optional[str]] = {}
114
  open_by_id: dict[str, Optional[bool]] = {}
 
115
  for _, row in df.iterrows():
116
  cid = row["id"]
117
  parents_by_id[cid] = decode_parents(row.get("parents"))
@@ -119,6 +182,8 @@ def derive_model_lineage_fields(store: RegistryStore) -> dict[str, int]:
119
  org_by_id[cid] = None if _is_na(org) else org
120
  ow = row.get("open_weights")
121
  open_by_id[cid] = None if _is_na(ow) else bool(ow)
 
 
122
 
123
  def _walk(start: str, edge_ok) -> str:
124
  """Walk parents through edges where `edge_ok(edge)` is True.
@@ -178,7 +243,9 @@ def derive_model_lineage_fields(store: RegistryStore) -> dict[str, int]:
178
  root_updates: dict[str, Optional[str]] = {}
179
  lineage_updates: dict[str, Optional[str]] = {}
180
  open_updates: dict[str, Optional[bool]] = {}
 
181
  inherited_count = 0
 
182
  for cid in parents_by_id:
183
  # Identity root via quantized + variant-version walk (both treat
184
  # the parent as the same model at the API level — see docstring).
@@ -199,16 +266,32 @@ def derive_model_lineage_fields(store: RegistryStore) -> dict[str, int]:
199
  if inherited is not None:
200
  inherited_count += 1
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  df = df.copy()
203
  df["root_model_id"] = df["id"].map(root_updates).astype(pd.StringDtype())
204
  df["lineage_origin_org_id"] = df["id"].map(lineage_updates).astype(pd.StringDtype())
205
  df["open_weights"] = df["id"].map(open_updates).astype(pd.BooleanDtype())
 
206
  store.set_table("canonical_models", df)
207
 
208
  return {
209
  "root_set": int(df["root_model_id"].notna().sum()),
210
  "lineage_set": int(df["lineage_origin_org_id"].notna().sum()),
211
  "open_weights_inherited": inherited_count,
 
212
  }
213
 
214
 
 
11
 
12
  import hashlib
13
  import json
14
+ import re
15
  import uuid
16
  from datetime import datetime, timezone
17
  from typing import Any, Optional
 
26
  return datetime.now(timezone.utc).isoformat()
27
 
28
 
29
+ # Date-suffix patterns used by `_derive_release_date_from_id`. The id
30
+ # encodes the snapshot/release date in three common shapes:
31
+ # `-YYYY-MM-DD` — OpenAI / Google daily snapshot (gpt-4o-2024-08-06)
32
+ # `-YYYYMMDD` — Anthropic / xAI snapshot (claude-sonnet-4-20250514)
33
+ # `-YYYY-MM` — OpenAI monthly pointer (gpt-5-2025-08); day defaults
34
+ # to "01" since the snapshot is month-grained
35
+ _DATE_ISO_FULL_RE = re.compile(r"-(\d{4})-(\d{2})-(\d{2})$")
36
+ _DATE_PACKED_RE = re.compile(r"-(\d{4})(\d{2})(\d{2})$")
37
+ _DATE_ISO_MONTH_RE = re.compile(r"-(\d{4})-(\d{2})$")
38
+
39
+
40
+ def _derive_release_date_from_id(canonical_id: str) -> Optional[str]:
41
+ """Best-effort: parse a date suffix off the canonical id and return
42
+ ISO-8601 YYYY-MM-DD, or None when no recognisable suffix is present.
43
+
44
+ Year-range guard (2015-2035) keeps non-year 4-digit tails (parameter
45
+ counts, batch numbers, etc.) from being mis-interpreted as a release
46
+ year. The day/month components are validated to plausible ranges.
47
+ Returns None on guard failure rather than a malformed date.
48
+ """
49
+ if not canonical_id:
50
+ return None
51
+
52
+ def _ok_year(s: str) -> bool:
53
+ try:
54
+ return 2015 <= int(s) <= 2035
55
+ except ValueError:
56
+ return False
57
+
58
+ m = _DATE_ISO_FULL_RE.search(canonical_id)
59
+ if m:
60
+ y, mo, d = m.groups()
61
+ if _ok_year(y) and 1 <= int(mo) <= 12 and 1 <= int(d) <= 31:
62
+ return f"{y}-{mo}-{d}"
63
+ return None
64
+
65
+ m = _DATE_PACKED_RE.search(canonical_id)
66
+ if m:
67
+ y, mo, d = m.groups()
68
+ if _ok_year(y) and 1 <= int(mo) <= 12 and 1 <= int(d) <= 31:
69
+ return f"{y}-{mo}-{d}"
70
+ return None
71
+
72
+ m = _DATE_ISO_MONTH_RE.search(canonical_id)
73
+ if m:
74
+ y, mo = m.groups()
75
+ if _ok_year(y) and 1 <= int(mo) <= 12:
76
+ # Monthly pointer: day defaults to 01. Consumers wanting more
77
+ # precision should rely on hand-curated or hub-stats sourced
78
+ # release_dates, which always win over this derivation.
79
+ return f"{y}-{mo}-01"
80
+ return None
81
+
82
+ return None
83
+
84
+
85
  def _is_na(value) -> bool:
86
  try:
87
  return bool(pd.isna(value))
 
164
  """
165
  df = store.table("canonical_models")
166
  if df.empty:
167
+ return {
168
+ "root_set": 0,
169
+ "lineage_set": 0,
170
+ "open_weights_inherited": 0,
171
+ "release_date_derived_from_id": 0,
172
+ }
173
 
174
  parents_by_id: dict[str, list[dict]] = {}
175
  org_by_id: dict[str, Optional[str]] = {}
176
  open_by_id: dict[str, Optional[bool]] = {}
177
+ release_by_id: dict[str, Optional[str]] = {}
178
  for _, row in df.iterrows():
179
  cid = row["id"]
180
  parents_by_id[cid] = decode_parents(row.get("parents"))
 
182
  org_by_id[cid] = None if _is_na(org) else org
183
  ow = row.get("open_weights")
184
  open_by_id[cid] = None if _is_na(ow) else bool(ow)
185
+ rd = row.get("release_date")
186
+ release_by_id[cid] = None if _is_na(rd) else str(rd)
187
 
188
  def _walk(start: str, edge_ok) -> str:
189
  """Walk parents through edges where `edge_ok(edge)` is True.
 
243
  root_updates: dict[str, Optional[str]] = {}
244
  lineage_updates: dict[str, Optional[str]] = {}
245
  open_updates: dict[str, Optional[bool]] = {}
246
+ release_updates: dict[str, Optional[str]] = {}
247
  inherited_count = 0
248
+ release_derived_count = 0
249
  for cid in parents_by_id:
250
  # Identity root via quantized + variant-version walk (both treat
251
  # the parent as the same model at the API level — see docstring).
 
266
  if inherited is not None:
267
  inherited_count += 1
268
 
269
+ # Release date — explicit value (hand-curated, hub-stats createdAt,
270
+ # or models.dev release_dates) WINS. Fall back to parsing the date
271
+ # off the id when the canonical name encodes it (`-YYYY-MM-DD`,
272
+ # `-YYYYMMDD`, `-YYYY-MM`). Avoids the silly "id literally says
273
+ # 2025-04-14, registry says <NA>" gap on dated openai snapshots.
274
+ explicit_release = release_by_id.get(cid)
275
+ if explicit_release is not None and explicit_release.strip():
276
+ release_updates[cid] = explicit_release
277
+ else:
278
+ derived = _derive_release_date_from_id(cid)
279
+ release_updates[cid] = derived
280
+ if derived is not None:
281
+ release_derived_count += 1
282
+
283
  df = df.copy()
284
  df["root_model_id"] = df["id"].map(root_updates).astype(pd.StringDtype())
285
  df["lineage_origin_org_id"] = df["id"].map(lineage_updates).astype(pd.StringDtype())
286
  df["open_weights"] = df["id"].map(open_updates).astype(pd.BooleanDtype())
287
+ df["release_date"] = df["id"].map(release_updates).astype(pd.StringDtype())
288
  store.set_table("canonical_models", df)
289
 
290
  return {
291
  "root_set": int(df["root_model_id"].notna().sum()),
292
  "lineage_set": int(df["lineage_origin_org_id"].notna().sum()),
293
  "open_weights_inherited": inherited_count,
294
+ "release_date_derived_from_id": release_derived_count,
295
  }
296
 
297