Executor-Tyrant-Framework commited on
Commit
42aac76
Β·
verified Β·
1 Parent(s): 0aef8d8

Sync from GitHub: 75c4d61881de90eca28c15d1b5d08cee9f1cd77e

Browse files
Files changed (1) hide show
  1. app.py +74 -20
app.py CHANGED
@@ -1127,13 +1127,62 @@ def on_interleaved_benchmark(
1127
  categories_per_turn: list = [c for c, _ in INTERLEAVED_QUESTIONS]
1128
 
1129
  # Cross-run category registry on the organism. Maps node_id -> category
1130
- # for every deposit this benchmark has ever made on this Space lifetime.
1131
- # Needed because turn j's pith may pull nodes from PRIOR runs that share
1132
- # a category but aren't in this run's deposit_ids. Lazy-init on first use.
 
 
1133
  if not hasattr(nw_organism, "_benchmark_category_registry"):
1134
  nw_organism._benchmark_category_registry = {}
1135
  cat_registry = nw_organism._benchmark_category_registry
1136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1137
  N = len(INTERLEAVED_QUESTIONS)
1138
 
1139
  for i, (category, prompt_text) in enumerate(INTERLEAVED_QUESTIONS):
@@ -1278,13 +1327,13 @@ def on_interleaved_benchmark(
1278
  if i < j and deposit_ids[i] and deposit_ids[i] in pith_set:
1279
  mat_B[j, i] = 1.0
1280
 
1281
- # ── Heatmap B (category-match β€” the displayed one) ──
1282
- # Cell (j, i) is bright when turn j's Pith contains ANY node tagged
1283
- # with category[i] in the cross-run registry. Causally valid for all
1284
- # i < j; we don't restrict to same-category (i,j) pairs because the
1285
- # cross-category cells answer a useful question too: "did j leak into
1286
- # i's category?" (low brightness off the diagonal of same-cat pairs
1287
- # = clean separation, which is also good).
1288
  mat_B_cat = np.zeros((N, N))
1289
  for j in range(N):
1290
  pith_set = set(pith_ids_per_turn[j])
@@ -1293,7 +1342,7 @@ def on_interleaved_benchmark(
1293
  continue
1294
  target_cat = categories_per_turn[i]
1295
  for pid in pith_set:
1296
- if cat_registry.get(pid) == target_cat:
1297
  mat_B_cat[j, i] = 1.0
1298
  break
1299
 
@@ -1365,28 +1414,33 @@ def on_interleaved_benchmark(
1365
  # that mat_B.sum() produces (previous reporting conflated the two).
1366
  same_cat_B_hits = sum(int(mat_B[j, i]) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS)
1367
 
1368
- # Category-match version: for each q2 turn, did its pith contain ANY
1369
- # node tagged with its own category in the cross-run registry? This
1370
- # is the looser, more biologically honest signal β€” surfacing what you
1371
- # KNOW about the topic, not the specific past instance.
 
1372
  q2_turns = [j for (_, j) in _INTERLEAVED_SAME_CAT_PAIRS]
1373
  same_cat_pith_hits = 0
1374
  for j in q2_turns:
1375
  pith_set = set(pith_ids_per_turn[j])
1376
  j_cat = categories_per_turn[j]
1377
- if any(cat_registry.get(pid) == j_cat for pid in pith_set):
1378
  same_cat_pith_hits += 1
1379
  same_cat_pith_hit_rate = same_cat_pith_hits / max(1, len(q2_turns))
1380
 
1381
  # Off-diagonal "category leak" diagnostic: how often did a q2 pith
1382
- # pull a node from a DIFFERENT category? Lower is cleaner separation.
 
 
1383
  cross_cat_leaks = 0
1384
  for j in q2_turns:
1385
  pith_set = set(pith_ids_per_turn[j])
1386
  j_cat = categories_per_turn[j]
1387
- if any(cat_registry.get(pid) and cat_registry.get(pid) != j_cat
1388
- for pid in pith_set):
1389
- cross_cat_leaks += 1
 
 
1390
 
1391
  # End-state substrate diagnostics β€” pair with the _start_ values
1392
  # captured at benchmark entry so consumers can confirm both A and B
 
1127
  categories_per_turn: list = [c for c, _ in INTERLEAVED_QUESTIONS]
1128
 
1129
  # Cross-run category registry on the organism. Maps node_id -> category
1130
+ # for deposits this benchmark has tagged. Kept as a diagnostic β€” its
1131
+ # size proves the persistence path works β€” but the heatmap no longer
1132
+ # depends on it (Run 15 showed the substrate is stable but pith pulls
1133
+ # nodes that predate the registry, so registry-based tagging can never
1134
+ # match). Lazy-init on first use.
1135
  if not hasattr(nw_organism, "_benchmark_category_registry"):
1136
  nw_organism._benchmark_category_registry = {}
1137
  cat_registry = nw_organism._benchmark_category_registry
1138
 
1139
+ # ── Option G: similarity-based categorization ──────────────────────
1140
+ # Build one centroid per category by averaging the embeddings of all
1141
+ # INTERLEAVED_QUESTIONS prompts in that category (q1 + q2). Then any
1142
+ # node with a stored embedding can be categorized post-hoc by cosine
1143
+ # similarity, regardless of when it was deposited. Replaces the
1144
+ # registry-only logic that couldn't see pre-instrumentation nodes.
1145
+ _category_centroids: dict = {}
1146
+ _CATEGORY_SIM_THRESHOLD = 0.30 # cosine sim floor to assign category
1147
+ try:
1148
+ _per_cat_embs: dict = {}
1149
+ for _cat, _prompt in INTERLEAVED_QUESTIONS:
1150
+ _emb = np.asarray(nw_organism._embed_fn(_prompt), dtype=np.float32)
1151
+ _per_cat_embs.setdefault(_cat, []).append(_emb)
1152
+ for _cat, _embs in _per_cat_embs.items():
1153
+ _centroid = np.mean(_embs, axis=0)
1154
+ _norm = np.linalg.norm(_centroid) + 1e-8
1155
+ _category_centroids[_cat] = _centroid / _norm
1156
+ logger.info("Built %d category centroids for similarity tagging",
1157
+ len(_category_centroids))
1158
+ except Exception as exc:
1159
+ logger.warning("Category centroid build failed: %s", exc)
1160
+
1161
+ def _categorize_node(node_id: str) -> Optional[str]:
1162
+ """Return best-matching category for a node, or None.
1163
+
1164
+ Looks up the node's stored embedding in the organism's side-table,
1165
+ computes cosine similarity to each category centroid, returns the
1166
+ category with maximum similarity if it exceeds the threshold.
1167
+ Threshold prevents off-topic substrate nodes (e.g. residue from
1168
+ unrelated chat sessions) from being shoehorned into a category.
1169
+ """
1170
+ if not _category_centroids:
1171
+ return None
1172
+ emb = nw_organism._embeddings.get(node_id)
1173
+ if emb is None:
1174
+ return None
1175
+ norm = np.linalg.norm(emb) + 1e-8
1176
+ emb_n = emb / norm
1177
+ best_cat = None
1178
+ best_sim = _CATEGORY_SIM_THRESHOLD
1179
+ for cat, cent in _category_centroids.items():
1180
+ sim = float(np.dot(emb_n, cent))
1181
+ if sim > best_sim:
1182
+ best_sim = sim
1183
+ best_cat = cat
1184
+ return best_cat
1185
+
1186
  N = len(INTERLEAVED_QUESTIONS)
1187
 
1188
  for i, (category, prompt_text) in enumerate(INTERLEAVED_QUESTIONS):
 
1327
  if i < j and deposit_ids[i] and deposit_ids[i] in pith_set:
1328
  mat_B[j, i] = 1.0
1329
 
1330
+ # ── Heatmap B (category-match via similarity tagging β€” Option G) ──
1331
+ # Cell (j, i) is bright when turn j's Pith contains ANY node whose
1332
+ # stored embedding cosine-matches category[i]'s centroid above the
1333
+ # threshold. Causally valid for all i < j. The similarity-based
1334
+ # version replaces the prior registry-based logic which could only
1335
+ # see nodes deposited by runs that called the new code path β€”
1336
+ # invisible against a substrate accumulated over many prior runs.
1337
  mat_B_cat = np.zeros((N, N))
1338
  for j in range(N):
1339
  pith_set = set(pith_ids_per_turn[j])
 
1342
  continue
1343
  target_cat = categories_per_turn[i]
1344
  for pid in pith_set:
1345
+ if _categorize_node(pid) == target_cat:
1346
  mat_B_cat[j, i] = 1.0
1347
  break
1348
 
 
1414
  # that mat_B.sum() produces (previous reporting conflated the two).
1415
  same_cat_B_hits = sum(int(mat_B[j, i]) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS)
1416
 
1417
+ # Category-match via similarity (Option G): for each q2 turn, did its
1418
+ # pith contain ANY node whose embedding cosine-matches the turn's
1419
+ # category centroid above threshold? This is the metric that actually
1420
+ # answers "is the substrate doing category-coherent retrieval" β€”
1421
+ # works on the entire substrate, not just nodes the registry has seen.
1422
  q2_turns = [j for (_, j) in _INTERLEAVED_SAME_CAT_PAIRS]
1423
  same_cat_pith_hits = 0
1424
  for j in q2_turns:
1425
  pith_set = set(pith_ids_per_turn[j])
1426
  j_cat = categories_per_turn[j]
1427
+ if any(_categorize_node(pid) == j_cat for pid in pith_set):
1428
  same_cat_pith_hits += 1
1429
  same_cat_pith_hit_rate = same_cat_pith_hits / max(1, len(q2_turns))
1430
 
1431
  # Off-diagonal "category leak" diagnostic: how often did a q2 pith
1432
+ # pull a node tagged with a DIFFERENT category? Lower is cleaner
1433
+ # separation. Untaggable nodes (no embedding, or below threshold) do
1434
+ # not count as leaks.
1435
  cross_cat_leaks = 0
1436
  for j in q2_turns:
1437
  pith_set = set(pith_ids_per_turn[j])
1438
  j_cat = categories_per_turn[j]
1439
+ for pid in pith_set:
1440
+ tagged = _categorize_node(pid)
1441
+ if tagged is not None and tagged != j_cat:
1442
+ cross_cat_leaks += 1
1443
+ break
1444
 
1445
  # End-state substrate diagnostics β€” pair with the _start_ values
1446
  # captured at benchmark entry so consumers can confirm both A and B