Sync from GitHub: 75c4d61881de90eca28c15d1b5d08cee9f1cd77e
Browse files
app.py
CHANGED
|
@@ -1127,13 +1127,62 @@ def on_interleaved_benchmark(
|
|
| 1127 |
categories_per_turn: list = [c for c, _ in INTERLEAVED_QUESTIONS]
|
| 1128 |
|
| 1129 |
# Cross-run category registry on the organism. Maps node_id -> category
|
| 1130 |
-
# for
|
| 1131 |
-
#
|
| 1132 |
-
#
|
|
|
|
|
|
|
| 1133 |
if not hasattr(nw_organism, "_benchmark_category_registry"):
|
| 1134 |
nw_organism._benchmark_category_registry = {}
|
| 1135 |
cat_registry = nw_organism._benchmark_category_registry
|
| 1136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1137 |
N = len(INTERLEAVED_QUESTIONS)
|
| 1138 |
|
| 1139 |
for i, (category, prompt_text) in enumerate(INTERLEAVED_QUESTIONS):
|
|
@@ -1278,13 +1327,13 @@ def on_interleaved_benchmark(
|
|
| 1278 |
if i < j and deposit_ids[i] and deposit_ids[i] in pith_set:
|
| 1279 |
mat_B[j, i] = 1.0
|
| 1280 |
|
| 1281 |
-
# ββ Heatmap B (category-match
|
| 1282 |
-
# Cell (j, i) is bright when turn j's Pith contains ANY node
|
| 1283 |
-
#
|
| 1284 |
-
#
|
| 1285 |
-
#
|
| 1286 |
-
#
|
| 1287 |
-
#
|
| 1288 |
mat_B_cat = np.zeros((N, N))
|
| 1289 |
for j in range(N):
|
| 1290 |
pith_set = set(pith_ids_per_turn[j])
|
|
@@ -1293,7 +1342,7 @@ def on_interleaved_benchmark(
|
|
| 1293 |
continue
|
| 1294 |
target_cat = categories_per_turn[i]
|
| 1295 |
for pid in pith_set:
|
| 1296 |
-
if
|
| 1297 |
mat_B_cat[j, i] = 1.0
|
| 1298 |
break
|
| 1299 |
|
|
@@ -1365,28 +1414,33 @@ def on_interleaved_benchmark(
|
|
| 1365 |
# that mat_B.sum() produces (previous reporting conflated the two).
|
| 1366 |
same_cat_B_hits = sum(int(mat_B[j, i]) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS)
|
| 1367 |
|
| 1368 |
-
# Category-match
|
| 1369 |
-
#
|
| 1370 |
-
#
|
| 1371 |
-
#
|
|
|
|
| 1372 |
q2_turns = [j for (_, j) in _INTERLEAVED_SAME_CAT_PAIRS]
|
| 1373 |
same_cat_pith_hits = 0
|
| 1374 |
for j in q2_turns:
|
| 1375 |
pith_set = set(pith_ids_per_turn[j])
|
| 1376 |
j_cat = categories_per_turn[j]
|
| 1377 |
-
if any(
|
| 1378 |
same_cat_pith_hits += 1
|
| 1379 |
same_cat_pith_hit_rate = same_cat_pith_hits / max(1, len(q2_turns))
|
| 1380 |
|
| 1381 |
# Off-diagonal "category leak" diagnostic: how often did a q2 pith
|
| 1382 |
-
# pull a node
|
|
|
|
|
|
|
| 1383 |
cross_cat_leaks = 0
|
| 1384 |
for j in q2_turns:
|
| 1385 |
pith_set = set(pith_ids_per_turn[j])
|
| 1386 |
j_cat = categories_per_turn[j]
|
| 1387 |
-
|
| 1388 |
-
|
| 1389 |
-
|
|
|
|
|
|
|
| 1390 |
|
| 1391 |
# End-state substrate diagnostics β pair with the _start_ values
|
| 1392 |
# captured at benchmark entry so consumers can confirm both A and B
|
|
|
|
| 1127 |
categories_per_turn: list = [c for c, _ in INTERLEAVED_QUESTIONS]
|
| 1128 |
|
| 1129 |
# Cross-run category registry on the organism. Maps node_id -> category
|
| 1130 |
+
# for deposits this benchmark has tagged. Kept as a diagnostic β its
|
| 1131 |
+
# size proves the persistence path works β but the heatmap no longer
|
| 1132 |
+
# depends on it (Run 15 showed the substrate is stable but pith pulls
|
| 1133 |
+
# nodes that predate the registry, so registry-based tagging can never
|
| 1134 |
+
# match). Lazy-init on first use.
|
| 1135 |
if not hasattr(nw_organism, "_benchmark_category_registry"):
|
| 1136 |
nw_organism._benchmark_category_registry = {}
|
| 1137 |
cat_registry = nw_organism._benchmark_category_registry
|
| 1138 |
|
| 1139 |
+
# ββ Option G: similarity-based categorization ββββββββββββββββββββββ
|
| 1140 |
+
# Build one centroid per category by averaging the embeddings of all
|
| 1141 |
+
# INTERLEAVED_QUESTIONS prompts in that category (q1 + q2). Then any
|
| 1142 |
+
# node with a stored embedding can be categorized post-hoc by cosine
|
| 1143 |
+
# similarity, regardless of when it was deposited. Replaces the
|
| 1144 |
+
# registry-only logic that couldn't see pre-instrumentation nodes.
|
| 1145 |
+
_category_centroids: dict = {}
|
| 1146 |
+
_CATEGORY_SIM_THRESHOLD = 0.30 # cosine sim floor to assign category
|
| 1147 |
+
try:
|
| 1148 |
+
_per_cat_embs: dict = {}
|
| 1149 |
+
for _cat, _prompt in INTERLEAVED_QUESTIONS:
|
| 1150 |
+
_emb = np.asarray(nw_organism._embed_fn(_prompt), dtype=np.float32)
|
| 1151 |
+
_per_cat_embs.setdefault(_cat, []).append(_emb)
|
| 1152 |
+
for _cat, _embs in _per_cat_embs.items():
|
| 1153 |
+
_centroid = np.mean(_embs, axis=0)
|
| 1154 |
+
_norm = np.linalg.norm(_centroid) + 1e-8
|
| 1155 |
+
_category_centroids[_cat] = _centroid / _norm
|
| 1156 |
+
logger.info("Built %d category centroids for similarity tagging",
|
| 1157 |
+
len(_category_centroids))
|
| 1158 |
+
except Exception as exc:
|
| 1159 |
+
logger.warning("Category centroid build failed: %s", exc)
|
| 1160 |
+
|
| 1161 |
+
def _categorize_node(node_id: str) -> Optional[str]:
|
| 1162 |
+
"""Return best-matching category for a node, or None.
|
| 1163 |
+
|
| 1164 |
+
Looks up the node's stored embedding in the organism's side-table,
|
| 1165 |
+
computes cosine similarity to each category centroid, returns the
|
| 1166 |
+
category with maximum similarity if it exceeds the threshold.
|
| 1167 |
+
Threshold prevents off-topic substrate nodes (e.g. residue from
|
| 1168 |
+
unrelated chat sessions) from being shoehorned into a category.
|
| 1169 |
+
"""
|
| 1170 |
+
if not _category_centroids:
|
| 1171 |
+
return None
|
| 1172 |
+
emb = nw_organism._embeddings.get(node_id)
|
| 1173 |
+
if emb is None:
|
| 1174 |
+
return None
|
| 1175 |
+
norm = np.linalg.norm(emb) + 1e-8
|
| 1176 |
+
emb_n = emb / norm
|
| 1177 |
+
best_cat = None
|
| 1178 |
+
best_sim = _CATEGORY_SIM_THRESHOLD
|
| 1179 |
+
for cat, cent in _category_centroids.items():
|
| 1180 |
+
sim = float(np.dot(emb_n, cent))
|
| 1181 |
+
if sim > best_sim:
|
| 1182 |
+
best_sim = sim
|
| 1183 |
+
best_cat = cat
|
| 1184 |
+
return best_cat
|
| 1185 |
+
|
| 1186 |
N = len(INTERLEAVED_QUESTIONS)
|
| 1187 |
|
| 1188 |
for i, (category, prompt_text) in enumerate(INTERLEAVED_QUESTIONS):
|
|
|
|
| 1327 |
if i < j and deposit_ids[i] and deposit_ids[i] in pith_set:
|
| 1328 |
mat_B[j, i] = 1.0
|
| 1329 |
|
| 1330 |
+
# ββ Heatmap B (category-match via similarity tagging β Option G) ββ
|
| 1331 |
+
# Cell (j, i) is bright when turn j's Pith contains ANY node whose
|
| 1332 |
+
# stored embedding cosine-matches category[i]'s centroid above the
|
| 1333 |
+
# threshold. Causally valid for all i < j. The similarity-based
|
| 1334 |
+
# version replaces the prior registry-based logic which could only
|
| 1335 |
+
# see nodes deposited by runs that called the new code path β
|
| 1336 |
+
# invisible against a substrate accumulated over many prior runs.
|
| 1337 |
mat_B_cat = np.zeros((N, N))
|
| 1338 |
for j in range(N):
|
| 1339 |
pith_set = set(pith_ids_per_turn[j])
|
|
|
|
| 1342 |
continue
|
| 1343 |
target_cat = categories_per_turn[i]
|
| 1344 |
for pid in pith_set:
|
| 1345 |
+
if _categorize_node(pid) == target_cat:
|
| 1346 |
mat_B_cat[j, i] = 1.0
|
| 1347 |
break
|
| 1348 |
|
|
|
|
| 1414 |
# that mat_B.sum() produces (previous reporting conflated the two).
|
| 1415 |
same_cat_B_hits = sum(int(mat_B[j, i]) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS)
|
| 1416 |
|
| 1417 |
+
# Category-match via similarity (Option G): for each q2 turn, did its
|
| 1418 |
+
# pith contain ANY node whose embedding cosine-matches the turn's
|
| 1419 |
+
# category centroid above threshold? This is the metric that actually
|
| 1420 |
+
# answers "is the substrate doing category-coherent retrieval" β
|
| 1421 |
+
# works on the entire substrate, not just nodes the registry has seen.
|
| 1422 |
q2_turns = [j for (_, j) in _INTERLEAVED_SAME_CAT_PAIRS]
|
| 1423 |
same_cat_pith_hits = 0
|
| 1424 |
for j in q2_turns:
|
| 1425 |
pith_set = set(pith_ids_per_turn[j])
|
| 1426 |
j_cat = categories_per_turn[j]
|
| 1427 |
+
if any(_categorize_node(pid) == j_cat for pid in pith_set):
|
| 1428 |
same_cat_pith_hits += 1
|
| 1429 |
same_cat_pith_hit_rate = same_cat_pith_hits / max(1, len(q2_turns))
|
| 1430 |
|
| 1431 |
# Off-diagonal "category leak" diagnostic: how often did a q2 pith
|
| 1432 |
+
# pull a node tagged with a DIFFERENT category? Lower is cleaner
|
| 1433 |
+
# separation. Untaggable nodes (no embedding, or below threshold) do
|
| 1434 |
+
# not count as leaks.
|
| 1435 |
cross_cat_leaks = 0
|
| 1436 |
for j in q2_turns:
|
| 1437 |
pith_set = set(pith_ids_per_turn[j])
|
| 1438 |
j_cat = categories_per_turn[j]
|
| 1439 |
+
for pid in pith_set:
|
| 1440 |
+
tagged = _categorize_node(pid)
|
| 1441 |
+
if tagged is not None and tagged != j_cat:
|
| 1442 |
+
cross_cat_leaks += 1
|
| 1443 |
+
break
|
| 1444 |
|
| 1445 |
# End-state substrate diagnostics β pair with the _start_ values
|
| 1446 |
# captured at benchmark entry so consumers can confirm both A and B
|