Spaces:

Executor-Tyrant-Framework
/

NuWave

Running

App Files Files Community

Executor-Tyrant-Framework commited on Apr 25

Commit

42aac76

verified ·

1 Parent(s): 0aef8d8

Sync from GitHub: 75c4d61881de90eca28c15d1b5d08cee9f1cd77e

Browse files

Files changed (1) hide show

app.py +74 -20

app.py CHANGED Viewed

@@ -1127,13 +1127,62 @@ def on_interleaved_benchmark(
     categories_per_turn: list = [c for c, _ in INTERLEAVED_QUESTIONS]
     # Cross-run category registry on the organism. Maps node_id -> category
-    # for every deposit this benchmark has ever made on this Space lifetime.
-    # Needed because turn j's pith may pull nodes from PRIOR runs that share
-    # a category but aren't in this run's deposit_ids. Lazy-init on first use.
     if not hasattr(nw_organism, "_benchmark_category_registry"):
         nw_organism._benchmark_category_registry = {}
     cat_registry = nw_organism._benchmark_category_registry
     N = len(INTERLEAVED_QUESTIONS)
     for i, (category, prompt_text) in enumerate(INTERLEAVED_QUESTIONS):
@@ -1278,13 +1327,13 @@ def on_interleaved_benchmark(
             if i < j and deposit_ids[i] and deposit_ids[i] in pith_set:
                 mat_B[j, i] = 1.0
-    # ── Heatmap B (category-match — the displayed one) ──
-    # Cell (j, i) is bright when turn j's Pith contains ANY node tagged
-    # with category[i] in the cross-run registry. Causally valid for all
-    # i < j; we don't restrict to same-category (i,j) pairs because the
-    # cross-category cells answer a useful question too: "did j leak into
-    # i's category?" (low brightness off the diagonal of same-cat pairs
-    # = clean separation, which is also good).
     mat_B_cat = np.zeros((N, N))
     for j in range(N):
         pith_set = set(pith_ids_per_turn[j])
@@ -1293,7 +1342,7 @@ def on_interleaved_benchmark(
                 continue
             target_cat = categories_per_turn[i]
             for pid in pith_set:
-                if cat_registry.get(pid) == target_cat:
                     mat_B_cat[j, i] = 1.0
                     break
@@ -1365,28 +1414,33 @@ def on_interleaved_benchmark(
     # that mat_B.sum() produces (previous reporting conflated the two).
     same_cat_B_hits = sum(int(mat_B[j, i]) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS)
-    # Category-match version: for each q2 turn, did its pith contain ANY
-    # node tagged with its own category in the cross-run registry? This
-    # is the looser, more biologically honest signal — surfacing what you
-    # KNOW about the topic, not the specific past instance.
     q2_turns = [j for (_, j) in _INTERLEAVED_SAME_CAT_PAIRS]
     same_cat_pith_hits = 0
     for j in q2_turns:
         pith_set = set(pith_ids_per_turn[j])
         j_cat = categories_per_turn[j]
-        if any(cat_registry.get(pid) == j_cat for pid in pith_set):
             same_cat_pith_hits += 1
     same_cat_pith_hit_rate = same_cat_pith_hits / max(1, len(q2_turns))
     # Off-diagonal "category leak" diagnostic: how often did a q2 pith
-    # pull a node from a DIFFERENT category? Lower is cleaner separation.
     cross_cat_leaks = 0
     for j in q2_turns:
         pith_set = set(pith_ids_per_turn[j])
         j_cat = categories_per_turn[j]
-        if any(cat_registry.get(pid) and cat_registry.get(pid) != j_cat
-               for pid in pith_set):
-            cross_cat_leaks += 1
     # End-state substrate diagnostics — pair with the _start_ values
     # captured at benchmark entry so consumers can confirm both A and B

     categories_per_turn: list = [c for c, _ in INTERLEAVED_QUESTIONS]
     # Cross-run category registry on the organism. Maps node_id -> category
+    # for deposits this benchmark has tagged. Kept as a diagnostic — its
+    # size proves the persistence path works — but the heatmap no longer
+    # depends on it (Run 15 showed the substrate is stable but pith pulls
+    # nodes that predate the registry, so registry-based tagging can never
+    # match). Lazy-init on first use.
     if not hasattr(nw_organism, "_benchmark_category_registry"):
         nw_organism._benchmark_category_registry = {}
     cat_registry = nw_organism._benchmark_category_registry
+    # ── Option G: similarity-based categorization ──────────────────────
+    # Build one centroid per category by averaging the embeddings of all
+    # INTERLEAVED_QUESTIONS prompts in that category (q1 + q2). Then any
+    # node with a stored embedding can be categorized post-hoc by cosine
+    # similarity, regardless of when it was deposited. Replaces the
+    # registry-only logic that couldn't see pre-instrumentation nodes.
+    _category_centroids: dict = {}
+    _CATEGORY_SIM_THRESHOLD = 0.30  # cosine sim floor to assign category
+    try:
+        _per_cat_embs: dict = {}
+        for _cat, _prompt in INTERLEAVED_QUESTIONS:
+            _emb = np.asarray(nw_organism._embed_fn(_prompt), dtype=np.float32)
+            _per_cat_embs.setdefault(_cat, []).append(_emb)
+        for _cat, _embs in _per_cat_embs.items():
+            _centroid = np.mean(_embs, axis=0)
+            _norm = np.linalg.norm(_centroid) + 1e-8
+            _category_centroids[_cat] = _centroid / _norm
+        logger.info("Built %d category centroids for similarity tagging",
+                    len(_category_centroids))
+    except Exception as exc:
+        logger.warning("Category centroid build failed: %s", exc)
+    def _categorize_node(node_id: str) -> Optional[str]:
+        """Return best-matching category for a node, or None.
+        Looks up the node's stored embedding in the organism's side-table,
+        computes cosine similarity to each category centroid, returns the
+        category with maximum similarity if it exceeds the threshold.
+        Threshold prevents off-topic substrate nodes (e.g. residue from
+        unrelated chat sessions) from being shoehorned into a category.
+        """
+        if not _category_centroids:
+            return None
+        emb = nw_organism._embeddings.get(node_id)
+        if emb is None:
+            return None
+        norm = np.linalg.norm(emb) + 1e-8
+        emb_n = emb / norm
+        best_cat = None
+        best_sim = _CATEGORY_SIM_THRESHOLD
+        for cat, cent in _category_centroids.items():
+            sim = float(np.dot(emb_n, cent))
+            if sim > best_sim:
+                best_sim = sim
+                best_cat = cat
+        return best_cat
     N = len(INTERLEAVED_QUESTIONS)
     for i, (category, prompt_text) in enumerate(INTERLEAVED_QUESTIONS):
             if i < j and deposit_ids[i] and deposit_ids[i] in pith_set:
                 mat_B[j, i] = 1.0
+    # ── Heatmap B (category-match via similarity tagging — Option G) ──
+    # Cell (j, i) is bright when turn j's Pith contains ANY node whose
+    # stored embedding cosine-matches category[i]'s centroid above the
+    # threshold. Causally valid for all i < j. The similarity-based
+    # version replaces the prior registry-based logic which could only
+    # see nodes deposited by runs that called the new code path —
+    # invisible against a substrate accumulated over many prior runs.
     mat_B_cat = np.zeros((N, N))
     for j in range(N):
         pith_set = set(pith_ids_per_turn[j])
                 continue
             target_cat = categories_per_turn[i]
             for pid in pith_set:
+                if _categorize_node(pid) == target_cat:
                     mat_B_cat[j, i] = 1.0
                     break
     # that mat_B.sum() produces (previous reporting conflated the two).
     same_cat_B_hits = sum(int(mat_B[j, i]) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS)
+    # Category-match via similarity (Option G): for each q2 turn, did its
+    # pith contain ANY node whose embedding cosine-matches the turn's
+    # category centroid above threshold? This is the metric that actually
+    # answers "is the substrate doing category-coherent retrieval" —
+    # works on the entire substrate, not just nodes the registry has seen.
     q2_turns = [j for (_, j) in _INTERLEAVED_SAME_CAT_PAIRS]
     same_cat_pith_hits = 0
     for j in q2_turns:
         pith_set = set(pith_ids_per_turn[j])
         j_cat = categories_per_turn[j]
+        if any(_categorize_node(pid) == j_cat for pid in pith_set):
             same_cat_pith_hits += 1
     same_cat_pith_hit_rate = same_cat_pith_hits / max(1, len(q2_turns))
     # Off-diagonal "category leak" diagnostic: how often did a q2 pith
+    # pull a node tagged with a DIFFERENT category? Lower is cleaner
+    # separation. Untaggable nodes (no embedding, or below threshold) do
+    # not count as leaks.
     cross_cat_leaks = 0
     for j in q2_turns:
         pith_set = set(pith_ids_per_turn[j])
         j_cat = categories_per_turn[j]
+        for pid in pith_set:
+            tagged = _categorize_node(pid)
+            if tagged is not None and tagged != j_cat:
+                cross_cat_leaks += 1
+                break
     # End-state substrate diagnostics — pair with the _start_ values
     # captured at benchmark entry so consumers can confirm both A and B