Spaces:

Executor-Tyrant-Framework
/

NuWave

Sleeping

App Files Files Community

Executor-Tyrant-Framework commited on Apr 21

Commit

e4da2b6

verified ·

1 Parent(s): 4e714d6

Sync from GitHub: 89b8477d86cc8cc4ab59bdcf327c7c98a303df49

Browse files

Files changed (1) hide show

app.py +112 -5

app.py CHANGED Viewed

@@ -838,6 +838,78 @@ INTERLEAVED_QUESTIONS = [
 _INTERLEAVED_SAME_CAT_PAIRS = [(0, 4), (1, 5), (2, 6), (3, 7)]
 SAMPLE_CONVERSATIONS = [
     "What is machine learning?",
     "How does it differ from traditional programming?",
@@ -956,7 +1028,7 @@ def on_benchmark(num_turns):
     return json.dumps(summary, indent=2), json.dumps(results, indent=2)
-def on_interleaved_benchmark(enable_dual_pass: bool = True):
     """Run the 4-category interleaved benchmark + build re-ignition heatmaps.
     Runs against the live organism (accumulated state), so re-ignition
@@ -1005,8 +1077,18 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
             "Dual-pass DISABLED for this benchmark run "
             "(drained %d pending concept entries)", drained_count,
         )
     else:
-        logger.info("Dual-pass ENABLED for this benchmark run")
     # Record starting substrate state for fair-comparison diagnostics
     _start_stats = nw_organism.get_stats()
@@ -1254,9 +1336,12 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
     # captures the actual state. If an exception crashes the benchmark
     # mid-flight the extractor stays detached until manual re-wiring
     # or Space restart — acceptable for a diagnostic tool.
-    if not enable_dual_pass and _saved_extractor is not None:
         nw_organism._concept_extractor = _saved_extractor
-        logger.info("Dual-pass RE-ENABLED after benchmark")
     return (
         json.dumps(summary, indent=2),
@@ -1374,6 +1459,22 @@ with gr.Blocks(
                 )
                 inter_btn = gr.Button("Run Interleaved Benchmark", variant="primary")
             inter_summary = gr.Code(label="Summary", language="json")
             inter_per_turn = gr.Code(label="Per-Turn Data", language="json")
@@ -1382,11 +1483,17 @@ with gr.Blocks(
                 inter_heatmap_b = gr.Plot(label="Pith Re-selection")
             inter_btn.click(
-                on_interleaved_benchmark,
                 inputs=[inter_enable_dualpass],
                 outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
             )
         with gr.Tab("Debug Extract"):
             gr.Markdown(
                 """

 _INTERLEAVED_SAME_CAT_PAIRS = [(0, 4), (1, 5), (2, 6), (3, 7)]
+# ── Oracle Trees (experimental ceiling test) ─────────────────────────
+#
+# Hand-authored "ideal" mechanism concepts for each interleaved prompt.
+# Used by the oracle-mode benchmark to establish whether dual-pass CAN
+# succeed given perfect trees — regardless of extractor quality. If
+# oracle-mode ignition metrics dramatically exceed run 3's no-tree
+# baseline (15.3× signal/noise), the extractor is the bottleneck
+# and worth improving. If oracle-mode performs no better than runs
+# 3-9, dual-pass itself is the dead end.
+#
+# Design: each q1 and q2 tree list intentionally shares 1-5 concepts
+# with its same-category partner to maximize re-ignition probability.
+# Example: "prime factorization" appears in BOTH math/q1 and math/q2
+# so it should fire the same tree node on both turns.
+_ORACLE_TREES = {
+    # Biology
+    "How does photosynthesis work?": [
+        "chlorophyll", "photon absorption", "thylakoid membrane",
+        "Calvin cycle", "ATP synthesis", "carbon fixation",
+    ],
+    "What role does chlorophyll play in it?": [
+        "chlorophyll", "photon absorption", "thylakoid membrane",
+        "light-dependent reactions", "green pigment", "photosystem II",
+    ],
+    # Physics
+    "What is a black hole?": [
+        "event horizon", "Schwarzschild radius", "gravitational collapse",
+        "singularity", "escape velocity", "spacetime curvature",
+    ],
+    "How does its event horizon form?": [
+        "event horizon", "Schwarzschild radius", "gravitational collapse",
+        "spacetime curvature", "escape velocity", "null geodesic",
+    ],
+    # Computing
+    "How do CPU cache hierarchies work?": [
+        "cache hierarchy", "cache coherency", "memory access latency",
+        "cache line", "L1 cache", "L2 cache",
+    ],
+    "Why are L1 caches split into instruction and data?": [
+        "L1 cache", "instruction cache", "data cache",
+        "cache line", "Harvard architecture", "pipeline parallelism",
+    ],
+    # Math
+    "What are prime numbers?": [
+        "prime factorization", "integer divisibility", "Euclidean algorithm",
+        "fundamental theorem", "modular arithmetic", "prime distribution",
+    ],
+    "Why are they important in cryptography?": [
+        "prime factorization", "modular exponentiation", "RSA encryption",
+        "discrete logarithm", "trapdoor function", "integer factorization",
+    ],
+}
+def _oracle_concept_extractor(text: str) -> list:
+    """Return hand-authored ideal trees for interleaved benchmark prompts.
+    Oracle extraction: lookup-only, no LLM call. Used by the oracle-mode
+    benchmark to establish the ceiling of dual-pass performance. For
+    prompts NOT in the oracle dict, returns empty list (oracle mode only
+    supports the interleaved benchmark questions — running other text
+    through this would give misleading results).
+    """
+    concepts = _ORACLE_TREES.get(text, [])
+    if not concepts:
+        logger.info("Oracle extractor: no entry for prompt, returning []")
+    else:
+        logger.info("Oracle extractor: returning %d concepts for %r",
+                    len(concepts), text[:60])
+    return [c.lower() for c in concepts]
 SAMPLE_CONVERSATIONS = [
     "What is machine learning?",
     "How does it differ from traditional programming?",
     return json.dumps(summary, indent=2), json.dumps(results, indent=2)
+def on_interleaved_benchmark(enable_dual_pass: bool = True, oracle_trees: bool = False):
     """Run the 4-category interleaved benchmark + build re-ignition heatmaps.
     Runs against the live organism (accumulated state), so re-ignition
             "Dual-pass DISABLED for this benchmark run "
             "(drained %d pending concept entries)", drained_count,
         )
+    elif oracle_trees:
+        # Oracle mode — swap the LLM extractor for a dict-lookup oracle
+        # that returns hand-authored ideal trees. Tests the ceiling of
+        # dual-pass performance independent of extractor quality.
+        _saved_extractor = nw_organism._concept_extractor
+        nw_organism._concept_extractor = _oracle_concept_extractor
+        logger.info(
+            "ORACLE TREES mode for this benchmark run — using hand-authored "
+            "ideal concepts (%d prompts in oracle dict)", len(_ORACLE_TREES),
+        )
     else:
+        logger.info("Dual-pass ENABLED for this benchmark run (LLM extractor)")
     # Record starting substrate state for fair-comparison diagnostics
     _start_stats = nw_organism.get_stats()
     # captures the actual state. If an exception crashes the benchmark
     # mid-flight the extractor stays detached until manual re-wiring
     # or Space restart — acceptable for a diagnostic tool.
+    if _saved_extractor is not None:
         nw_organism._concept_extractor = _saved_extractor
+        if oracle_trees:
+            logger.info("Oracle mode EXITED — LLM extractor restored")
+        else:
+            logger.info("Dual-pass RE-ENABLED after benchmark")
     return (
         json.dumps(summary, indent=2),
                 )
                 inter_btn = gr.Button("Run Interleaved Benchmark", variant="primary")
+            gr.Markdown(
+                """
+                **Oracle Trees (ceiling test):** Run once with hand-authored
+                ideal mechanism concepts instead of the LLM extractor. Tests
+                whether dual-pass CAN succeed given perfect trees — regardless
+                of extractor quality. If ignition metrics dramatically exceed
+                the no-tree baseline, the extractor is the bottleneck.
+                If not, dual-pass itself is the dead end. Only works with
+                the 8 interleaved benchmark prompts.
+                """
+            )
+            oracle_btn = gr.Button(
+                "Run with Oracle Trees (experiment)",
+                variant="secondary",
+            )
             inter_summary = gr.Code(label="Summary", language="json")
             inter_per_turn = gr.Code(label="Per-Turn Data", language="json")
                 inter_heatmap_b = gr.Plot(label="Pith Re-selection")
             inter_btn.click(
+                lambda enable: on_interleaved_benchmark(enable, False),
                 inputs=[inter_enable_dualpass],
                 outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
             )
+            oracle_btn.click(
+                lambda: on_interleaved_benchmark(True, True),
+                inputs=[],
+                outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
+            )
         with gr.Tab("Debug Extract"):
             gr.Markdown(
                 """