Executor-Tyrant-Framework commited on
Commit
e4da2b6
Β·
verified Β·
1 Parent(s): 4e714d6

Sync from GitHub: 89b8477d86cc8cc4ab59bdcf327c7c98a303df49

Browse files
Files changed (1) hide show
  1. app.py +112 -5
app.py CHANGED
@@ -838,6 +838,78 @@ INTERLEAVED_QUESTIONS = [
838
  _INTERLEAVED_SAME_CAT_PAIRS = [(0, 4), (1, 5), (2, 6), (3, 7)]
839
 
840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  SAMPLE_CONVERSATIONS = [
842
  "What is machine learning?",
843
  "How does it differ from traditional programming?",
@@ -956,7 +1028,7 @@ def on_benchmark(num_turns):
956
  return json.dumps(summary, indent=2), json.dumps(results, indent=2)
957
 
958
 
959
- def on_interleaved_benchmark(enable_dual_pass: bool = True):
960
  """Run the 4-category interleaved benchmark + build re-ignition heatmaps.
961
 
962
  Runs against the live organism (accumulated state), so re-ignition
@@ -1005,8 +1077,18 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
1005
  "Dual-pass DISABLED for this benchmark run "
1006
  "(drained %d pending concept entries)", drained_count,
1007
  )
 
 
 
 
 
 
 
 
 
 
1008
  else:
1009
- logger.info("Dual-pass ENABLED for this benchmark run")
1010
 
1011
  # Record starting substrate state for fair-comparison diagnostics
1012
  _start_stats = nw_organism.get_stats()
@@ -1254,9 +1336,12 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
1254
  # captures the actual state. If an exception crashes the benchmark
1255
  # mid-flight the extractor stays detached until manual re-wiring
1256
  # or Space restart β€” acceptable for a diagnostic tool.
1257
- if not enable_dual_pass and _saved_extractor is not None:
1258
  nw_organism._concept_extractor = _saved_extractor
1259
- logger.info("Dual-pass RE-ENABLED after benchmark")
 
 
 
1260
 
1261
  return (
1262
  json.dumps(summary, indent=2),
@@ -1374,6 +1459,22 @@ with gr.Blocks(
1374
  )
1375
  inter_btn = gr.Button("Run Interleaved Benchmark", variant="primary")
1376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1377
  inter_summary = gr.Code(label="Summary", language="json")
1378
  inter_per_turn = gr.Code(label="Per-Turn Data", language="json")
1379
 
@@ -1382,11 +1483,17 @@ with gr.Blocks(
1382
  inter_heatmap_b = gr.Plot(label="Pith Re-selection")
1383
 
1384
  inter_btn.click(
1385
- on_interleaved_benchmark,
1386
  inputs=[inter_enable_dualpass],
1387
  outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
1388
  )
1389
 
 
 
 
 
 
 
1390
  with gr.Tab("Debug Extract"):
1391
  gr.Markdown(
1392
  """
 
838
  _INTERLEAVED_SAME_CAT_PAIRS = [(0, 4), (1, 5), (2, 6), (3, 7)]
839
 
840
 
841
+ # ── Oracle Trees (experimental ceiling test) ─────────────────────────
842
+ #
843
+ # Hand-authored "ideal" mechanism concepts for each interleaved prompt.
844
+ # Used by the oracle-mode benchmark to establish whether dual-pass CAN
845
+ # succeed given perfect trees β€” regardless of extractor quality. If
846
+ # oracle-mode ignition metrics dramatically exceed run 3's no-tree
847
+ # baseline (15.3Γ— signal/noise), the extractor is the bottleneck
848
+ # and worth improving. If oracle-mode performs no better than runs
849
+ # 3-9, dual-pass itself is the dead end.
850
+ #
851
+ # Design: each q1 and q2 tree list intentionally shares 1-5 concepts
852
+ # with its same-category partner to maximize re-ignition probability.
853
+ # Example: "prime factorization" appears in BOTH math/q1 and math/q2
854
+ # so it should fire the same tree node on both turns.
855
+ _ORACLE_TREES = {
856
+ # Biology
857
+ "How does photosynthesis work?": [
858
+ "chlorophyll", "photon absorption", "thylakoid membrane",
859
+ "Calvin cycle", "ATP synthesis", "carbon fixation",
860
+ ],
861
+ "What role does chlorophyll play in it?": [
862
+ "chlorophyll", "photon absorption", "thylakoid membrane",
863
+ "light-dependent reactions", "green pigment", "photosystem II",
864
+ ],
865
+ # Physics
866
+ "What is a black hole?": [
867
+ "event horizon", "Schwarzschild radius", "gravitational collapse",
868
+ "singularity", "escape velocity", "spacetime curvature",
869
+ ],
870
+ "How does its event horizon form?": [
871
+ "event horizon", "Schwarzschild radius", "gravitational collapse",
872
+ "spacetime curvature", "escape velocity", "null geodesic",
873
+ ],
874
+ # Computing
875
+ "How do CPU cache hierarchies work?": [
876
+ "cache hierarchy", "cache coherency", "memory access latency",
877
+ "cache line", "L1 cache", "L2 cache",
878
+ ],
879
+ "Why are L1 caches split into instruction and data?": [
880
+ "L1 cache", "instruction cache", "data cache",
881
+ "cache line", "Harvard architecture", "pipeline parallelism",
882
+ ],
883
+ # Math
884
+ "What are prime numbers?": [
885
+ "prime factorization", "integer divisibility", "Euclidean algorithm",
886
+ "fundamental theorem", "modular arithmetic", "prime distribution",
887
+ ],
888
+ "Why are they important in cryptography?": [
889
+ "prime factorization", "modular exponentiation", "RSA encryption",
890
+ "discrete logarithm", "trapdoor function", "integer factorization",
891
+ ],
892
+ }
893
+
894
+
895
+ def _oracle_concept_extractor(text: str) -> list:
896
+ """Return hand-authored ideal trees for interleaved benchmark prompts.
897
+
898
+ Oracle extraction: lookup-only, no LLM call. Used by the oracle-mode
899
+ benchmark to establish the ceiling of dual-pass performance. For
900
+ prompts NOT in the oracle dict, returns empty list (oracle mode only
901
+ supports the interleaved benchmark questions β€” running other text
902
+ through this would give misleading results).
903
+ """
904
+ concepts = _ORACLE_TREES.get(text, [])
905
+ if not concepts:
906
+ logger.info("Oracle extractor: no entry for prompt, returning []")
907
+ else:
908
+ logger.info("Oracle extractor: returning %d concepts for %r",
909
+ len(concepts), text[:60])
910
+ return [c.lower() for c in concepts]
911
+
912
+
913
  SAMPLE_CONVERSATIONS = [
914
  "What is machine learning?",
915
  "How does it differ from traditional programming?",
 
1028
  return json.dumps(summary, indent=2), json.dumps(results, indent=2)
1029
 
1030
 
1031
+ def on_interleaved_benchmark(enable_dual_pass: bool = True, oracle_trees: bool = False):
1032
  """Run the 4-category interleaved benchmark + build re-ignition heatmaps.
1033
 
1034
  Runs against the live organism (accumulated state), so re-ignition
 
1077
  "Dual-pass DISABLED for this benchmark run "
1078
  "(drained %d pending concept entries)", drained_count,
1079
  )
1080
+ elif oracle_trees:
1081
+ # Oracle mode β€” swap the LLM extractor for a dict-lookup oracle
1082
+ # that returns hand-authored ideal trees. Tests the ceiling of
1083
+ # dual-pass performance independent of extractor quality.
1084
+ _saved_extractor = nw_organism._concept_extractor
1085
+ nw_organism._concept_extractor = _oracle_concept_extractor
1086
+ logger.info(
1087
+ "ORACLE TREES mode for this benchmark run β€” using hand-authored "
1088
+ "ideal concepts (%d prompts in oracle dict)", len(_ORACLE_TREES),
1089
+ )
1090
  else:
1091
+ logger.info("Dual-pass ENABLED for this benchmark run (LLM extractor)")
1092
 
1093
  # Record starting substrate state for fair-comparison diagnostics
1094
  _start_stats = nw_organism.get_stats()
 
1336
  # captures the actual state. If an exception crashes the benchmark
1337
  # mid-flight the extractor stays detached until manual re-wiring
1338
  # or Space restart β€” acceptable for a diagnostic tool.
1339
+ if _saved_extractor is not None:
1340
  nw_organism._concept_extractor = _saved_extractor
1341
+ if oracle_trees:
1342
+ logger.info("Oracle mode EXITED β€” LLM extractor restored")
1343
+ else:
1344
+ logger.info("Dual-pass RE-ENABLED after benchmark")
1345
 
1346
  return (
1347
  json.dumps(summary, indent=2),
 
1459
  )
1460
  inter_btn = gr.Button("Run Interleaved Benchmark", variant="primary")
1461
 
1462
+ gr.Markdown(
1463
+ """
1464
+ **Oracle Trees (ceiling test):** Run once with hand-authored
1465
+ ideal mechanism concepts instead of the LLM extractor. Tests
1466
+ whether dual-pass CAN succeed given perfect trees β€” regardless
1467
+ of extractor quality. If ignition metrics dramatically exceed
1468
+ the no-tree baseline, the extractor is the bottleneck.
1469
+ If not, dual-pass itself is the dead end. Only works with
1470
+ the 8 interleaved benchmark prompts.
1471
+ """
1472
+ )
1473
+ oracle_btn = gr.Button(
1474
+ "Run with Oracle Trees (experiment)",
1475
+ variant="secondary",
1476
+ )
1477
+
1478
  inter_summary = gr.Code(label="Summary", language="json")
1479
  inter_per_turn = gr.Code(label="Per-Turn Data", language="json")
1480
 
 
1483
  inter_heatmap_b = gr.Plot(label="Pith Re-selection")
1484
 
1485
  inter_btn.click(
1486
+ lambda enable: on_interleaved_benchmark(enable, False),
1487
  inputs=[inter_enable_dualpass],
1488
  outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
1489
  )
1490
 
1491
+ oracle_btn.click(
1492
+ lambda: on_interleaved_benchmark(True, True),
1493
+ inputs=[],
1494
+ outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
1495
+ )
1496
+
1497
  with gr.Tab("Debug Extract"):
1498
  gr.Markdown(
1499
  """