Spaces:
Sleeping
Sleeping
Sync from GitHub: 89b8477d86cc8cc4ab59bdcf327c7c98a303df49
Browse files
app.py
CHANGED
|
@@ -838,6 +838,78 @@ INTERLEAVED_QUESTIONS = [
|
|
| 838 |
_INTERLEAVED_SAME_CAT_PAIRS = [(0, 4), (1, 5), (2, 6), (3, 7)]
|
| 839 |
|
| 840 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
SAMPLE_CONVERSATIONS = [
|
| 842 |
"What is machine learning?",
|
| 843 |
"How does it differ from traditional programming?",
|
|
@@ -956,7 +1028,7 @@ def on_benchmark(num_turns):
|
|
| 956 |
return json.dumps(summary, indent=2), json.dumps(results, indent=2)
|
| 957 |
|
| 958 |
|
| 959 |
-
def on_interleaved_benchmark(enable_dual_pass: bool = True):
|
| 960 |
"""Run the 4-category interleaved benchmark + build re-ignition heatmaps.
|
| 961 |
|
| 962 |
Runs against the live organism (accumulated state), so re-ignition
|
|
@@ -1005,8 +1077,18 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
|
|
| 1005 |
"Dual-pass DISABLED for this benchmark run "
|
| 1006 |
"(drained %d pending concept entries)", drained_count,
|
| 1007 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1008 |
else:
|
| 1009 |
-
logger.info("Dual-pass ENABLED for this benchmark run")
|
| 1010 |
|
| 1011 |
# Record starting substrate state for fair-comparison diagnostics
|
| 1012 |
_start_stats = nw_organism.get_stats()
|
|
@@ -1254,9 +1336,12 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
|
|
| 1254 |
# captures the actual state. If an exception crashes the benchmark
|
| 1255 |
# mid-flight the extractor stays detached until manual re-wiring
|
| 1256 |
# or Space restart β acceptable for a diagnostic tool.
|
| 1257 |
-
if
|
| 1258 |
nw_organism._concept_extractor = _saved_extractor
|
| 1259 |
-
|
|
|
|
|
|
|
|
|
|
| 1260 |
|
| 1261 |
return (
|
| 1262 |
json.dumps(summary, indent=2),
|
|
@@ -1374,6 +1459,22 @@ with gr.Blocks(
|
|
| 1374 |
)
|
| 1375 |
inter_btn = gr.Button("Run Interleaved Benchmark", variant="primary")
|
| 1376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1377 |
inter_summary = gr.Code(label="Summary", language="json")
|
| 1378 |
inter_per_turn = gr.Code(label="Per-Turn Data", language="json")
|
| 1379 |
|
|
@@ -1382,11 +1483,17 @@ with gr.Blocks(
|
|
| 1382 |
inter_heatmap_b = gr.Plot(label="Pith Re-selection")
|
| 1383 |
|
| 1384 |
inter_btn.click(
|
| 1385 |
-
on_interleaved_benchmark,
|
| 1386 |
inputs=[inter_enable_dualpass],
|
| 1387 |
outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
|
| 1388 |
)
|
| 1389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1390 |
with gr.Tab("Debug Extract"):
|
| 1391 |
gr.Markdown(
|
| 1392 |
"""
|
|
|
|
| 838 |
_INTERLEAVED_SAME_CAT_PAIRS = [(0, 4), (1, 5), (2, 6), (3, 7)]
|
| 839 |
|
| 840 |
|
| 841 |
+
# ββ Oracle Trees (experimental ceiling test) βββββββββββββββββββββββββ
|
| 842 |
+
#
|
| 843 |
+
# Hand-authored "ideal" mechanism concepts for each interleaved prompt.
|
| 844 |
+
# Used by the oracle-mode benchmark to establish whether dual-pass CAN
|
| 845 |
+
# succeed given perfect trees β regardless of extractor quality. If
|
| 846 |
+
# oracle-mode ignition metrics dramatically exceed run 3's no-tree
|
| 847 |
+
# baseline (15.3Γ signal/noise), the extractor is the bottleneck
|
| 848 |
+
# and worth improving. If oracle-mode performs no better than runs
|
| 849 |
+
# 3-9, dual-pass itself is the dead end.
|
| 850 |
+
#
|
| 851 |
+
# Design: each q1 and q2 tree list intentionally shares 1-5 concepts
|
| 852 |
+
# with its same-category partner to maximize re-ignition probability.
|
| 853 |
+
# Example: "prime factorization" appears in BOTH math/q1 and math/q2
|
| 854 |
+
# so it should fire the same tree node on both turns.
|
| 855 |
+
_ORACLE_TREES = {
|
| 856 |
+
# Biology
|
| 857 |
+
"How does photosynthesis work?": [
|
| 858 |
+
"chlorophyll", "photon absorption", "thylakoid membrane",
|
| 859 |
+
"Calvin cycle", "ATP synthesis", "carbon fixation",
|
| 860 |
+
],
|
| 861 |
+
"What role does chlorophyll play in it?": [
|
| 862 |
+
"chlorophyll", "photon absorption", "thylakoid membrane",
|
| 863 |
+
"light-dependent reactions", "green pigment", "photosystem II",
|
| 864 |
+
],
|
| 865 |
+
# Physics
|
| 866 |
+
"What is a black hole?": [
|
| 867 |
+
"event horizon", "Schwarzschild radius", "gravitational collapse",
|
| 868 |
+
"singularity", "escape velocity", "spacetime curvature",
|
| 869 |
+
],
|
| 870 |
+
"How does its event horizon form?": [
|
| 871 |
+
"event horizon", "Schwarzschild radius", "gravitational collapse",
|
| 872 |
+
"spacetime curvature", "escape velocity", "null geodesic",
|
| 873 |
+
],
|
| 874 |
+
# Computing
|
| 875 |
+
"How do CPU cache hierarchies work?": [
|
| 876 |
+
"cache hierarchy", "cache coherency", "memory access latency",
|
| 877 |
+
"cache line", "L1 cache", "L2 cache",
|
| 878 |
+
],
|
| 879 |
+
"Why are L1 caches split into instruction and data?": [
|
| 880 |
+
"L1 cache", "instruction cache", "data cache",
|
| 881 |
+
"cache line", "Harvard architecture", "pipeline parallelism",
|
| 882 |
+
],
|
| 883 |
+
# Math
|
| 884 |
+
"What are prime numbers?": [
|
| 885 |
+
"prime factorization", "integer divisibility", "Euclidean algorithm",
|
| 886 |
+
"fundamental theorem", "modular arithmetic", "prime distribution",
|
| 887 |
+
],
|
| 888 |
+
"Why are they important in cryptography?": [
|
| 889 |
+
"prime factorization", "modular exponentiation", "RSA encryption",
|
| 890 |
+
"discrete logarithm", "trapdoor function", "integer factorization",
|
| 891 |
+
],
|
| 892 |
+
}
|
| 893 |
+
|
| 894 |
+
|
| 895 |
+
def _oracle_concept_extractor(text: str) -> list:
|
| 896 |
+
"""Return hand-authored ideal trees for interleaved benchmark prompts.
|
| 897 |
+
|
| 898 |
+
Oracle extraction: lookup-only, no LLM call. Used by the oracle-mode
|
| 899 |
+
benchmark to establish the ceiling of dual-pass performance. For
|
| 900 |
+
prompts NOT in the oracle dict, returns empty list (oracle mode only
|
| 901 |
+
supports the interleaved benchmark questions β running other text
|
| 902 |
+
through this would give misleading results).
|
| 903 |
+
"""
|
| 904 |
+
concepts = _ORACLE_TREES.get(text, [])
|
| 905 |
+
if not concepts:
|
| 906 |
+
logger.info("Oracle extractor: no entry for prompt, returning []")
|
| 907 |
+
else:
|
| 908 |
+
logger.info("Oracle extractor: returning %d concepts for %r",
|
| 909 |
+
len(concepts), text[:60])
|
| 910 |
+
return [c.lower() for c in concepts]
|
| 911 |
+
|
| 912 |
+
|
| 913 |
SAMPLE_CONVERSATIONS = [
|
| 914 |
"What is machine learning?",
|
| 915 |
"How does it differ from traditional programming?",
|
|
|
|
| 1028 |
return json.dumps(summary, indent=2), json.dumps(results, indent=2)
|
| 1029 |
|
| 1030 |
|
| 1031 |
+
def on_interleaved_benchmark(enable_dual_pass: bool = True, oracle_trees: bool = False):
|
| 1032 |
"""Run the 4-category interleaved benchmark + build re-ignition heatmaps.
|
| 1033 |
|
| 1034 |
Runs against the live organism (accumulated state), so re-ignition
|
|
|
|
| 1077 |
"Dual-pass DISABLED for this benchmark run "
|
| 1078 |
"(drained %d pending concept entries)", drained_count,
|
| 1079 |
)
|
| 1080 |
+
elif oracle_trees:
|
| 1081 |
+
# Oracle mode β swap the LLM extractor for a dict-lookup oracle
|
| 1082 |
+
# that returns hand-authored ideal trees. Tests the ceiling of
|
| 1083 |
+
# dual-pass performance independent of extractor quality.
|
| 1084 |
+
_saved_extractor = nw_organism._concept_extractor
|
| 1085 |
+
nw_organism._concept_extractor = _oracle_concept_extractor
|
| 1086 |
+
logger.info(
|
| 1087 |
+
"ORACLE TREES mode for this benchmark run β using hand-authored "
|
| 1088 |
+
"ideal concepts (%d prompts in oracle dict)", len(_ORACLE_TREES),
|
| 1089 |
+
)
|
| 1090 |
else:
|
| 1091 |
+
logger.info("Dual-pass ENABLED for this benchmark run (LLM extractor)")
|
| 1092 |
|
| 1093 |
# Record starting substrate state for fair-comparison diagnostics
|
| 1094 |
_start_stats = nw_organism.get_stats()
|
|
|
|
| 1336 |
# captures the actual state. If an exception crashes the benchmark
|
| 1337 |
# mid-flight the extractor stays detached until manual re-wiring
|
| 1338 |
# or Space restart β acceptable for a diagnostic tool.
|
| 1339 |
+
if _saved_extractor is not None:
|
| 1340 |
nw_organism._concept_extractor = _saved_extractor
|
| 1341 |
+
if oracle_trees:
|
| 1342 |
+
logger.info("Oracle mode EXITED β LLM extractor restored")
|
| 1343 |
+
else:
|
| 1344 |
+
logger.info("Dual-pass RE-ENABLED after benchmark")
|
| 1345 |
|
| 1346 |
return (
|
| 1347 |
json.dumps(summary, indent=2),
|
|
|
|
| 1459 |
)
|
| 1460 |
inter_btn = gr.Button("Run Interleaved Benchmark", variant="primary")
|
| 1461 |
|
| 1462 |
+
gr.Markdown(
|
| 1463 |
+
"""
|
| 1464 |
+
**Oracle Trees (ceiling test):** Run once with hand-authored
|
| 1465 |
+
ideal mechanism concepts instead of the LLM extractor. Tests
|
| 1466 |
+
whether dual-pass CAN succeed given perfect trees β regardless
|
| 1467 |
+
of extractor quality. If ignition metrics dramatically exceed
|
| 1468 |
+
the no-tree baseline, the extractor is the bottleneck.
|
| 1469 |
+
If not, dual-pass itself is the dead end. Only works with
|
| 1470 |
+
the 8 interleaved benchmark prompts.
|
| 1471 |
+
"""
|
| 1472 |
+
)
|
| 1473 |
+
oracle_btn = gr.Button(
|
| 1474 |
+
"Run with Oracle Trees (experiment)",
|
| 1475 |
+
variant="secondary",
|
| 1476 |
+
)
|
| 1477 |
+
|
| 1478 |
inter_summary = gr.Code(label="Summary", language="json")
|
| 1479 |
inter_per_turn = gr.Code(label="Per-Turn Data", language="json")
|
| 1480 |
|
|
|
|
| 1483 |
inter_heatmap_b = gr.Plot(label="Pith Re-selection")
|
| 1484 |
|
| 1485 |
inter_btn.click(
|
| 1486 |
+
lambda enable: on_interleaved_benchmark(enable, False),
|
| 1487 |
inputs=[inter_enable_dualpass],
|
| 1488 |
outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
|
| 1489 |
)
|
| 1490 |
|
| 1491 |
+
oracle_btn.click(
|
| 1492 |
+
lambda: on_interleaved_benchmark(True, True),
|
| 1493 |
+
inputs=[],
|
| 1494 |
+
outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
|
| 1495 |
+
)
|
| 1496 |
+
|
| 1497 |
with gr.Tab("Debug Extract"):
|
| 1498 |
gr.Markdown(
|
| 1499 |
"""
|