Sync from GitHub: 9e469f21bece8d35a8d9e4e26e2d7eb6c8c55efe
Browse files
app.py
CHANGED
|
@@ -1312,27 +1312,60 @@ def on_interleaved_benchmark(
|
|
| 1312 |
resp_nw, in_nw, out_nw, time_nw, tps_nw, _ = do_generate(prompt_nw, max_new_tokens=128)
|
| 1313 |
nw_msgs.append({"role": "assistant", "content": resp_nw})
|
| 1314 |
|
| 1315 |
-
# ββ Per-turn correctness signal (Run
|
| 1316 |
-
#
|
| 1317 |
-
#
|
| 1318 |
-
#
|
| 1319 |
-
#
|
| 1320 |
-
# discriminating signal: every co-firing pair got LTP'd equally,
|
| 1321 |
-
# including cross-category contaminations.
|
| 1322 |
#
|
| 1323 |
-
#
|
| 1324 |
-
#
|
| 1325 |
-
#
|
| 1326 |
-
#
|
| 1327 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1328 |
_tagged_total = 0
|
| 1329 |
_tagged_same = 0
|
|
|
|
| 1330 |
for _pid in pith_ids:
|
| 1331 |
_tag = _categorize_node(_pid)
|
| 1332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1333 |
continue
|
| 1334 |
_tagged_total += 1
|
| 1335 |
-
if
|
|
|
|
| 1336 |
_tagged_same += 1
|
| 1337 |
if _tagged_total >= 2:
|
| 1338 |
_same_cat_ratio = _tagged_same / _tagged_total
|
|
@@ -1439,9 +1472,12 @@ def on_interleaved_benchmark(
|
|
| 1439 |
# Run 31+ correctness-signal telemetry β what we fed the substrate
|
| 1440 |
# via record_outcome's success arg this turn, and the underlying
|
| 1441 |
# same-category proportion. ratio is None when fewer than 2 pith
|
| 1442 |
-
# ids were taggable (cold-start neutral).
|
|
|
|
|
|
|
| 1443 |
"success_signal": success_signal,
|
| 1444 |
"pith_same_cat_ratio": _same_cat_ratio,
|
|
|
|
| 1445 |
})
|
| 1446 |
|
| 1447 |
# ββ Heatmap A: ignition-set Jaccard overlap (symmetric) ββ
|
|
|
|
| 1312 |
resp_nw, in_nw, out_nw, time_nw, tps_nw, _ = do_generate(prompt_nw, max_new_tokens=128)
|
| 1313 |
nw_msgs.append({"role": "assistant", "content": resp_nw})
|
| 1314 |
|
| 1315 |
+
# ββ Per-turn correctness signal (Run 33+) ββββββββββββββββββββββββββ
|
| 1316 |
+
# Did this turn's pith pull predominantly USEFUL same-category
|
| 1317 |
+
# content β excluding self-retrievals (pith ids whose embedding is
|
| 1318 |
+
# near-identical to the query, i.e., the substrate handing the query
|
| 1319 |
+
# back at us)?
|
|
|
|
|
|
|
| 1320 |
#
|
| 1321 |
+
# History:
|
| 1322 |
+
# - Run 30: hardcoded success=True β inverted ignition asymmetry
|
| 1323 |
+
# (cross-cat firing harder than same-cat).
|
| 1324 |
+
# - Run 31 (same-cat ratio threshold β₯ 0.5): ignition flipped sign
|
| 1325 |
+
# in one run; token regression collapsed +6.1% β +0.51%.
|
| 1326 |
+
# - Run 32: signal got gamed by question-repetition. Prior-run
|
| 1327 |
+
# deposits of the same query text are same-category-tagged, so
|
| 1328 |
+
# ratio = 1.0 on 5/8 turns. Substrate over-LTP'd at 5Γ normal
|
| 1329 |
+
# rate (~56K new synapses vs typical ~11K). Token regression
|
| 1330 |
+
# jumped to +12.1%, wall-clock +8.6% slower.
|
| 1331 |
+
#
|
| 1332 |
+
# Self-retrieval gate: for each pith id, cosine similarity between
|
| 1333 |
+
# its embedding and the current query's embedding. If above
|
| 1334 |
+
# _SELF_RETRIEVAL_THRESHOLD (0.92), node is a near-identical text
|
| 1335 |
+
# repeat β counts toward tagged_total but NOT tagged_same. Drives
|
| 1336 |
+
# ratio DOWN for question-repeat-heavy turns, so canonical STDP
|
| 1337 |
+
# depresses self-retrieval synapses via LTD over multiple runs.
|
| 1338 |
+
#
|
| 1339 |
+
# This is a feedback-path correction (refines the reward signal we
|
| 1340 |
+
# feed canonical inject_reward), NOT an extraction-path filter β
|
| 1341 |
+
# pith still goes to the LLM unchanged. Substrate's STDP retrieves
|
| 1342 |
+
# what it retrieves; we only refine our judgement of "did that
|
| 1343 |
+
# help" so the canonical reward channel has accurate ground truth
|
| 1344 |
+
# to learn against.
|
| 1345 |
+
_SELF_RETRIEVAL_THRESHOLD = 0.92
|
| 1346 |
+
_q_emb = np.asarray(nw_organism._embed_fn(prompt_text), dtype=np.float32)
|
| 1347 |
+
_q_norm = float(np.linalg.norm(_q_emb)) + 1e-8
|
| 1348 |
_tagged_total = 0
|
| 1349 |
_tagged_same = 0
|
| 1350 |
+
_self_retrievals = 0
|
| 1351 |
for _pid in pith_ids:
|
| 1352 |
_tag = _categorize_node(_pid)
|
| 1353 |
+
_node_emb = nw_organism._embeddings.get(_pid)
|
| 1354 |
+
_is_self = False
|
| 1355 |
+
if _node_emb is not None:
|
| 1356 |
+
_node_norm = float(np.linalg.norm(_node_emb)) + 1e-8
|
| 1357 |
+
_cos_to_query = float(
|
| 1358 |
+
np.dot(_q_emb, _node_emb) / (_q_norm * _node_norm)
|
| 1359 |
+
)
|
| 1360 |
+
_is_self = _cos_to_query > _SELF_RETRIEVAL_THRESHOLD
|
| 1361 |
+
if _is_self:
|
| 1362 |
+
_self_retrievals += 1
|
| 1363 |
+
# Skip only when BOTH untaggable AND not self-retrieval (no signal)
|
| 1364 |
+
if _tag is None and not _is_self:
|
| 1365 |
continue
|
| 1366 |
_tagged_total += 1
|
| 1367 |
+
# Same-cat credit only if tag matches AND not a self-retrieval
|
| 1368 |
+
if _tag == category and not _is_self:
|
| 1369 |
_tagged_same += 1
|
| 1370 |
if _tagged_total >= 2:
|
| 1371 |
_same_cat_ratio = _tagged_same / _tagged_total
|
|
|
|
| 1472 |
# Run 31+ correctness-signal telemetry β what we fed the substrate
|
| 1473 |
# via record_outcome's success arg this turn, and the underlying
|
| 1474 |
# same-category proportion. ratio is None when fewer than 2 pith
|
| 1475 |
+
# ids were taggable (cold-start neutral). pith_self_retrievals
|
| 1476 |
+
# added Run 33+: count of pith ids with cosine β₯ 0.92 to query
|
| 1477 |
+
# (substrate handing the query back) β these count as misses.
|
| 1478 |
"success_signal": success_signal,
|
| 1479 |
"pith_same_cat_ratio": _same_cat_ratio,
|
| 1480 |
+
"pith_self_retrievals": _self_retrievals,
|
| 1481 |
})
|
| 1482 |
|
| 1483 |
# ββ Heatmap A: ignition-set Jaccard overlap (symmetric) ββ
|