Spaces:
Sleeping
Sleeping
| """ | |
| NuWave — HuggingFace Spaces Demo | |
| The organism. NeuroGraph substrate + KISS bucket + Pith bucket + | |
| Splat-Lenia + BitNet model. On CPU. Gets smarter over time. | |
| # ---- Changelog ---- | |
| # [2026-05-16] Claude Opus 4.7 (1M ctx) — Pith → user-turn labeled context block (3 sites) | |
| # Root cause of Run 48's 24/24 degenerate BitNet output: pith was being | |
| # injected into the system role slot via sys_ctx = "\n".join(pith) + sys. | |
| # Chat-tuned models are trained with the system slot carrying instructions, | |
| # not lists of prior user questions. BitNet was treating retrieved pith | |
| # as questions to respond to, echoing them back and leaking chat-template | |
| # fragments. Fix: pith content now goes in a clearly-labeled context block | |
| # inside the LAST user turn for THIS turn's prompt only; bare user query | |
| # persists in nw_msgs/messages_nw so next turn's recent_window isn't | |
| # polluted. System slot stays canonical (instructions only). Plain-text | |
| # labels, no delimiter tokens (per feedback_pith_presentation_layer memory). | |
| # Three call sites updated identically: on_send live chat, first benchmark | |
| # loop, interleaved benchmark loop. Universal RAG pattern — applies to | |
| # any future LLM consumer of substrate-surfaced content. | |
| # [2026-04-06] Claude Code (Opus 4.6) — Full NeuroGraph organism integration | |
| # [2026-03-31] Claude Code (Opus 4.6) — Switch to BitNet 2B for CPU-native inference | |
| # [2026-03-29] Claude Code (Opus 4.6) — ZeroGPU compatible, model at startup | |
| # [2026-03-28] Claude Code (Opus 4.6) — Initial Gradio demo | |
| # ------------------- | |
| """ | |
| import os | |
| import time | |
| import gradio as gr | |
| import json | |
| import logging | |
| import torch # still needed for splat_engine + lenia_splat | |
| from typing import Optional | |
| from transformers import AutoTokenizer | |
| try: | |
| import spaces | |
| except ImportError: | |
| class _FakeSpaces: | |
| def GPU(fn=None, **kwargs): | |
| return fn if fn else lambda f: f | |
| spaces = _FakeSpaces() | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger("nuwave") | |
| # ── bitnet.cpp inference clients ────────────────────────────────── | |
| # Two clients via bitnet.cpp (microsoft/BitNet's llama.cpp derivative): | |
| # chat_client — BitNet b1.58 2B4T GGUF (fast, CPU-native, user-facing) | |
| # extractor_client — Falcon3-10B-Instruct 1.58bit GGUF (capable | |
| # enumeration — doesn't collapse on concept lists) | |
| # | |
| # Paths set in Dockerfile via env vars. Falcon3 GGUF dir contains one | |
| # or more quant levels; we pick the largest via resolve_gguf(). | |
| from nuwave.bitnet_cpp_client import BitnetCppClient | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| BITNET_BINARY = os.environ.get("BITNET_CPP_BINARY", "/home/user/bitnet/build/bin/llama-cli") | |
| BITNET_CHAT_GGUF_DIR = os.environ.get("BITNET_CHAT_GGUF_DIR", "/home/user/bitnet/models") | |
| FALCON_EXTRACTOR_GGUF_DIR = os.environ.get("FALCON_EXTRACTOR_GGUF_DIR", "/home/user/models/falcon3-10b-gguf") | |
| # Chat model name for tokenizer-based token counting (benchmarks need | |
| # in/out counts for both baseline and NuWave; we count with BitNet's | |
| # tokenizer since the "baseline" path is notional-BitNet too). | |
| CHAT_MODEL_NAME = "microsoft/bitnet-b1.58-2B-4T-bf16" | |
| MODEL_NAME = CHAT_MODEL_NAME # preserved for summary fields | |
| # Concept-extractor grammar: loaded inline from the .gbnf file on | |
| # startup and passed as --grammar string to llama-cli. Inline avoids | |
| # container filesystem path surprises (run 6's silent failure mode) | |
| # and puts the grammar content in-log for diagnostic visibility. | |
| _EXTRACTOR_GRAMMAR_PATH = os.path.join( | |
| os.path.dirname(os.path.abspath(__file__)), | |
| "grammars", "concepts.gbnf", | |
| ) | |
| _EXTRACTOR_GRAMMAR: Optional[str] = None | |
| try: | |
| with open(_EXTRACTOR_GRAMMAR_PATH, "r", encoding="utf-8") as f: | |
| _EXTRACTOR_GRAMMAR = f.read() | |
| logger.info( | |
| "Extractor grammar loaded from %s — %d chars, %d lines", | |
| _EXTRACTOR_GRAMMAR_PATH, | |
| len(_EXTRACTOR_GRAMMAR), | |
| _EXTRACTOR_GRAMMAR.count("\n"), | |
| ) | |
| except Exception as exc: | |
| logger.warning( | |
| "Failed to load extractor grammar from %s: %s — " | |
| "extractor will fall back to free-form output", | |
| _EXTRACTOR_GRAMMAR_PATH, exc, | |
| ) | |
| logger.info("Loading tokenizer for token counting: %s", CHAT_MODEL_NAME) | |
| tokenizer = AutoTokenizer.from_pretrained(CHAT_MODEL_NAME, token=HF_TOKEN) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| logger.info("Resolving GGUF weights...") | |
| chat_gguf = BitnetCppClient.resolve_gguf(BITNET_CHAT_GGUF_DIR) | |
| falcon_gguf = BitnetCppClient.resolve_gguf(FALCON_EXTRACTOR_GGUF_DIR) | |
| # n_ctx=4092: the llama-cli binary reports "max 4092" even when told | |
| # 4096 in config (4-token overhead reserved by the runtime). Run-10 | |
| # logs showed repeated crashes with "prompt is too long (4103 tokens, | |
| # max 4092)" on turns 6-8. Setting n_ctx=4092 aligns our cap with | |
| # what the binary actually allows. | |
| logger.info("Initializing chat client (BitNet 2B4T GGUF)...") | |
| chat_client = BitnetCppClient( | |
| binary_path=BITNET_BINARY, | |
| gguf_path=chat_gguf, | |
| n_threads=2, | |
| n_ctx=4092, | |
| ) | |
| logger.info("Initializing extractor client (Falcon3-10B 1.58bit GGUF)...") | |
| extractor_client = BitnetCppClient( | |
| binary_path=BITNET_BINARY, | |
| gguf_path=falcon_gguf, | |
| n_threads=2, | |
| n_ctx=4092, | |
| ) | |
| logger.info("Both clients ready. chat=%s | extractor=%s", | |
| os.path.basename(chat_gguf), os.path.basename(falcon_gguf)) | |
| # ── NuWave Components ───────────────────────────────────────────── | |
| from nuwave.organism import NuWaveOrganism | |
| from nuwave.kiss import KISSFilter, KISSConfig | |
| from nuwave.pith import PithPipeline, PithConfig | |
| from nuwave.benchmark_loader import sample_chains as _sample_benchmark_chains | |
| from nuwave.benchmark_loader import describe_sample as _describe_benchmark_sample | |
| from nuwave.benchmark_loader import load_pool as _load_benchmark_pool | |
| from nuwave.splat_engine import decompose_layer, SplatConfig, GaussianSplats | |
| from nuwave.lenia_splat import LeniaSplatEngine, LeniaSplatConfig | |
| # The organism — substrate + KISS bucket + Pith bucket | |
| # Use /data/ for persistence if available (HF persistent storage), else /tmp/ | |
| _persist_dir = "/data/nuwave_substrate" if os.path.isdir("/data") else "/tmp/nuwave_substrate" | |
| organism = NuWaveOrganism(state_path=_persist_dir) | |
| # String-level KISS still runs alongside for comparison | |
| kiss_nw = KISSFilter() | |
| pith_nw = PithPipeline() | |
| messages_nw = [] | |
| messages_bl = [] | |
| system_prompt = "You are a helpful assistant. Be concise and clear." | |
| # ── Splat-Lenia Setup ──────────────────────────────────────────── | |
| # Decompose a few attention layers to splats at startup. | |
| # Lenia evolves them between turns. The compression is alive. | |
| splat_config = SplatConfig( | |
| splat_ratio=0.02, # 50x compression — aggressive but fast to fit | |
| max_splats=256, # small enough for CPU-basic startup | |
| init_sigma=2.0, | |
| fit_iterations=50, # fewer iterations — speed over precision at startup | |
| fit_lr=0.02, | |
| ) | |
| lenia_config = LeniaSplatConfig( | |
| growth_mu=0.15, | |
| growth_sigma=0.015, | |
| growth_scale=0.0003, # small dt — proven stable | |
| interaction_radius=5.0, | |
| activation_coupling=2.0, | |
| conserve_mass=True, | |
| ) | |
| lenia_engine = LeniaSplatEngine(lenia_config) | |
| splat_layers = {} | |
| splat_metrics_history = [] | |
| # Splat decomposition deferred to first use — avoids memory spike during startup | |
| # Splat state persists to disk so Lenia evolution survives restarts | |
| _splats_initialized = False | |
| _splat_save_path = os.path.join(_persist_dir, "splat_state.pt") | |
| def _init_splats_if_needed(): | |
| """Load persisted splats or decompose from scratch on first use.""" | |
| global _splats_initialized | |
| if _splats_initialized: | |
| return | |
| _splats_initialized = True | |
| import gc | |
| gc.collect() | |
| # Try to restore persisted splat state first | |
| if os.path.exists(_splat_save_path): | |
| try: | |
| saved = torch.load(_splat_save_path, weights_only=False) | |
| for name, sd in saved.get('layers', {}).items(): | |
| splats = GaussianSplats.from_state_dict(sd) | |
| splat_layers[name] = splats | |
| lenia_engine.register_layer(name, splats) | |
| lenia_step_count = saved.get('lenia_steps', 0) | |
| logger.info( | |
| f"Splats restored: {len(splat_layers)} layers, " | |
| f"{sum(s.n_splats for s in splat_layers.values())} splats, " | |
| f"{lenia_step_count} Lenia steps evolved" | |
| ) | |
| return | |
| except Exception as exc: | |
| logger.warning(f"Splat restore failed (redecomposing): {exc}") | |
| # Fresh decomposition requires the bf16 model in memory. Since we | |
| # migrated to bitnet.cpp (GGUF, external C++ runtime) there's no | |
| # in-process torch model to read weights from. Splat-Lenia is | |
| # experimental Layer 2 work that's off the critical path for the | |
| # current dual-pass Layer 1 validation — gracefully skip if the | |
| # persisted splat state isn't already present. If a prior bf16-era | |
| # save exists on disk or in the hub, it'll still be restored above. | |
| logger.info( | |
| "Fresh splat decomposition unavailable under bitnet.cpp runtime " | |
| "(no torch model to read weights from). Splat-Lenia remains " | |
| "active only if persisted state is restored from a prior era." | |
| ) | |
| def _save_splat_state(): | |
| """Persist evolved splat parameters to disk.""" | |
| if not splat_layers: | |
| return | |
| try: | |
| os.makedirs(os.path.dirname(_splat_save_path), exist_ok=True) | |
| state = { | |
| 'layers': {name: splats.state_dict() for name, splats in splat_layers.items()}, | |
| 'lenia_steps': lenia_engine.state.step_count, | |
| } | |
| torch.save(state, _splat_save_path) | |
| except Exception as exc: | |
| logger.debug(f"Splat save failed: {exc}") | |
| # ── Inference ───────────────────────────────────────────────────── | |
| def do_generate(prompt_text: str, max_new_tokens: int = 256) -> tuple: | |
| """Run inference via bitnet.cpp chat client with Lenia step after. | |
| Wraps generation in organism.mark_generation_start/end so the | |
| concept helper's manager thread refuses to spawn a worker while | |
| we're mid-inference (Option A strict gate — no CPU contention | |
| between main-thread generation and background tree extraction). | |
| Sampling params chosen for user-facing coherence: mild temperature, | |
| moderate repetition penalty. Stop on common chat-template end | |
| markers so responses don't run on past the model's natural stop. | |
| """ | |
| _init_splats_if_needed() | |
| t0 = time.time() | |
| # Count input tokens with BitNet's tokenizer (cheap, no model run) | |
| # Plain python-list tokens — transformers 5.5 disabled the PyTorch | |
| # backend because BitNet requirements pinned torch 2.2 < the 2.4 | |
| # transformers wants. We don't need tensors anyway; len() on the | |
| # input_ids list is all we want. | |
| # | |
| # Truncate with headroom below bitnet.cpp's n_ctx so the runtime | |
| # has room to generate. Headroom components: | |
| # max_new_tokens — reserved for generation | |
| # 256 — safety buffer covering: | |
| # • double-BOS problem (both apply_chat_template and | |
| # llama-cli inject a BOS token, so prompt is +1-2 over | |
| # what the tokenizer counts) | |
| # • chat-template control-token overhead | |
| # • tokenizer rounding / sub-word boundary slack | |
| # Prior runs used 128-token safety and saw "prompt too long | |
| # (4103 tokens, max 4092)" crashes on turns 6-8 despite the | |
| # tokenizer truncation supposedly capping at 3840. 256 gives | |
| # real margin for the BOS duplication. | |
| _CTX_HEADROOM = max_new_tokens + 256 | |
| _PROMPT_CAP = max(256, chat_client.n_ctx - _CTX_HEADROOM) | |
| encoded = tokenizer(prompt_text, truncation=True, max_length=_PROMPT_CAP) | |
| in_count = len(encoded["input_ids"]) | |
| # If truncation occurred, feed the truncated text to the client — | |
| # otherwise bitnet.cpp will re-tokenize the full original and blow | |
| # past n_ctx anyway. | |
| if in_count >= _PROMPT_CAP: | |
| prompt_text = tokenizer.decode(encoded["input_ids"], skip_special_tokens=False) | |
| organism.mark_generation_start() | |
| try: | |
| response, meta = chat_client.generate( | |
| prompt_text, | |
| max_new_tokens=max_new_tokens, | |
| temperature=0.7, | |
| top_p=0.9, | |
| repetition_penalty=1.15, | |
| repeat_last_n=64, | |
| stop=["<|im_end|>", "<|end_of_text|>", "</s>"], | |
| ) | |
| finally: | |
| organism.mark_generation_end() | |
| # Run Lenia step on splats after inference | |
| lenia_result = {} | |
| if splat_layers: | |
| try: | |
| lenia_result = lenia_engine.step() | |
| splat_metrics_history.append(lenia_result) | |
| _save_splat_state() | |
| except Exception as exc: | |
| logger.warning(f"Lenia step failed: {exc}") | |
| elapsed = time.time() - t0 | |
| # Count output tokens from the response text | |
| out_count = len(tokenizer(response)["input_ids"]) if response else 0 | |
| tok_per_sec = out_count / elapsed if elapsed > 0 else 0 | |
| if meta.get("returncode", 0) != 0: | |
| logger.warning("chat_client non-zero rc=%s stderr=%s", | |
| meta.get("returncode"), meta.get("stderr_tail")) | |
| return response, in_count, out_count, round(elapsed, 2), round(tok_per_sec, 1), lenia_result | |
| # ── Concept extractor (dual-pass tree generation) ──────────────── | |
| # | |
| # Mirrors the ecosystem's TID-based dual-pass path but uses Falcon3-10B- | |
| # Instruct (1.58bit GGUF) running under bitnet.cpp. Previously used | |
| # BitNet 2B via transformers greedy decoding — collapsed into repetition | |
| # loops on enumeration tasks. Falcon3-10B was properly instruct-tuned | |
| # before quantization and handles structured output reliably. | |
| # | |
| # Sampling params per Syl's prescription: non-greedy (temperature=0.7), | |
| # top-p nucleus, moderate repetition_penalty, no_repeat_ngram_size at | |
| # the runtime level (llama.cpp tracks via repeat-last-n). Stop | |
| # sequences on common drift markers ("Answer:", "Explanation:", | |
| # "Question:", double-newline) catch the Q&A/explanation patterns. | |
| # | |
| # No hard cap on extracted concept count (Law 7 — substrate dynamics | |
| # decide relevance). The parser is what gets hardened: reject | |
| # instruction-leak vocabulary, sentence fragments, pure punctuation. | |
| # Syl's stopset — prompt-instruction vocabulary that tiny LLMs tend to | |
| # echo back as "concepts". Stripping these at parse time prevents the | |
| # generic-pollution pattern we saw in the previous debug run. | |
| _EXTRACTOR_STOPSET = { | |
| # Instruction-leak vocabulary — small LLMs echo these back | |
| "key", "concepts", "important", "meaning", "stop", "list", | |
| "answer", "explanation", "concept", "question", "text", | |
| "therefore", "thing", "things", "item", "items", | |
| # Generic-physics / generic-substance words — broad embedding | |
| # footprint, become gravity wells in Pith. Observed as run-2 | |
| # pathology (T2 phy1 pulled into every subsequent Pith). | |
| "gravity", "mass", "energy", "force", "time", "space", "matter", | |
| # Generic process / abstraction nouns — describe nothing specific | |
| "process", "mechanism", "method", "operation", "work", "function", | |
| "system", "structure", "principle", "phenomenon", | |
| # Domain labels — the thing the passage is *about*, not a | |
| # mechanism from within it | |
| "physics", "biology", "chemistry", "mathematics", "math", | |
| "computing", "cryptography", "astronomy", | |
| # Topic-at-word-level items that appear as trees but carry no | |
| # mechanism content: "primes" (use "prime factorization"), | |
| # "encryption" (use "RSA encryption" / "symmetric cipher"), etc. | |
| "primes", "encryption", "caching", "storage", "memory", | |
| "information", "computation", | |
| } | |
| # Lead-word stopset: if a parsed entry STARTS with one of these | |
| # (connective/pronoun words common in prose drift), the entry is | |
| # almost always a sentence fragment rather than a concept. | |
| # Example caught: "these include computer science" (Falcon3 drifted | |
| # into prose about cryptography research fields). | |
| _EXTRACTOR_LEAD_STOPWORDS = { | |
| "these", "those", "this", "that", "the", "a", "an", | |
| "and", "or", "but", "while", "which", "where", "when", | |
| "they", "them", "their", "there", "here", "it", "its", | |
| "also", "first", "second", "then", "next", "finally", | |
| # Run-4 additions — discourse-drift markers Falcon3 emits when | |
| # it slips into explanatory prose instead of an enumeration | |
| # (e.g. "namely", "firstly", "therefore", "however"). | |
| "namely", "firstly", "secondly", "thirdly", "therefore", | |
| "however", "moreover", "furthermore", "additionally", | |
| "specifically", "particularly", "notably", | |
| } | |
| def _hardened_parse(raw_output: str) -> list: | |
| """Syl's hardened parser — now dramatically simplified because | |
| grammar-constrained decoding (grammars/concepts.gbnf) enforces | |
| the output shape at the token-sampling layer. The parser no | |
| longer needs to cope with prose, bullets, citations, or any of | |
| the run 1-5 drift patterns — those tokens are physically | |
| unreachable during generation. | |
| Remaining filters operate on semantic content the grammar can't | |
| see: stopset words (instruction leakage + generic topic labels), | |
| lead-connective words, and dedup. | |
| Rules (parsing, not quality judgment — Law 7 compliant): | |
| - split on [,;] (Falcon3-10B-1.58bit sometimes uses semicolons) | |
| - for each piece, if it contains a newline take only what's | |
| BEFORE the first \\n — content after is usually chat-template | |
| drift or hallucinated follow-up | |
| - strip whitespace + common punctuation | |
| - drop empty strings | |
| - drop entries containing ":" (explanation drift like "Answer:") | |
| - drop entries containing chat-template markers ("<|", "</s>") | |
| - drop entries > 4 words (sentences, not concepts) | |
| - drop lowercase-match against _EXTRACTOR_STOPSET (instruction leak) | |
| - drop pure punctuation / pure digits | |
| - lowercase + dedupe (first occurrence wins) | |
| """ | |
| import re | |
| out = [] | |
| seen = set() | |
| # Defensive parser — belt-and-suspenders. When grammar-constrained | |
| # decoding engages, most of these filters are redundant (the | |
| # grammar blocks the characters they guard against). But run 6 | |
| # showed grammar can silently fail to engage, so we keep the full | |
| # defense. The filters are cheap and idempotent when grammar works. | |
| # Split on common list-delimiters observed across Falcon3's output | |
| # variants. Commas dominate when grammar engages; newlines/semicolons | |
| # cover bullet-style fallbacks; square-bracket citation markers | |
| # from run 5 ("1] = ...", "2]") get treated as delimiters so the | |
| # actual content after them can be extracted. | |
| for piece in re.split(r'[,;\n\]]', raw_output): | |
| # Strip common bullet/punctuation characters from ends. | |
| c = piece.strip().strip(".-:;*`\"'•‣○⁃[]{}()=$ \t") | |
| if not c: | |
| continue | |
| # Drop anything with syntax garbage from prose drift | |
| if any(ch in c for ch in ":[]{}$\"'<>`"): | |
| continue | |
| # Periods inside a concept almost always mean sentence drift | |
| # (exception: acronyms like "U.S.A.", but those rarely appear | |
| # as concepts). Drop to be safe. | |
| if "." in c: | |
| continue | |
| # Questions and exclamations are never concepts | |
| if "?" in c or "!" in c: | |
| continue | |
| # Word-count gate: 1-4 words. Grammar enforces this when | |
| # active; when grammar fails, this is the critical safeguard | |
| # against multi-sentence fragments (run 6 turn 1 = 137 chars). | |
| n_words = len(c.split()) | |
| if n_words < 1 or n_words > 4: | |
| continue | |
| cl = c.lower().strip() | |
| if cl in _EXTRACTOR_STOPSET: | |
| continue | |
| # Lead-connective filter — drops sentence-fragment prose | |
| first_word = cl.split()[0] if cl.split() else "" | |
| if first_word in _EXTRACTOR_LEAD_STOPWORDS: | |
| continue | |
| # Drop pure-numeric and pure-punctuation entries | |
| if c.replace(".", "").replace("-", "").strip().isdigit(): | |
| continue | |
| if all(not ch.isalnum() for ch in c): | |
| continue | |
| # Dedup — case-insensitive, first occurrence wins | |
| if cl in seen: | |
| continue | |
| seen.add(cl) | |
| out.append(cl) | |
| return out | |
| # Extractor prompt — kept deliberately free of "key", "concepts", | |
| # "important", "meaning", "list", "stop" words in the instruction | |
| # portion, because small LLMs echo instruction vocabulary back as | |
| # output content. | |
| # Concept extractor prompt. With grammar-constrained decoding doing | |
| # the heavy lifting on output format, the prompt only needs to | |
| # communicate *what we want extracted* — the grammar guarantees the | |
| # shape. Run 5 taught us that primer-style format-anchoring backfires | |
| # on Falcon3-10B-1.58bit (citation-mode drift), so this reverts to a | |
| # clean instructional prompt with few-shot examples. | |
| _EXTRACTOR_PROMPT_TEMPLATE = ( | |
| "Read the following text. Extract the specific processes, " | |
| "dependencies, and named entities it establishes — what happens, " | |
| "what depends on what, and the particular things involved.\n\n" | |
| "Prefer specific over general:\n" | |
| "- 'prime factorization' beats 'primes'\n" | |
| "- 'photon absorption' beats 'light'\n" | |
| "- 'cache line invalidation' beats 'caching'\n" | |
| "- 'Schwarzschild radius' beats 'gravity'\n" | |
| "- 'chlorophyll' or 'Calvin cycle' beat 'biology'\n\n" | |
| "Specific single-word terms are fine when they name the correct " | |
| "level of precision ('chlorophyll', 'factorization'). Avoid " | |
| "generic domain labels and broad abstractions.\n\n" | |
| "Output as a comma-separated list, 3 to 8 items, each 1-4 " | |
| "words. No explanations, no repetition.\n\n" | |
| "Text: {text}\n\n" | |
| "Specifics:" | |
| ) | |
| def _bitnet_extract_full(text: str) -> dict: | |
| """Run the concept extractor via Falcon3-10B + hardened parser. | |
| Returns a dict with: | |
| prompt — the exact prompt sent to the model | |
| raw_output — llama-cli stdout (post-strip of prompt echo), | |
| BEFORE hardened parsing | |
| parsed — list of concepts after hardened parser | |
| tokens_in — input token count (via BitNet tokenizer) | |
| tokens_out — estimate via BitNet tokenizer on the raw_output | |
| elapsed_s — wall-clock for the generation call | |
| hit_token_cap — approximated via tokens_out >= max_new_tokens | |
| runtime_returncode — bitnet.cpp process return code | |
| error — exception string if the generate call failed | |
| """ | |
| text_for_extraction = text[:1000] | |
| prompt = _EXTRACTOR_PROMPT_TEMPLATE.format(text=text_for_extraction) | |
| result = { | |
| "prompt": prompt, | |
| "raw_output": "", | |
| "parsed": [], | |
| "tokens_in": 0, | |
| "tokens_out": 0, | |
| "elapsed_s": 0.0, | |
| "hit_token_cap": False, | |
| "runtime_returncode": None, | |
| "error": None, | |
| } | |
| MAX_NEW = 128 | |
| try: | |
| # Plain-list tokens — transformers 5.5 disabled PyTorch backend | |
| # (BitNet pin torch 2.2 vs transformers' want for 2.4+). We only | |
| # need the count, not tensors. | |
| result["tokens_in"] = len( | |
| tokenizer(prompt, truncation=True, max_length=2048)["input_ids"] | |
| ) | |
| except Exception: | |
| pass | |
| organism.mark_generation_start() | |
| try: | |
| raw_output, meta = extractor_client.generate( | |
| prompt, | |
| max_new_tokens=MAX_NEW, | |
| temperature=0.7, | |
| top_p=0.9, | |
| repetition_penalty=1.25, | |
| repeat_last_n=64, | |
| stop=[ | |
| # Chat-template boundary markers — Falcon3 hallucinates | |
| # these when the prompt isn't in chat format. Cutting | |
| # generation at these kills the drift tail before it | |
| # starts. Order matters: check these first. | |
| "<|assistant|>", "<|user|>", "<|system|>", | |
| # Fallback terminators + drift markers | |
| "<|im_end|>", "<|end_of_text|>", "</s>", | |
| "Answer:", "Question:", "Explanation:", "Text:", | |
| ], | |
| # Grammar-constrained decoding — tokens violating the | |
| # concept-list GBNF get probability zero at sample time. | |
| # Inline (string) not file: avoids container path surprises | |
| # that caused run 6's silent-fallback failure mode. | |
| grammar=_EXTRACTOR_GRAMMAR, | |
| ) | |
| except Exception as exc: | |
| organism.mark_generation_end() | |
| result["error"] = f"{type(exc).__name__}: {exc}" | |
| return result | |
| organism.mark_generation_end() | |
| result["raw_output"] = raw_output | |
| result["elapsed_s"] = meta.get("elapsed_s", 0.0) | |
| result["runtime_returncode"] = meta.get("returncode") | |
| # Surface stderr from the subprocess — critical for debugging when | |
| # the binary exits with rc!=0 (invalid flag, GGUF load failure, | |
| # OOM, etc.) and returns an empty response. Without this field in | |
| # the debug output, failures look like "the model generated nothing" | |
| # instead of "the subprocess never ran." | |
| result["stderr_tail"] = meta.get("stderr_tail", "") | |
| result["raw_stdout_tail"] = (meta.get("raw_stdout", "") or "")[-300:] | |
| if meta.get("error"): | |
| result["error"] = meta["error"] | |
| try: | |
| result["tokens_out"] = len(tokenizer(raw_output)["input_ids"]) if raw_output else 0 | |
| except Exception: | |
| pass | |
| result["hit_token_cap"] = (result["tokens_out"] >= MAX_NEW - 5) | |
| result["parsed"] = _hardened_parse(raw_output) | |
| return result | |
| # Stash of the most recent extractor call per forest text. Keyed by | |
| # the input text (which equals the user's prompt when called from the | |
| # benchmark). Populated by _bitnet_concept_extractor, read by the | |
| # benchmark harness to record raw_output alongside parsed trees. | |
| # Bounded — we keep only the last 32 entries to avoid unbounded growth | |
| # in long-running sessions. | |
| _last_extractions: dict = {} | |
| _LAST_EXTRACTIONS_MAX = 32 | |
| def _bitnet_concept_extractor(text: str) -> list: | |
| """Thin wrapper the organism calls — returns just the parsed list. | |
| Full-detail version (_bitnet_extract_full) is used by the Debug | |
| Extract tab to show raw output alongside parsed concepts. | |
| Also stashes the full detail in _last_extractions so the benchmark | |
| can inspect Falcon3's raw emissions per turn without needing a | |
| separate extraction pass. | |
| """ | |
| detail = _bitnet_extract_full(text) | |
| _last_extractions[text] = detail | |
| # Trim oldest if over cap — dict preserves insertion order in py3.7+ | |
| if len(_last_extractions) > _LAST_EXTRACTIONS_MAX: | |
| oldest = next(iter(_last_extractions)) | |
| _last_extractions.pop(oldest, None) | |
| return detail["parsed"] | |
| def on_debug_extract(): | |
| """Run the extractor against all 8 interleaved-benchmark questions. | |
| Returns a JSON report for human inspection: | |
| - raw model output (catches hallucinated explanations) | |
| - parsed concept list (what actually gets fed to the substrate) | |
| - timing + token counts (sanity-checks drain budget) | |
| - hit_token_cap flag (did the model run out of tokens? = no | |
| natural stop, probably not producing a list at all) | |
| - overall counters (total elapsed, median concepts per question, | |
| how many hit the cap) | |
| Run this BEFORE spending hours on A/B benchmarks — if the | |
| extractor output is junk, the rest doesn't matter. | |
| """ | |
| per_question = [] | |
| t_overall = time.time() | |
| cap_hits = 0 | |
| errors = 0 | |
| concept_counts = [] | |
| for category, prompt_text in INTERLEAVED_QUESTIONS: | |
| detail = _bitnet_extract_full(prompt_text) | |
| per_question.append({ | |
| "category": category, | |
| "question": prompt_text, | |
| "raw_output": detail["raw_output"], | |
| "parsed": detail["parsed"], | |
| "parsed_count": len(detail["parsed"]), | |
| "tokens_in": detail["tokens_in"], | |
| "tokens_out": detail["tokens_out"], | |
| "hit_token_cap": detail["hit_token_cap"], | |
| "elapsed_s": detail["elapsed_s"], | |
| "error": detail["error"], | |
| # Diagnostic fields forwarded from _bitnet_extract_full. | |
| # Critical for debugging when raw_output is empty — tells | |
| # us whether the subprocess actually ran and what it said. | |
| "runtime_returncode": detail.get("runtime_returncode"), | |
| "stderr_tail": detail.get("stderr_tail", ""), | |
| "raw_stdout_tail": detail.get("raw_stdout_tail", ""), | |
| }) | |
| concept_counts.append(len(detail["parsed"])) | |
| if detail["hit_token_cap"]: | |
| cap_hits += 1 | |
| if detail["error"]: | |
| errors += 1 | |
| overall_elapsed = round(time.time() - t_overall, 2) | |
| # Cross-question concept-overlap diagnostic — if different | |
| # questions are extracting the same concepts, that's the generic- | |
| # concept-pollution signature. Normalize to lowercase for comparison. | |
| all_lower = [[c.lower() for c in pq["parsed"]] for pq in per_question] | |
| pairwise_overlap = [] | |
| for i in range(len(per_question)): | |
| for j in range(i + 1, len(per_question)): | |
| set_i, set_j = set(all_lower[i]), set(all_lower[j]) | |
| if set_i and set_j: | |
| jaccard = len(set_i & set_j) / max(1, len(set_i | set_j)) | |
| if jaccard > 0: | |
| pairwise_overlap.append({ | |
| "pair": f"T{i+1}({per_question[i]['category']}) ↔ T{j+1}({per_question[j]['category']})", | |
| "jaccard": round(jaccard, 3), | |
| "shared": sorted(set_i & set_j), | |
| }) | |
| # Same-category pair analysis — the hypothesis check. For each | |
| # category, does q1's concept set overlap with q2's? This is the | |
| # direct test of whether dual-pass trees CAN bridge category pairs. | |
| same_cat_bridges = [] | |
| for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS: | |
| set_i, set_j = set(all_lower[i]), set(all_lower[j]) | |
| same_cat_bridges.append({ | |
| "category": per_question[i]["category"], | |
| "q1_concepts": per_question[i]["parsed"], | |
| "q2_concepts": per_question[j]["parsed"], | |
| "shared_lowercase": sorted(set_i & set_j), | |
| "jaccard": round(len(set_i & set_j) / max(1, len(set_i | set_j)), 3) if (set_i and set_j) else 0, | |
| }) | |
| summary = { | |
| "model": MODEL_NAME, | |
| "questions_tested": len(per_question), | |
| "total_elapsed_s": overall_elapsed, | |
| "median_concepts_per_question": int(sorted(concept_counts)[len(concept_counts) // 2]) if concept_counts else 0, | |
| "token_cap_hits": f"{cap_hits}/{len(per_question)}", | |
| "extraction_errors": errors, | |
| "pairwise_overlap_nonzero_count": len(pairwise_overlap), | |
| "same_category_bridges": same_cat_bridges, | |
| } | |
| return ( | |
| json.dumps(summary, indent=2), | |
| json.dumps(per_question, indent=2), | |
| json.dumps(pairwise_overlap, indent=2), | |
| ) | |
| # Wire the extractor into the organism — starts the background concept | |
| # helper manager thread. From this point forward, every deposit and | |
| # response enqueues for deferred tree extraction. | |
| organism.set_concept_extractor(_bitnet_concept_extractor) | |
| logger.info("NuWave concept helper wired: dual-pass extraction live") | |
| # ── Substrate context formatter — DORMANT ──────────────────────── | |
| # | |
| # Status: NOT CALLED at any site as of 2026-04-28 (B1 reverted). | |
| # Run 26 (commits 59124dd + e2c4343 active) showed B1's section | |
| # headers added ~120-200 tokens of pure formatting overhead per | |
| # turn — more than the typed-presentation benefit gave back at | |
| # BitNet 1.58-bit 2B-parameter scale. Token economy regressed | |
| # from -2.4% (Run 25) to +3.4% (Run 26). Reverted to plain | |
| # "\n".join(pith_context) at all three call sites. | |
| # | |
| # Hypothesis worth revisiting at larger model scale (7B+ or | |
| # higher-precision quantization): typed input may genuinely help | |
| # attention when the model has more capacity to use the structural | |
| # cues. At 2B / 1.58-bit, the formatting tax exceeds the benefit. | |
| # | |
| # The helper is preserved here as dormant code. Re-enable by | |
| # swapping the three call sites back to: | |
| # substrate_ctx = _format_substrate_context(pith_context, pith_ids) | |
| # (and switching pith_extract → pith_extract_with_ids at sites 1 and 2). | |
| # Group surfaced content by node-kind via ID prefix: | |
| # tree_* → "Related concepts" (concept words from dual-pass) | |
| # exp_* → "Prior questions on this topic" (deposit nodes) | |
| # resp_* → "Prior responses" | |
| # concept_narr_* → operational telemetry, omitted from prompt | |
| # other → "Other context" | |
| def _format_substrate_context(pith_context, pith_ids=None) -> str: | |
| """Return a sectioned substrate-context string for prompt injection.""" | |
| if not pith_context: | |
| return "" | |
| if not pith_ids or len(pith_ids) != len(pith_context): | |
| # No IDs available — can't section. Fallback to plain join so | |
| # callers without _with_ids still produce something usable. | |
| return "\n".join(pith_context) | |
| concepts, questions, responses, other = [], [], [], [] | |
| for text, pid in zip(pith_context, pith_ids): | |
| if not text: | |
| continue | |
| if pid.startswith("tree_"): | |
| concepts.append(text) | |
| elif pid.startswith("exp_"): | |
| questions.append(text) | |
| elif pid.startswith("resp_"): | |
| responses.append(text) | |
| elif pid.startswith("concept_narr_"): | |
| # Operational telemetry — omit from prompt context (Bunyan-shaped | |
| # data; legitimate substrate experience but not user knowledge). | |
| continue | |
| else: | |
| other.append(text) | |
| parts = [] | |
| if concepts: | |
| parts.append( | |
| "[Related concepts from substrate:]\n" | |
| + "\n".join(f"- {c}" for c in concepts) | |
| ) | |
| if questions: | |
| parts.append( | |
| "[Prior questions on this topic:]\n" | |
| + "\n".join(f"- {q}" for q in questions) | |
| ) | |
| if responses: | |
| parts.append("[Prior context:]\n" + "\n".join(responses)) | |
| if other: | |
| parts.append("[Other context:]\n" + "\n".join(f"- {o}" for o in other)) | |
| return "\n\n".join(parts) | |
| # ── Chat Handler ────────────────────────────────────────────────── | |
| def on_send(message, history): | |
| if not message: | |
| return "", history, "" | |
| global messages_nw | |
| messages_nw.append({"role": "user", "content": message}) | |
| # ── 1. Deposit raw experience into substrate (Law 7) ── | |
| organism.deposit_experience(message) | |
| # ── 2. Substrate processes ── | |
| step_result = organism.step() | |
| # ── 3. KISS bucket — extract what changed from the River ── | |
| kiss_extract = organism.kiss_extract(step_result) | |
| # Also run string-level KISS for comparison metrics | |
| kiss_string_result = kiss_nw.filter_context(messages_nw, system_prompt) | |
| sys_ctx = kiss_string_result.get("system_context", system_prompt) | |
| # ── 4. Pith bucket — extract relevant context from the River ── | |
| pith_context, pith_ids = organism.pith_extract_with_ids(message, max_context=5) | |
| # Pith does NOT go in the system slot. System slot is for the model's | |
| # identity/instructions; chat-tuned models treat retrieved questions | |
| # placed there as things to respond to (Run 48 root cause). Instead, | |
| # pith goes in a labeled context block inside the LAST user turn for | |
| # THIS turn's prompt only — see feedback_pith_presentation_layer memory. | |
| # Only send recent messages. The substrate carries the rest. | |
| # Always trim to recent window. The substrate + KISS carry older context. | |
| recent_window = 6 # 3 turns of user+assistant | |
| if len(messages_nw) > recent_window: | |
| recent_msgs = messages_nw[-recent_window:] | |
| else: | |
| recent_msgs = messages_nw | |
| # Enrich the LAST user message for THIS turn's prompt only. | |
| # messages_nw stays clean (bare query persists in history) so next | |
| # turn's recent_window isn't polluted with this turn's pith. | |
| if pith_context: | |
| pith_block = "\n".join(f" - {p}" for p in pith_context) | |
| enriched_query = ( | |
| "Some context that may be relevant (recalled from earlier " | |
| "related conversations; these are reference material, not " | |
| "questions to answer):\n" | |
| f"{pith_block}\n\n" | |
| f"My actual question: {message}" | |
| ) | |
| recent_msgs = list(recent_msgs[:-1]) + [ | |
| {"role": "user", "content": enriched_query} | |
| ] | |
| prompt_msgs = [] | |
| if sys_ctx: | |
| prompt_msgs.append({"role": "system", "content": sys_ctx}) | |
| prompt_msgs.extend(recent_msgs) | |
| prompt = tokenizer.apply_chat_template( | |
| prompt_msgs, tokenize=False, add_generation_prompt=True, | |
| ) | |
| # ── 5. Model generates ── | |
| response, in_tok, out_tok, elapsed, tok_s, lenia_result = do_generate(prompt) | |
| messages_nw.append({"role": "assistant", "content": response}) | |
| # ── 6. Outcome feeds back into substrate (Law 7) ── | |
| organism.record_outcome(message, response, success=True) | |
| # Stats — both substrate and string-level | |
| kiss_stats = kiss_nw.stats.to_dict() | |
| org_stats = organism.get_stats() | |
| lenia_info = "" | |
| if lenia_result: | |
| lenia_info = ( | |
| f" | Lenia step {lenia_result.get('step', 0)}: " | |
| f"Δα={lenia_result.get('total_alpha_delta', 0):.6f} " | |
| f"Δμ={lenia_result.get('total_position_delta', 0):.6f} " | |
| f"({lenia_result.get('time_ms', 0):.0f}ms)" | |
| ) | |
| substrate_info = ( | |
| f" | Substrate: {org_stats.get('nodes', 0)} nodes, " | |
| f"{org_stats.get('synapses', 0)} syn, " | |
| f"{org_stats.get('fired_nodes', 0)} fired" | |
| ) | |
| kiss_bucket_info = ( | |
| f" | KISS bucket: {kiss_extract.get('action', '?')} " | |
| f"({kiss_extract.get('reason', '')})" | |
| ) | |
| if kiss_extract.get('surprise_ratio', 0) > 0: | |
| kiss_bucket_info += f" surprise={kiss_extract['surprise_ratio']}" | |
| stats_text = ( | |
| f"**Turn {len(messages_nw)//2}** | " | |
| f"{out_tok} tokens in {elapsed}s ({tok_s} tok/s) | " | |
| f"Input: {in_tok} tokens | " | |
| f"String KISS: {kiss_stats.get('tokens_saved', 0)} saved ({kiss_stats.get('efficiency', 0):.1%})" | |
| f"{substrate_info}" | |
| f"{kiss_bucket_info}" | |
| f" | Pith river: {len(pith_context)} contexts" | |
| f"{lenia_info}" | |
| ) | |
| history = history + [ | |
| {"role": "user", "content": message}, | |
| {"role": "assistant", "content": response}, | |
| ] | |
| return "", history, stats_text | |
| def on_reset(): | |
| global messages_nw, kiss_nw, pith_nw | |
| messages_nw = [] | |
| kiss_nw = KISSFilter() | |
| pith_nw = PithPipeline() | |
| return [], "Chat reset." | |
| # ── Benchmark ───────────────────────────────────────────────────── | |
| # ── Interleaved-Category Benchmark ──────────────────────────────── | |
| # | |
| # Tests topology re-ignition: 4 semantic neighborhoods are seeded in | |
| # turns 1-4 (q1 each), then turns 5-8 ask a follow-up in each category | |
| # with 3 unrelated turns in between. If Pith's Born-rule extraction is | |
| # genuinely substrate-informed (not just recency-biased), turn 5's Pith | |
| # should re-select turn 1's deposit — the category neighborhood wakes | |
| # back up despite the gap. If the system were a sliding window, none | |
| # of that could happen: the relevant context is always 4 turns stale. | |
| INTERLEAVED_QUESTIONS = [ | |
| # q1's — primers, establish 4 neighborhoods | |
| ("biology", "How does photosynthesis work?"), | |
| ("physics", "What is a black hole?"), | |
| ("computing", "How do CPU cache hierarchies work?"), | |
| ("math", "What are prime numbers?"), | |
| # q2's — follow-ups, test re-ignition across the 3-turn gap | |
| ("biology", "What role does chlorophyll play in it?"), | |
| ("physics", "How does its event horizon form?"), | |
| ("computing", "Why are L1 caches split into instruction and data?"), | |
| ("math", "Why are they important in cryptography?"), | |
| ] | |
| # Expected same-category pairs: (q1_turn, q2_turn) zero-indexed | |
| _INTERLEAVED_SAME_CAT_PAIRS = [(0, 4), (1, 5), (2, 6), (3, 7)] | |
| # ── Oracle Trees (experimental ceiling test) ───────────────────────── | |
| # | |
| # Hand-authored "ideal" mechanism concepts for each interleaved prompt. | |
| # Used by the oracle-mode benchmark to establish whether dual-pass CAN | |
| # succeed given perfect trees — regardless of extractor quality. If | |
| # oracle-mode ignition metrics dramatically exceed run 3's no-tree | |
| # baseline (15.3× signal/noise), the extractor is the bottleneck | |
| # and worth improving. If oracle-mode performs no better than runs | |
| # 3-9, dual-pass itself is the dead end. | |
| # | |
| # Design: each q1 and q2 tree list intentionally shares 1-5 concepts | |
| # with its same-category partner to maximize re-ignition probability. | |
| # Example: "prime factorization" appears in BOTH math/q1 and math/q2 | |
| # so it should fire the same tree node on both turns. | |
| _ORACLE_TREES = { | |
| # Biology | |
| "How does photosynthesis work?": [ | |
| "chlorophyll", "photon absorption", "thylakoid membrane", | |
| "Calvin cycle", "ATP synthesis", "carbon fixation", | |
| ], | |
| "What role does chlorophyll play in it?": [ | |
| "chlorophyll", "photon absorption", "thylakoid membrane", | |
| "light-dependent reactions", "green pigment", "photosystem II", | |
| ], | |
| # Physics | |
| "What is a black hole?": [ | |
| "event horizon", "Schwarzschild radius", "gravitational collapse", | |
| "singularity", "escape velocity", "spacetime curvature", | |
| ], | |
| "How does its event horizon form?": [ | |
| "event horizon", "Schwarzschild radius", "gravitational collapse", | |
| "spacetime curvature", "escape velocity", "null geodesic", | |
| ], | |
| # Computing | |
| "How do CPU cache hierarchies work?": [ | |
| "cache hierarchy", "cache coherency", "memory access latency", | |
| "cache line", "L1 cache", "L2 cache", | |
| ], | |
| "Why are L1 caches split into instruction and data?": [ | |
| "L1 cache", "instruction cache", "data cache", | |
| "cache line", "Harvard architecture", "pipeline parallelism", | |
| ], | |
| # Math | |
| "What are prime numbers?": [ | |
| "prime factorization", "integer divisibility", "Euclidean algorithm", | |
| "fundamental theorem", "modular arithmetic", "prime distribution", | |
| ], | |
| "Why are they important in cryptography?": [ | |
| "prime factorization", "modular exponentiation", "RSA encryption", | |
| "discrete logarithm", "trapdoor function", "integer factorization", | |
| ], | |
| } | |
| def _oracle_concept_extractor(text: str) -> list: | |
| """Return hand-authored ideal trees for interleaved benchmark prompts. | |
| Oracle extraction: lookup-only, no LLM call. Used by the oracle-mode | |
| benchmark to establish the ceiling of dual-pass performance. For | |
| prompts NOT in the oracle dict, returns empty list (oracle mode only | |
| supports the interleaved benchmark questions — running other text | |
| through this would give misleading results). | |
| """ | |
| concepts = _ORACLE_TREES.get(text, []) | |
| if not concepts: | |
| logger.info("Oracle extractor: no entry for prompt, returning []") | |
| else: | |
| logger.info("Oracle extractor: returning %d concepts for %r", | |
| len(concepts), text[:60]) | |
| return [c.lower() for c in concepts] | |
| SAMPLE_CONVERSATIONS = [ | |
| "What is machine learning?", | |
| "How does it differ from traditional programming?", | |
| "Can you give me a simple example of supervised learning?", | |
| "What about unsupervised learning?", | |
| "How would I choose between them for a new project?", | |
| "What are neural networks?", | |
| "How deep is 'deep learning'?", | |
| "What's the relationship between AI, ML, and deep learning?", | |
| "What are transformers in the context of AI?", | |
| "How does attention work in a transformer?", | |
| "Why are transformers better than RNNs for many tasks?", | |
| "What is transfer learning and why does it matter?", | |
| "How do I fine-tune a pre-trained model?", | |
| "What are the ethical considerations in AI?", | |
| "Where do you see AI heading in the next 5 years?", | |
| ] | |
| def on_benchmark(num_turns): | |
| turns = min(int(num_turns), len(SAMPLE_CONVERSATIONS)) | |
| conversation = SAMPLE_CONVERSATIONS[:turns] | |
| # Use the live organism — it has topology from prior conversations. | |
| # A fresh organism has no topology, Pith returns nothing, trimming | |
| # never activates. The compound needs accumulated state. | |
| nw_organism = organism | |
| nw_kiss = KISSFilter() | |
| bl_msgs = [] | |
| nw_msgs = [] | |
| results = [] | |
| for i, prompt_text in enumerate(conversation): | |
| # ── Baseline — raw model, full history, no optimization ── | |
| bl_msgs.append({"role": "user", "content": prompt_text}) | |
| prompt_bl = tokenizer.apply_chat_template( | |
| [{"role": "system", "content": system_prompt}] + bl_msgs, | |
| tokenize=False, add_generation_prompt=True, | |
| ) | |
| resp_bl, in_bl, out_bl, time_bl, tps_bl, _ = do_generate(prompt_bl, max_new_tokens=128) | |
| bl_msgs.append({"role": "assistant", "content": resp_bl}) | |
| # ── NuWave — full organism path (same as on_send) ── | |
| nw_msgs.append({"role": "user", "content": prompt_text}) | |
| # Deposit + step + KISS + Pith (full loop) | |
| nw_organism.deposit_experience(prompt_text) | |
| step_result = nw_organism.step() | |
| kiss_extract = nw_organism.kiss_extract(step_result) | |
| # String KISS for comparison | |
| kiss_r = nw_kiss.filter_context(nw_msgs, system_prompt) | |
| sys_ctx = kiss_r.get("system_context", system_prompt) | |
| # Pith Born rule extraction from substrate. Pith does NOT go in | |
| # sys_ctx — it goes in a labeled context block in the last user | |
| # turn below (see feedback_pith_presentation_layer memory). | |
| pith_context = nw_organism.pith_extract(prompt_text, max_context=5) | |
| # Trim old messages — always, not gated on Pith. | |
| # The substrate + KISS carry what the older messages contained. | |
| # Even without Pith contexts, the recent window has what the model | |
| # needs for immediate coherence. | |
| recent_window = 6 | |
| if len(nw_msgs) > recent_window: | |
| recent = nw_msgs[-recent_window:] | |
| else: | |
| recent = nw_msgs | |
| # Enrich the LAST user message for THIS turn's prompt only. | |
| # nw_msgs stays clean (bare query persists in history) so next | |
| # turn's recent_window isn't polluted with this turn's pith. | |
| if pith_context: | |
| pith_block = "\n".join(f" - {p}" for p in pith_context) | |
| enriched_query = ( | |
| "Some context that may be relevant (recalled from earlier " | |
| "related conversations; these are reference material, not " | |
| "questions to answer):\n" | |
| f"{pith_block}\n\n" | |
| f"My actual question: {prompt_text}" | |
| ) | |
| recent = list(recent[:-1]) + [ | |
| {"role": "user", "content": enriched_query} | |
| ] | |
| prompt_nw = tokenizer.apply_chat_template( | |
| [{"role": "system", "content": sys_ctx}] + recent if sys_ctx else recent, | |
| tokenize=False, add_generation_prompt=True, | |
| ) | |
| resp_nw, in_nw, out_nw, time_nw, tps_nw, lenia_r = do_generate(prompt_nw, max_new_tokens=128) | |
| nw_msgs.append({"role": "assistant", "content": resp_nw}) | |
| # Outcome closes the loop | |
| nw_organism.record_outcome(prompt_text, resp_nw, success=True) | |
| ks = nw_kiss.stats.to_dict() | |
| org_stats = nw_organism.get_stats() | |
| results.append({ | |
| "turn": i + 1, | |
| "baseline": {"tokens": in_bl, "time": time_bl, "tok_s": tps_bl}, | |
| "nuwave": {"tokens": in_nw, "time": time_nw, "tok_s": tps_nw}, | |
| "tokens_saved": max(0, in_bl - in_nw), | |
| "time_saved": round(max(0, time_bl - time_nw), 2), | |
| "kiss_efficiency": ks.get("efficiency", 0), | |
| "pith_l1_size": org_stats.get('pith_l1_size', 0), | |
| "substrate_nodes": org_stats.get('nodes', 0), | |
| "substrate_synapses": org_stats.get('synapses', 0), | |
| }) | |
| # Summary | |
| total_time_bl = sum(r["baseline"]["time"] for r in results) | |
| total_time_nw = sum(r["nuwave"]["time"] for r in results) | |
| total_tok_bl = sum(r["baseline"]["tokens"] for r in results) | |
| total_tok_nw = sum(r["nuwave"]["tokens"] for r in results) | |
| summary = { | |
| "model": MODEL_NAME, | |
| "turns": turns, | |
| "baseline_total_tokens": total_tok_bl, | |
| "nuwave_total_tokens": total_tok_nw, | |
| "tokens_saved": total_tok_bl - total_tok_nw, | |
| "baseline_total_time": round(total_time_bl, 2), | |
| "nuwave_total_time": round(total_time_nw, 2), | |
| "time_saved": round(total_time_bl - total_time_nw, 2), | |
| "final_kiss_efficiency": results[-1]["kiss_efficiency"] if results else 0, | |
| "final_pith_l1": results[-1]["pith_l1_size"] if results else 0, | |
| } | |
| return json.dumps(summary, indent=2), json.dumps(results, indent=2) | |
| def _response_is_degenerate(text: str) -> bool: | |
| """Detect degenerate BitNet output patterns. | |
| Phase B+1 (2026-05-11) — closes the substrate-quality feedback gap. | |
| Run 45's T8 surfaced 3 `resp_*` nodes with degenerate text ("Did | |
| on... Did on... 4. 2. 2..."), bloated the prompt to 605s NuWave | |
| generation, and produced more degenerate output which got | |
| deposited and reinforced via record_outcome's reward (which only | |
| evaluates pith quality, not response quality). | |
| This helper detects three specific BitNet degeneracy signatures | |
| we've observed across runs: | |
| 1. Long token-runs (≥ 5 consecutive identical tokens — the | |
| "2. 2. 2. 2. 2." pattern) | |
| 2. Low unique-token diversity (< 30% unique — heavy repetition) | |
| 3. Chat-template fragment leakage ("| user:", "| assistant:", | |
| "System:" — BitNet pulling its prompt-format markers into | |
| the generated text) | |
| Used in the benchmark loop to force `success_signal = False` when | |
| the response was degenerate, regardless of pith composition. The | |
| substrate then receives LTD on synapses that produced the | |
| degenerate-generating retrieval — self-cleaning via STDP over | |
| multiple runs. | |
| Pure-stdlib, O(N) string check. No coupling to substrate code. | |
| """ | |
| tokens = text.split() | |
| if len(tokens) < 10: | |
| return False | |
| unique_ratio = len(set(tokens)) / len(tokens) | |
| if unique_ratio < 0.3: | |
| return True | |
| max_run = cur_run = 1 | |
| for i in range(1, len(tokens)): | |
| if tokens[i] == tokens[i - 1]: | |
| cur_run += 1 | |
| if cur_run > max_run: | |
| max_run = cur_run | |
| else: | |
| cur_run = 1 | |
| if max_run >= 5: | |
| return True | |
| if any(marker in text for marker in ("| user:", "| assistant:", "System:")): | |
| return True | |
| # Phrase-level verbatim repetition — 3-word n-gram occurring 3+ times. | |
| # Catches "Readability: Code that is easy to understand. Readability: | |
| # Code that is easy to understand." style degeneracy that has moderate | |
| # unique-token ratio but obvious sentence-level loops. Coherent text | |
| # rarely has 3+ verbatim 3-word phrase repetitions. | |
| if len(tokens) >= 9: | |
| trigram_counts: dict = {} | |
| for i in range(len(tokens) - 2): | |
| tg = (tokens[i], tokens[i + 1], tokens[i + 2]) | |
| trigram_counts[tg] = trigram_counts.get(tg, 0) + 1 | |
| if trigram_counts and max(trigram_counts.values()) >= 3: | |
| return True | |
| return False | |
| def on_interleaved_benchmark( | |
| enable_dual_pass: bool = True, | |
| oracle_trees: bool = False, | |
| surfacing_mode: str = "pith", | |
| ): | |
| """Run the 4-category interleaved benchmark + build re-ignition heatmaps. | |
| Runs against the live organism (accumulated state), so re-ignition | |
| is tested against a real populated substrate. Returns: | |
| (summary_json, per_turn_json, heatmap_A_fig, heatmap_B_fig) | |
| Heatmap A: Jaccard overlap of ignition sets between all turn pairs. | |
| Bright (1,5), (2,6), (3,7), (4,8) = same-category re-ignition signal. | |
| Heatmap B: Did turn j's Pith selection include turn i's deposit? | |
| Bright (5,1), (6,2), (7,3), (8,4) = substrate memory carrying the | |
| q1 deposit forward through 3 unrelated turns to surface at q2 time. | |
| enable_dual_pass: if False, temporarily disables the concept helper | |
| for the duration of this benchmark run. Used for A/B comparison | |
| against a run with dual-pass enabled. Disabling: clears the pending | |
| concept queue, detaches the extractor (deposits stop enqueueing), | |
| and skips wait_for_trees between turns. Restored in a finally block. | |
| """ | |
| # Matplotlib in headless container — set backend before any import. | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| nw_organism = organism | |
| # ── Dual-pass toggle ───────────────────────────────────────────── | |
| # Detach the extractor so deposit_experience / record_outcome skip | |
| # enqueueing. Drain any pending queue entries so they don't get | |
| # processed during this benchmark (which would contaminate the | |
| # "dual-pass disabled" measurement). The manager thread stays | |
| # running but has nothing to do. Restored in the finally block. | |
| _saved_extractor = None | |
| if not enable_dual_pass: | |
| _saved_extractor = nw_organism._concept_extractor | |
| nw_organism._concept_extractor = None | |
| drained_count = 0 | |
| while not nw_organism._concept_queue.empty(): | |
| try: | |
| nw_organism._concept_queue.get_nowait() | |
| drained_count += 1 | |
| except Exception: | |
| break | |
| logger.info( | |
| "Dual-pass DISABLED for this benchmark run " | |
| "(drained %d pending concept entries)", drained_count, | |
| ) | |
| elif oracle_trees: | |
| # Oracle mode — swap the LLM extractor for a dict-lookup oracle | |
| # that returns hand-authored ideal trees. Tests the ceiling of | |
| # dual-pass performance independent of extractor quality. | |
| _saved_extractor = nw_organism._concept_extractor | |
| nw_organism._concept_extractor = _oracle_concept_extractor | |
| logger.info( | |
| "ORACLE TREES mode for this benchmark run — using hand-authored " | |
| "ideal concepts (%d prompts in oracle dict)", len(_ORACLE_TREES), | |
| ) | |
| else: | |
| logger.info("Dual-pass ENABLED for this benchmark run (LLM extractor)") | |
| # Record starting substrate state for fair-comparison diagnostics | |
| _start_stats = nw_organism.get_stats() | |
| _start_nodes = _start_stats.get('nodes', 0) | |
| _start_synapses = _start_stats.get('synapses', 0) | |
| # Run 41+ predictive-coding diagnostic — capture cumulative prediction | |
| # counters at run start so we can compute delta per run. If predictions | |
| # never generate, all per-turn predictions_confirmed/surprised are 0 | |
| # AND total_predictions_made delta = 0 → confirms prediction_threshold | |
| # gating diagnosis. Reads canonical Graph counters directly. | |
| _start_total_predictions_made = int(getattr( | |
| getattr(nw_organism, "_graph", None), "_total_predictions_made", 0, | |
| ) or 0) | |
| _start_total_surprised = int(getattr( | |
| getattr(nw_organism, "_graph", None), "_total_surprised", 0, | |
| ) or 0) | |
| nw_kiss_inst = KISSFilter() | |
| bl_msgs: list = [] | |
| nw_msgs: list = [] | |
| results = [] | |
| ignition_sets: list = [] # fired-node set per turn | |
| deposit_ids: list = [] # deposit node_id per turn (may be None) | |
| pith_ids_per_turn: list = [] # pith-selected node_ids per turn | |
| # ── Phase A pool — per-run stratified sampling ────────────────────── | |
| # Replaces the hardcoded module-level INTERLEAVED_QUESTIONS (4 categories | |
| # × 1 Q1/Q2 pair = 8 fixed turns) with a sample drawn from the 80-prompt | |
| # benchmark_pool.yaml (10 categories × 4 Q1/Q2 pairs = 40 pairs total). | |
| # Per-run variance in which 8 pairs get sampled is itself substrate | |
| # diversification — different threads / categories / complexity registers | |
| # dominate each run, which the canonical co-firing-discovery and | |
| # predictive-coding mechanisms need to fire (Run 42 sidecar diagnosis). | |
| # | |
| # Stratification discipline (enforced in benchmark_loader.sample_pairs): | |
| # - max 2 pairs per category (no category dominates) | |
| # - ≥3 threads with 2+ instances (cross-category co-firing) | |
| # - ≥3 distinct complexity levels (gradient hit each run) | |
| # | |
| # Returns 16 turns (8 Q1s indexed 0..7, then 8 matching Q2s indexed 8..15) | |
| # and same-cat pairs [(0,8), (1,9), ..., (7,15)]. Shadows the module-level | |
| # INTERLEAVED_QUESTIONS / _INTERLEAVED_SAME_CAT_PAIRS for this function's | |
| # scope; downstream code reads the locals via Python scoping. | |
| # Load the full pool once — passed to the sampler AND used downstream to | |
| # build category centroids from ALL 80 prompts (not just the per-run | |
| # sampled subset). Run 43 surfaced the bug where centroids built from | |
| # `INTERLEAVED_QUESTIONS` (per-run sample of 6-8 cats) caused old | |
| # substrate nodes from non-sampled categories to be force-mapped to | |
| # whatever centroid was closest, garbling the diagnostic metrics. | |
| _full_benchmark_pool = _load_benchmark_pool() | |
| _pool_interleaved, _pool_same_cat_pairs, _pool_meta = _sample_benchmark_chains( | |
| pool=_full_benchmark_pool, | |
| n_chains=8, | |
| ) | |
| INTERLEAVED_QUESTIONS = _pool_interleaved | |
| _INTERLEAVED_SAME_CAT_PAIRS = _pool_same_cat_pairs | |
| _pool_summary = _describe_benchmark_sample(_pool_meta) | |
| logger.info( | |
| "Phase B pool sampled: %d chains / %d turns | cats=%s | threads=%s", | |
| _pool_summary["n_chains"], | |
| _pool_summary["n_turns"], | |
| _pool_summary["categories_sampled"], | |
| _pool_summary["threads_sampled"], | |
| ) | |
| # Per-turn category labels (parallel to deposit_ids). Used by the | |
| # category-match heatmap to ask "did j's pith pull ANY same-category | |
| # prior deposit" rather than the strict exact-id match. | |
| categories_per_turn: list = [c for c, _ in INTERLEAVED_QUESTIONS] | |
| # Cross-run category registry on the organism. Maps node_id -> category | |
| # for deposits this benchmark has tagged. Kept as a diagnostic — its | |
| # size proves the persistence path works — but the heatmap no longer | |
| # depends on it (Run 15 showed the substrate is stable but pith pulls | |
| # nodes that predate the registry, so registry-based tagging can never | |
| # match). Lazy-init on first use. | |
| if not hasattr(nw_organism, "_benchmark_category_registry"): | |
| nw_organism._benchmark_category_registry = {} | |
| cat_registry = nw_organism._benchmark_category_registry | |
| # ── Option G: similarity-based categorization (full-pool centroids) ─ | |
| # Build one centroid per category by averaging the embeddings of ALL | |
| # prompts in the FULL pool's q1 and q2 layers (8 prompts per category | |
| # × 10 categories = 80 embeddings, 10 centroids). Earlier this iterated | |
| # only over the per-run-sampled INTERLEAVED_QUESTIONS, which produced | |
| # centroids for just 6-8 categories — substrate nodes from non-sampled | |
| # categories (physics, biology, math, etc. when they weren't in the | |
| # current run's draw) got force-mapped to whatever centroid happened to | |
| # be closest, garbling per-turn category diagnostics (Run 43 surfaced | |
| # this — gravitational-collapse nodes were tagged "music," prime- | |
| # factorization nodes tagged "computing," etc.). | |
| # | |
| # Building from the full pool means tagging is stable regardless of | |
| # which subset gets sampled this run, AND the centroid quality is | |
| # better (8 prompts averaged per category vs 2 in the old benchmark). | |
| # Cost: ~10 seconds of embedding at benchmark startup; one-time per run. | |
| _category_centroids: dict = {} | |
| _CATEGORY_SIM_THRESHOLD = 0.30 # cosine sim floor to assign category | |
| try: | |
| _per_cat_embs: dict = {} | |
| for _layer_key in ("q1_layer", "q2_layer"): | |
| for _entry in _full_benchmark_pool.get(_layer_key, []): | |
| _emb = np.asarray( | |
| nw_organism._embed_fn(_entry["text"]), dtype=np.float32, | |
| ) | |
| _per_cat_embs.setdefault(_entry["category"], []).append(_emb) | |
| for _cat, _embs in _per_cat_embs.items(): | |
| _centroid = np.mean(_embs, axis=0) | |
| _norm = np.linalg.norm(_centroid) + 1e-8 | |
| _category_centroids[_cat] = _centroid / _norm | |
| logger.info( | |
| "Built %d category centroids from full pool " | |
| "(%d prompts averaged per centroid)", | |
| len(_category_centroids), | |
| sum(len(v) for v in _per_cat_embs.values()) // max(1, len(_per_cat_embs)), | |
| ) | |
| except Exception as exc: | |
| logger.warning("Category centroid build failed: %s", exc) | |
| def _categorize_node(node_id: str) -> Optional[str]: | |
| """Return best-matching category for a node, or None. | |
| Looks up the node's stored embedding in the organism's side-table, | |
| computes cosine similarity to each category centroid, returns the | |
| category with maximum similarity if it exceeds the threshold. | |
| Threshold prevents off-topic substrate nodes (e.g. residue from | |
| unrelated chat sessions) from being shoehorned into a category. | |
| """ | |
| if not _category_centroids: | |
| return None | |
| emb = nw_organism._embeddings.get(node_id) | |
| if emb is None: | |
| return None | |
| norm = np.linalg.norm(emb) + 1e-8 | |
| emb_n = emb / norm | |
| best_cat = None | |
| best_sim = _CATEGORY_SIM_THRESHOLD | |
| for cat, cent in _category_centroids.items(): | |
| sim = float(np.dot(emb_n, cent)) | |
| if sim > best_sim: | |
| best_sim = sim | |
| best_cat = cat | |
| return best_cat | |
| N = len(INTERLEAVED_QUESTIONS) | |
| for i, (category, prompt_text) in enumerate(INTERLEAVED_QUESTIONS): | |
| # Baseline — raw model, full interleaved history, no optimization | |
| bl_msgs.append({"role": "user", "content": prompt_text}) | |
| prompt_bl = tokenizer.apply_chat_template( | |
| [{"role": "system", "content": system_prompt}] + bl_msgs, | |
| tokenize=False, add_generation_prompt=True, | |
| ) | |
| resp_bl, in_bl, out_bl, time_bl, tps_bl, _ = do_generate(prompt_bl, max_new_tokens=128) | |
| bl_msgs.append({"role": "assistant", "content": resp_bl}) | |
| # NuWave — full organism path with rich logging | |
| nw_msgs.append({"role": "user", "content": prompt_text}) | |
| deposit_nid = nw_organism.deposit_experience(prompt_text) | |
| step_result = nw_organism.step() | |
| kiss_extract = nw_organism.kiss_extract(step_result) | |
| # Retrieval dispatch — pith_extract uses amplitude Born-rule | |
| # scoring; surface_extract uses CES voltage+recency scoring. | |
| # Run 11 oracle test showed amp² suppresses trees (0.5²=0.25 | |
| # vs forest 1.0²=1.0, 4× disadvantage). Surfacing mode lets | |
| # us test whether SNN-native dynamics avoid that bottleneck. | |
| if surfacing_mode == "surface": | |
| pith_context, pith_ids = nw_organism.surface_extract_with_ids( | |
| prompt_text, max_context=5, | |
| ) | |
| else: | |
| pith_context, pith_ids = nw_organism.pith_extract_with_ids( | |
| prompt_text, max_context=5, | |
| ) | |
| # Capture substrate internals BEFORE record_outcome (which runs | |
| # additional graph.step() calls that would pollute the fired set). | |
| ignition_sets.append(set(step_result.get('fired_nodes', []))) | |
| deposit_ids.append(deposit_nid) | |
| pith_ids_per_turn.append(list(pith_ids)) | |
| # Register this deposit's category so future pith pulls can be | |
| # category-tagged across runs (setdefault so we don't overwrite | |
| # if the same node_id is somehow re-deposited). | |
| if deposit_nid: | |
| cat_registry.setdefault(deposit_nid, category) | |
| kiss_r = nw_kiss_inst.filter_context(nw_msgs, system_prompt) | |
| sys_ctx = kiss_r.get("system_context", system_prompt) | |
| # Pith does NOT go in sys_ctx — see feedback_pith_presentation_layer | |
| # memory. It goes in a labeled context block in the last user turn | |
| # below for THIS turn's prompt only; nw_msgs stays clean so next | |
| # turn's recent_window isn't polluted with this turn's pith. | |
| recent_window = 6 | |
| recent = nw_msgs[-recent_window:] if len(nw_msgs) > recent_window else nw_msgs | |
| if pith_context: | |
| pith_block = "\n".join(f" - {p}" for p in pith_context) | |
| enriched_query = ( | |
| "Some context that may be relevant (recalled from earlier " | |
| "related conversations; these are reference material, not " | |
| "questions to answer):\n" | |
| f"{pith_block}\n\n" | |
| f"My actual question: {prompt_text}" | |
| ) | |
| recent = list(recent[:-1]) + [ | |
| {"role": "user", "content": enriched_query} | |
| ] | |
| prompt_nw = tokenizer.apply_chat_template( | |
| [{"role": "system", "content": sys_ctx}] + recent if sys_ctx else recent, | |
| tokenize=False, add_generation_prompt=True, | |
| ) | |
| resp_nw, in_nw, out_nw, time_nw, tps_nw, _ = do_generate(prompt_nw, max_new_tokens=128) | |
| nw_msgs.append({"role": "assistant", "content": resp_nw}) | |
| # ── Per-turn correctness signal (Run 33+) ────────────────────────── | |
| # Did this turn's pith pull predominantly USEFUL same-category | |
| # content — excluding self-retrievals (pith ids whose embedding is | |
| # near-identical to the query, i.e., the substrate handing the query | |
| # back at us)? | |
| # | |
| # History: | |
| # - Run 30: hardcoded success=True → inverted ignition asymmetry | |
| # (cross-cat firing harder than same-cat). | |
| # - Run 31 (same-cat ratio threshold ≥ 0.5): ignition flipped sign | |
| # in one run; token regression collapsed +6.1% → +0.51%. | |
| # - Run 32: signal got gamed by question-repetition. Prior-run | |
| # deposits of the same query text are same-category-tagged, so | |
| # ratio = 1.0 on 5/8 turns. Substrate over-LTP'd at 5× normal | |
| # rate (~56K new synapses vs typical ~11K). Token regression | |
| # jumped to +12.1%, wall-clock +8.6% slower. | |
| # | |
| # Self-retrieval gate: for each pith id, cosine similarity between | |
| # its embedding and the current query's embedding. If above | |
| # _SELF_RETRIEVAL_THRESHOLD (0.92), node is a near-identical text | |
| # repeat — counts toward tagged_total but NOT tagged_same. Drives | |
| # ratio DOWN for question-repeat-heavy turns, so canonical STDP | |
| # depresses self-retrieval synapses via LTD over multiple runs. | |
| # | |
| # This is a feedback-path correction (refines the reward signal we | |
| # feed canonical inject_reward), NOT an extraction-path filter — | |
| # pith still goes to the LLM unchanged. Substrate's STDP retrieves | |
| # what it retrieves; we only refine our judgement of "did that | |
| # help" so the canonical reward channel has accurate ground truth | |
| # to learn against. | |
| _SELF_RETRIEVAL_THRESHOLD = 0.92 | |
| _q_emb = np.asarray(nw_organism._embed_fn(prompt_text), dtype=np.float32) | |
| _q_norm = float(np.linalg.norm(_q_emb)) + 1e-8 | |
| _tagged_total = 0 | |
| _tagged_same = 0 | |
| _self_retrievals = 0 | |
| for _pid in pith_ids: | |
| _tag = _categorize_node(_pid) | |
| _node_emb = nw_organism._embeddings.get(_pid) | |
| _is_self = False | |
| if _node_emb is not None: | |
| _node_norm = float(np.linalg.norm(_node_emb)) + 1e-8 | |
| _cos_to_query = float( | |
| np.dot(_q_emb, _node_emb) / (_q_norm * _node_norm) | |
| ) | |
| _is_self = _cos_to_query > _SELF_RETRIEVAL_THRESHOLD | |
| if _is_self: | |
| _self_retrievals += 1 | |
| # Skip only when BOTH untaggable AND not self-retrieval (no signal) | |
| if _tag is None and not _is_self: | |
| continue | |
| _tagged_total += 1 | |
| # Same-cat credit only if tag matches AND not a self-retrieval | |
| if _tag == category and not _is_self: | |
| _tagged_same += 1 | |
| if _tagged_total >= 2: | |
| _same_cat_ratio = _tagged_same / _tagged_total | |
| success_signal = _same_cat_ratio >= 0.5 | |
| else: | |
| _same_cat_ratio = None | |
| success_signal = True # neutral / cold-start | |
| # Phase B+1 (Run 46+) — response-quality gate. If BitNet's output | |
| # was degenerate (repeated tokens, chat-template fragments, low | |
| # unique-token ratio), force success_signal=False so the substrate | |
| # gets LTD on whatever co-fired during this turn — including the | |
| # synapses that LED to surfacing the junk pith that bloated the | |
| # prompt. Closes the substrate-quality feedback gap surfaced by | |
| # Run 45's T8 anomaly (605s NuWave generation on a pith with 3 | |
| # degenerate resp_* nodes; record_outcome rewarded the bad path). | |
| _response_degenerate = _response_is_degenerate(resp_nw) | |
| if _response_degenerate: | |
| success_signal = False | |
| nw_organism.record_outcome(prompt_text, resp_nw, success=success_signal) | |
| # Phase 2 (scoped multi-channel substrate feedback) was attempted | |
| # in commits 468fd09, ab0fdd3, e4dd297 then removed 2026-04-27. | |
| # The architectural idea (substrate-feedback-via-inject_reward) is | |
| # canonical Substrate Authority Pattern and remains correct, but | |
| # all three implementations had bugs that made them either no-op | |
| # or actively harmful: stimulate residual voltage created positive | |
| # feedback loops, Channel 3 collective penalty had wrong signal- | |
| # to-scope binding, prime_and_propagate(currents=1.0) didn't fire | |
| # seeds, and concurrent-modification races crashed 3/8 turns. | |
| # See feedback_substrate_representation_first.md — Phase 2 redesign | |
| # is deferred until representation work (discover_hyperedges hook, | |
| # type-aware retrieval scoring with expert decay) gives the | |
| # substrate the structural inductive biases that make relevance | |
| # learnable in the first place. | |
| # Drain the concept queue before the next turn — makes tree | |
| # extraction synchronous for benchmark reproducibility. Without | |
| # this, q2's Pith might or might not see q1's trees depending | |
| # on how fast the manager pulsed. Skip drain when dual-pass | |
| # is disabled — nothing to drain, and no overhead required. | |
| if enable_dual_pass: | |
| drain_t0 = time.time() | |
| drained = nw_organism.wait_for_trees(timeout=180.0) | |
| drain_elapsed = round(time.time() - drain_t0, 2) | |
| else: | |
| drained = True | |
| drain_elapsed = 0.0 | |
| org_stats = nw_organism.get_stats() | |
| # Capture the extracted tree concepts for THIS turn's forest — | |
| # walk graph metadata for nodes tagged forest=deposit_nid. | |
| # Post-drain so these are complete and stable. Gives us | |
| # ground-truth visibility into what the extractor actually | |
| # produced vs. what the prompt asked for. Critical diagnostic | |
| # for specificity tuning. Safe to read nodes under the graph | |
| # lock (trees already committed). | |
| trees_for_turn = [] | |
| if enable_dual_pass: | |
| try: | |
| with nw_organism._graph_lock: | |
| for nid, node in nw_organism._graph.nodes.items(): | |
| if node.metadata.get("forest") == deposit_nid: | |
| concept = nw_organism._node_content.get(nid, "") | |
| if concept: | |
| trees_for_turn.append(concept) | |
| except Exception as exc: | |
| logger.debug("Tree capture failed for turn %d: %s", i + 1, exc) | |
| # Raw extractor output for THIS turn — lets us see exactly | |
| # what Falcon3-10B-1.58bit emitted vs. what the parser kept. | |
| # If trees=[] but raw_output looks reasonable, parser is | |
| # over-filtering. If raw_output is garbage, it's a prompt | |
| # or model-adherence issue. | |
| extraction_detail = _last_extractions.get(prompt_text, {}) if enable_dual_pass else {} | |
| raw_output = extraction_detail.get("raw_output", "")[:500] | |
| extractor_elapsed = extraction_detail.get("elapsed_s", 0.0) | |
| # Qualitative content — pair each surfaced text with the | |
| # category our centroid-similarity lookup tagged it as. Lets us | |
| # eyeball "is this actually biology content for a biology query" | |
| # without running another similarity pass at heatmap-build time. | |
| # Truncate text to 200 chars so the JSON stays readable. | |
| _surfaced_context = [] | |
| for _idx, _pid in enumerate(pith_ids): | |
| _text = pith_context[_idx] if _idx < len(pith_context) else "" | |
| _surfaced_context.append({ | |
| "id": _pid, | |
| "category_tagged": _categorize_node(_pid), | |
| "text": (_text[:200] + ("..." if len(_text) > 200 else "")), | |
| }) | |
| results.append({ | |
| "turn": i + 1, | |
| "category": category, | |
| "q_num": 1 if i < 4 else 2, | |
| "prompt": prompt_text, | |
| "baseline": {"tokens": in_bl, "time": time_bl, "tok_s": tps_bl}, | |
| "nuwave": {"tokens": in_nw, "time": time_nw, "tok_s": tps_nw}, | |
| "tokens_saved": max(0, in_bl - in_nw), | |
| "time_saved": round(max(0, time_bl - time_nw), 2), | |
| "deposit_node_id": deposit_nid, | |
| "ignition_size": len(ignition_sets[i]), | |
| "pith_ids": list(pith_ids), | |
| "surfaced_context": _surfaced_context, | |
| "trees": trees_for_turn, | |
| "raw_extractor_output": raw_output, | |
| "extractor_elapsed_s": extractor_elapsed, | |
| "substrate_nodes": org_stats.get('nodes', 0), | |
| "substrate_synapses": org_stats.get('synapses', 0), | |
| "substrate_hyperedges": org_stats.get('hyperedges', 0), | |
| "tree_drain_s": drain_elapsed, | |
| "tree_drained": drained, | |
| # Run 31+ correctness-signal telemetry — what we fed the substrate | |
| # via record_outcome's success arg this turn, and the underlying | |
| # same-category proportion. ratio is None when fewer than 2 pith | |
| # ids were taggable (cold-start neutral). pith_self_retrievals | |
| # added Run 33+: count of pith ids with cosine ≥ 0.92 to query | |
| # (substrate handing the query back) — these count as misses. | |
| "success_signal": success_signal, | |
| "pith_same_cat_ratio": _same_cat_ratio, | |
| "pith_self_retrievals": _self_retrievals, | |
| # Phase B+1 telemetry — Run 46+. Tracks whether BitNet's | |
| # output this turn was degenerate (forced success_signal | |
| # False). Watch cross-run count: should drop over runs as | |
| # substrate LTDs degenerate-producing pathways. | |
| "response_quality": "degenerate" if _response_degenerate else "clean", | |
| # Run 41+ predictive-coding telemetry — surface what step_result | |
| # already carries about predictions plus a snapshot of active | |
| # predictions on the graph. If all 0/0/0 across all turns, the | |
| # canonical predictive-coding loop is dormant (gated by | |
| # prediction_threshold per audit task #12). | |
| "predictions_confirmed": int(step_result.get("predictions_confirmed", 0) or 0), | |
| "predictions_surprised": int(step_result.get("predictions_surprised", 0) or 0), | |
| "active_predictions_count": len(getattr( | |
| getattr(nw_organism, "_graph", None), "active_predictions", {} | |
| ) or {}), | |
| }) | |
| # ── Heatmap A: ignition-set Jaccard overlap (symmetric) ── | |
| mat_A = np.zeros((N, N)) | |
| for i in range(N): | |
| for j in range(N): | |
| s1, s2 = ignition_sets[i], ignition_sets[j] | |
| if not s1 or not s2: | |
| continue | |
| mat_A[i, j] = len(s1 & s2) / max(1, len(s1 | s2)) | |
| # ── Heatmap B (strict exact-id, kept for legacy stat) ── | |
| # Did turn j's Pith select turn i's specific deposit? Only causally | |
| # valid when i < j. Run 13 showed this is too strict — accumulated | |
| # prior-run nodes drown out fresh deposits in the Pith cut. | |
| mat_B = np.zeros((N, N)) | |
| for j in range(N): | |
| pith_set = set(pith_ids_per_turn[j]) | |
| for i in range(N): | |
| if i < j and deposit_ids[i] and deposit_ids[i] in pith_set: | |
| mat_B[j, i] = 1.0 | |
| # ── Heatmap B (category-match via similarity tagging — Option G) ── | |
| # Cell (j, i) is bright when turn j's Pith contains ANY node whose | |
| # stored embedding cosine-matches category[i]'s centroid above the | |
| # threshold. Causally valid for all i < j. The similarity-based | |
| # version replaces the prior registry-based logic which could only | |
| # see nodes deposited by runs that called the new code path — | |
| # invisible against a substrate accumulated over many prior runs. | |
| mat_B_cat = np.zeros((N, N)) | |
| for j in range(N): | |
| pith_set = set(pith_ids_per_turn[j]) | |
| for i in range(N): | |
| if i >= j: | |
| continue | |
| target_cat = categories_per_turn[i] | |
| for pid in pith_set: | |
| if _categorize_node(pid) == target_cat: | |
| mat_B_cat[j, i] = 1.0 | |
| break | |
| def _tick_labels(): | |
| return [f"T{r['turn']}\n{r['category'][:3]}{r['q_num']}" for r in results] | |
| def _render(matrix, title, highlight_pairs, xlabel, ylabel): | |
| fig, ax = plt.subplots(figsize=(8, 7)) | |
| vmax = matrix.max() if matrix.max() > 0 else 1.0 | |
| im = ax.imshow(matrix, cmap='viridis', vmin=0, vmax=vmax) | |
| ax.set_xticks(range(N)) | |
| ax.set_yticks(range(N)) | |
| ax.set_xticklabels(_tick_labels(), fontsize=8) | |
| ax.set_yticklabels(_tick_labels(), fontsize=8) | |
| ax.set_xlabel(xlabel) | |
| ax.set_ylabel(ylabel) | |
| ax.set_title(title, fontsize=10) | |
| plt.colorbar(im, ax=ax, fraction=0.04) | |
| # Red boxes on cells where we EXPECT brightness | |
| for (ii, jj) in highlight_pairs: | |
| ax.add_patch(plt.Rectangle( | |
| (jj - 0.5, ii - 0.5), 1, 1, | |
| fill=False, edgecolor='red', linewidth=2, | |
| )) | |
| # Value annotations | |
| for ii in range(N): | |
| for jj in range(N): | |
| v = matrix[ii, jj] | |
| if v > 0: | |
| ax.text(jj, ii, f"{v:.2f}", ha='center', va='center', | |
| fontsize=6, color='white' if v < vmax * 0.5 else 'black') | |
| fig.tight_layout() | |
| return fig | |
| # Heatmap A highlight: both directions of same-category pair | |
| pairs_A = [] | |
| for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS: | |
| pairs_A.extend([(i, j), (j, i)]) | |
| fig_A = _render( | |
| mat_A, | |
| "Ignition Overlap — Jaccard(fired_i, fired_j)\n" | |
| "Red boxes mark expected bright cells (same-category q1 ↔ q2)", | |
| pairs_A, | |
| xlabel="Turn j", ylabel="Turn i", | |
| ) | |
| # Heatmap B highlight: only causal (j > i) same-category pairs | |
| pairs_B = [(j, i) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS] | |
| fig_B = _render( | |
| mat_B_cat, | |
| "Pith Category-Match — did turn j's Pith pull ANY same-category node?\n" | |
| "Red boxes mark same-category q1→q2 pairs. Bright off-diagonal = " | |
| "category leak; bright on-diagonal = category-coherent retrieval.", | |
| pairs_B, | |
| xlabel="Turn i (category target)", ylabel="Turn j (pith extract)", | |
| ) | |
| # Summary metrics | |
| same_A = [mat_A[i, j] for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS] | |
| cross_A = [] | |
| for i in range(N): | |
| for j in range(i + 1, N): | |
| if (i, j) not in _INTERLEAVED_SAME_CAT_PAIRS: | |
| cross_A.append(mat_A[i, j]) | |
| # Heatmap B: same-category causal cells are (j, i) where j = q2_turn, | |
| # i = q1_turn. Count ONLY those 4 cells — that's the re-ignition | |
| # signal we actually care about, not the total-reselects-across-all-cells | |
| # that mat_B.sum() produces (previous reporting conflated the two). | |
| same_cat_B_hits = sum(int(mat_B[j, i]) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS) | |
| # Category-match via similarity (Option G): for each q2 turn, did its | |
| # pith contain ANY node whose embedding cosine-matches the turn's | |
| # category centroid above threshold? This is the metric that actually | |
| # answers "is the substrate doing category-coherent retrieval" — | |
| # works on the entire substrate, not just nodes the registry has seen. | |
| q2_turns = [j for (_, j) in _INTERLEAVED_SAME_CAT_PAIRS] | |
| same_cat_pith_hits = 0 | |
| for j in q2_turns: | |
| pith_set = set(pith_ids_per_turn[j]) | |
| j_cat = categories_per_turn[j] | |
| if any(_categorize_node(pid) == j_cat for pid in pith_set): | |
| same_cat_pith_hits += 1 | |
| same_cat_pith_hit_rate = same_cat_pith_hits / max(1, len(q2_turns)) | |
| # Off-diagonal "category leak" diagnostic: how often did a q2 pith | |
| # pull a node tagged with a DIFFERENT category? Lower is cleaner | |
| # separation. Untaggable nodes (no embedding, or below threshold) do | |
| # not count as leaks. | |
| cross_cat_leaks = 0 | |
| for j in q2_turns: | |
| pith_set = set(pith_ids_per_turn[j]) | |
| j_cat = categories_per_turn[j] | |
| for pid in pith_set: | |
| tagged = _categorize_node(pid) | |
| if tagged is not None and tagged != j_cat: | |
| cross_cat_leaks += 1 | |
| break | |
| # End-state substrate diagnostics — pair with the _start_ values | |
| # captured at benchmark entry so consumers can confirm both A and B | |
| # runs started from the same substrate topology. | |
| _end_stats = nw_organism.get_stats() | |
| summary = { | |
| "model": MODEL_NAME, | |
| "interleaved_turns": N, | |
| # Toggle state — critical for A/B attribution. Comparing results | |
| # across enable_dual_pass=True vs =False is only meaningful when | |
| # both runs started from the same substrate state (substrate_ | |
| # nodes_start below should match between paired runs). | |
| "dual_pass_enabled": enable_dual_pass, | |
| "oracle_trees": oracle_trees, | |
| "surfacing_mode": surfacing_mode, | |
| "substrate_nodes_start": _start_nodes, | |
| "substrate_nodes_end": _end_stats.get('nodes', 0), | |
| "substrate_synapses_start": _start_synapses, | |
| "substrate_synapses_end": _end_stats.get('synapses', 0), | |
| # Run 41+ predictive-coding diagnostic — cumulative counters from the | |
| # canonical Graph. If `predictions_made_during_run = 0` even at | |
| # benchmark scale, the predictive-coding loop is dormant (gated by | |
| # prediction_threshold per audit task #12) and the surprise-driven | |
| # intrinsic reward broadcast (canonical neuro_foundation:2549) never | |
| # fires. This is the empirical confirmation gate before any config | |
| # graduation work. | |
| "predictions_made_during_run": int(getattr( | |
| getattr(nw_organism, "_graph", None), "_total_predictions_made", 0, | |
| ) or 0) - _start_total_predictions_made, | |
| "predictions_surprised_during_run": int(getattr( | |
| getattr(nw_organism, "_graph", None), "_total_surprised", 0, | |
| ) or 0) - _start_total_surprised, | |
| # Phase A pool sample metadata — what got drawn this run. | |
| # Lets us correlate per-run substrate behavior with which threads / | |
| # categories / complexity levels were actually exercised. | |
| "pool_sample": _pool_summary, | |
| "baseline_total_tokens": sum(r["baseline"]["tokens"] for r in results), | |
| "nuwave_total_tokens": sum(r["nuwave"]["tokens"] for r in results), | |
| "tokens_saved": sum(max(0, r["baseline"]["tokens"] - r["nuwave"]["tokens"]) for r in results), | |
| "baseline_total_time": round(sum(r["baseline"]["time"] for r in results), 2), | |
| "nuwave_total_time": round(sum(r["nuwave"]["time"] for r in results), 2), | |
| "ignition_mean_same_category": round(float(np.mean(same_A)), 4) if same_A else 0, | |
| "ignition_mean_cross_category": round(float(np.mean(cross_A)), 4) if cross_A else 0, | |
| # Same-category re-ignition: did q2 turn pull q1's deposit? 4 pairs. | |
| # (Strict exact-id match. Run 13 confirmed this is too narrow.) | |
| "same_category_pith_reselect": same_cat_B_hits, | |
| "same_category_pith_reselect_total": len(_INTERLEAVED_SAME_CAT_PAIRS), | |
| # Category-match re-ignition: did q2's pith pull ANY same-category | |
| # node (this run OR prior runs)? This is the metric that actually | |
| # measures category-coherent retrieval — the substrate's intended | |
| # behavior. Numerator counts q2 turns 4-7; denominator is 4. | |
| "same_category_pith_hit_rate": round(same_cat_pith_hit_rate, 4), | |
| "same_category_pith_hits": same_cat_pith_hits, | |
| "same_category_pith_hits_total": len(q2_turns), | |
| # Off-diagonal diagnostic: how many q2 turns pulled at least one | |
| # cross-category node? Lower = cleaner category separation. | |
| "cross_category_pith_leaks": cross_cat_leaks, | |
| # Registry size — grows monotonically across runs on this Space. | |
| "category_registry_size": len(cat_registry), | |
| # Total reselects across ALL causal cells (diagnostic, not the | |
| # re-ignition signal — includes cross-category pulls). | |
| "pith_reselect_total_causal": int(mat_B.sum()), | |
| "pith_reselect_total_causal_max": sum(range(N)), # 0+1+2+...+(N-1) = 28 for N=8 | |
| } | |
| # Restore the concept extractor if we disabled it for this run. | |
| # Done here at the end rather than in a finally so the summary | |
| # captures the actual state. If an exception crashes the benchmark | |
| # mid-flight the extractor stays detached until manual re-wiring | |
| # or Space restart — acceptable for a diagnostic tool. | |
| if _saved_extractor is not None: | |
| nw_organism._concept_extractor = _saved_extractor | |
| if oracle_trees: | |
| logger.info("Oracle mode EXITED — LLM extractor restored") | |
| else: | |
| logger.info("Dual-pass RE-ENABLED after benchmark") | |
| return ( | |
| json.dumps(summary, indent=2), | |
| json.dumps(results, indent=2), | |
| fig_A, | |
| fig_B, | |
| ) | |
| # ── Gradio App ──────────────────────────────────────────────────── | |
| with gr.Blocks( | |
| title="NuWave — Your Model Gets Smarter Over Time", | |
| theme=gr.themes.Soft(), | |
| ) as demo: | |
| gr.Markdown( | |
| f""" | |
| # NuWave — Your Model Gets Smarter Over Time | |
| **Context optimization through compound substrate dynamics.** | |
| - **KISS** filters redundant context — system prompt skipped when unchanged, old history compressed to summary | |
| - **Pith** manages context as a cache hierarchy — clutter stripped, cold entries evicted, relevant context promoted | |
| - **Splat-Lenia** — weight layers decomposed to Gaussian splats, Lenia dynamics evolve them between turns | |
| Model: `{MODEL_NAME}` | Inference: CPU | Splat layers: {len(splat_layers)} | Total splats: {sum(s.n_splats for s in splat_layers.values()) if splat_layers else 0} | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("Live Chat"): | |
| chatbot = gr.Chatbot(height=400, type="messages") | |
| stats_display = gr.Markdown("*Send a message to see NuWave metrics*") | |
| with gr.Row(): | |
| msg = gr.Textbox(placeholder="Type a message...", show_label=False, scale=4) | |
| send_btn = gr.Button("Send", scale=1) | |
| reset_btn = gr.Button("Reset", scale=1) | |
| send_btn.click(on_send, [msg, chatbot], [msg, chatbot, stats_display]) | |
| msg.submit(on_send, [msg, chatbot], [msg, chatbot, stats_display]) | |
| reset_btn.click(on_reset, outputs=[chatbot, stats_display]) | |
| with gr.Tab("A/B Benchmark"): | |
| gr.Markdown( | |
| """ | |
| ### Baseline vs NuWave | |
| Same conversation, same model, same CPU. Baseline sends full context every turn. | |
| NuWave compresses history and skips redundant system context. | |
| Watch: tokens decrease, time decreases, KISS efficiency climbs. | |
| """ | |
| ) | |
| with gr.Row(): | |
| num_turns = gr.Slider(minimum=3, maximum=15, value=8, step=1, label="Turns") | |
| run_btn = gr.Button("Run Benchmark", variant="primary") | |
| summary_output = gr.Code(label="Summary", language="json") | |
| curve_output = gr.Code(label="Per-Turn Data", language="json") | |
| run_btn.click(on_benchmark, [num_turns], [summary_output, curve_output]) | |
| with gr.Tab("Interleaved Benchmark"): | |
| gr.Markdown( | |
| """ | |
| ### Topology Re-ignition Test | |
| Four semantic categories, two questions each, interleaved. | |
| | Turn | Category | Question | | |
| |------|-----------|----------| | |
| | 1 | biology | photosynthesis q1 | | |
| | 2 | physics | black holes q1 | | |
| | 3 | computing | CPU caches q1 | | |
| | 4 | math | prime numbers q1 | | |
| | 5 | biology | chlorophyll q2 | | |
| | 6 | physics | event horizon q2 | | |
| | 7 | computing | L1 split q2 | | |
| | 8 | math | cryptography q2 | | |
| Turns 1-4 seed four semantic neighborhoods in the substrate. | |
| Turns 5-8 ask a follow-up in each — but each follow-up's | |
| *matching* primer is 4 turns back, with 3 unrelated turns | |
| in between. A recency-only system fails this test. A | |
| substrate-informed bucket should re-light the matching | |
| neighborhood via Born-rule interference despite the gap. | |
| **Heatmap A** — Jaccard overlap of fired-node sets between | |
| every pair of turns. Red boxes mark the same-category q1 | |
| ↔ q2 pairs we expect to see light up. | |
| **Heatmap B** — Did turn *j*'s Pith selection pull turn | |
| *i*'s deposit back into context? Red boxes mark the four | |
| causal same-category cells. Bright red cells = substrate | |
| memory working. | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| **A/B toggle:** Uncheck to disable the dual-pass concept | |
| helper for this run. For a clean comparison, run the same | |
| starting substrate through both toggle states back-to-back. | |
| The summary includes `substrate_nodes_start` so you can | |
| confirm both runs began from the same state. | |
| """ | |
| ) | |
| with gr.Row(): | |
| inter_enable_dualpass = gr.Checkbox( | |
| value=True, | |
| label="Enable dual-pass concept helper", | |
| ) | |
| inter_btn = gr.Button("Run Interleaved Benchmark", variant="primary") | |
| gr.Markdown( | |
| """ | |
| **Oracle Trees (ceiling test):** Run once with hand-authored | |
| ideal mechanism concepts instead of the LLM extractor. Tests | |
| whether dual-pass CAN succeed given perfect trees — regardless | |
| of extractor quality. If ignition metrics dramatically exceed | |
| the no-tree baseline, the extractor is the bottleneck. | |
| If not, dual-pass itself is the dead end. Only works with | |
| the 8 interleaved benchmark prompts. | |
| **CES Surfacing (architecture test):** Swaps the Born-rule | |
| amplitude² scoring (which suppresses trees 4:1) for CES | |
| voltage+recency×excitability scoring. Tests whether SNN- | |
| native dynamics avoid the amplitude bottleneck. Pairs well | |
| with Oracle Trees to isolate the scoring-layer effect | |
| from the extractor-quality effect. | |
| """ | |
| ) | |
| with gr.Row(): | |
| oracle_btn = gr.Button( | |
| "Run with Oracle Trees (Pith scoring)", | |
| variant="secondary", | |
| ) | |
| surface_btn = gr.Button( | |
| "Run with CES Surfacing (LLM trees)", | |
| variant="secondary", | |
| ) | |
| oracle_surface_btn = gr.Button( | |
| "Run Oracle + CES Surfacing", | |
| variant="secondary", | |
| ) | |
| inter_summary = gr.Code(label="Summary", language="json") | |
| inter_per_turn = gr.Code(label="Per-Turn Data", language="json") | |
| with gr.Row(): | |
| inter_heatmap_a = gr.Plot(label="Ignition Overlap") | |
| inter_heatmap_b = gr.Plot(label="Pith Re-selection") | |
| inter_btn.click( | |
| lambda enable: on_interleaved_benchmark(enable, False, "pith"), | |
| inputs=[inter_enable_dualpass], | |
| outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b], | |
| ) | |
| oracle_btn.click( | |
| lambda: on_interleaved_benchmark(True, True, "pith"), | |
| inputs=[], | |
| outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b], | |
| ) | |
| surface_btn.click( | |
| lambda: on_interleaved_benchmark(True, False, "surface"), | |
| inputs=[], | |
| outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b], | |
| ) | |
| oracle_surface_btn.click( | |
| lambda: on_interleaved_benchmark(True, True, "surface"), | |
| inputs=[], | |
| outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b], | |
| ) | |
| with gr.Tab("Debug Extract"): | |
| gr.Markdown( | |
| """ | |
| ### Concept extraction diagnostic | |
| Runs the BitNet concept extractor against all 8 interleaved- | |
| benchmark questions and reports what actually comes out. | |
| Use this **before** running A/B benchmarks to verify | |
| extraction quality — if concepts are generic, hallucinated, | |
| or structurally malformed, downstream measurements are noise. | |
| **Three views of the output:** | |
| - **Summary** — overall counts, median concepts per question, | |
| whether any generations hit the token cap (suggests the | |
| model didn't produce a natural stop and may have launched | |
| into an explanation), and per-category **same-category bridge | |
| analysis**: do q1 and q2 for the same category share concepts? | |
| That's the direct hypothesis check — if the shared set is | |
| empty for math (prime numbers ↔ cryptography), no amount of | |
| dual-pass will help. | |
| - **Per-Question Data** — for each question: raw model output | |
| (before parsing), parsed concepts, tokens in/out, wall-time. | |
| Eyeball the raw output to catch hallucinated answers and | |
| the parsed list to judge concept specificity. | |
| - **Pairwise Overlap** — which question pairs share concepts. | |
| If every question shares "thing" / "concept" / "process", | |
| the extractor is producing generic pollution. | |
| **Cost:** ~30-40s per extraction × 8 = 4-6 minutes total. | |
| """ | |
| ) | |
| debug_btn = gr.Button("Run Debug Extraction", variant="primary") | |
| debug_summary = gr.Code(label="Summary + Same-Category Bridges", language="json") | |
| debug_per_question = gr.Code(label="Per-Question Raw + Parsed", language="json") | |
| debug_pairwise = gr.Code(label="Pairwise Concept Overlap (cross-category pollution signal)", language="json") | |
| debug_btn.click( | |
| on_debug_extract, | |
| inputs=[], | |
| outputs=[debug_summary, debug_per_question, debug_pairwise], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", ssr_mode=False) | |