Spaces:

Executor-Tyrant-Framework
/

NuWave

Running

App Files Files Community

Executor-Tyrant-Framework commited on Apr 20

Commit

1a60a7b

verified ·

1 Parent(s): 6cb9c01

Sync from GitHub: f14bff5f1a4cd6df770e238630bf7f96e869e925

Browse files

Files changed (1) hide show

app.py +56 -10

app.py CHANGED Viewed

@@ -211,8 +211,21 @@ def do_generate(prompt_text: str, max_new_tokens: int = 256) -> tuple:
     # backend because BitNet requirements pinned torch 2.2 < the 2.4
     # transformers wants. We don't need tensors anyway; len() on the
     # input_ids list is all we want.
-    encoded = tokenizer(prompt_text, truncation=True, max_length=4096)
     in_count = len(encoded["input_ids"])
     organism.mark_generation_start()
     try:
@@ -325,7 +338,14 @@ def _hardened_parse(raw_output: str) -> list:
             continue
         if "<|" in c or "</s>" in c or "</" in c:
             continue
-        if len(c.split()) > 4:
             continue
         cl = c.lower().strip()
         if cl in _EXTRACTOR_STOPSET:
@@ -352,15 +372,20 @@ def _hardened_parse(raw_output: str) -> list:
 # portion, because small LLMs echo instruction vocabulary back as
 # output content.
 _EXTRACTOR_PROMPT_TEMPLATE = (
-    "Read the following text. Extract the specific mechanisms, "
-    "operations, and relationships it establishes — the things the "
-    "text says happen, connect, or depend on each other. Prefer "
-    "specific over general: 'prime factorization' not 'number theory', "
-    "'membrane depolarization' not 'biology'. Output as a comma-"
-    "separated enumeration. Each item 1-4 words. No sentences, no "
-    "explanations, no repetition.\n\n"
     "Text: {text}\n\n"
-    "Mechanisms:"
 )
@@ -952,6 +977,26 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
             drain_elapsed = 0.0
         org_stats = nw_organism.get_stats()
         results.append({
             "turn": i + 1,
             "category": category,
@@ -964,6 +1009,7 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
             "deposit_node_id": deposit_nid,
             "ignition_size":   len(ignition_sets[i]),
             "pith_ids":        list(pith_ids),
             "substrate_nodes":    org_stats.get('nodes', 0),
             "substrate_synapses": org_stats.get('synapses', 0),
             "tree_drain_s": drain_elapsed,

     # backend because BitNet requirements pinned torch 2.2 < the 2.4
     # transformers wants. We don't need tensors anyway; len() on the
     # input_ids list is all we want.
+    #
+    # Truncate with headroom below bitnet.cpp's n_ctx so the runtime
+    # has room to generate. Without this, a prompt tokenized to exactly
+    # 4096 collides with the client's n_ctx=4096 and the subprocess
+    # exits ~1s with zero output (see benchmark runs 2026-04-20 turns
+    # 6-8 on both sides). Headroom = max_new_tokens + safety buffer.
+    _CTX_HEADROOM = max_new_tokens + 128
+    _PROMPT_CAP = max(256, chat_client.n_ctx - _CTX_HEADROOM)
+    encoded = tokenizer(prompt_text, truncation=True, max_length=_PROMPT_CAP)
     in_count = len(encoded["input_ids"])
+    # If truncation occurred, feed the truncated text to the client —
+    # otherwise bitnet.cpp will re-tokenize the full original and blow
+    # past n_ctx anyway.
+    if in_count >= _PROMPT_CAP:
+        prompt_text = tokenizer.decode(encoded["input_ids"], skip_special_tokens=False)
     organism.mark_generation_start()
     try:
             continue
         if "<|" in c or "</s>" in c or "</" in c:
             continue
+        # Process-shape enforcement: require 2-4 words. Single-word
+        # entries ("gravity", "encryption", "caching") are topic labels,
+        # not processes — they have broad embedding footprint and become
+        # gravity wells in Pith. Mechanism concepts that actually bridge
+        # passages are process-shaped: 2+ words naming an action or
+        # dependency. Backstops the prompt's negative examples.
+        n_words = len(c.split())
+        if n_words < 2 or n_words > 4:
             continue
         cl = c.lower().strip()
         if cl in _EXTRACTOR_STOPSET:
 # portion, because small LLMs echo instruction vocabulary back as
 # output content.
 _EXTRACTOR_PROMPT_TEMPLATE = (
+    "Read the following text. Extract the specific processes and "
+    "dependencies it describes — name each as an action or relationship, "
+    "not as a topic label.\n\n"
+    "Good examples: 'prime factorization', 'photon absorption', "
+    "'cache line invalidation', 'modular exponentiation', "
+    "'membrane depolarization'.\n"
+    "Bad examples: 'gravity', 'primes', 'caching', 'encryption', "
+    "'biology' — these are single-word topics that describe what "
+    "exists, not what happens or how things depend on each other.\n\n"
+    "Output as a comma-separated enumeration. Each item must be 2-4 "
+    "words describing a process or dependency. No sentences, no "
+    "explanations, no single-word topic labels, no repetition.\n\n"
     "Text: {text}\n\n"
+    "Processes:"
 )
             drain_elapsed = 0.0
         org_stats = nw_organism.get_stats()
+        # Capture the extracted tree concepts for THIS turn's forest —
+        # walk graph metadata for nodes tagged forest=deposit_nid.
+        # Post-drain so these are complete and stable. Gives us
+        # ground-truth visibility into what the extractor actually
+        # produced vs. what the prompt asked for. Critical diagnostic
+        # for specificity tuning. Safe to read nodes under the graph
+        # lock (trees already committed).
+        trees_for_turn = []
+        if enable_dual_pass:
+            try:
+                with nw_organism._graph_lock:
+                    for nid, node in nw_organism._graph.nodes.items():
+                        if node.metadata.get("forest") == deposit_nid:
+                            concept = nw_organism._node_content.get(nid, "")
+                            if concept:
+                                trees_for_turn.append(concept)
+            except Exception as exc:
+                logger.debug("Tree capture failed for turn %d: %s", i + 1, exc)
         results.append({
             "turn": i + 1,
             "category": category,
             "deposit_node_id": deposit_nid,
             "ignition_size":   len(ignition_sets[i]),
             "pith_ids":        list(pith_ids),
+            "trees":           trees_for_turn,
             "substrate_nodes":    org_stats.get('nodes', 0),
             "substrate_synapses": org_stats.get('synapses', 0),
             "tree_drain_s": drain_elapsed,