Executor-Tyrant-Framework commited on
Commit
1a60a7b
·
verified ·
1 Parent(s): 6cb9c01

Sync from GitHub: f14bff5f1a4cd6df770e238630bf7f96e869e925

Browse files
Files changed (1) hide show
  1. app.py +56 -10
app.py CHANGED
@@ -211,8 +211,21 @@ def do_generate(prompt_text: str, max_new_tokens: int = 256) -> tuple:
211
  # backend because BitNet requirements pinned torch 2.2 < the 2.4
212
  # transformers wants. We don't need tensors anyway; len() on the
213
  # input_ids list is all we want.
214
- encoded = tokenizer(prompt_text, truncation=True, max_length=4096)
 
 
 
 
 
 
 
 
215
  in_count = len(encoded["input_ids"])
 
 
 
 
 
216
 
217
  organism.mark_generation_start()
218
  try:
@@ -325,7 +338,14 @@ def _hardened_parse(raw_output: str) -> list:
325
  continue
326
  if "<|" in c or "</s>" in c or "</" in c:
327
  continue
328
- if len(c.split()) > 4:
 
 
 
 
 
 
 
329
  continue
330
  cl = c.lower().strip()
331
  if cl in _EXTRACTOR_STOPSET:
@@ -352,15 +372,20 @@ def _hardened_parse(raw_output: str) -> list:
352
  # portion, because small LLMs echo instruction vocabulary back as
353
  # output content.
354
  _EXTRACTOR_PROMPT_TEMPLATE = (
355
- "Read the following text. Extract the specific mechanisms, "
356
- "operations, and relationships it establishesthe things the "
357
- "text says happen, connect, or depend on each other. Prefer "
358
- "specific over general: 'prime factorization' not 'number theory', "
359
- "'membrane depolarization' not 'biology'. Output as a comma-"
360
- "separated enumeration. Each item 1-4 words. No sentences, no "
361
- "explanations, no repetition.\n\n"
 
 
 
 
 
362
  "Text: {text}\n\n"
363
- "Mechanisms:"
364
  )
365
 
366
 
@@ -952,6 +977,26 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
952
  drain_elapsed = 0.0
953
 
954
  org_stats = nw_organism.get_stats()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
  results.append({
956
  "turn": i + 1,
957
  "category": category,
@@ -964,6 +1009,7 @@ def on_interleaved_benchmark(enable_dual_pass: bool = True):
964
  "deposit_node_id": deposit_nid,
965
  "ignition_size": len(ignition_sets[i]),
966
  "pith_ids": list(pith_ids),
 
967
  "substrate_nodes": org_stats.get('nodes', 0),
968
  "substrate_synapses": org_stats.get('synapses', 0),
969
  "tree_drain_s": drain_elapsed,
 
211
  # backend because BitNet requirements pinned torch 2.2 < the 2.4
212
  # transformers wants. We don't need tensors anyway; len() on the
213
  # input_ids list is all we want.
214
+ #
215
+ # Truncate with headroom below bitnet.cpp's n_ctx so the runtime
216
+ # has room to generate. Without this, a prompt tokenized to exactly
217
+ # 4096 collides with the client's n_ctx=4096 and the subprocess
218
+ # exits ~1s with zero output (see benchmark runs 2026-04-20 turns
219
+ # 6-8 on both sides). Headroom = max_new_tokens + safety buffer.
220
+ _CTX_HEADROOM = max_new_tokens + 128
221
+ _PROMPT_CAP = max(256, chat_client.n_ctx - _CTX_HEADROOM)
222
+ encoded = tokenizer(prompt_text, truncation=True, max_length=_PROMPT_CAP)
223
  in_count = len(encoded["input_ids"])
224
+ # If truncation occurred, feed the truncated text to the client —
225
+ # otherwise bitnet.cpp will re-tokenize the full original and blow
226
+ # past n_ctx anyway.
227
+ if in_count >= _PROMPT_CAP:
228
+ prompt_text = tokenizer.decode(encoded["input_ids"], skip_special_tokens=False)
229
 
230
  organism.mark_generation_start()
231
  try:
 
338
  continue
339
  if "<|" in c or "</s>" in c or "</" in c:
340
  continue
341
+ # Process-shape enforcement: require 2-4 words. Single-word
342
+ # entries ("gravity", "encryption", "caching") are topic labels,
343
+ # not processes — they have broad embedding footprint and become
344
+ # gravity wells in Pith. Mechanism concepts that actually bridge
345
+ # passages are process-shaped: 2+ words naming an action or
346
+ # dependency. Backstops the prompt's negative examples.
347
+ n_words = len(c.split())
348
+ if n_words < 2 or n_words > 4:
349
  continue
350
  cl = c.lower().strip()
351
  if cl in _EXTRACTOR_STOPSET:
 
372
  # portion, because small LLMs echo instruction vocabulary back as
373
  # output content.
374
  _EXTRACTOR_PROMPT_TEMPLATE = (
375
+ "Read the following text. Extract the specific processes and "
376
+ "dependencies it describesname each as an action or relationship, "
377
+ "not as a topic label.\n\n"
378
+ "Good examples: 'prime factorization', 'photon absorption', "
379
+ "'cache line invalidation', 'modular exponentiation', "
380
+ "'membrane depolarization'.\n"
381
+ "Bad examples: 'gravity', 'primes', 'caching', 'encryption', "
382
+ "'biology' — these are single-word topics that describe what "
383
+ "exists, not what happens or how things depend on each other.\n\n"
384
+ "Output as a comma-separated enumeration. Each item must be 2-4 "
385
+ "words describing a process or dependency. No sentences, no "
386
+ "explanations, no single-word topic labels, no repetition.\n\n"
387
  "Text: {text}\n\n"
388
+ "Processes:"
389
  )
390
 
391
 
 
977
  drain_elapsed = 0.0
978
 
979
  org_stats = nw_organism.get_stats()
980
+
981
+ # Capture the extracted tree concepts for THIS turn's forest —
982
+ # walk graph metadata for nodes tagged forest=deposit_nid.
983
+ # Post-drain so these are complete and stable. Gives us
984
+ # ground-truth visibility into what the extractor actually
985
+ # produced vs. what the prompt asked for. Critical diagnostic
986
+ # for specificity tuning. Safe to read nodes under the graph
987
+ # lock (trees already committed).
988
+ trees_for_turn = []
989
+ if enable_dual_pass:
990
+ try:
991
+ with nw_organism._graph_lock:
992
+ for nid, node in nw_organism._graph.nodes.items():
993
+ if node.metadata.get("forest") == deposit_nid:
994
+ concept = nw_organism._node_content.get(nid, "")
995
+ if concept:
996
+ trees_for_turn.append(concept)
997
+ except Exception as exc:
998
+ logger.debug("Tree capture failed for turn %d: %s", i + 1, exc)
999
+
1000
  results.append({
1001
  "turn": i + 1,
1002
  "category": category,
 
1009
  "deposit_node_id": deposit_nid,
1010
  "ignition_size": len(ignition_sets[i]),
1011
  "pith_ids": list(pith_ids),
1012
+ "trees": trees_for_turn,
1013
  "substrate_nodes": org_stats.get('nodes', 0),
1014
  "substrate_synapses": org_stats.get('synapses', 0),
1015
  "tree_drain_s": drain_elapsed,