CharlesCNorton commited on 15 days ago

Commit

05b1aea

1 Parent(s): 1e992a1

Add 18 prebuilt variants and unified eval harness

variants/ holds every (8|16|32)-bit x (none|registers|scratchpad|small|reduced|full)
build (~325 MB total) so users can pull weights from HF without running build.py.

eval_all.py is variant-agnostic: reads each safetensors' manifest, runs the
BatchedFitnessEvaluator, and with --cpu-program also runs an assembled program
through the threshold CPU sized to the variant plus a chained N-bit ALU test
for 16/32-bit data widths.

build.py: fix infer_combinational_inputs N-bit handling. The barrel shifter
case used 1 << (2 - layer), valid only for 3-layer (8-bit) shifters; 16/32-bit
versions have 4-5 layers and crashed at the .inputs step. Priority encoder also
hardcoded 8 inputs and the legacy any_ge naming. Both now parse the bit width
from the gate name and emit correct shift amounts and signal references.

build_all.py orchestrates building + evaluating every named profile.
play.py is a standalone demo (Boolean gates, 8-bit ALU, mod-5, threshold CPU).

Files changed (22) hide show

build.py +41 -21
build_all.py +181 -0
eval_all.py +613 -0
play.py +484 -0
variants/neural_alu16.safetensors +3 -0
variants/neural_alu32.safetensors +3 -0
variants/neural_alu8.safetensors +3 -0
variants/neural_computer16.safetensors +3 -0
variants/neural_computer16_reduced.safetensors +3 -0
variants/neural_computer16_registers.safetensors +3 -0
variants/neural_computer16_scratchpad.safetensors +3 -0
variants/neural_computer16_small.safetensors +3 -0
variants/neural_computer32.safetensors +3 -0
variants/neural_computer32_reduced.safetensors +3 -0
variants/neural_computer32_registers.safetensors +3 -0
variants/neural_computer32_scratchpad.safetensors +3 -0
variants/neural_computer32_small.safetensors +3 -0
variants/neural_computer8.safetensors +3 -0
variants/neural_computer8_reduced.safetensors +3 -0
variants/neural_computer8_registers.safetensors +3 -0
variants/neural_computer8_scratchpad.safetensors +3 -0
variants/neural_computer8_small.safetensors +3 -0

build.py CHANGED Viewed

@@ -2505,7 +2505,7 @@ def infer_error_detection_inputs(gate: str, reg: SignalRegistry) -> List[int]:
     return [reg.get_id(f"$x[{i}]") for i in range(8)]
-def infer_combinational_inputs(gate: str, reg: SignalRegistry) -> List[int]:
     if 'decoder3to8' in gate:
         for i in range(3):
             reg.register(f"$sel[{i}]")
@@ -2550,41 +2550,57 @@ def infer_combinational_inputs(gate: str, reg: SignalRegistry) -> List[int]:
                 return [reg.register(f"combinational.regmux4to1.bit{bit}.and{i}") for i in range(4)]
         return []
     if 'barrelshifter' in gate:
-        for i in range(8):
             reg.register(f"$x[{i}]")
-        for i in range(3):
             reg.register(f"$shift[{i}]")
         m = re.search(r'layer(\d+)\.bit(\d+)', gate)
         if m:
             layer, bit = int(m.group(1)), int(m.group(2))
-            shift_amount = 1 << (2 - layer)
-            prefix = f"combinational.barrelshifter.layer{layer}.bit{bit}"
             if '.not_sel' in gate:
-                return [reg.get_id(f"$shift[{2 - layer}]")]
             if '.and_a' in gate:
                 if layer == 0:
                     return [reg.get_id(f"$x[{bit}]"), reg.register(f"{prefix}.not_sel")]
                 else:
-                    prev_prefix = f"combinational.barrelshifter.layer{layer-1}.bit{bit}"
                     return [reg.register(f"{prev_prefix}.or"), reg.register(f"{prefix}.not_sel")]
             if '.and_b' in gate:
-                src = (bit + shift_amount) % 8
                 if layer == 0:
-                    return [reg.get_id(f"$x[{src}]"), reg.get_id(f"$shift[{2 - layer}]")]
                 else:
-                    prev_prefix = f"combinational.barrelshifter.layer{layer-1}.bit{src}"
-                    return [reg.register(f"{prev_prefix}.or"), reg.get_id(f"$shift[{2 - layer}]")]
             if '.or' in gate:
                 return [reg.register(f"{prefix}.and_a"), reg.register(f"{prefix}.and_b")]
-        return [reg.get_id(f"$x[{i}]") for i in range(8)]
     if 'priorityencoder' in gate:
-        for i in range(8):
             reg.register(f"$x[{i}]")
         if '.any_ge' in gate:
             m = re.search(r'any_ge(\d+)', gate)
             if m:
                 pos = int(m.group(1))
-                return [reg.get_id(f"$x[{i}]") for i in range(pos, 8)]
         if '.is_highest' in gate:
             m = re.search(r'is_highest(\d+)', gate)
             if m:
@@ -2593,21 +2609,25 @@ def infer_combinational_inputs(gate: str, reg: SignalRegistry) -> List[int]:
                     if pos == 0:
                         return [reg.get_id("#0")]
                     else:
-                        return [reg.register(f"combinational.priorityencoder.any_ge{pos-1}")]
                 if '.and' in gate:
-                    return [reg.get_id(f"$x[{pos}]"), reg.register(f"combinational.priorityencoder.is_highest{pos}.not_higher")]
         if '.out' in gate:
             m = re.search(r'out(\d+)', gate)
             if m:
                 out_bit = int(m.group(1))
                 inputs = []
-                for pos in range(8):
                     if (pos >> out_bit) & 1:
-                        inputs.append(reg.register(f"combinational.priorityencoder.is_highest{pos}.and"))
                 return inputs
         if '.valid' in gate:
-            return [reg.get_id(f"$x[{i}]") for i in range(8)]
-        return [reg.get_id(f"$x[{i}]") for i in range(8)]
     return []
@@ -2706,7 +2726,7 @@ def infer_inputs_for_gate(gate: str, reg: SignalRegistry, tensors: Dict[str, tor
     if gate.startswith('error_detection.'):
         return infer_error_detection_inputs(gate, reg)
     if gate.startswith('combinational.'):
-        return infer_combinational_inputs(gate, reg)
     weight_key = f"{gate}.weight"
     if weight_key in tensors:
         w = tensors[weight_key]

     return [reg.get_id(f"$x[{i}]") for i in range(8)]
+def infer_combinational_inputs(gate: str, reg: SignalRegistry, tensors: Dict[str, torch.Tensor] = None) -> List[int]:
     if 'decoder3to8' in gate:
         for i in range(3):
             reg.register(f"$sel[{i}]")
                 return [reg.register(f"combinational.regmux4to1.bit{bit}.and{i}") for i in range(4)]
         return []
     if 'barrelshifter' in gate:
+        import math as _math
+        bs_match = re.search(r'barrelshifter(\d*)', gate)
+        bits = int(bs_match.group(1)) if bs_match and bs_match.group(1) else 8
+        bs_prefix = f"combinational.barrelshifter{bs_match.group(1) if bs_match else ''}"
+        num_layers = max(1, _math.ceil(_math.log2(bits))) if bits > 1 else 1
+        for i in range(bits):
             reg.register(f"$x[{i}]")
+        for i in range(num_layers):
             reg.register(f"$shift[{i}]")
         m = re.search(r'layer(\d+)\.bit(\d+)', gate)
         if m:
             layer, bit = int(m.group(1)), int(m.group(2))
+            shift_amount = 1 << (num_layers - 1 - layer)
+            prefix = f"{bs_prefix}.layer{layer}.bit{bit}"
+            sel_idx = num_layers - 1 - layer
             if '.not_sel' in gate:
+                return [reg.get_id(f"$shift[{sel_idx}]")]
             if '.and_a' in gate:
                 if layer == 0:
                     return [reg.get_id(f"$x[{bit}]"), reg.register(f"{prefix}.not_sel")]
                 else:
+                    prev_prefix = f"{bs_prefix}.layer{layer-1}.bit{bit}"
                     return [reg.register(f"{prev_prefix}.or"), reg.register(f"{prefix}.not_sel")]
             if '.and_b' in gate:
+                src = (bit + shift_amount) % bits
                 if layer == 0:
+                    return [reg.get_id(f"$x[{src}]"), reg.get_id(f"$shift[{sel_idx}]")]
                 else:
+                    prev_prefix = f"{bs_prefix}.layer{layer-1}.bit{src}"
+                    return [reg.register(f"{prev_prefix}.or"), reg.get_id(f"$shift[{sel_idx}]")]
             if '.or' in gate:
                 return [reg.register(f"{prefix}.and_a"), reg.register(f"{prefix}.and_b")]
+        return [reg.get_id(f"$x[{i}]") for i in range(bits)]
     if 'priorityencoder' in gate:
+        pe_match = re.search(r'priorityencoder(\d*)', gate)
+        bits = int(pe_match.group(1)) if pe_match and pe_match.group(1) else 8
+        pe_prefix = f"combinational.priorityencoder{pe_match.group(1) if pe_match else ''}"
+        for i in range(bits):
             reg.register(f"$x[{i}]")
+        # Legacy 8-bit naming: any_ge{pos} = OR of bits at positions [pos..bits-1]
         if '.any_ge' in gate:
             m = re.search(r'any_ge(\d+)', gate)
             if m:
                 pos = int(m.group(1))
+                return [reg.get_id(f"$x[{i}]") for i in range(pos, bits)]
+        # N-bit naming: any_higher{pos} = OR of bits 0..pos-1
+        if '.any_higher' in gate:
+            m = re.search(r'any_higher(\d+)', gate)
+            if m:
+                pos = int(m.group(1))
+                return [reg.get_id(f"$x[{i}]") for i in range(pos)]
         if '.is_highest' in gate:
             m = re.search(r'is_highest(\d+)', gate)
             if m:
                     if pos == 0:
                         return [reg.get_id("#0")]
                     else:
+                        # Try N-bit any_higher first, fall back to legacy any_ge
+                        ah_key = f"{pe_prefix}.any_higher{pos}"
+                        if tensors is not None and f"{ah_key}.weight" in tensors:
+                            return [reg.register(ah_key)]
+                        return [reg.register(f"{pe_prefix}.any_ge{pos-1}")]
                 if '.and' in gate:
+                    return [reg.get_id(f"$x[{pos}]"), reg.register(f"{pe_prefix}.is_highest{pos}.not_higher")]
         if '.out' in gate:
             m = re.search(r'out(\d+)', gate)
             if m:
                 out_bit = int(m.group(1))
                 inputs = []
+                for pos in range(bits):
                     if (pos >> out_bit) & 1:
+                        inputs.append(reg.register(f"{pe_prefix}.is_highest{pos}.and"))
                 return inputs
         if '.valid' in gate:
+            return [reg.get_id(f"$x[{i}]") for i in range(bits)]
+        return [reg.get_id(f"$x[{i}]") for i in range(bits)]
     return []
     if gate.startswith('error_detection.'):
         return infer_error_detection_inputs(gate, reg)
     if gate.startswith('combinational.'):
+        return infer_combinational_inputs(gate, reg, tensors)
     weight_key = f"{gate}.weight"
     if weight_key in tensors:
         w = tensors[weight_key]

build_all.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+Build and verify every named (bits, memory_profile) variant.
+Outputs:
+  variants/neural_alu{8,16,32}.safetensors                          - no memory
+  variants/neural_computer{8,16,32}_registers.safetensors           - 16 B
+  variants/neural_computer{8,16,32}_scratchpad.safetensors          - 256 B
+  variants/neural_computer{8,16,32}_small.safetensors               - 1 KB
+  variants/neural_computer{8,16,32}_reduced.safetensors             - 4 KB
+  variants/neural_computer{8,16,32}.safetensors                     - 64 KB
+For each, runs eval.py via the BatchedFitnessEvaluator and records
+(tensor count, params, file size, fitness, total_tests, seconds).
+"""
+from __future__ import annotations
+import os
+import shutil
+import subprocess
+import sys
+import time
+from pathlib import Path
+import torch
+from safetensors import safe_open
+ROOT = Path(__file__).resolve().parent
+SEED = ROOT / "neural_computer.safetensors"
+OUT_DIR = ROOT / "variants"
+OUT_DIR.mkdir(exist_ok=True)
+PROFILES = ["none", "registers", "scratchpad", "small", "reduced", "full"]
+BITS = [8, 16, 32]
+def variant_filename(bits: int, profile: str) -> str:
+    if profile == "none":
+        return f"neural_alu{bits}.safetensors"
+    if profile == "full":
+        return f"neural_computer{bits}.safetensors"
+    return f"neural_computer{bits}_{profile}.safetensors"
+def run(cmd: list[str], timeout: int = 600) -> tuple[int, str]:
+    p = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
+    return p.returncode, (p.stdout or "") + (p.stderr or "")
+def build_variant(bits: int, profile: str) -> Path:
+    out = OUT_DIR / variant_filename(bits, profile)
+    shutil.copy2(SEED, out)
+    cmd = [
+        sys.executable, str(ROOT / "build.py"),
+        "--bits", str(bits),
+        "-m", profile,
+        "--apply",
+        "--model", str(out),
+        "all",
+    ]
+    rc, log = run(cmd, timeout=900)
+    if rc != 0:
+        raise RuntimeError(f"build failed for bits={bits} profile={profile}:\n{log[-1500:]}")
+    return out
+def measure_variant(path: Path) -> dict:
+    """Read tensor count, params, manifest values from the variant."""
+    with safe_open(str(path), framework="pt") as f:
+        keys = list(f.keys())
+        params = sum(f.get_tensor(k).numel() for k in keys)
+        manifest = {
+            k.split(".", 1)[1]: f.get_tensor(k).item()
+            for k in keys if k.startswith("manifest.") and f.get_tensor(k).numel() == 1
+        }
+    return {
+        "tensors": len(keys),
+        "params": params,
+        "size_mb": path.stat().st_size / (1024 * 1024),
+        "manifest": manifest,
+    }
+def eval_variant(path: Path, device: str = "cpu", timeout: int = 600) -> dict:
+    """Run eval.py against a variant and parse fitness."""
+    cmd = [
+        sys.executable, str(ROOT / "eval.py"),
+        "--model", str(path),
+        "--device", device,
+        "--quiet",
+    ]
+    t0 = time.time()
+    rc, log = run(cmd, timeout=timeout)
+    dt = time.time() - t0
+    fitness = None
+    total_tests = None
+    status = "ERROR"
+    for line in log.splitlines():
+        line = line.strip()
+        if line.startswith("Fitness:"):
+            try:
+                fitness = float(line.split()[1])
+            except Exception:
+                pass
+        elif line.startswith("Total tests:"):
+            try:
+                total_tests = int(line.split()[2])
+            except Exception:
+                pass
+        elif line.startswith("STATUS:"):
+            status = line.split()[1]
+    return {
+        "rc": rc,
+        "fitness": fitness,
+        "total_tests": total_tests,
+        "status": status,
+        "elapsed_s": dt,
+        "log_tail": "\n".join(log.splitlines()[-15:]),
+    }
+def main() -> None:
+    rows = []
+    print(f"Building 18 variants into {OUT_DIR}\n")
+    for bits in BITS:
+        for profile in PROFILES:
+            label = f"bits={bits} profile={profile}"
+            print(f"=== {label} ===", flush=True)
+            t0 = time.time()
+            try:
+                path = build_variant(bits, profile)
+                bt = time.time() - t0
+                meta = measure_variant(path)
+                ev = eval_variant(path, device="cpu", timeout=900)
+                rows.append({
+                    "bits": bits, "profile": profile,
+                    "filename": path.name,
+                    "build_s": bt,
+                    **meta,
+                    **{k: ev[k] for k in ("fitness", "total_tests", "status", "elapsed_s")},
+                    "log_tail": ev["log_tail"] if ev["status"] != "PASS" else "",
+                })
+                print(f"  built in {bt:.1f}s  size={meta['size_mb']:.1f}MB"
+                      f"  params={meta['params']:,}  tensors={meta['tensors']:,}")
+                print(f"  eval: fitness={ev['fitness']}  tests={ev['total_tests']}"
+                      f"  status={ev['status']}  ({ev['elapsed_s']:.1f}s)")
+                if ev["status"] != "PASS":
+                    print("  --- failure tail ---")
+                    print("  " + "\n  ".join(ev["log_tail"].splitlines()))
+                    print("  --------------------")
+            except Exception as e:
+                print(f"  EXCEPTION: {e}")
+                rows.append({"bits": bits, "profile": profile, "error": str(e)})
+            print()
+    print("=" * 88)
+    print(" SUMMARY")
+    print("=" * 88)
+    header = f"{'bits':>4} {'profile':<11} {'size_MB':>8} {'tensors':>8} {'params':>11} {'fitness':>9} {'tests':>6} {'status':>7}"
+    print(header)
+    print("-" * len(header))
+    for r in rows:
+        if "error" in r:
+            print(f"{r['bits']:>4} {r['profile']:<11} ERROR: {r['error'][:60]}")
+            continue
+        fit = f"{r['fitness']:.4f}" if r['fitness'] is not None else "n/a"
+        tests = r['total_tests'] if r['total_tests'] is not None else "?"
+        print(f"{r['bits']:>4} {r['profile']:<11} {r['size_mb']:>8.1f} "
+              f"{r['tensors']:>8,} {r['params']:>11,} "
+              f"{fit:>9} {tests:>6} {r['status']:>7}")
+    fail = [r for r in rows if r.get("status") != "PASS" or "error" in r]
+    print()
+    if fail:
+        print(f"FAILURES: {len(fail)}/{len(rows)}")
+    else:
+        print(f"ALL {len(rows)} VARIANTS PASS")
+if __name__ == "__main__":
+    main()

eval_all.py ADDED Viewed

	@@ -0,0 +1,613 @@

+"""
+Unified evaluation harness for any threshold-computer variant.
+Drops the `--cpu-test` smoke test (which was hardcoded to 16-bit/64KB) and
+adds variant-aware sweep modes. The same harness handles every (data_bits,
+addr_bits) configuration: it reads the manifest from each safetensors file,
+runs the BatchedFitnessEvaluator at the right device, and reports per-file
+plus per-category results.
+Usage:
+    python eval_all.py path/to/file.safetensors          # one file
+    python eval_all.py variants/                          # every .safetensors in dir
+    python eval_all.py --device cpu variants/             # CPU only (default)
+    python eval_all.py --pop_size 32 variants/            # batched pop eval
+    python eval_all.py --debug path/to/file.safetensors   # per-circuit detail
+    python eval_all.py --cpu-program PATH                 # also run an assembled program
+                                                          # through the threshold CPU
+                                                          # sized to the file's manifest
+Exit code:
+    0 if all files PASS (fitness >= 0.9999)
+    N where N is the number of FAILing files
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import torch
+from safetensors import safe_open
+# Reuse eval.py's evaluator (variant-aware)
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from eval import (
+    BatchedFitnessEvaluator,
+    create_population,
+    load_model,
+    get_manifest,
+    heaviside,
+    int_to_bits,
+    bits_to_int,
+    bits_msb_to_lsb,
+)
+# ---------------------------------------------------------------------------
+# Variant-aware threshold ALU + CPU
+# ---------------------------------------------------------------------------
+class GenericThresholdALU:
+    """Variant-aware threshold ALU. Reads manifest, runs ADD/SUB/CMP/MUL etc.
+    Currently supports the 8-bit ALU primitives (ripplecarry8bit, sub8bit,
+    cmp8bit, mul/div). For wider data paths, use the BatchedFitnessEvaluator
+    which already handles 16/32-bit comparators, subtractors, etc.
+    """
+    def __init__(self, tensors: Dict[str, torch.Tensor], data_bits: int):
+        self.T = tensors
+        self.data_bits = data_bits
+    def _g(self, name, inputs):
+        w = self.T[name + ".weight"].view(-1)
+        b = self.T[name + ".bias"].view(-1)
+        return int(heaviside((torch.tensor(inputs, dtype=torch.float32) * w).sum() + b).item())
+    def _xor_or_nand(self, prefix, inputs):
+        a, b_ = inputs
+        h_or = self._g(f"{prefix}.layer1.or", [a, b_])
+        h_nand = self._g(f"{prefix}.layer1.nand", [a, b_])
+        return self._g(f"{prefix}.layer2", [h_or, h_nand])
+    def _fa(self, prefix, a, b, cin):
+        s1 = self._xor_or_nand(f"{prefix}.ha1.sum", [a, b])
+        c1 = self._g(f"{prefix}.ha1.carry", [a, b])
+        s2 = self._xor_or_nand(f"{prefix}.ha2.sum", [s1, cin])
+        c2 = self._g(f"{prefix}.ha2.carry", [s1, cin])
+        cout = self._g(f"{prefix}.carry_or", [c1, c2])
+        return s2, cout
+    def add8(self, a, b):
+        a_lsb = list(reversed(int_to_bits(a, 8)))
+        b_lsb = list(reversed(int_to_bits(b, 8)))
+        carry = 0
+        s_lsb = []
+        for i in range(8):
+            s, carry = self._fa(f"arithmetic.ripplecarry8bit.fa{i}", a_lsb[i], b_lsb[i], carry)
+            s_lsb.append(s)
+        return bits_to_int(list(reversed(s_lsb))), carry
+    def sub8(self, a, b):
+        a_lsb = list(reversed(int_to_bits(a, 8)))
+        b_lsb = list(reversed(int_to_bits(b, 8)))
+        carry = 1
+        d_lsb = []
+        for i in range(8):
+            notb = self._g(f"arithmetic.sub8bit.notb{i}", [b_lsb[i]])
+            x1 = self._xor_or_nand(f"arithmetic.sub8bit.fa{i}.xor1", [a_lsb[i], notb])
+            x2 = self._xor_or_nand(f"arithmetic.sub8bit.fa{i}.xor2", [x1, carry])
+            and1 = self._g(f"arithmetic.sub8bit.fa{i}.and1", [a_lsb[i], notb])
+            and2 = self._g(f"arithmetic.sub8bit.fa{i}.and2", [x1, carry])
+            carry = self._g(f"arithmetic.sub8bit.fa{i}.or_carry", [and1, and2])
+            d_lsb.append(x2)
+        return bits_to_int(list(reversed(d_lsb))), carry
+    def cmp8(self, a, b, kind):
+        inp = int_to_bits(a, 8) + int_to_bits(b, 8)
+        if kind == "eq":
+            h_geq = self._g("arithmetic.equality8bit.layer1.geq", inp)
+            h_leq = self._g("arithmetic.equality8bit.layer1.leq", inp)
+            return self._g("arithmetic.equality8bit.layer2", [h_geq, h_leq])
+        return self._g(f"arithmetic.{kind}8bit", inp)
+    def mul8(self, a, b):
+        ab = int_to_bits(a, 8)
+        bb = int_to_bits(b, 8)
+        result = 0
+        for j in range(8):
+            if bb[j] == 0:
+                continue
+            row = 0
+            for i in range(8):
+                pp = self._g(f"alu.alu8bit.mul.pp.a{i}b{j}", [ab[i], bb[j]])
+                row |= (pp << (7 - i))
+            shift = 7 - j
+            result, _ = self.add8(result & 0xFF, (row << shift) & 0xFF)
+        return result & 0xFF
+    # ----- N-bit primitives (for 16-bit and 32-bit variants) ----------------
+    def add_n(self, a: int, b: int, bits: int):
+        """Width-generic ripple-carry add via arithmetic.ripplecarry{N}bit."""
+        prefix = f"arithmetic.ripplecarry{bits}bit"
+        a_lsb = list(reversed(int_to_bits(a, bits)))
+        b_lsb = list(reversed(int_to_bits(b, bits)))
+        carry = 0
+        s_lsb = []
+        for i in range(bits):
+            s, carry = self._fa(f"{prefix}.fa{i}", a_lsb[i], b_lsb[i], carry)
+            s_lsb.append(s)
+        return bits_to_int(list(reversed(s_lsb))), carry
+    def sub_n(self, a: int, b: int, bits: int):
+        """N-bit two's-complement subtract via arithmetic.sub{N}bit (N >= 16).
+        Structure (per build.add_sub_nbits): N NOT gates + N standard full adders.
+        """
+        prefix = f"arithmetic.sub{bits}bit"
+        a_lsb = list(reversed(int_to_bits(a, bits)))
+        b_lsb = list(reversed(int_to_bits(b, bits)))
+        # NOT each B bit
+        notb = [self._g(f"{prefix}.not_b.bit{i}", [b_lsb[i]]) for i in range(bits)]
+        carry = 1  # carry-in = 1 for two's-complement
+        d_lsb = []
+        for i in range(bits):
+            s, carry = self._fa(f"{prefix}.fa{i}", a_lsb[i], notb[i], carry)
+            d_lsb.append(s)
+        return bits_to_int(list(reversed(d_lsb))), carry
+    def cmp_n(self, a: int, b: int, kind: str, bits: int):
+        """N-bit comparator. For bits <= 16 single-layer; bits == 32 cascaded."""
+        a_bits = int_to_bits(a, bits)
+        b_bits = int_to_bits(b, bits)
+        if bits <= 16:
+            inp = a_bits + b_bits
+            if kind == "eq":
+                h_geq = self._g(f"arithmetic.equality{bits}bit.layer1.geq", inp)
+                h_leq = self._g(f"arithmetic.equality{bits}bit.layer1.leq", inp)
+                return self._g(f"arithmetic.equality{bits}bit.layer2", [h_geq, h_leq])
+            return self._g(f"arithmetic.{kind}{bits}bit", inp)
+        # 32-bit: cascaded byte-wise
+        prefix = f"arithmetic.cmp{bits}bit"
+        num_bytes = bits // 8
+        # per-byte gt/lt/eq
+        byte_gt, byte_lt, byte_eq = [], [], []
+        for bn in range(num_bytes):
+            ab = a_bits[bn*8:(bn+1)*8]
+            bb = b_bits[bn*8:(bn+1)*8]
+            byte_gt.append(self._g(f"{prefix}.byte{bn}.gt", ab + bb))
+            byte_lt.append(self._g(f"{prefix}.byte{bn}.lt", ab + bb))
+            geq = self._g(f"{prefix}.byte{bn}.eq.geq", ab + bb)
+            leq = self._g(f"{prefix}.byte{bn}.eq.leq", ab + bb)
+            byte_eq.append(self._g(f"{prefix}.byte{bn}.eq.and", [geq, leq]))
+        if kind == "equality":
+            # OR of all eq's, but the gate is `arithmetic.equality{bits}bit` with weight=[1,1,..,1]/bias=-num_bytes
+            return self._g(f"arithmetic.equality{bits}bit", byte_eq)
+        # cascade
+        cascade_gt = [byte_gt[0]]
+        cascade_lt = [byte_lt[0]]
+        for bn in range(1, num_bytes):
+            all_eq = self._g(f"{prefix}.cascade.gt.stage{bn}.all_eq", byte_eq[:bn])
+            cascade_gt.append(self._g(f"{prefix}.cascade.gt.stage{bn}.and", [all_eq, byte_gt[bn]]))
+            all_eq2 = self._g(f"{prefix}.cascade.lt.stage{bn}.all_eq", byte_eq[:bn])
+            cascade_lt.append(self._g(f"{prefix}.cascade.lt.stage{bn}.and", [all_eq2, byte_lt[bn]]))
+        if kind == "greaterthan":
+            return self._g(f"arithmetic.greaterthan{bits}bit", cascade_gt)
+        if kind == "lessthan":
+            return self._g(f"arithmetic.lessthan{bits}bit", cascade_lt)
+        raise ValueError(f"unsupported cmp kind {kind} for bits={bits}")
+    def mul_n(self, a: int, b: int, bits: int):
+        """N-bit shift-add multiply (low N bits only)."""
+        ab = int_to_bits(a, bits)
+        bb = int_to_bits(b, bits)
+        mask = (1 << bits) - 1
+        result = 0
+        for j in range(bits):
+            if bb[j] == 0:
+                continue
+            row = 0
+            for i in range(bits):
+                pp = self._g(f"alu.alu{bits}bit.mul.pp.a{i}b{j}", [ab[i], bb[j]])
+                row |= (pp << (bits - 1 - i))
+            shift = (bits - 1) - j
+            result, _ = self.add_n(result & mask, (row << shift) & mask, bits)
+        return result & mask
+class GenericThresholdCPU:
+    """Variant-aware CPU runtime. Sized from the variant's manifest."""
+    def __init__(self, tensors: Dict[str, torch.Tensor]):
+        self.T = tensors
+        m = get_manifest(tensors)
+        self.data_bits = m["data_bits"]
+        self.addr_bits = m["addr_bits"]
+        self.mem_bytes = m["memory_bytes"]
+        # 8-bit CPU primitives (ripplecarry8bit, sub8bit, alu.alu8bit.*, memory.*,
+        # control.*) are present in every variant regardless of manifest data_bits.
+        # Wider data widths simply add additional standalone ALU primitives.
+        if self.mem_bytes == 0:
+            raise NotImplementedError(
+                "Pure-ALU variants have no memory; cannot run CPU programs"
+            )
+        self.alu = GenericThresholdALU(tensors, 8)
+    def _addr_decode(self, addr):
+        bits = torch.tensor(int_to_bits(addr, self.addr_bits), dtype=torch.float32)
+        w = self.T["memory.addr_decode.weight"]
+        b = self.T["memory.addr_decode.bias"]
+        return heaviside((w * bits).sum(dim=1) + b)
+    def mem_read(self, mem, addr):
+        sel = self._addr_decode(addr)
+        mem_bits = torch.tensor(
+            [int_to_bits(byte, 8) for byte in mem], dtype=torch.float32
+        )
+        and_w = self.T["memory.read.and.weight"]
+        and_b = self.T["memory.read.and.bias"]
+        or_w = self.T["memory.read.or.weight"]
+        or_b = self.T["memory.read.or.bias"]
+        out = []
+        for bit in range(8):
+            inp = torch.stack([mem_bits[:, bit], sel], dim=1)
+            and_out = heaviside((inp * and_w[bit]).sum(dim=1) + and_b[bit])
+            out.append(int(heaviside((and_out * or_w[bit]).sum() + or_b[bit]).item()))
+        return bits_to_int(out)
+    def mem_write(self, mem, addr, value):
+        sel = self._addr_decode(addr)
+        data_bits = torch.tensor(int_to_bits(value, 8), dtype=torch.float32)
+        mem_bits = torch.tensor(
+            [int_to_bits(byte, 8) for byte in mem], dtype=torch.float32
+        )
+        sel_w = self.T["memory.write.sel.weight"]
+        sel_b = self.T["memory.write.sel.bias"]
+        nsel_w = self.T["memory.write.nsel.weight"].squeeze(1)
+        nsel_b = self.T["memory.write.nsel.bias"]
+        and_old_w = self.T["memory.write.and_old.weight"]
+        and_old_b = self.T["memory.write.and_old.bias"]
+        and_new_w = self.T["memory.write.and_new.weight"]
+        and_new_b = self.T["memory.write.and_new.bias"]
+        or_w = self.T["memory.write.or.weight"]
+        or_b = self.T["memory.write.or.bias"]
+        we = torch.ones_like(sel)
+        sel_inp = torch.stack([sel, we], dim=1)
+        write_sel = heaviside((sel_inp * sel_w).sum(dim=1) + sel_b)
+        nsel = heaviside(write_sel * nsel_w + nsel_b)
+        for bit in range(8):
+            old = mem_bits[:, bit]
+            data_bit = data_bits[bit].expand(self.mem_bytes)
+            inp_old = torch.stack([old, nsel], dim=1)
+            inp_new = torch.stack([data_bit, write_sel], dim=1)
+            and_old = heaviside((inp_old * and_old_w[:, bit]).sum(dim=1) + and_old_b[:, bit])
+            and_new = heaviside((inp_new * and_new_w[:, bit]).sum(dim=1) + and_new_b[:, bit])
+            or_inp = torch.stack([and_old, and_new], dim=1)
+            new_bit = heaviside((or_inp * or_w[:, bit]).sum(dim=1) + or_b[:, bit])
+            mem_bits[:, bit] = new_bit
+        return [bits_to_int([int(b) for b in mem_bits[i].tolist()]) for i in range(self.mem_bytes)]
+    def step(self, state):
+        if state["halted"]:
+            return state
+        s = dict(state)
+        s["mem"] = state["mem"][:]
+        s["regs"] = state["regs"][:]
+        s["flags"] = state["flags"][:]
+        addr_mask = (1 << self.addr_bits) - 1
+        pc = s["pc"]
+        hi = self.mem_read(s["mem"], pc & addr_mask)
+        lo = self.mem_read(s["mem"], (pc + 1) & addr_mask)
+        ir = ((hi & 0xFF) << 8) | (lo & 0xFF)
+        opcode = (ir >> 12) & 0xF
+        rd = (ir >> 10) & 0x3
+        rs = (ir >> 8) & 0x3
+        imm = ir & 0xFF
+        next_pc = (pc + 2) & addr_mask
+        addr_full = None
+        if opcode in (0xA, 0xB, 0xC, 0xD, 0xE):
+            ah = self.mem_read(s["mem"], next_pc)
+            al = self.mem_read(s["mem"], (next_pc + 1) & addr_mask)
+            addr_full = ((ah & 0xFF) << 8) | (al & 0xFF)
+            next_pc = (next_pc + 2) & addr_mask
+        addr = (addr_full & addr_mask) if addr_full is not None else None
+        a = s["regs"][rd]
+        b = s["regs"][rs]
+        result = a
+        carry = 0
+        write_result = True
+        if opcode == 0x0:
+            result, carry = self.alu.add8(a, b)
+        elif opcode == 0x1:
+            result, carry = self.alu.sub8(a, b)
+        elif opcode == 0x7:
+            result = self.alu.mul8(a, b)
+        elif opcode == 0x9:
+            r2, carry = self.alu.sub8(a, b)
+            z = 1 if r2 == 0 else 0
+            n = 1 if (r2 & 0x80) else 0
+            s["flags"] = [z, n, carry, 0]
+            write_result = False
+        elif opcode == 0xA:
+            result = self.mem_read(s["mem"], addr)
+        elif opcode == 0xB:
+            s["mem"] = self.mem_write(s["mem"], addr, b & 0xFF)
+            write_result = False
+        elif opcode == 0xC:
+            s["pc"] = addr
+            return s
+        elif opcode == 0xD:
+            cond = imm & 0x7
+            z, n, c, v = s["flags"]
+            take = [z == 1, z == 0, c == 1, c == 0,
+                    n == 1, n == 0, v == 1, v == 0][cond]
+            s["pc"] = addr if take else next_pc
+            return s
+        elif opcode == 0xF:
+            s["halted"] = True
+            return s
+        if write_result and opcode != 0x9:
+            s["regs"][rd] = result & 0xFF
+        if opcode in (0x0, 0x1, 0x7):
+            z = 1 if (result & 0xFF) == 0 else 0
+            n = 1 if (result & 0x80) else 0
+            s["flags"] = [z, n, carry, 0]
+        s["pc"] = next_pc
+        return s
+    def run(self, state, max_cycles=200):
+        s = state
+        cycles = 0
+        while not s["halted"] and cycles < max_cycles:
+            s = self.step(s)
+            cycles += 1
+        return s, cycles
+def _encode_instr(opcode, rd, rs, imm):
+    return ((opcode & 0xF) << 12) | ((rd & 0x3) << 10) | ((rs & 0x3) << 8) | (imm & 0xFF)
+def _w16(mem, addr, value):
+    mem[addr] = (value >> 8) & 0xFF
+    mem[addr + 1] = value & 0xFF
+PROGRAM_MIN_BYTES = 0x84  # code 0x00..0x1F + data 0x80..0x83
+def builtin_program(addr_bits: int) -> Tuple[List[int], int]:
+    """Sum 5+4+3+2+1 via a loop. Returns (mem, expected_result_at_0x83).
+    Compact layout: code at 0x00..0x1F (32 bytes), data at 0x80..0x83 (4 bytes).
+    Total footprint 132 bytes -- fits within scratchpad (256 B) and larger.
+    Requires addr_bits >= 8.
+    """
+    if (1 << addr_bits) < PROGRAM_MIN_BYTES:
+        raise ValueError(f"addr_bits={addr_bits} too small for builtin program")
+    mem = [0] * (1 << addr_bits)
+    mem[0x80] = 5  # initial counter
+    mem[0x81] = 1  # decrement
+    mem[0x82] = 0  # zero (for compare and accumulator init)
+    # mem[0x83] is the output
+    _w16(mem, 0x0000, _encode_instr(0xA, 1, 0, 0)); _w16(mem, 0x0002, 0x0080)
+    _w16(mem, 0x0004, _encode_instr(0xA, 2, 0, 0)); _w16(mem, 0x0006, 0x0081)
+    _w16(mem, 0x0008, _encode_instr(0xA, 3, 0, 0)); _w16(mem, 0x000A, 0x0082)
+    _w16(mem, 0x000C, _encode_instr(0xA, 0, 0, 0)); _w16(mem, 0x000E, 0x0082)
+    _w16(mem, 0x0010, _encode_instr(0x0, 0, 1, 0))
+    _w16(mem, 0x0012, _encode_instr(0x1, 1, 2, 0))
+    _w16(mem, 0x0014, _encode_instr(0x9, 1, 3, 0))
+    _w16(mem, 0x0016, _encode_instr(0xD, 0, 0, 0x01)); _w16(mem, 0x0018, 0x0010)
+    _w16(mem, 0x001A, _encode_instr(0xB, 0, 0, 0)); _w16(mem, 0x001C, 0x0083)
+    _w16(mem, 0x001E, _encode_instr(0xF, 0, 0, 0))
+    return mem, 15
+# ---------------------------------------------------------------------------
+# Eval driver
+# ---------------------------------------------------------------------------
+def list_safetensors(path: Path) -> List[Path]:
+    if path.is_file():
+        return [path]
+    if path.is_dir():
+        return sorted(p for p in path.glob("*.safetensors") if p.is_file())
+    return []
+def evaluate_one(path: Path, device: str, pop_size: int, debug: bool, run_cpu_program: bool) -> Dict:
+    out: Dict = {"path": str(path), "filename": path.name}
+    try:
+        tensors = load_model(str(path))
+    except Exception as e:
+        out.update(error=f"load failed: {e}", status="ERROR")
+        return out
+    manifest = get_manifest(tensors)
+    out.update(
+        size_mb=path.stat().st_size / (1024 * 1024),
+        tensors=len(tensors),
+        params=sum(t.numel() for t in tensors.values()),
+        manifest=manifest,
+    )
+    # Move to device
+    tensors = {k: v.to(device) for k, v in tensors.items()}
+    try:
+        evaluator = BatchedFitnessEvaluator(device=device, model_path=str(path), tensors=tensors)
+        population = create_population(tensors, pop_size=pop_size, device=device)
+        t0 = time.perf_counter()
+        fitness = evaluator.evaluate(population, debug=debug)
+        elapsed = time.perf_counter() - t0
+        f0 = float(fitness[0].item()) if pop_size == 1 else float(fitness.mean().item())
+        out.update(
+            fitness=f0,
+            total_tests=evaluator.total_tests,
+            elapsed_s=elapsed,
+            categories={k: (float(v[0]), int(v[1])) for k, v in evaluator.category_scores.items()},
+            status="PASS" if f0 >= 0.9999 else "FAIL",
+        )
+    except Exception as e:
+        out.update(error=f"eval failed: {type(e).__name__}: {e}", status="ERROR")
+        return out
+    # Optional: CPU program test (8-bit CPU primitives are in every variant)
+    if run_cpu_program:
+        if manifest["memory_bytes"] >= PROGRAM_MIN_BYTES:
+            try:
+                cpu_tensors = {k: v.cpu() for k, v in tensors.items()}
+                cpu = GenericThresholdCPU(cpu_tensors)
+                mem, expected = builtin_program(manifest["addr_bits"])
+                state = {"pc": 0, "regs": [0] * 4, "flags": [0] * 4, "mem": mem, "halted": False}
+                t0 = time.perf_counter()
+                final, cycles = cpu.run(state, max_cycles=200)
+                cpu_elapsed = time.perf_counter() - t0
+                got = final["mem"][0x83]
+                out["cpu_program"] = {
+                    "ok": got == expected,
+                    "got": got,
+                    "expected": expected,
+                    "cycles": cycles,
+                    "elapsed_s": cpu_elapsed,
+                }
+                if got != expected:
+                    out["status"] = "FAIL"
+            except Exception as e:
+                out["cpu_program"] = {"error": str(e)}
+        else:
+            out["cpu_program"] = {"skipped": f"mem={manifest['memory_bytes']}B < {PROGRAM_MIN_BYTES}"}
+        # Wider-ALU chain test for 16/32-bit variants
+        bits = manifest["data_bits"]
+        if bits in (16, 32):
+            try:
+                alu_tensors = {k: v.cpu() for k, v in tensors.items()}
+                alu = GenericThresholdALU(alu_tensors, bits)
+                t0 = time.perf_counter()
+                if bits == 16:
+                    x, y = 1234, 5678
+                    z, _ = alu.add_n(x, y, 16);          assert z == (x + y) & 0xFFFF
+                    w, _ = alu.sub_n(z, x, 16);          assert w == (z - x) & 0xFFFF, (w, z - x)
+                    gt = alu.cmp_n(z, x, "greaterthan", 16); assert gt == 1
+                    lt = alu.cmp_n(x, z, "lessthan",   16);  assert lt == 1
+                    eq = alu.cmp_n(w, y, "eq",         16);  assert eq == 1
+                    p   = alu.mul_n(123, 5, 16);          assert p == (123 * 5) & 0xFFFF
+                else:  # 32
+                    x, y = 1_000_000, 999_000
+                    z, _ = alu.sub_n(x, y, 32);          assert z == 1_000
+                    s, _ = alu.add_n(z, x, 32);          assert s == 1_001_000
+                    p    = alu.mul_n(z, 100, 32);        assert p == 100_000
+                    gt = alu.cmp_n(x, y, "greaterthan", 32); assert gt == 1
+                    lt = alu.cmp_n(y, x, "lessthan",   32);  assert lt == 1
+                    eq = alu.cmp_n(p, 100_000, "equality", 32); assert eq == 1
+                chain_dt = time.perf_counter() - t0
+                out[f"alu_chain_{bits}"] = {"ok": True, "elapsed_s": chain_dt}
+            except AssertionError as e:
+                out[f"alu_chain_{bits}"] = {"ok": False, "error": f"chain mismatch: {e}"}
+                out["status"] = "FAIL"
+            except Exception as e:
+                out[f"alu_chain_{bits}"] = {"ok": False, "error": f"{type(e).__name__}: {e}"}
+                out["status"] = "FAIL"
+    return out
+def print_row(r: Dict, show_cpu: bool) -> None:
+    if "error" in r:
+        print(f"  {r['filename']:<48}  ERROR: {r['error'][:80]}")
+        return
+    m = r["manifest"]
+    fit = f"{r['fitness']:.4f}" if r.get("fitness") is not None else "n/a"
+    cpu_col = ""
+    if show_cpu and "cpu_program" in r:
+        cp = r["cpu_program"]
+        if cp.get("ok"):
+            cpu_col = f"  CPU OK ({cp['cycles']}cyc/{cp['elapsed_s']:.1f}s)"
+        elif "skipped" in cp:
+            cpu_col = f"  CPU SKIP"
+        elif "error" in cp:
+            cpu_col = f"  CPU ERR"
+        else:
+            cpu_col = f"  CPU FAIL ({cp.get('got')}!={cp.get('expected')})"
+    chain_col = ""
+    if show_cpu:
+        for bits in (16, 32):
+            key = f"alu_chain_{bits}"
+            if key in r:
+                ch = r[key]
+                if ch.get("ok"):
+                    chain_col = f"  ALU{bits} OK ({ch['elapsed_s']:.2f}s)"
+                else:
+                    chain_col = f"  ALU{bits} FAIL"
+    print(
+        f"  {r['filename']:<48}  d={m['data_bits']:>2}b a={m['addr_bits']:>2}b "
+        f"mem={m['memory_bytes']:>6}B  size={r['size_mb']:>6.1f}MB  "
+        f"params={r['params']:>10,}  fit={fit:>6}  tests={r['total_tests']:>5}  "
+        f"{r['status']:>5}{cpu_col}{chain_col}"
+    )
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Variant-agnostic eval harness")
+    parser.add_argument("path", help="Path to .safetensors file or directory of files")
+    parser.add_argument("--device", default="cpu", help="cpu (default) or cuda")
+    parser.add_argument("--pop_size", type=int, default=1)
+    parser.add_argument("--debug", action="store_true", help="Per-circuit detail per file")
+    parser.add_argument("--cpu-program", action="store_true",
+                        help="Also run a small assembled program through the threshold CPU "
+                             "(only applies to 8-bit variants with >= 512 B memory)")
+    parser.add_argument("--json", action="store_true", help="Emit JSON results to stdout instead of a table")
+    args = parser.parse_args()
+    files = list_safetensors(Path(args.path))
+    if not files:
+        print(f"No .safetensors files found under {args.path}", file=sys.stderr)
+        return 2
+    print(f"Evaluating {len(files)} file(s) on {args.device}\n")
+    results = []
+    fail_count = 0
+    for f in files:
+        print(f"=== {f.name}")
+        r = evaluate_one(f, device=args.device, pop_size=args.pop_size,
+                         debug=args.debug, run_cpu_program=args.cpu_program)
+        results.append(r)
+        print_row(r, show_cpu=args.cpu_program)
+        if r.get("status") != "PASS":
+            fail_count += 1
+    if args.json:
+        # Make it serialisable
+        for r in results:
+            r["manifest"] = {k: (int(v) if isinstance(v, float) and v.is_integer() else v)
+                             for k, v in r.get("manifest", {}).items()}
+        print(json.dumps(results, indent=2, default=str))
+        return fail_count
+    # Summary
+    print()
+    print("=" * 100)
+    print(" SUMMARY")
+    print("=" * 100)
+    for r in results:
+        print_row(r, show_cpu=args.cpu_program)
+    print()
+    if fail_count == 0:
+        print(f"ALL {len(files)} variants PASS")
+    else:
+        print(f"{fail_count}/{len(files)} variants FAIL")
+    return fail_count
+if __name__ == "__main__":
+    sys.exit(main())

play.py ADDED Viewed

	@@ -0,0 +1,484 @@

+"""
+Hands-on playground for the 8bit-threshold-computer.
+Loads the bundled safetensors model, reads its manifest, and exercises
+threshold circuits at every level: raw Boolean gates, ALU arithmetic,
+comparators, then a CPU runtime sized to the actual manifest that runs
+a small assembled program end-to-end through the threshold weights.
+"""
+from __future__ import annotations
+import os
+import sys
+import torch
+from safetensors import safe_open
+sys.path.insert(0, os.path.dirname(__file__))
+# ---------------------------------------------------------------------------
+# Load model + manifest
+# ---------------------------------------------------------------------------
+MODEL_PATH = os.path.join(os.path.dirname(__file__), "neural_computer.safetensors")
+def heaviside(x):
+    return (x >= 0).float()
+def load_tensors(path):
+    out = {}
+    with safe_open(path, framework="pt") as f:
+        for name in f.keys():
+            out[name] = f.get_tensor(name).float()
+    return out
+print("Loading", MODEL_PATH)
+T = load_tensors(MODEL_PATH)
+DATA_BITS = int(T["manifest.data_bits"].item())
+ADDR_BITS = int(T["manifest.addr_bits"].item())
+MEM_BYTES = int(T["manifest.memory_bytes"].item())
+REGISTERS = int(T["manifest.registers"].item())
+print(f"Manifest: data={DATA_BITS}-bit, addr={ADDR_BITS}-bit, mem={MEM_BYTES}B, regs={REGISTERS}")
+print(f"Tensors: {len(T):,}")
+print(f"Total params: {sum(t.numel() for t in T.values()):,}")
+print()
+def gate(name, inputs):
+    """Run one threshold gate identified by `name` (no .weight/.bias suffix)."""
+    w = T[name + ".weight"].view(-1)
+    b = T[name + ".bias"].view(-1)
+    inp = torch.tensor(inputs, dtype=torch.float32)
+    return int(heaviside((inp * w).sum() + b).item())
+def xor(prefix, inputs):
+    """Run a 2-layer XOR-style gate (or/nand naming, e.g. ripple-carry adders)."""
+    a, b_ = inputs
+    h_or = gate(f"{prefix}.layer1.or", [a, b_])
+    h_nand = gate(f"{prefix}.layer1.nand", [a, b_])
+    return gate(f"{prefix}.layer2", [h_or, h_nand])
+def xor_neuron(prefix, inputs):
+    """Run a 2-layer XOR-style gate (neuron1/neuron2 naming, e.g. boolean.xor)."""
+    a, b_ = inputs
+    h1 = gate(f"{prefix}.layer1.neuron1", [a, b_])
+    h2 = gate(f"{prefix}.layer1.neuron2", [a, b_])
+    return gate(f"{prefix}.layer2", [h1, h2])
+def int_to_bits_msb(v, n):
+    return [(v >> (n - 1 - i)) & 1 for i in range(n)]
+def bits_msb_to_int(bits):
+    out = 0
+    for b in bits:
+        out = (out << 1) | int(b)
+    return out
+# ---------------------------------------------------------------------------
+# Demo 1: Boolean gates (README Usage example)
+# ---------------------------------------------------------------------------
+print("=" * 64)
+print(" Demo 1: Boolean threshold gates")
+print("=" * 64)
+truth_2 = [(0, 0), (0, 1), (1, 0), (1, 1)]
+for gname in ["and", "or", "nand", "nor", "implies"]:
+    row = " ".join(f"{a}{b}->{gate(f'boolean.{gname}', [a, b])}" for a, b in truth_2)
+    print(f"  {gname:8} {row}")
+# 2-layer gates (boolean.* uses neuron1/neuron2 naming)
+for gname in ["xor", "xnor", "biimplies"]:
+    row = " ".join(f"{a}{b}->{xor_neuron(f'boolean.{gname}', [a, b])}" for a, b in truth_2)
+    print(f"  {gname:8} {row}")
+# NOT (1-input)
+print(f"  not      0->{gate('boolean.not', [0])} 1->{gate('boolean.not', [1])}")
+print()
+# ---------------------------------------------------------------------------
+# Demo 2: 8-bit ALU operations via threshold weights
+# ---------------------------------------------------------------------------
+print("=" * 64)
+print(" Demo 2: 8-bit ALU arithmetic (every gate is threshold logic)")
+print("=" * 64)
+def fa(prefix, a, b, cin):
+    s1 = xor(f"{prefix}.ha1.sum", [a, b])
+    c1 = gate(f"{prefix}.ha1.carry", [a, b])
+    s2 = xor(f"{prefix}.ha2.sum", [s1, cin])
+    c2 = gate(f"{prefix}.ha2.carry", [s1, cin])
+    cout = gate(f"{prefix}.carry_or", [c1, c2])
+    return s2, cout
+def alu_add(a, b):
+    """8-bit ripple carry add via threshold full-adders."""
+    a_bits = int_to_bits_msb(a, 8)
+    b_bits = int_to_bits_msb(b, 8)
+    a_lsb_first = list(reversed(a_bits))
+    b_lsb_first = list(reversed(b_bits))
+    carry = 0
+    sum_lsb_first = []
+    for i in range(8):
+        s, carry = fa(f"arithmetic.ripplecarry8bit.fa{i}", a_lsb_first[i], b_lsb_first[i], carry)
+        sum_lsb_first.append(s)
+    return bits_msb_to_int(list(reversed(sum_lsb_first))), carry
+def alu_sub(a, b):
+    """A - B via two's complement; uses sub8bit circuit family."""
+    a_lsb = list(reversed(int_to_bits_msb(a, 8)))
+    b_lsb = list(reversed(int_to_bits_msb(b, 8)))
+    carry = 1
+    diff_lsb = []
+    for i in range(8):
+        notb = gate(f"arithmetic.sub8bit.notb{i}", [b_lsb[i]])
+        x1 = xor(f"arithmetic.sub8bit.fa{i}.xor1", [a_lsb[i], notb])
+        x2 = xor(f"arithmetic.sub8bit.fa{i}.xor2", [x1, carry])
+        and1 = gate(f"arithmetic.sub8bit.fa{i}.and1", [a_lsb[i], notb])
+        and2 = gate(f"arithmetic.sub8bit.fa{i}.and2", [x1, carry])
+        carry = gate(f"arithmetic.sub8bit.fa{i}.or_carry", [and1, and2])
+        diff_lsb.append(x2)
+    return bits_msb_to_int(list(reversed(diff_lsb))), carry
+def alu_compare(a, b, kind):
+    """8-bit comparators (single-layer GT/LT, two-layer EQ)."""
+    a_bits = int_to_bits_msb(a, 8)
+    b_bits = int_to_bits_msb(b, 8)
+    inp = a_bits + b_bits
+    if kind == "eq":
+        h_geq = gate("arithmetic.equality8bit.layer1.geq", inp)
+        h_leq = gate("arithmetic.equality8bit.layer1.leq", inp)
+        return gate("arithmetic.equality8bit.layer2", [h_geq, h_leq])
+    return gate(f"arithmetic.{kind}8bit", inp)
+def alu_mul(a, b):
+    """Shift-add multiply via partial-product threshold AND gates + repeated add."""
+    a_bits = int_to_bits_msb(a, 8)
+    b_bits = int_to_bits_msb(b, 8)
+    pp = [[0] * 8 for _ in range(8)]
+    for i in range(8):
+        for j in range(8):
+            pp[i][j] = gate(f"alu.alu8bit.mul.pp.a{i}b{j}", [a_bits[i], b_bits[j]])
+    # accumulate weighted partial products in 8 bits (drop overflow above bit 7)
+    result = 0
+    for j in range(8):  # j=0 is MSB of b -> weight 7-j
+        if b_bits[j] == 0:
+            continue
+        row = 0
+        for i in range(8):
+            row |= (pp[i][j] << (7 - i))
+        shift = 7 - j
+        result, _ = alu_add(result & 0xFF, (row << shift) & 0xFF)
+    return result & 0xFF
+cases_arith = [(5, 3), (37, 100), (200, 99), (255, 1), (127, 128), (15, 17)]
+print("ADD:")
+for a, b in cases_arith:
+    r, c = alu_add(a, b)
+    expect = (a + b) & 0xFF
+    ok = "OK" if r == expect else "FAIL"
+    print(f"  {a:3} + {b:3} = {r:3} (carry={c})  expected {expect:3}  [{ok}]")
+print("SUB:")
+for a, b in cases_arith:
+    r, c = alu_sub(a, b)
+    expect = (a - b) & 0xFF
+    ok = "OK" if r == expect else "FAIL"
+    print(f"  {a:3} - {b:3} = {r:3} (no_borrow={c})  expected {expect:3}  [{ok}]")
+print("CMP:")
+cmp_cases = [(50, 30), (30, 50), (77, 77), (255, 0), (0, 255), (128, 127)]
+for a, b in cmp_cases:
+    gt = alu_compare(a, b, "greaterthan")
+    lt = alu_compare(a, b, "lessthan")
+    eq = alu_compare(a, b, "eq")
+    print(f"  {a:3} vs {b:3} -> GT={gt} LT={lt} EQ={eq}")
+print("MUL (low 8 bits):")
+for a, b in [(12, 11), (15, 17), (8, 32), (200, 3), (0, 99), (1, 255)]:
+    r = alu_mul(a, b)
+    expect = (a * b) & 0xFF
+    ok = "OK" if r == expect else "FAIL"
+    print(f"  {a:3} * {b:3} = {r:3}  expected {expect:3}  [{ok}]")
+print()
+# ---------------------------------------------------------------------------
+# Demo 3: A 4-bit divisibility test (mod 5) - non-linearly-separable
+# ---------------------------------------------------------------------------
+print("=" * 64)
+print(" Demo 3: mod-5 divisibility (multi-layer, hand-constructed)")
+print("=" * 64)
+# layer1: per-residue geq/leq -> layer2: eq -> layer3: OR all eq's
+def mod5(v):
+    bits = int_to_bits_msb(v, 8)
+    # discover number of geq/leq neurons
+    n = 0
+    while f"modular.mod5.layer1.geq{n}.weight" in T:
+        n += 1
+    eqs = []
+    for i in range(n):
+        h_geq = gate(f"modular.mod5.layer1.geq{i}", bits)
+        h_leq = gate(f"modular.mod5.layer1.leq{i}", bits)
+        eqs.append(gate(f"modular.mod5.layer2.eq{i}", [h_geq, h_leq]))
+    return gate("modular.mod5.layer3.or", eqs)
+hits = [v for v in range(256) if mod5(v)]
+print(f"  v in [0,255] with mod5(v)==1: {len(hits)} hits, first 12: {hits[:12]}")
+print(f"  Sanity: {[h % 5 for h in hits[:12]]}")
+print()
+# ---------------------------------------------------------------------------
+# Demo 4: Manifest-aware threshold CPU - run a real program
+# ---------------------------------------------------------------------------
+print("=" * 64)
+print(" Demo 4: Threshold CPU running an assembled program")
+print("=" * 64)
+class ThresholdCPU10:
+    """CPU runtime matching the bundled small-profile manifest (10-bit addr)."""
+    def __init__(self, addr_bits, mem_bytes):
+        self.addr_bits = addr_bits
+        self.mem_bytes = mem_bytes
+    # --- memory primitives, fully through threshold weights ---
+    def addr_decode(self, addr):
+        bits = torch.tensor(int_to_bits_msb(addr, self.addr_bits), dtype=torch.float32)
+        w = T["memory.addr_decode.weight"]
+        b = T["memory.addr_decode.bias"]
+        return heaviside((w * bits).sum(dim=1) + b)  # [mem_bytes]
+    def mem_read(self, mem, addr):
+        sel = self.addr_decode(addr)
+        mem_bits = torch.tensor(
+            [int_to_bits_msb(byte, 8) for byte in mem], dtype=torch.float32
+        )
+        and_w = T["memory.read.and.weight"]
+        and_b = T["memory.read.and.bias"]
+        or_w = T["memory.read.or.weight"]
+        or_b = T["memory.read.or.bias"]
+        out_bits = []
+        for bit in range(8):
+            inp = torch.stack([mem_bits[:, bit], sel], dim=1)
+            and_out = heaviside((inp * and_w[bit]).sum(dim=1) + and_b[bit])
+            out_bits.append(int(heaviside((and_out * or_w[bit]).sum() + or_b[bit]).item()))
+        return bits_msb_to_int(out_bits)
+    def mem_write(self, mem, addr, value):
+        sel = self.addr_decode(addr)
+        data_bits = torch.tensor(int_to_bits_msb(value, 8), dtype=torch.float32)
+        mem_bits = torch.tensor(
+            [int_to_bits_msb(byte, 8) for byte in mem], dtype=torch.float32
+        )
+        sel_w = T["memory.write.sel.weight"]
+        sel_b = T["memory.write.sel.bias"]
+        nsel_w = T["memory.write.nsel.weight"].squeeze(1)
+        nsel_b = T["memory.write.nsel.bias"]
+        and_old_w = T["memory.write.and_old.weight"]
+        and_old_b = T["memory.write.and_old.bias"]
+        and_new_w = T["memory.write.and_new.weight"]
+        and_new_b = T["memory.write.and_new.bias"]
+        or_w = T["memory.write.or.weight"]
+        or_b = T["memory.write.or.bias"]
+        we = torch.ones_like(sel)
+        sel_inp = torch.stack([sel, we], dim=1)
+        write_sel = heaviside((sel_inp * sel_w).sum(dim=1) + sel_b)
+        nsel = heaviside(write_sel * nsel_w + nsel_b)
+        new_mem = mem[:]
+        for bit in range(8):
+            old = mem_bits[:, bit]
+            data_bit = data_bits[bit].expand(self.mem_bytes)
+            inp_old = torch.stack([old, nsel], dim=1)
+            inp_new = torch.stack([data_bit, write_sel], dim=1)
+            and_old = heaviside((inp_old * and_old_w[:, bit]).sum(dim=1) + and_old_b[:, bit])
+            and_new = heaviside((inp_new * and_new_w[:, bit]).sum(dim=1) + and_new_b[:, bit])
+            or_inp = torch.stack([and_old, and_new], dim=1)
+            new_bit = heaviside((or_inp * or_w[:, bit]).sum(dim=1) + or_b[:, bit])
+            mem_bits[:, bit] = new_bit
+        return [bits_msb_to_int([int(b) for b in mem_bits[i].tolist()]) for i in range(self.mem_bytes)]
+    # --- helper to use threshold ALU functions defined above ---
+    def step(self, state):
+        if state["halted"]:
+            return state
+        s = dict(state)
+        s["mem"] = state["mem"][:]
+        s["regs"] = state["regs"][:]
+        s["flags"] = state["flags"][:]
+        pc = s["pc"]
+        addr_mask = (1 << self.addr_bits) - 1
+        hi = self.mem_read(s["mem"], pc & addr_mask)
+        lo = self.mem_read(s["mem"], (pc + 1) & addr_mask)
+        ir = ((hi & 0xFF) << 8) | (lo & 0xFF)
+        opcode = (ir >> 12) & 0xF
+        rd = (ir >> 10) & 0x3
+        rs = (ir >> 8) & 0x3
+        imm = ir & 0xFF
+        next_pc = (pc + 2) & addr_mask
+        addr16 = None
+        if opcode in (0xA, 0xB, 0xC, 0xD, 0xE):
+            ah = self.mem_read(s["mem"], next_pc)
+            al = self.mem_read(s["mem"], (next_pc + 1) & addr_mask)
+            addr16 = ((ah & 0xFF) << 8) | (al & 0xFF)
+            next_pc = (next_pc + 2) & addr_mask
+        addr10 = (addr16 & addr_mask) if addr16 is not None else None
+        a = s["regs"][rd]
+        b = s["regs"][rs]
+        write = True
+        result = a
+        carry = 0
+        if opcode == 0x0:  # ADD
+            result, carry = alu_add(a, b)
+        elif opcode == 0x1:  # SUB
+            result, carry = alu_sub(a, b)
+        elif opcode == 0x7:  # MUL
+            result = alu_mul(a, b)
+        elif opcode == 0x9:  # CMP
+            _r, carry = alu_sub(a, b)
+            z = 1 if _r == 0 else 0
+            n = 1 if (_r & 0x80) else 0
+            s["flags"] = [z, n, carry, 0]
+            write = False
+            opcode_was_cmp = True
+        elif opcode == 0xA:  # LOAD
+            result = self.mem_read(s["mem"], addr10)
+        elif opcode == 0xB:  # STORE
+            s["mem"] = self.mem_write(s["mem"], addr10, b & 0xFF)
+            write = False
+        elif opcode == 0xC:  # JMP
+            s["pc"] = addr10
+            return s
+        elif opcode == 0xD:  # Jcc
+            cond = imm & 0x7
+            take = False
+            z, n, c, v = s["flags"]
+            if cond == 0: take = z == 1
+            elif cond == 1: take = z == 0
+            elif cond == 2: take = c == 1
+            elif cond == 3: take = c == 0
+            elif cond == 4: take = n == 1
+            elif cond == 5: take = n == 0
+            elif cond == 6: take = v == 1
+            else: take = v == 0
+            s["pc"] = addr10 if take else next_pc
+            return s
+        elif opcode == 0xF:  # HALT
+            s["halted"] = True
+            return s
+        if write and opcode != 0x9:
+            s["regs"][rd] = result & 0xFF
+        if opcode in (0x0, 0x1, 0x7):
+            z = 1 if (result & 0xFF) == 0 else 0
+            n = 1 if (result & 0x80) else 0
+            s["flags"] = [z, n, carry, 0]
+        s["pc"] = next_pc
+        return s
+    def run(self, state, max_cycles=64):
+        s = state
+        cycles = 0
+        while not s["halted"] and cycles < max_cycles:
+            s = self.step(s)
+            cycles += 1
+        return s, cycles
+def encode_instr(opcode, rd, rs, imm):
+    return ((opcode & 0xF) << 12) | ((rd & 0x3) << 10) | ((rs & 0x3) << 8) | (imm & 0xFF)
+def write_word(mem, addr, value):
+    mem[addr] = (value >> 8) & 0xFF
+    mem[addr + 1] = value & 0xFF
+# Program: count down from 5 to 0 with a loop, accumulate sum into R0.
+#
+# R1 = 5
+# R0 = 0
+# loop:
+#   R0 = R0 + R1     ; ADD R0, R1
+#   R1 = R1 - 1      ; we need an immediate decrement; use SUB R1, R2 with R2=1
+#   CMP R1, R3       ; R3=0
+#   JNZ loop
+# HALT
+#
+# Memory layout (1KB):
+#   0x0000: LOAD R1 <- M[0x0100]   (5)
+#   0x0004: LOAD R2 <- M[0x0101]   (1)
+#   0x0008: LOAD R3 <- M[0x0102]   (0)
+#   0x000C: LOAD R0 <- M[0x0102]   (0)
+#   0x0010: ADD  R0, R1
+#   0x0012: SUB  R1, R2
+#   0x0014: CMP  R1, R3
+#   0x0016: JNZ  0x0010
+#   0x001A: STORE R0 -> M[0x0103]
+#   0x001E: HALT
+mem = [0] * 1024
+mem[0x100] = 5
+mem[0x101] = 1
+mem[0x102] = 0
+# LOAD R1 <- M[0x0100]
+write_word(mem, 0x0000, encode_instr(0xA, 1, 0, 0)); write_word(mem, 0x0002, 0x0100)
+# LOAD R2 <- M[0x0101]
+write_word(mem, 0x0004, encode_instr(0xA, 2, 0, 0)); write_word(mem, 0x0006, 0x0101)
+# LOAD R3 <- M[0x0102]
+write_word(mem, 0x0008, encode_instr(0xA, 3, 0, 0)); write_word(mem, 0x000A, 0x0102)
+# LOAD R0 <- M[0x0102]
+write_word(mem, 0x000C, encode_instr(0xA, 0, 0, 0)); write_word(mem, 0x000E, 0x0102)
+# ADD R0, R1
+write_word(mem, 0x0010, encode_instr(0x0, 0, 1, 0))
+# SUB R1, R2
+write_word(mem, 0x0012, encode_instr(0x1, 1, 2, 0))
+# CMP R1, R3
+write_word(mem, 0x0014, encode_instr(0x9, 1, 3, 0))
+# JNZ 0x0010 (cond=1 = NZ)
+write_word(mem, 0x0016, encode_instr(0xD, 0, 0, 0x01)); write_word(mem, 0x0018, 0x0010)
+# STORE R0 -> M[0x0103]
+write_word(mem, 0x001A, encode_instr(0xB, 0, 0, 0)); write_word(mem, 0x001C, 0x0103)
+# HALT
+write_word(mem, 0x001E, encode_instr(0xF, 0, 0, 0))
+cpu = ThresholdCPU10(addr_bits=ADDR_BITS, mem_bytes=MEM_BYTES)
+state = {
+    "pc": 0,
+    "regs": [0, 0, 0, 0],
+    "flags": [0, 0, 0, 0],
+    "mem": mem,
+    "halted": False,
+}
+print("  Program: sum 5+4+3+2+1 via loop (uses ADD/SUB/CMP/Jcc/LOAD/STORE/HALT, all threshold-gated)")
+print("  Running ...")
+final, cycles = cpu.run(state, max_cycles=200)
+print(f"  Halted after {cycles} cycles")
+print(f"  R0={final['regs'][0]}  R1={final['regs'][1]}  R2={final['regs'][2]}  R3={final['regs'][3]}")
+print(f"  M[0x0103] = {final['mem'][0x103]}  (expected 15)")
+print()
+print("Done.")

variants/neural_alu16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f702736cd85124aac22602bf44617698309c03739a254b338409df87e22344c9
+size 12434484

variants/neural_alu32.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c6761fa0366a19cdb9abb7c1c72f53b3a3a07032056b6d17dbed4131cc5e21d
+size 14378864

variants/neural_alu8.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:246546bba4668a80a81e32b115d883d57b6b49bdfe8254034090089d5bf168cf
+size 11561076

variants/neural_computer16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2daa9ab42ab63534010e363adbb3423502ebbe94a4b354797c25dece5eb5948
+size 45730164

variants/neural_computer16_reduced.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1808dd34084e68120bccd277310749e047c357274440901baf2b01ca64e9e41
+size 14640476

variants/neural_computer16_registers.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7487bbfe4da343bb2072c190e33b7861b452c947efd424a068927c413595049
+size 12534076

variants/neural_computer16_scratchpad.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58dfbdf0a987c1675a68439d86a57a7631a7657ea60b6b0d3e568dfdeee88f2e
+size 12704876

variants/neural_computer16_small.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:759920dcb38a340ee31f4d116df4983322258b796ac5d6021f7ca165986f5f5b
+size 13104212

variants/neural_computer32.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18f4f3420fb307d90ea7a8fe356c196a59d7a0f2ed4ec57679d87b209a7fec22
+size 47693920

variants/neural_computer32_reduced.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51e14c8819de3402881ce2ffe3cdd7e94a801c038c6ef8495110144e9348e2e7
+size 16604104

variants/neural_computer32_registers.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13ac4eb1c793a6331a2ecfa13d3372edc9f4649163883244847ca6616062de05
+size 14497800

variants/neural_computer32_scratchpad.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8c389b1730cc297f40944815aebda1b1a71b79bff738e9da767c82609e9d9bd
+size 14668512

variants/neural_computer32_small.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3c67a2047c0cf7370e802727b9be51d8b7185dedfe108409968f0d838157e04
+size 15067856

variants/neural_computer8.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:acde9e66a5bae870b5684ddc8592a206f00b518e088e90965a73bfa35274ba2a
+size 44846164

variants/neural_computer8_reduced.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e318727316bfb34f82cdc4a2b627d9f8475c3282cab67a6424ba642350dc823
+size 13756476

variants/neural_computer8_registers.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b2c49af2b18786699351235d4d051afd7452e17616f0f06a87b3e5e9820da66
+size 11649932

variants/neural_computer8_scratchpad.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40fe6db0454dd6ba33072a18f6c81ed1463830b270b708b9ae45f976e32cfc50
+size 11820860

variants/neural_computer8_small.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:547aef648729c49dc106c14d05bfcdf12a6f1aca5de5b7d1c475fce65aef1373
+size 12220204