Add NEG circuit (two's complement negate) - 76 tensors, 256/256 tests pass

Browse files

Files changed (7) hide show

16bitupgrade.md +904 -904
llm/circuit_llm.py +606 -606
llm/guide.md +615 -615
llm/train_circuit_interface.py +306 -306
neural_computer.safetensors +2 -2
tensors.txt +0 -0
todo.md +1 -1

16bitupgrade.md CHANGED Viewed

@@ -1,905 +1,905 @@
-16-BIT TENSOR MANIFEST
-  ---
-  ARITHMETIC
-  ripplecarry16bit
-  arithmetic.ripplecarry16bit.fa0.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa0.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa0.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa0.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa0.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa0.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa1.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa1.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa1.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa1.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa1.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa1.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa2.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa2.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa2.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa2.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa2.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa2.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa3.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa3.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa3.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa3.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa3.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa3.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa4.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa4.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa4.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa4.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa4.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa4.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa5.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa5.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa5.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa5.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa5.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa5.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa6.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa6.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa6.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa6.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa6.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa6.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa7.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa7.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa7.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa7.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa7.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa7.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa8.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa8.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa8.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa8.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa8.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa8.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa9.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa9.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa9.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa9.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa9.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa9.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa10.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa10.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa10.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa10.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa10.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa10.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa11.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa11.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa11.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa11.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa11.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa11.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa12.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa12.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa12.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa12.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa12.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa12.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa13.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa13.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa13.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa13.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa13.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa13.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa14.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa14.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa14.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa14.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa14.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa14.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa15.carry_or.bias [1]
-  arithmetic.ripplecarry16bit.fa15.carry_or.weight [2]
-  arithmetic.ripplecarry16bit.fa15.ha1.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa15.ha1.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer2.weight [2]
-  arithmetic.ripplecarry16bit.fa15.ha2.carry.bias [1]
-  arithmetic.ripplecarry16bit.fa15.ha2.carry.weight [2]
-  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer1.nand.bias [1]
-  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer1.nand.weight [2]
-  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer1.or.bias [1]
-  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer1.or.weight [2]
-  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer2.bias [1]
-  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer2.weight [2]
-  16-bit comparators
-  arithmetic.greaterthan16bit.comparator [16]
-  arithmetic.lessthan16bit.comparator [16]
-  16x16 multiplier (14 stages, bits 0-30 per stage where applicable)
-  Stage 0: bits 0-16
-  Stage 1: bits 0-17
-  Stage 2: bits 0-18
-  ...
-  Stage 13: bits 0-30
-  Each bit position has the same full adder structure. Total enumeration:
-  arithmetic.multiplier16x16.stage0.bit0.carry_or.bias [1]
-  arithmetic.multiplier16x16.stage0.bit0.carry_or.weight [2]
-  arithmetic.multiplier16x16.stage0.bit0.ha1.carry.bias [1]
-  arithmetic.multiplier16x16.stage0.bit0.ha1.carry.weight [2]
-  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer1.nand.bias [1]
-  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer1.nand.weight [2]
-  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer1.or.bias [1]
-  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer1.or.weight [2]
-  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer2.bias [1]
-  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer2.weight [2]
-  arithmetic.multiplier16x16.stage0.bit0.ha2.carry.bias [1]
-  arithmetic.multiplier16x16.stage0.bit0.ha2.carry.weight [2]
-  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer1.nand.bias [1]
-  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer1.nand.weight [2]
-  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer1.or.bias [1]
-  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer1.or.weight [2]
-  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer2.bias [1]
-  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer2.weight [2]
-  Pattern repeats for:
-  - stage0: bit0-bit16 (17 bits)
-  - stage1: bit0-bit17 (18 bits)
-  - stage2: bit0-bit18 (19 bits)
-  - stage3: bit0-bit19 (20 bits)
-  - stage4: bit0-bit20 (21 bits)
-  - stage5: bit0-bit21 (22 bits)
-  - stage6: bit0-bit22 (23 bits)
-  - stage7: bit0-bit23 (24 bits)
-  - stage8: bit0-bit24 (25 bits)
-  - stage9: bit0-bit25 (26 bits)
-  - stage10: bit0-bit26 (27 bits)
-  - stage11: bit0-bit27 (28 bits)
-  - stage12: bit0-bit28 (29 bits)
-  - stage13: bit0-bit29 (30 bits)
-  18 tensors per bit × (17+18+19+20+21+22+23+24+25+26+27+28+29+30) = 18 × 329 = 5922 tensors for multiplier stages.
-  Plus 256 AND gates for partial products (16×16):
-  arithmetic.multiplier16x16.partial.r0c0.bias [1]
-  arithmetic.multiplier16x16.partial.r0c0.weight [2]
-  ...through...
-  arithmetic.multiplier16x16.partial.r15c15.bias [1]
-  arithmetic.multiplier16x16.partial.r15c15.weight [2]
-  256 × 2 = 512 tensors for partial products.
-  ---
-  COMBINATIONAL
-  Barrel shifter 16-bit
-  combinational.barrelshifter16bit.shift [20]
-  Decoder 4-to-16
-  combinational.decoder4to16.out0.bias [1]
-  combinational.decoder4to16.out0.weight [4]
-  combinational.decoder4to16.out1.bias [1]
-  combinational.decoder4to16.out1.weight [4]
-  combinational.decoder4to16.out2.bias [1]
-  combinational.decoder4to16.out2.weight [4]
-  combinational.decoder4to16.out3.bias [1]
-  combinational.decoder4to16.out3.weight [4]
-  combinational.decoder4to16.out4.bias [1]
-  combinational.decoder4to16.out4.weight [4]
-  combinational.decoder4to16.out5.bias [1]
-  combinational.decoder4to16.out5.weight [4]
-  combinational.decoder4to16.out6.bias [1]
-  combinational.decoder4to16.out6.weight [4]
-  combinational.decoder4to16.out7.bias [1]
-  combinational.decoder4to16.out7.weight [4]
-  combinational.decoder4to16.out8.bias [1]
-  combinational.decoder4to16.out8.weight [4]
-  combinational.decoder4to16.out9.bias [1]
-  combinational.decoder4to16.out9.weight [4]
-  combinational.decoder4to16.out10.bias [1]
-  combinational.decoder4to16.out10.weight [4]
-  combinational.decoder4to16.out11.bias [1]
-  combinational.decoder4to16.out11.weight [4]
-  combinational.decoder4to16.out12.bias [1]
-  combinational.decoder4to16.out12.weight [4]
-  combinational.decoder4to16.out13.bias [1]
-  combinational.decoder4to16.out13.weight [4]
-  combinational.decoder4to16.out14.bias [1]
-  combinational.decoder4to16.out14.weight [4]
-  combinational.decoder4to16.out15.bias [1]
-  combinational.decoder4to16.out15.weight [4]
-  Encoder 16-to-4
-  combinational.encoder16to4.bit0.bias [1]
-  combinational.encoder16to4.bit0.weight [16]
-  combinational.encoder16to4.bit1.bias [1]
-  combinational.encoder16to4.bit1.weight [16]
-  combinational.encoder16to4.bit2.bias [1]
-  combinational.encoder16to4.bit2.weight [16]
-  combinational.encoder16to4.bit3.bias [1]
-  combinational.encoder16to4.bit3.weight [16]
-  Multiplexer 16-to-1
-  combinational.multiplexer16to1.select [20]
-  Demultiplexer 1-to-16
-  combinational.demultiplexer1to16.decode [5]
-  Priority encoder 16-bit
-  combinational.priorityencoder16bit.priority [16]
-  ---
-  CONTROL
-  Unconditional jump 16-bit
-  control.jump.bit0.bias [1]
-  control.jump.bit0.weight [1]
-  control.jump.bit1.bias [1]
-  control.jump.bit1.weight [1]
-  control.jump.bit2.bias [1]
-  control.jump.bit2.weight [1]
-  control.jump.bit3.bias [1]
-  control.jump.bit3.weight [1]
-  control.jump.bit4.bias [1]
-  control.jump.bit4.weight [1]
-  control.jump.bit5.bias [1]
-  control.jump.bit5.weight [1]
-  control.jump.bit6.bias [1]
-  control.jump.bit6.weight [1]
-  control.jump.bit7.bias [1]
-  control.jump.bit7.weight [1]
-  control.jump.bit8.bias [1]
-  control.jump.bit8.weight [1]
-  control.jump.bit9.bias [1]
-  control.jump.bit9.weight [1]
-  control.jump.bit10.bias [1]
-  control.jump.bit10.weight [1]
-  control.jump.bit11.bias [1]
-  control.jump.bit11.weight [1]
-  control.jump.bit12.bias [1]
-  control.jump.bit12.weight [1]
-  control.jump.bit13.bias [1]
-  control.jump.bit13.weight [1]
-  control.jump.bit14.bias [1]
-  control.jump.bit14.weight [1]
-  control.jump.bit15.bias [1]
-  control.jump.bit15.weight [1]
-  Conditional jump 16-bit (template for JZ, JNZ, JC, JNC, JN, JP, JV, JNV, and generic conditionaljump)
-  Each conditional jump type follows this pattern for bits 0-15:
-  control.{jtype}.bit{N}.and_a.bias [1]
-  control.{jtype}.bit{N}.and_a.weight [2]
-  control.{jtype}.bit{N}.and_b.bias [1]
-  control.{jtype}.bit{N}.and_b.weight [2]
-  control.{jtype}.bit{N}.not_sel.bias [1]
-  control.{jtype}.bit{N}.not_sel.weight [1]
-  control.{jtype}.bit{N}.or.bias [1]
-  control.{jtype}.bit{N}.or.weight [2]
-  Where {jtype} ∈ {conditionaljump, jz, jnz, jc, jnc, jn, jp, jv, jnv} and N ∈ {0..15}
-  Full expansion for control.jz (others follow same pattern):
-  control.jz.bit0.and_a.bias [1]
-  control.jz.bit0.and_a.weight [2]
-  control.jz.bit0.and_b.bias [1]
-  control.jz.bit0.and_b.weight [2]
-  control.jz.bit0.not_sel.bias [1]
-  control.jz.bit0.not_sel.weight [1]
-  control.jz.bit0.or.bias [1]
-  control.jz.bit0.or.weight [2]
-  control.jz.bit1.and_a.bias [1]
-  control.jz.bit1.and_a.weight [2]
-  control.jz.bit1.and_b.bias [1]
-  control.jz.bit1.and_b.weight [2]
-  control.jz.bit1.not_sel.bias [1]
-  control.jz.bit1.not_sel.weight [1]
-  control.jz.bit1.or.bias [1]
-  control.jz.bit1.or.weight [2]
-  control.jz.bit2.and_a.bias [1]
-  control.jz.bit2.and_a.weight [2]
-  control.jz.bit2.and_b.bias [1]
-  control.jz.bit2.and_b.weight [2]
-  control.jz.bit2.not_sel.bias [1]
-  control.jz.bit2.not_sel.weight [1]
-  control.jz.bit2.or.bias [1]
-  control.jz.bit2.or.weight [2]
-  control.jz.bit3.and_a.bias [1]
-  control.jz.bit3.and_a.weight [2]
-  control.jz.bit3.and_b.bias [1]
-  control.jz.bit3.and_b.weight [2]
-  control.jz.bit3.not_sel.bias [1]
-  control.jz.bit3.not_sel.weight [1]
-  control.jz.bit3.or.bias [1]
-  control.jz.bit3.or.weight [2]
-  control.jz.bit4.and_a.bias [1]
-  control.jz.bit4.and_a.weight [2]
-  control.jz.bit4.and_b.bias [1]
-  control.jz.bit4.and_b.weight [2]
-  control.jz.bit4.not_sel.bias [1]
-  control.jz.bit4.not_sel.weight [1]
-  control.jz.bit4.or.bias [1]
-  control.jz.bit4.or.weight [2]
-  control.jz.bit5.and_a.bias [1]
-  control.jz.bit5.and_a.weight [2]
-  control.jz.bit5.and_b.bias [1]
-  control.jz.bit5.and_b.weight [2]
-  control.jz.bit5.not_sel.bias [1]
-  control.jz.bit5.not_sel.weight [1]
-  control.jz.bit5.or.bias [1]
-  control.jz.bit5.or.weight [2]
-  control.jz.bit6.and_a.bias [1]
-  control.jz.bit6.and_a.weight [2]
-  control.jz.bit6.and_b.bias [1]
-  control.jz.bit6.and_b.weight [2]
-  control.jz.bit6.not_sel.bias [1]
-  control.jz.bit6.not_sel.weight [1]
-  control.jz.bit6.or.bias [1]
-  control.jz.bit6.or.weight [2]
-  control.jz.bit7.and_a.bias [1]
-  control.jz.bit7.and_a.weight [2]
-  control.jz.bit7.and_b.bias [1]
-  control.jz.bit7.and_b.weight [2]
-  control.jz.bit7.not_sel.bias [1]
-  control.jz.bit7.not_sel.weight [1]
-  control.jz.bit7.or.bias [1]
-  control.jz.bit7.or.weight [2]
-  control.jz.bit8.and_a.bias [1]
-  control.jz.bit8.and_a.weight [2]
-  control.jz.bit8.and_b.bias [1]
-  control.jz.bit8.and_b.weight [2]
-  control.jz.bit8.not_sel.bias [1]
-  control.jz.bit8.not_sel.weight [1]
-  control.jz.bit8.or.bias [1]
-  control.jz.bit8.or.weight [2]
-  control.jz.bit9.and_a.bias [1]
-  control.jz.bit9.and_a.weight [2]
-  control.jz.bit9.and_b.bias [1]
-  control.jz.bit9.and_b.weight [2]
-  control.jz.bit9.not_sel.bias [1]
-  control.jz.bit9.not_sel.weight [1]
-  control.jz.bit9.or.bias [1]
-  control.jz.bit9.or.weight [2]
-  control.jz.bit10.and_a.bias [1]
-  control.jz.bit10.and_a.weight [2]
-  control.jz.bit10.and_b.bias [1]
-  control.jz.bit10.and_b.weight [2]
-  control.jz.bit10.not_sel.bias [1]
-  control.jz.bit10.not_sel.weight [1]
-  control.jz.bit10.or.bias [1]
-  control.jz.bit10.or.weight [2]
-  control.jz.bit11.and_a.bias [1]
-  control.jz.bit11.and_a.weight [2]
-  control.jz.bit11.and_b.bias [1]
-  control.jz.bit11.and_b.weight [2]
-  control.jz.bit11.not_sel.bias [1]
-  control.jz.bit11.not_sel.weight [1]
-  control.jz.bit11.or.bias [1]
-  control.jz.bit11.or.weight [2]
-  control.jz.bit12.and_a.bias [1]
-  control.jz.bit12.and_a.weight [2]
-  control.jz.bit12.and_b.bias [1]
-  control.jz.bit12.and_b.weight [2]
-  control.jz.bit12.not_sel.bias [1]
-  control.jz.bit12.not_sel.weight [1]
-  control.jz.bit12.or.bias [1]
-  control.jz.bit12.or.weight [2]
-  control.jz.bit13.and_a.bias [1]
-  control.jz.bit13.and_a.weight [2]
-  control.jz.bit13.and_b.bias [1]
-  control.jz.bit13.and_b.weight [2]
-  control.jz.bit13.not_sel.bias [1]
-  control.jz.bit13.not_sel.weight [1]
-  control.jz.bit13.or.bias [1]
-  control.jz.bit13.or.weight [2]
-  control.jz.bit14.and_a.bias [1]
-  control.jz.bit14.and_a.weight [2]
-  control.jz.bit14.and_b.bias [1]
-  control.jz.bit14.and_b.weight [2]
-  control.jz.bit14.not_sel.bias [1]
-  control.jz.bit14.not_sel.weight [1]
-  control.jz.bit14.or.bias [1]
-  control.jz.bit14.or.weight [2]
-  control.jz.bit15.and_a.bias [1]
-  control.jz.bit15.and_a.weight [2]
-  control.jz.bit15.and_b.bias [1]
-  control.jz.bit15.and_b.weight [2]
-  control.jz.bit15.not_sel.bias [1]
-  control.jz.bit15.not_sel.weight [1]
-  control.jz.bit15.or.bias [1]
-  control.jz.bit15.or.weight [2]
-  Repeat above for: jnz, jc, jnc, jn, jp, jv, jnv, conditionaljump (9 types × 16 bits × 8 tensors = 1152 tensors)
-  Stack operations (unchanged)
-  control.call.jump [1]
-  control.call.push [1]
-  control.pop.load [1]
-  control.pop.sp_inc [1]
-  control.push.sp_dec [1]
-  control.push.store [1]
-  control.ret.jump [1]
-  control.ret.pop [1]
-  control.sp_dec.uses [1]
-  control.sp_inc.uses [1]
-  ---
-  ERROR DETECTION
-  Checksum 16-bit
-  error_detection.checksum16bit.sum.bias [1]
-  error_detection.checksum16bit.sum.weight [16]
-  Parity 16-bit
-  error_detection.evenparitychecker16bit.bias [1]
-  error_detection.evenparitychecker16bit.weight [16]
-  error_detection.oddparitychecker16bit.not.bias [1]
-  error_detection.oddparitychecker16bit.not.weight [1]
-  error_detection.oddparitychecker16bit.parity.bias [1]
-  error_detection.oddparitychecker16bit.parity.weight [16]
-  Parity checker/generator 16-bit (4 XOR stages instead of 3)
-  error_detection.paritychecker16bit.output.not.bias [1]
-  error_detection.paritychecker16bit.output.not.weight [1]
-  error_detection.paritychecker16bit.stage1.xor0.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage1.xor0.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage1.xor0.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage1.xor0.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage1.xor0.layer2.bias [1]
-  error_detection.paritychecker16bit.stage1.xor0.layer2.weight [2]
-  error_detection.paritychecker16bit.stage1.xor1.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage1.xor1.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage1.xor1.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage1.xor1.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage1.xor1.layer2.bias [1]
-  error_detection.paritychecker16bit.stage1.xor1.layer2.weight [2]
-  error_detection.paritychecker16bit.stage1.xor2.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage1.xor2.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage1.xor2.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage1.xor2.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage1.xor2.layer2.bias [1]
-  error_detection.paritychecker16bit.stage1.xor2.layer2.weight [2]
-  error_detection.paritychecker16bit.stage1.xor3.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage1.xor3.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage1.xor3.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage1.xor3.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage1.xor3.layer2.bias [1]
-  error_detection.paritychecker16bit.stage1.xor3.layer2.weight [2]
-  error_detection.paritychecker16bit.stage1.xor4.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage1.xor4.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage1.xor4.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage1.xor4.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage1.xor4.layer2.bias [1]
-  error_detection.paritychecker16bit.stage1.xor4.layer2.weight [2]
-  error_detection.paritychecker16bit.stage1.xor5.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage1.xor5.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage1.xor5.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage1.xor5.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage1.xor5.layer2.bias [1]
-  error_detection.paritychecker16bit.stage1.xor5.layer2.weight [2]
-  error_detection.paritychecker16bit.stage1.xor6.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage1.xor6.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage1.xor6.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage1.xor6.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage1.xor6.layer2.bias [1]
-  error_detection.paritychecker16bit.stage1.xor6.layer2.weight [2]
-  error_detection.paritychecker16bit.stage1.xor7.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage1.xor7.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage1.xor7.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage1.xor7.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage1.xor7.layer2.bias [1]
-  error_detection.paritychecker16bit.stage1.xor7.layer2.weight [2]
-  error_detection.paritychecker16bit.stage2.xor0.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage2.xor0.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage2.xor0.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage2.xor0.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage2.xor0.layer2.bias [1]
-  error_detection.paritychecker16bit.stage2.xor0.layer2.weight [2]
-  error_detection.paritychecker16bit.stage2.xor1.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage2.xor1.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage2.xor1.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage2.xor1.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage2.xor1.layer2.bias [1]
-  error_detection.paritychecker16bit.stage2.xor1.layer2.weight [2]
-  error_detection.paritychecker16bit.stage2.xor2.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage2.xor2.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage2.xor2.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage2.xor2.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage2.xor2.layer2.bias [1]
-  error_detection.paritychecker16bit.stage2.xor2.layer2.weight [2]
-  error_detection.paritychecker16bit.stage2.xor3.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage2.xor3.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage2.xor3.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage2.xor3.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage2.xor3.layer2.bias [1]
-  error_detection.paritychecker16bit.stage2.xor3.layer2.weight [2]
-  error_detection.paritychecker16bit.stage3.xor0.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage3.xor0.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage3.xor0.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage3.xor0.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage3.xor0.layer2.bias [1]
-  error_detection.paritychecker16bit.stage3.xor0.layer2.weight [2]
-  error_detection.paritychecker16bit.stage3.xor1.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage3.xor1.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage3.xor1.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage3.xor1.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage3.xor1.layer2.bias [1]
-  error_detection.paritychecker16bit.stage3.xor1.layer2.weight [2]
-  error_detection.paritychecker16bit.stage4.xor0.layer1.nand.bias [1]
-  error_detection.paritychecker16bit.stage4.xor0.layer1.nand.weight [2]
-  error_detection.paritychecker16bit.stage4.xor0.layer1.or.bias [1]
-  error_detection.paritychecker16bit.stage4.xor0.layer1.or.weight [2]
-  error_detection.paritychecker16bit.stage4.xor0.layer2.bias [1]
-  error_detection.paritychecker16bit.stage4.xor0.layer2.weight [2]
-  Identical structure for paritygenerator16bit.
-  CRC-16
-  error_detection.crc16.divisor [17]
-  Hamming (15,11) - 11 data bits, 4 parity bits
-  error_detection.hammingencode11bit.p0.weight [11]
-  error_detection.hammingencode11bit.p1.bias [1]
-  error_detection.hammingencode11bit.p1.weight [11]
-  error_detection.hammingencode11bit.p2.bias [1]
-  error_detection.hammingencode11bit.p2.weight [11]
-  error_detection.hammingencode11bit.p3.bias [1]
-  error_detection.hammingencode11bit.p3.weight [11]
-  error_detection.hammingencode11bit.p4.bias [1]
-  error_detection.hammingencode11bit.p4.weight [11]
-  error_detection.hammingdecode15bit.s1.bias [1]
-  error_detection.hammingdecode15bit.s1.weight [8]
-  error_detection.hammingdecode15bit.s2.bias [1]
-  error_detection.hammingdecode15bit.s2.weight [8]
-  error_detection.hammingdecode15bit.s3.bias [1]
-  error_detection.hammingdecode15bit.s3.weight [8]
-  error_detection.hammingdecode15bit.s4.bias [1]
-  error_detection.hammingdecode15bit.s4.weight [8]
-  error_detection.hammingsyndrome15bit.s1.weight [8]
-  error_detection.hammingsyndrome15bit.s2.weight [8]
-  error_detection.hammingsyndrome15bit.s3.weight [8]
-  error_detection.hammingsyndrome15bit.s4.weight [8]
-  Longitudinal parity 16-bit
-  error_detection.longitudinalparity16bit.col_parity [16]
-  error_detection.longitudinalparity16bit.row_parity [16]
-  ---
-  MODULAR
-  For 16-bit inputs, modular arithmetic requires detecting which of ceil(65536/N) ranges the input falls into. Structure per modulus:
-  mod2 (simple - just check LSB)
-  modular.mod2_16bit.bias [1]
-  modular.mod2_16bit.weight [16]
-  mod4 (check 2 LSBs)
-  modular.mod4_16bit.bias [1]
-  modular.mod4_16bit.weight [16]
-  mod8 (check 3 LSBs)
-  modular.mod8_16bit.bias [1]
-  modular.mod8_16bit.weight [16]
-  mod16 (check 4 LSBs)
-  modular.mod16_16bit.bias [1]
-  modular.mod16_16bit.weight [16]
-  For non-power-of-2 moduli (3, 5, 6, 7, 9, 10, 11, 12), use iterative subtraction circuit referencing the 16-bit subtractor and comparator, or expand the range-check approach:
-  mod3, mod5, mod6, mod7, mod9, mod10, mod11, mod12 (range-check approach, pattern):
-  modular.mod{N}_16bit.layer1.geq{K}.bias [1]
-  modular.mod{N}_16bit.layer1.geq{K}.weight [16]
-  modular.mod{N}_16bit.layer1.leq{K}.bias [1]
-  modular.mod{N}_16bit.layer1.leq{K}.weight [16]
-  modular.mod{N}_16bit.layer2.eq{K}.bias [1]
-  modular.mod{N}_16bit.layer2.eq{K}.weight [2]
-  modular.mod{N}_16bit.layer3.or.bias [1]
-  modular.mod{N}_16bit.layer3.or.weight [R]
-  Where R = number of ranges = ceil(65536/N).
-  ---
-  PATTERN RECOGNITION
-  pattern_recognition.popcount16bit.bias [1]
-  pattern_recognition.popcount16bit.weight [16]
-  pattern_recognition.allones16bit.bias [1]
-  pattern_recognition.allones16bit.weight [16]
-  pattern_recognition.allzeros16bit.bias [1]
-  pattern_recognition.allzeros16bit.weight [16]
-  pattern_recognition.alternating16bit.pattern1.weight [16]
-  pattern_recognition.alternating16bit.pattern2.weight [16]
-  pattern_recognition.hammingdistance16bit.popcount.weight [16]
-  pattern_recognition.hammingdistance16bit.xor.weight [32]
-  pattern_recognition.leadingones16bit.weight [16]
-  pattern_recognition.trailingones16bit.weight [16]
-  pattern_recognition.runlength16bit.weight [16]
-  pattern_recognition.onehotdetector16bit.and.bias [1]
-  pattern_recognition.onehotdetector16bit.and.weight [2]
-  pattern_recognition.onehotdetector16bit.atleast1.bias [1]
-  pattern_recognition.onehotdetector16bit.atleast1.weight [16]
-  pattern_recognition.onehotdetector16bit.atmost1.bias [1]
-  pattern_recognition.onehotdetector16bit.atmost1.weight [16]
-  pattern_recognition.symmetry16bit.and.bias [1]
-  pattern_recognition.symmetry16bit.and.weight [8]
-  pattern_recognition.symmetry16bit.xnor0.weight [2]
-  pattern_recognition.symmetry16bit.xnor1.weight [2]
-  pattern_recognition.symmetry16bit.xnor2.weight [2]
-  pattern_recognition.symmetry16bit.xnor3.weight [2]
-  pattern_recognition.symmetry16bit.xnor4.weight [2]
-  pattern_recognition.symmetry16bit.xnor5.weight [2]
-  pattern_recognition.symmetry16bit.xnor6.weight [2]
-  pattern_recognition.symmetry16bit.xnor7.weight [2]
-  ---
-  THRESHOLD
-  threshold.alloutof16.bias [1]
-  threshold.alloutof16.weight [16]
-  threshold.oneoutof16.bias [1]
-  threshold.oneoutof16.weight [16]
-  threshold.twooutof16.bias [1]
-  threshold.twooutof16.weight [16]
-  threshold.threeoutof16.bias [1]
-  threshold.threeoutof16.weight [16]
-  threshold.fouroutof16.bias [1]
-  threshold.fouroutof16.weight [16]
-  threshold.fiveoutof16.bias [1]
-  threshold.fiveoutof16.weight [16]
-  threshold.sixoutof16.bias [1]
-  threshold.sixoutof16.weight [16]
-  threshold.sevenoutof16.bias [1]
-  threshold.sevenoutof16.weight [16]
-  threshold.eightoutof16.bias [1]
-  threshold.eightoutof16.weight [16]
-  threshold.nineoutof16.bias [1]
-  threshold.nineoutof16.weight [16]
-  threshold.tenoutof16.bias [1]
-  threshold.tenoutof16.weight [16]
-  threshold.elevenoutof16.bias [1]
-  threshold.elevenoutof16.weight [16]
-  threshold.twelveoutof16.bias [1]
-  threshold.twelveoutof16.weight [16]
-  threshold.thirteenoutof16.bias [1]
-  threshold.thirteenoutof16.weight [16]
-  threshold.fourteenoutof16.bias [1]
-  threshold.fourteenoutof16.weight [16]
-  threshold.fifteenoutof16.bias [1]
-  threshold.fifteenoutof16.weight [16]
-  threshold.sixteenoutof16.bias [1]
-  threshold.sixteenoutof16.weight [16]
-  threshold.majority16.bias [1]
-  threshold.majority16.weight [16]
-  threshold.minority16.bias [1]
-  threshold.minority16.weight [16]
-  threshold.atleastk_8_16bit.bias [1]
-  threshold.atleastk_8_16bit.weight [16]
-  threshold.atmostk_8_16bit.bias [1]
-  threshold.atmostk_8_16bit.weight [16]
-  threshold.exactlyk_8_16bit.and.bias [1]
-  threshold.exactlyk_8_16bit.and.weight [2]
-  threshold.exactlyk_8_16bit.atleast.bias [1]
-  threshold.exactlyk_8_16bit.atleast.weight [16]
-  threshold.exactlyk_8_16bit.atmost.bias [1]
-  threshold.exactlyk_8_16bit.atmost.weight [16]
-  ---
-  MANIFEST
-  manifest.alu_operations [1]
-  manifest.flags [1]
-  manifest.instruction_width [1]
-  manifest.memory_bytes [1]
-  manifest.pc_width [1]
-  manifest.register_width [1]
-  manifest.registers [1]
-  manifest.turing_complete [1]
-  manifest.version [1]
-  Values change:
-  - register_width: 8 → 16
-  - pc_width: 8 → 16
-  - memory_bytes: 256 → 65536
-  ---
-  TOTAL NEW TENSOR COUNT
-  | Category                      | Count             |
-  |-------------------------------|-------------------|
-  | ripplecarry16bit              | 288               |
-  | 16-bit comparators            | 2                 |
-  | multiplier16x16               | ~6500             |
-  | combinational                 | 45                |
-  | control (jump + conditionals) | 1184              |
-  | error_detection               | ~200              |
-  | modular                       | ~600              |
-  | pattern_recognition           | 45                |
-  | threshold                     | 60                |
-  | manifest                      | 9                 |
-  | TOTAL                         | ~8900 new tensors |
   Combined with existing 8-bit tensors retained for backwards compatibility or removed: final 16-bit model ~9000-17000 tensors depending on whether 8-bit components are kept.

+16-BIT TENSOR MANIFEST
+  ---
+  ARITHMETIC
+  ripplecarry16bit
+  arithmetic.ripplecarry16bit.fa0.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa0.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa0.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa0.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa0.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa0.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa0.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa0.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa1.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa1.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa1.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa1.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa1.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa1.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa1.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa1.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa2.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa2.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa2.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa2.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa2.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa2.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa2.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa2.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa3.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa3.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa3.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa3.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa3.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa3.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa3.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa3.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa4.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa4.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa4.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa4.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa4.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa4.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa4.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa4.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa5.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa5.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa5.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa5.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa5.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa5.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa5.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa5.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa6.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa6.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa6.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa6.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa6.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa6.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa6.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa6.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa7.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa7.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa7.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa7.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa7.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa7.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa7.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa7.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa8.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa8.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa8.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa8.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa8.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa8.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa8.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa8.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa9.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa9.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa9.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa9.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa9.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa9.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa9.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa9.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa10.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa10.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa10.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa10.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa10.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa10.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa10.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa10.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa11.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa11.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa11.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa11.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa11.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa11.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa11.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa11.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa12.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa12.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa12.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa12.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa12.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa12.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa12.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa12.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa13.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa13.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa13.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa13.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa13.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa13.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa13.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa13.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa14.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa14.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa14.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa14.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa14.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa14.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa14.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa14.ha2.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa15.carry_or.bias [1]
+  arithmetic.ripplecarry16bit.fa15.carry_or.weight [2]
+  arithmetic.ripplecarry16bit.fa15.ha1.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa15.ha1.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa15.ha1.sum.layer2.weight [2]
+  arithmetic.ripplecarry16bit.fa15.ha2.carry.bias [1]
+  arithmetic.ripplecarry16bit.fa15.ha2.carry.weight [2]
+  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer1.nand.bias [1]
+  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer1.nand.weight [2]
+  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer1.or.bias [1]
+  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer1.or.weight [2]
+  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer2.bias [1]
+  arithmetic.ripplecarry16bit.fa15.ha2.sum.layer2.weight [2]
+  16-bit comparators
+  arithmetic.greaterthan16bit.comparator [16]
+  arithmetic.lessthan16bit.comparator [16]
+  16x16 multiplier (14 stages, bits 0-30 per stage where applicable)
+  Stage 0: bits 0-16
+  Stage 1: bits 0-17
+  Stage 2: bits 0-18
+  ...
+  Stage 13: bits 0-30
+  Each bit position has the same full adder structure. Total enumeration:
+  arithmetic.multiplier16x16.stage0.bit0.carry_or.bias [1]
+  arithmetic.multiplier16x16.stage0.bit0.carry_or.weight [2]
+  arithmetic.multiplier16x16.stage0.bit0.ha1.carry.bias [1]
+  arithmetic.multiplier16x16.stage0.bit0.ha1.carry.weight [2]
+  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer1.nand.bias [1]
+  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer1.nand.weight [2]
+  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer1.or.bias [1]
+  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer1.or.weight [2]
+  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer2.bias [1]
+  arithmetic.multiplier16x16.stage0.bit0.ha1.sum.layer2.weight [2]
+  arithmetic.multiplier16x16.stage0.bit0.ha2.carry.bias [1]
+  arithmetic.multiplier16x16.stage0.bit0.ha2.carry.weight [2]
+  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer1.nand.bias [1]
+  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer1.nand.weight [2]
+  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer1.or.bias [1]
+  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer1.or.weight [2]
+  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer2.bias [1]
+  arithmetic.multiplier16x16.stage0.bit0.ha2.sum.layer2.weight [2]
+  Pattern repeats for:
+  - stage0: bit0-bit16 (17 bits)
+  - stage1: bit0-bit17 (18 bits)
+  - stage2: bit0-bit18 (19 bits)
+  - stage3: bit0-bit19 (20 bits)
+  - stage4: bit0-bit20 (21 bits)
+  - stage5: bit0-bit21 (22 bits)
+  - stage6: bit0-bit22 (23 bits)
+  - stage7: bit0-bit23 (24 bits)
+  - stage8: bit0-bit24 (25 bits)
+  - stage9: bit0-bit25 (26 bits)
+  - stage10: bit0-bit26 (27 bits)
+  - stage11: bit0-bit27 (28 bits)
+  - stage12: bit0-bit28 (29 bits)
+  - stage13: bit0-bit29 (30 bits)
+  18 tensors per bit × (17+18+19+20+21+22+23+24+25+26+27+28+29+30) = 18 × 329 = 5922 tensors for multiplier stages.
+  Plus 256 AND gates for partial products (16×16):
+  arithmetic.multiplier16x16.partial.r0c0.bias [1]
+  arithmetic.multiplier16x16.partial.r0c0.weight [2]
+  ...through...
+  arithmetic.multiplier16x16.partial.r15c15.bias [1]
+  arithmetic.multiplier16x16.partial.r15c15.weight [2]
+  256 × 2 = 512 tensors for partial products.
+  ---
+  COMBINATIONAL
+  Barrel shifter 16-bit
+  combinational.barrelshifter16bit.shift [20]
+  Decoder 4-to-16
+  combinational.decoder4to16.out0.bias [1]
+  combinational.decoder4to16.out0.weight [4]
+  combinational.decoder4to16.out1.bias [1]
+  combinational.decoder4to16.out1.weight [4]
+  combinational.decoder4to16.out2.bias [1]
+  combinational.decoder4to16.out2.weight [4]
+  combinational.decoder4to16.out3.bias [1]
+  combinational.decoder4to16.out3.weight [4]
+  combinational.decoder4to16.out4.bias [1]
+  combinational.decoder4to16.out4.weight [4]
+  combinational.decoder4to16.out5.bias [1]
+  combinational.decoder4to16.out5.weight [4]
+  combinational.decoder4to16.out6.bias [1]
+  combinational.decoder4to16.out6.weight [4]
+  combinational.decoder4to16.out7.bias [1]
+  combinational.decoder4to16.out7.weight [4]
+  combinational.decoder4to16.out8.bias [1]
+  combinational.decoder4to16.out8.weight [4]
+  combinational.decoder4to16.out9.bias [1]
+  combinational.decoder4to16.out9.weight [4]
+  combinational.decoder4to16.out10.bias [1]
+  combinational.decoder4to16.out10.weight [4]
+  combinational.decoder4to16.out11.bias [1]
+  combinational.decoder4to16.out11.weight [4]
+  combinational.decoder4to16.out12.bias [1]
+  combinational.decoder4to16.out12.weight [4]
+  combinational.decoder4to16.out13.bias [1]
+  combinational.decoder4to16.out13.weight [4]
+  combinational.decoder4to16.out14.bias [1]
+  combinational.decoder4to16.out14.weight [4]
+  combinational.decoder4to16.out15.bias [1]
+  combinational.decoder4to16.out15.weight [4]
+  Encoder 16-to-4
+  combinational.encoder16to4.bit0.bias [1]
+  combinational.encoder16to4.bit0.weight [16]
+  combinational.encoder16to4.bit1.bias [1]
+  combinational.encoder16to4.bit1.weight [16]
+  combinational.encoder16to4.bit2.bias [1]
+  combinational.encoder16to4.bit2.weight [16]
+  combinational.encoder16to4.bit3.bias [1]
+  combinational.encoder16to4.bit3.weight [16]
+  Multiplexer 16-to-1
+  combinational.multiplexer16to1.select [20]
+  Demultiplexer 1-to-16
+  combinational.demultiplexer1to16.decode [5]
+  Priority encoder 16-bit
+  combinational.priorityencoder16bit.priority [16]
+  ---
+  CONTROL
+  Unconditional jump 16-bit
+  control.jump.bit0.bias [1]
+  control.jump.bit0.weight [1]
+  control.jump.bit1.bias [1]
+  control.jump.bit1.weight [1]
+  control.jump.bit2.bias [1]
+  control.jump.bit2.weight [1]
+  control.jump.bit3.bias [1]
+  control.jump.bit3.weight [1]
+  control.jump.bit4.bias [1]
+  control.jump.bit4.weight [1]
+  control.jump.bit5.bias [1]
+  control.jump.bit5.weight [1]
+  control.jump.bit6.bias [1]
+  control.jump.bit6.weight [1]
+  control.jump.bit7.bias [1]
+  control.jump.bit7.weight [1]
+  control.jump.bit8.bias [1]
+  control.jump.bit8.weight [1]
+  control.jump.bit9.bias [1]
+  control.jump.bit9.weight [1]
+  control.jump.bit10.bias [1]
+  control.jump.bit10.weight [1]
+  control.jump.bit11.bias [1]
+  control.jump.bit11.weight [1]
+  control.jump.bit12.bias [1]
+  control.jump.bit12.weight [1]
+  control.jump.bit13.bias [1]
+  control.jump.bit13.weight [1]
+  control.jump.bit14.bias [1]
+  control.jump.bit14.weight [1]
+  control.jump.bit15.bias [1]
+  control.jump.bit15.weight [1]
+  Conditional jump 16-bit (template for JZ, JNZ, JC, JNC, JN, JP, JV, JNV, and generic conditionaljump)
+  Each conditional jump type follows this pattern for bits 0-15:
+  control.{jtype}.bit{N}.and_a.bias [1]
+  control.{jtype}.bit{N}.and_a.weight [2]
+  control.{jtype}.bit{N}.and_b.bias [1]
+  control.{jtype}.bit{N}.and_b.weight [2]
+  control.{jtype}.bit{N}.not_sel.bias [1]
+  control.{jtype}.bit{N}.not_sel.weight [1]
+  control.{jtype}.bit{N}.or.bias [1]
+  control.{jtype}.bit{N}.or.weight [2]
+  Where {jtype} ∈ {conditionaljump, jz, jnz, jc, jnc, jn, jp, jv, jnv} and N ∈ {0..15}
+  Full expansion for control.jz (others follow same pattern):
+  control.jz.bit0.and_a.bias [1]
+  control.jz.bit0.and_a.weight [2]
+  control.jz.bit0.and_b.bias [1]
+  control.jz.bit0.and_b.weight [2]
+  control.jz.bit0.not_sel.bias [1]
+  control.jz.bit0.not_sel.weight [1]
+  control.jz.bit0.or.bias [1]
+  control.jz.bit0.or.weight [2]
+  control.jz.bit1.and_a.bias [1]
+  control.jz.bit1.and_a.weight [2]
+  control.jz.bit1.and_b.bias [1]
+  control.jz.bit1.and_b.weight [2]
+  control.jz.bit1.not_sel.bias [1]
+  control.jz.bit1.not_sel.weight [1]
+  control.jz.bit1.or.bias [1]
+  control.jz.bit1.or.weight [2]
+  control.jz.bit2.and_a.bias [1]
+  control.jz.bit2.and_a.weight [2]
+  control.jz.bit2.and_b.bias [1]
+  control.jz.bit2.and_b.weight [2]
+  control.jz.bit2.not_sel.bias [1]
+  control.jz.bit2.not_sel.weight [1]
+  control.jz.bit2.or.bias [1]
+  control.jz.bit2.or.weight [2]
+  control.jz.bit3.and_a.bias [1]
+  control.jz.bit3.and_a.weight [2]
+  control.jz.bit3.and_b.bias [1]
+  control.jz.bit3.and_b.weight [2]
+  control.jz.bit3.not_sel.bias [1]
+  control.jz.bit3.not_sel.weight [1]
+  control.jz.bit3.or.bias [1]
+  control.jz.bit3.or.weight [2]
+  control.jz.bit4.and_a.bias [1]
+  control.jz.bit4.and_a.weight [2]
+  control.jz.bit4.and_b.bias [1]
+  control.jz.bit4.and_b.weight [2]
+  control.jz.bit4.not_sel.bias [1]
+  control.jz.bit4.not_sel.weight [1]
+  control.jz.bit4.or.bias [1]
+  control.jz.bit4.or.weight [2]
+  control.jz.bit5.and_a.bias [1]
+  control.jz.bit5.and_a.weight [2]
+  control.jz.bit5.and_b.bias [1]
+  control.jz.bit5.and_b.weight [2]
+  control.jz.bit5.not_sel.bias [1]
+  control.jz.bit5.not_sel.weight [1]
+  control.jz.bit5.or.bias [1]
+  control.jz.bit5.or.weight [2]
+  control.jz.bit6.and_a.bias [1]
+  control.jz.bit6.and_a.weight [2]
+  control.jz.bit6.and_b.bias [1]
+  control.jz.bit6.and_b.weight [2]
+  control.jz.bit6.not_sel.bias [1]
+  control.jz.bit6.not_sel.weight [1]
+  control.jz.bit6.or.bias [1]
+  control.jz.bit6.or.weight [2]
+  control.jz.bit7.and_a.bias [1]
+  control.jz.bit7.and_a.weight [2]
+  control.jz.bit7.and_b.bias [1]
+  control.jz.bit7.and_b.weight [2]
+  control.jz.bit7.not_sel.bias [1]
+  control.jz.bit7.not_sel.weight [1]
+  control.jz.bit7.or.bias [1]
+  control.jz.bit7.or.weight [2]
+  control.jz.bit8.and_a.bias [1]
+  control.jz.bit8.and_a.weight [2]
+  control.jz.bit8.and_b.bias [1]
+  control.jz.bit8.and_b.weight [2]
+  control.jz.bit8.not_sel.bias [1]
+  control.jz.bit8.not_sel.weight [1]
+  control.jz.bit8.or.bias [1]
+  control.jz.bit8.or.weight [2]
+  control.jz.bit9.and_a.bias [1]
+  control.jz.bit9.and_a.weight [2]
+  control.jz.bit9.and_b.bias [1]
+  control.jz.bit9.and_b.weight [2]
+  control.jz.bit9.not_sel.bias [1]
+  control.jz.bit9.not_sel.weight [1]
+  control.jz.bit9.or.bias [1]
+  control.jz.bit9.or.weight [2]
+  control.jz.bit10.and_a.bias [1]
+  control.jz.bit10.and_a.weight [2]
+  control.jz.bit10.and_b.bias [1]
+  control.jz.bit10.and_b.weight [2]
+  control.jz.bit10.not_sel.bias [1]
+  control.jz.bit10.not_sel.weight [1]
+  control.jz.bit10.or.bias [1]
+  control.jz.bit10.or.weight [2]
+  control.jz.bit11.and_a.bias [1]
+  control.jz.bit11.and_a.weight [2]
+  control.jz.bit11.and_b.bias [1]
+  control.jz.bit11.and_b.weight [2]
+  control.jz.bit11.not_sel.bias [1]
+  control.jz.bit11.not_sel.weight [1]
+  control.jz.bit11.or.bias [1]
+  control.jz.bit11.or.weight [2]
+  control.jz.bit12.and_a.bias [1]
+  control.jz.bit12.and_a.weight [2]
+  control.jz.bit12.and_b.bias [1]
+  control.jz.bit12.and_b.weight [2]
+  control.jz.bit12.not_sel.bias [1]
+  control.jz.bit12.not_sel.weight [1]
+  control.jz.bit12.or.bias [1]
+  control.jz.bit12.or.weight [2]
+  control.jz.bit13.and_a.bias [1]
+  control.jz.bit13.and_a.weight [2]
+  control.jz.bit13.and_b.bias [1]
+  control.jz.bit13.and_b.weight [2]
+  control.jz.bit13.not_sel.bias [1]
+  control.jz.bit13.not_sel.weight [1]
+  control.jz.bit13.or.bias [1]
+  control.jz.bit13.or.weight [2]
+  control.jz.bit14.and_a.bias [1]
+  control.jz.bit14.and_a.weight [2]
+  control.jz.bit14.and_b.bias [1]
+  control.jz.bit14.and_b.weight [2]
+  control.jz.bit14.not_sel.bias [1]
+  control.jz.bit14.not_sel.weight [1]
+  control.jz.bit14.or.bias [1]
+  control.jz.bit14.or.weight [2]
+  control.jz.bit15.and_a.bias [1]
+  control.jz.bit15.and_a.weight [2]
+  control.jz.bit15.and_b.bias [1]
+  control.jz.bit15.and_b.weight [2]
+  control.jz.bit15.not_sel.bias [1]
+  control.jz.bit15.not_sel.weight [1]
+  control.jz.bit15.or.bias [1]
+  control.jz.bit15.or.weight [2]
+  Repeat above for: jnz, jc, jnc, jn, jp, jv, jnv, conditionaljump (9 types × 16 bits × 8 tensors = 1152 tensors)
+  Stack operations (unchanged)
+  control.call.jump [1]
+  control.call.push [1]
+  control.pop.load [1]
+  control.pop.sp_inc [1]
+  control.push.sp_dec [1]
+  control.push.store [1]
+  control.ret.jump [1]
+  control.ret.pop [1]
+  control.sp_dec.uses [1]
+  control.sp_inc.uses [1]
+  ---
+  ERROR DETECTION
+  Checksum 16-bit
+  error_detection.checksum16bit.sum.bias [1]
+  error_detection.checksum16bit.sum.weight [16]
+  Parity 16-bit
+  error_detection.evenparitychecker16bit.bias [1]
+  error_detection.evenparitychecker16bit.weight [16]
+  error_detection.oddparitychecker16bit.not.bias [1]
+  error_detection.oddparitychecker16bit.not.weight [1]
+  error_detection.oddparitychecker16bit.parity.bias [1]
+  error_detection.oddparitychecker16bit.parity.weight [16]
+  Parity checker/generator 16-bit (4 XOR stages instead of 3)
+  error_detection.paritychecker16bit.output.not.bias [1]
+  error_detection.paritychecker16bit.output.not.weight [1]
+  error_detection.paritychecker16bit.stage1.xor0.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage1.xor0.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage1.xor0.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage1.xor0.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage1.xor0.layer2.bias [1]
+  error_detection.paritychecker16bit.stage1.xor0.layer2.weight [2]
+  error_detection.paritychecker16bit.stage1.xor1.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage1.xor1.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage1.xor1.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage1.xor1.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage1.xor1.layer2.bias [1]
+  error_detection.paritychecker16bit.stage1.xor1.layer2.weight [2]
+  error_detection.paritychecker16bit.stage1.xor2.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage1.xor2.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage1.xor2.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage1.xor2.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage1.xor2.layer2.bias [1]
+  error_detection.paritychecker16bit.stage1.xor2.layer2.weight [2]
+  error_detection.paritychecker16bit.stage1.xor3.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage1.xor3.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage1.xor3.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage1.xor3.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage1.xor3.layer2.bias [1]
+  error_detection.paritychecker16bit.stage1.xor3.layer2.weight [2]
+  error_detection.paritychecker16bit.stage1.xor4.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage1.xor4.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage1.xor4.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage1.xor4.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage1.xor4.layer2.bias [1]
+  error_detection.paritychecker16bit.stage1.xor4.layer2.weight [2]
+  error_detection.paritychecker16bit.stage1.xor5.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage1.xor5.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage1.xor5.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage1.xor5.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage1.xor5.layer2.bias [1]
+  error_detection.paritychecker16bit.stage1.xor5.layer2.weight [2]
+  error_detection.paritychecker16bit.stage1.xor6.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage1.xor6.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage1.xor6.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage1.xor6.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage1.xor6.layer2.bias [1]
+  error_detection.paritychecker16bit.stage1.xor6.layer2.weight [2]
+  error_detection.paritychecker16bit.stage1.xor7.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage1.xor7.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage1.xor7.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage1.xor7.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage1.xor7.layer2.bias [1]
+  error_detection.paritychecker16bit.stage1.xor7.layer2.weight [2]
+  error_detection.paritychecker16bit.stage2.xor0.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage2.xor0.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage2.xor0.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage2.xor0.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage2.xor0.layer2.bias [1]
+  error_detection.paritychecker16bit.stage2.xor0.layer2.weight [2]
+  error_detection.paritychecker16bit.stage2.xor1.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage2.xor1.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage2.xor1.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage2.xor1.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage2.xor1.layer2.bias [1]
+  error_detection.paritychecker16bit.stage2.xor1.layer2.weight [2]
+  error_detection.paritychecker16bit.stage2.xor2.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage2.xor2.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage2.xor2.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage2.xor2.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage2.xor2.layer2.bias [1]
+  error_detection.paritychecker16bit.stage2.xor2.layer2.weight [2]
+  error_detection.paritychecker16bit.stage2.xor3.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage2.xor3.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage2.xor3.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage2.xor3.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage2.xor3.layer2.bias [1]
+  error_detection.paritychecker16bit.stage2.xor3.layer2.weight [2]
+  error_detection.paritychecker16bit.stage3.xor0.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage3.xor0.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage3.xor0.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage3.xor0.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage3.xor0.layer2.bias [1]
+  error_detection.paritychecker16bit.stage3.xor0.layer2.weight [2]
+  error_detection.paritychecker16bit.stage3.xor1.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage3.xor1.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage3.xor1.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage3.xor1.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage3.xor1.layer2.bias [1]
+  error_detection.paritychecker16bit.stage3.xor1.layer2.weight [2]
+  error_detection.paritychecker16bit.stage4.xor0.layer1.nand.bias [1]
+  error_detection.paritychecker16bit.stage4.xor0.layer1.nand.weight [2]
+  error_detection.paritychecker16bit.stage4.xor0.layer1.or.bias [1]
+  error_detection.paritychecker16bit.stage4.xor0.layer1.or.weight [2]
+  error_detection.paritychecker16bit.stage4.xor0.layer2.bias [1]
+  error_detection.paritychecker16bit.stage4.xor0.layer2.weight [2]
+  Identical structure for paritygenerator16bit.
+  CRC-16
+  error_detection.crc16.divisor [17]
+  Hamming (15,11) - 11 data bits, 4 parity bits
+  error_detection.hammingencode11bit.p0.weight [11]
+  error_detection.hammingencode11bit.p1.bias [1]
+  error_detection.hammingencode11bit.p1.weight [11]
+  error_detection.hammingencode11bit.p2.bias [1]
+  error_detection.hammingencode11bit.p2.weight [11]
+  error_detection.hammingencode11bit.p3.bias [1]
+  error_detection.hammingencode11bit.p3.weight [11]
+  error_detection.hammingencode11bit.p4.bias [1]
+  error_detection.hammingencode11bit.p4.weight [11]
+  error_detection.hammingdecode15bit.s1.bias [1]
+  error_detection.hammingdecode15bit.s1.weight [8]
+  error_detection.hammingdecode15bit.s2.bias [1]
+  error_detection.hammingdecode15bit.s2.weight [8]
+  error_detection.hammingdecode15bit.s3.bias [1]
+  error_detection.hammingdecode15bit.s3.weight [8]
+  error_detection.hammingdecode15bit.s4.bias [1]
+  error_detection.hammingdecode15bit.s4.weight [8]
+  error_detection.hammingsyndrome15bit.s1.weight [8]
+  error_detection.hammingsyndrome15bit.s2.weight [8]
+  error_detection.hammingsyndrome15bit.s3.weight [8]
+  error_detection.hammingsyndrome15bit.s4.weight [8]
+  Longitudinal parity 16-bit
+  error_detection.longitudinalparity16bit.col_parity [16]
+  error_detection.longitudinalparity16bit.row_parity [16]
+  ---
+  MODULAR
+  For 16-bit inputs, modular arithmetic requires detecting which of ceil(65536/N) ranges the input falls into. Structure per modulus:
+  mod2 (simple - just check LSB)
+  modular.mod2_16bit.bias [1]
+  modular.mod2_16bit.weight [16]
+  mod4 (check 2 LSBs)
+  modular.mod4_16bit.bias [1]
+  modular.mod4_16bit.weight [16]
+  mod8 (check 3 LSBs)
+  modular.mod8_16bit.bias [1]
+  modular.mod8_16bit.weight [16]
+  mod16 (check 4 LSBs)
+  modular.mod16_16bit.bias [1]
+  modular.mod16_16bit.weight [16]
+  For non-power-of-2 moduli (3, 5, 6, 7, 9, 10, 11, 12), use iterative subtraction circuit referencing the 16-bit subtractor and comparator, or expand the range-check approach:
+  mod3, mod5, mod6, mod7, mod9, mod10, mod11, mod12 (range-check approach, pattern):
+  modular.mod{N}_16bit.layer1.geq{K}.bias [1]
+  modular.mod{N}_16bit.layer1.geq{K}.weight [16]
+  modular.mod{N}_16bit.layer1.leq{K}.bias [1]
+  modular.mod{N}_16bit.layer1.leq{K}.weight [16]
+  modular.mod{N}_16bit.layer2.eq{K}.bias [1]
+  modular.mod{N}_16bit.layer2.eq{K}.weight [2]
+  modular.mod{N}_16bit.layer3.or.bias [1]
+  modular.mod{N}_16bit.layer3.or.weight [R]
+  Where R = number of ranges = ceil(65536/N).
+  ---
+  PATTERN RECOGNITION
+  pattern_recognition.popcount16bit.bias [1]
+  pattern_recognition.popcount16bit.weight [16]
+  pattern_recognition.allones16bit.bias [1]
+  pattern_recognition.allones16bit.weight [16]
+  pattern_recognition.allzeros16bit.bias [1]
+  pattern_recognition.allzeros16bit.weight [16]
+  pattern_recognition.alternating16bit.pattern1.weight [16]
+  pattern_recognition.alternating16bit.pattern2.weight [16]
+  pattern_recognition.hammingdistance16bit.popcount.weight [16]
+  pattern_recognition.hammingdistance16bit.xor.weight [32]
+  pattern_recognition.leadingones16bit.weight [16]
+  pattern_recognition.trailingones16bit.weight [16]
+  pattern_recognition.runlength16bit.weight [16]
+  pattern_recognition.onehotdetector16bit.and.bias [1]
+  pattern_recognition.onehotdetector16bit.and.weight [2]
+  pattern_recognition.onehotdetector16bit.atleast1.bias [1]
+  pattern_recognition.onehotdetector16bit.atleast1.weight [16]
+  pattern_recognition.onehotdetector16bit.atmost1.bias [1]
+  pattern_recognition.onehotdetector16bit.atmost1.weight [16]
+  pattern_recognition.symmetry16bit.and.bias [1]
+  pattern_recognition.symmetry16bit.and.weight [8]
+  pattern_recognition.symmetry16bit.xnor0.weight [2]
+  pattern_recognition.symmetry16bit.xnor1.weight [2]
+  pattern_recognition.symmetry16bit.xnor2.weight [2]
+  pattern_recognition.symmetry16bit.xnor3.weight [2]
+  pattern_recognition.symmetry16bit.xnor4.weight [2]
+  pattern_recognition.symmetry16bit.xnor5.weight [2]
+  pattern_recognition.symmetry16bit.xnor6.weight [2]
+  pattern_recognition.symmetry16bit.xnor7.weight [2]
+  ---
+  THRESHOLD
+  threshold.alloutof16.bias [1]
+  threshold.alloutof16.weight [16]
+  threshold.oneoutof16.bias [1]
+  threshold.oneoutof16.weight [16]
+  threshold.twooutof16.bias [1]
+  threshold.twooutof16.weight [16]
+  threshold.threeoutof16.bias [1]
+  threshold.threeoutof16.weight [16]
+  threshold.fouroutof16.bias [1]
+  threshold.fouroutof16.weight [16]
+  threshold.fiveoutof16.bias [1]
+  threshold.fiveoutof16.weight [16]
+  threshold.sixoutof16.bias [1]
+  threshold.sixoutof16.weight [16]
+  threshold.sevenoutof16.bias [1]
+  threshold.sevenoutof16.weight [16]
+  threshold.eightoutof16.bias [1]
+  threshold.eightoutof16.weight [16]
+  threshold.nineoutof16.bias [1]
+  threshold.nineoutof16.weight [16]
+  threshold.tenoutof16.bias [1]
+  threshold.tenoutof16.weight [16]
+  threshold.elevenoutof16.bias [1]
+  threshold.elevenoutof16.weight [16]
+  threshold.twelveoutof16.bias [1]
+  threshold.twelveoutof16.weight [16]
+  threshold.thirteenoutof16.bias [1]
+  threshold.thirteenoutof16.weight [16]
+  threshold.fourteenoutof16.bias [1]
+  threshold.fourteenoutof16.weight [16]
+  threshold.fifteenoutof16.bias [1]
+  threshold.fifteenoutof16.weight [16]
+  threshold.sixteenoutof16.bias [1]
+  threshold.sixteenoutof16.weight [16]
+  threshold.majority16.bias [1]
+  threshold.majority16.weight [16]
+  threshold.minority16.bias [1]
+  threshold.minority16.weight [16]
+  threshold.atleastk_8_16bit.bias [1]
+  threshold.atleastk_8_16bit.weight [16]
+  threshold.atmostk_8_16bit.bias [1]
+  threshold.atmostk_8_16bit.weight [16]
+  threshold.exactlyk_8_16bit.and.bias [1]
+  threshold.exactlyk_8_16bit.and.weight [2]
+  threshold.exactlyk_8_16bit.atleast.bias [1]
+  threshold.exactlyk_8_16bit.atleast.weight [16]
+  threshold.exactlyk_8_16bit.atmost.bias [1]
+  threshold.exactlyk_8_16bit.atmost.weight [16]
+  ---
+  MANIFEST
+  manifest.alu_operations [1]
+  manifest.flags [1]
+  manifest.instruction_width [1]
+  manifest.memory_bytes [1]
+  manifest.pc_width [1]
+  manifest.register_width [1]
+  manifest.registers [1]
+  manifest.turing_complete [1]
+  manifest.version [1]
+  Values change:
+  - register_width: 8 → 16
+  - pc_width: 8 → 16
+  - memory_bytes: 256 → 65536
+  ---
+  TOTAL NEW TENSOR COUNT
+  | Category                      | Count             |
+  |-------------------------------|-------------------|
+  | ripplecarry16bit              | 288               |
+  | 16-bit comparators            | 2                 |
+  | multiplier16x16               | ~6500             |
+  | combinational                 | 45                |
+  | control (jump + conditionals) | 1184              |
+  | error_detection               | ~200              |
+  | modular                       | ~600              |
+  | pattern_recognition           | 45                |
+  | threshold                     | 60                |
+  | manifest                      | 9                 |
+  | TOTAL                         | ~8900 new tensors |
   Combined with existing 8-bit tensors retained for backwards compatibility or removed: final 16-bit model ~9000-17000 tensors depending on whether 8-bit components are kept.

llm/circuit_llm.py CHANGED Viewed

@@ -1,606 +1,606 @@
-"""
-Circuit-Augmented LLM: Embedding threshold logic circuits into SmolLM2
-======================================================================
-Replaces/augments MLP layers with frozen threshold circuits for exact arithmetic.
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Dict, Optional, Tuple
-from safetensors.torch import load_file
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import warnings
-warnings.filterwarnings('ignore')
-# =============================================================================
-# HEAVISIDE WITH STRAIGHT-THROUGH ESTIMATOR
-# =============================================================================
-class HeavisideSTE(torch.autograd.Function):
-    """Heaviside step function with straight-through estimator for backprop."""
-    @staticmethod
-    def forward(ctx, x):
-        return (x >= 0).float()
-    @staticmethod
-    def backward(ctx, grad_output):
-        # STE: pass gradient through unchanged
-        return grad_output
-def heaviside(x: torch.Tensor) -> torch.Tensor:
-    """Heaviside step: 1 if x >= 0, else 0. Uses STE for training."""
-    return HeavisideSTE.apply(x)
-# =============================================================================
-# CIRCUIT EXECUTOR - Runs the frozen threshold circuits
-# =============================================================================
-class CircuitExecutor(nn.Module):
-    """
-    Executes threshold logic circuits from the safetensors file.
-    All circuit weights are frozen - only interface layers train.
-    """
-    def __init__(self, circuit_path: str, device: str = 'cpu'):
-        super().__init__()
-        self.device = device
-        # Load all circuit tensors
-        raw_circuits = load_file(circuit_path)
-        # Store as frozen parameters (use underscores for valid param names)
-        self.circuits = {}
-        for k, v in raw_circuits.items():
-            safe_name = k.replace('.', '__')
-            self.register_buffer(safe_name, v.float().to(device))
-            self.circuits[k] = safe_name
-    def _get(self, name: str) -> torch.Tensor:
-        """Get circuit tensor by original dotted name."""
-        return getattr(self, self.circuits[name])
-    # -------------------------------------------------------------------------
-    # Boolean Gates
-    # -------------------------------------------------------------------------
-    def eval_and(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-        """AND gate: output 1 iff both inputs are 1."""
-        inp = torch.stack([a, b], dim=-1)
-        w = self._get('boolean.and.weight')
-        bias = self._get('boolean.and.bias')
-        return heaviside(inp @ w + bias)
-    def eval_or(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-        """OR gate: output 1 if either input is 1."""
-        inp = torch.stack([a, b], dim=-1)
-        w = self._get('boolean.or.weight')
-        bias = self._get('boolean.or.bias')
-        return heaviside(inp @ w + bias)
-    def eval_xor(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-        """XOR gate: two-layer network (not linearly separable)."""
-        inp = torch.stack([a, b], dim=-1)
-        # Layer 1: OR and NAND neurons
-        w1_n1 = self._get('boolean.xor.layer1.neuron1.weight')
-        b1_n1 = self._get('boolean.xor.layer1.neuron1.bias')
-        w1_n2 = self._get('boolean.xor.layer1.neuron2.weight')
-        b1_n2 = self._get('boolean.xor.layer1.neuron2.bias')
-        h1 = heaviside(inp @ w1_n1 + b1_n1)
-        h2 = heaviside(inp @ w1_n2 + b1_n2)
-        hidden = torch.stack([h1, h2], dim=-1)
-        # Layer 2: AND of hidden
-        w2 = self._get('boolean.xor.layer2.weight')
-        b2 = self._get('boolean.xor.layer2.bias')
-        return heaviside(hidden @ w2 + b2)
-    # -------------------------------------------------------------------------
-    # Arithmetic: Full Adder
-    # -------------------------------------------------------------------------
-    def eval_full_adder(self, a: torch.Tensor, b: torch.Tensor,
-                        cin: torch.Tensor, prefix: str) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Full adder: sum = a XOR b XOR cin, cout = (a AND b) OR (cin AND (a XOR b))
-        Returns (sum_bit, carry_out)
-        """
-        inp_ab = torch.stack([a, b], dim=-1)
-        # HA1: a XOR b
-        w1_or = self._get(f'{prefix}.ha1.sum.layer1.or.weight')
-        b1_or = self._get(f'{prefix}.ha1.sum.layer1.or.bias')
-        w1_nand = self._get(f'{prefix}.ha1.sum.layer1.nand.weight')
-        b1_nand = self._get(f'{prefix}.ha1.sum.layer1.nand.bias')
-        w2 = self._get(f'{prefix}.ha1.sum.layer2.weight')
-        b2 = self._get(f'{prefix}.ha1.sum.layer2.bias')
-        h_or = heaviside(inp_ab @ w1_or + b1_or)
-        h_nand = heaviside(inp_ab @ w1_nand + b1_nand)
-        hidden = torch.stack([h_or, h_nand], dim=-1)
-        ha1_sum = heaviside(hidden @ w2 + b2)
-        # HA1 carry
-        w_c1 = self._get(f'{prefix}.ha1.carry.weight')
-        b_c1 = self._get(f'{prefix}.ha1.carry.bias')
-        ha1_carry = heaviside(inp_ab @ w_c1 + b_c1)
-        # HA2: ha1_sum XOR cin
-        inp_ha2 = torch.stack([ha1_sum, cin], dim=-1)
-        w1_or = self._get(f'{prefix}.ha2.sum.layer1.or.weight')
-        b1_or = self._get(f'{prefix}.ha2.sum.layer1.or.bias')
-        w1_nand = self._get(f'{prefix}.ha2.sum.layer1.nand.weight')
-        b1_nand = self._get(f'{prefix}.ha2.sum.layer1.nand.bias')
-        w2 = self._get(f'{prefix}.ha2.sum.layer2.weight')
-        b2 = self._get(f'{prefix}.ha2.sum.layer2.bias')
-        h_or = heaviside(inp_ha2 @ w1_or + b1_or)
-        h_nand = heaviside(inp_ha2 @ w1_nand + b1_nand)
-        hidden = torch.stack([h_or, h_nand], dim=-1)
-        ha2_sum = heaviside(hidden @ w2 + b2)
-        # HA2 carry
-        w_c2 = self._get(f'{prefix}.ha2.carry.weight')
-        b_c2 = self._get(f'{prefix}.ha2.carry.bias')
-        ha2_carry = heaviside(inp_ha2 @ w_c2 + b_c2)
-        # Carry out = ha1_carry OR ha2_carry
-        inp_cout = torch.stack([ha1_carry, ha2_carry], dim=-1)
-        w_or = self._get(f'{prefix}.carry_or.weight')
-        b_or = self._get(f'{prefix}.carry_or.bias')
-        cout = heaviside(inp_cout @ w_or + b_or)
-        return ha2_sum, cout
-    # -------------------------------------------------------------------------
-    # Arithmetic: 8-bit Ripple Carry Adder
-    # -------------------------------------------------------------------------
-    def add_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        8-bit ripple carry addition.
-        a_bits, b_bits: [..., 8] tensors of bits (LSB first)
-        Returns: (result_bits [..., 8], carry_out [...])
-        """
-        batch_shape = a_bits.shape[:-1]
-        carry = torch.zeros(batch_shape, device=a_bits.device)
-        result_bits = []
-        for i in range(8):
-            a_i = a_bits[..., i]
-            b_i = b_bits[..., i]
-            sum_bit, carry = self.eval_full_adder(
-                a_i, b_i, carry,
-                f'arithmetic.ripplecarry8bit.fa{i}'
-            )
-            result_bits.append(sum_bit)
-        return torch.stack(result_bits, dim=-1), carry
-    # -------------------------------------------------------------------------
-    # Arithmetic: 8-bit Comparators
-    # -------------------------------------------------------------------------
-    def greater_than_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> torch.Tensor:
-        """Returns 1 if a > b, else 0. Bits are MSB first."""
-        diff = a_bits - b_bits  # [..., 8]
-        w = self._get('arithmetic.greaterthan8bit.comparator')
-        score = (diff * w).sum(dim=-1)
-        return (score > 0).float()
-    def less_than_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> torch.Tensor:
-        """Returns 1 if a < b, else 0. Bits are MSB first."""
-        diff = b_bits - a_bits  # [..., 8]
-        w = self._get('arithmetic.lessthan8bit.comparator')
-        score = (diff * w).sum(dim=-1)
-        return (score > 0).float()
-    def equal_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> torch.Tensor:
-        """Returns 1 if a == b, else 0."""
-        gt = self.greater_than_8bit(a_bits, b_bits)
-        lt = self.less_than_8bit(a_bits, b_bits)
-        return (1 - gt) * (1 - lt)
-# =============================================================================
-# BIT EXTRACTION / INJECTION INTERFACES
-# =============================================================================
-class BitExtractor(nn.Module):
-    """
-    Learns to extract 8-bit operands from token embeddings.
-    Maps embedding -> 16 bits (two 8-bit operands).
-    """
-    def __init__(self, d_model: int):
-        super().__init__()
-        self.d_model = d_model
-        # Project to logits, then binarize
-        self.proj = nn.Linear(d_model, 16)
-        # Learnable temperature for sigmoid approximation during training
-        self.temperature = nn.Parameter(torch.tensor(1.0))
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        x: [..., d_model]
-        Returns: a_bits [..., 8], b_bits [..., 8] (LSB first for arithmetic)
-        """
-        logits = self.proj(x)  # [..., 16]
-        # Binarize with STE
-        bits = heaviside(logits)
-        # Split into two operands
-        a_bits = bits[..., :8]
-        b_bits = bits[..., 8:]
-        return a_bits, b_bits
-class BitInjector(nn.Module):
-    """
-    Learns to inject circuit results back into embedding space.
-    Maps 16 bits (result + flags) -> embedding delta.
-    """
-    def __init__(self, d_model: int):
-        super().__init__()
-        self.d_model = d_model
-        # Project bits to embedding
-        self.proj = nn.Linear(16, d_model)
-        # Learnable scale
-        self.scale = nn.Parameter(torch.tensor(0.1))
-    def forward(self, result_bits: torch.Tensor, flags: torch.Tensor) -> torch.Tensor:
-        """
-        result_bits: [..., 8]
-        flags: [..., 8] (carry, overflow, zero, negative, etc.)
-        Returns: [..., d_model]
-        """
-        combined = torch.cat([result_bits, flags], dim=-1)  # [..., 16]
-        return self.proj(combined) * self.scale
-# =============================================================================
-# CIRCUIT-AUGMENTED MLP BLOCK
-# =============================================================================
-class CircuitAugmentedMLP(nn.Module):
-    """
-    MLP block augmented with frozen threshold circuits.
-    The original MLP path runs in parallel with the circuit path.
-    A learned router decides how much to use each.
-    """
-    def __init__(
-        self,
-        d_model: int,
-        intermediate_size: int,
-        circuit_path: str,
-        device: str = 'cpu'
-    ):
-        super().__init__()
-        self.d_model = d_model
-        # Original MLP components (will be loaded from pretrained)
-        self.gate_proj = nn.Linear(d_model, intermediate_size, bias=False)
-        self.up_proj = nn.Linear(d_model, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, d_model, bias=False)
-        self.act_fn = nn.SiLU()
-        # Circuit components
-        self.circuits = CircuitExecutor(circuit_path, device)
-        self.bit_extractor = BitExtractor(d_model)
-        self.bit_injector = BitInjector(d_model)
-        # Router: decides circuit vs MLP contribution
-        self.router = nn.Sequential(
-            nn.Linear(d_model, 64),
-            nn.ReLU(),
-            nn.Linear(64, 2),
-            nn.Softmax(dim=-1)
-        )
-        # Operation selector (which arithmetic op to perform)
-        self.op_selector = nn.Sequential(
-            nn.Linear(d_model, 32),
-            nn.ReLU(),
-            nn.Linear(32, 4),  # add, sub, compare, passthrough
-            nn.Softmax(dim=-1)
-        )
-    def _compute_flags(self, result_bits: torch.Tensor, carry: torch.Tensor) -> torch.Tensor:
-        """Compute status flags from result."""
-        batch_shape = result_bits.shape[:-1]
-        # Zero flag: all bits are 0
-        zero = (result_bits.sum(dim=-1) == 0).float()
-        # Negative flag: MSB is 1 (two's complement)
-        negative = result_bits[..., 7]
-        # Carry flag
-        carry_flag = carry
-        # Pad to 8 flags
-        flags = torch.zeros(*batch_shape, 8, device=result_bits.device)
-        flags[..., 0] = zero
-        flags[..., 1] = negative
-        flags[..., 2] = carry_flag
-        return flags
-    def _circuit_forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Run input through threshold circuits."""
-        # Extract operands
-        a_bits, b_bits = self.bit_extractor(x)
-        # Get operation weights
-        op_weights = self.op_selector(x)  # [..., 4]
-        # Compute addition
-        add_result, add_carry = self.circuits.add_8bit(a_bits, b_bits)
-        add_flags = self._compute_flags(add_result, add_carry)
-        # Compute subtraction (a + (~b) + 1, simplified: just use add for now)
-        # For MVP, we'll focus on addition
-        # Inject result back
-        circuit_delta = self.bit_injector(add_result, add_flags)
-        return circuit_delta
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        x: [batch, seq_len, d_model]
-        Returns: [batch, seq_len, d_model]
-        """
-        # Original MLP path
-        mlp_out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        # Circuit path
-        circuit_out = self._circuit_forward(x)
-        # Route between paths
-        route_weights = self.router(x)  # [..., 2]
-        mlp_weight = route_weights[..., 0:1]
-        circuit_weight = route_weights[..., 1:2]
-        # Combine: MLP output + weighted circuit contribution
-        output = mlp_out + circuit_weight * circuit_out
-        return output
-# =============================================================================
-# MODEL SURGERY: Insert circuits into SmolLM2
-# =============================================================================
-def augment_smollm2_with_circuits(
-    model: AutoModelForCausalLM,
-    circuit_path: str,
-    layer_indices: list = None,
-    device: str = 'cpu'
-) -> AutoModelForCausalLM:
-    """
-    Surgically insert circuit blocks into SmolLM2's MLP layers.
-    Args:
-        model: Pretrained SmolLM2 model
-        circuit_path: Path to neural_computer.safetensors
-        layer_indices: Which layers to augment (default: middle layers)
-        device: Device for circuit tensors
-    Returns:
-        Modified model with circuit-augmented MLPs
-    """
-    config = model.config
-    num_layers = config.num_hidden_layers
-    # Default: augment middle third of layers
-    if layer_indices is None:
-        start = num_layers // 3
-        end = 2 * num_layers // 3
-        layer_indices = list(range(start, end))
-    print(f"Augmenting layers {layer_indices} with threshold circuits...")
-    for idx in layer_indices:
-        layer = model.model.layers[idx]
-        old_mlp = layer.mlp
-        # Create augmented MLP
-        new_mlp = CircuitAugmentedMLP(
-            d_model=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            circuit_path=circuit_path,
-            device=device
-        )
-        # Copy pretrained weights
-        new_mlp.gate_proj.weight.data = old_mlp.gate_proj.weight.data.clone()
-        new_mlp.up_proj.weight.data = old_mlp.up_proj.weight.data.clone()
-        new_mlp.down_proj.weight.data = old_mlp.down_proj.weight.data.clone()
-        # Replace
-        layer.mlp = new_mlp
-    # Freeze circuit weights, keep interfaces trainable
-    for name, param in model.named_parameters():
-        if 'circuits' in name:
-            param.requires_grad = False
-    print(f"Done. Circuit weights frozen, interfaces trainable.")
-    return model
-# =============================================================================
-# TRAINING UTILITIES
-# =============================================================================
-def generate_arithmetic_batch(batch_size: int, max_val: int = 255) -> Tuple[list, list]:
-    """Generate batch of arithmetic problems and solutions."""
-    prompts = []
-    targets = []
-    for _ in range(batch_size):
-        a = torch.randint(0, max_val + 1, (1,)).item()
-        b = torch.randint(0, max_val + 1, (1,)).item()
-        result = (a + b) % 256
-        prompts.append(f"{a} + {b} =")
-        targets.append(f" {result}")
-    return prompts, targets
-def evaluate_arithmetic(
-    model: AutoModelForCausalLM,
-    tokenizer: AutoTokenizer,
-    n_problems: int = 100,
-    device: str = 'cpu'
-) -> dict:
-    """Evaluate model on random arithmetic problems."""
-    correct = 0
-    total = 0
-    errors = []
-    model.eval()
-    for _ in range(n_problems):
-        a = torch.randint(0, 256, (1,)).item()
-        b = torch.randint(0, 256, (1,)).item()
-        expected = (a + b) % 256
-        prompt = f"{a} + {b} ="
-        inputs = tokenizer(prompt, return_tensors='pt').to(device)
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=10,
-                do_sample=False,
-                pad_token_id=tokenizer.eos_token_id
-            )
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract number from response
-        try:
-            # Find the part after "="
-            answer_part = response.split('=')[-1].strip()
-            # Extract first number
-            predicted = int(''.join(c for c in answer_part.split()[0] if c.isdigit()))
-            if predicted == expected:
-                correct += 1
-            else:
-                errors.append((a, b, expected, predicted))
-        except:
-            errors.append((a, b, expected, "parse_error"))
-        total += 1
-    return {
-        'accuracy': correct / total,
-        'correct': correct,
-        'total': total,
-        'errors': errors[:10]  # First 10 errors
-    }
-# =============================================================================
-# MAIN: Demo
-# =============================================================================
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='Circuit-Augmented LLM Demo')
-    parser.add_argument('--circuit-path', type=str,
-                        default='./neural_computer.safetensors',
-                        help='Path to circuit weights')
-    parser.add_argument('--device', type=str, default='cpu',
-                        help='Device (cpu or cuda)')
-    parser.add_argument('--eval-only', action='store_true',
-                        help='Only evaluate, do not augment')
-    args = parser.parse_args()
-    print("=" * 70)
-    print(" CIRCUIT-AUGMENTED LLM")
-    print("=" * 70)
-    # Load tokenizer and model
-    print("\n[1] Loading SmolLM2-360M...")
-    model_id = "HuggingFaceTB/SmolLM2-360M"
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
-    print(f"    Parameters: {sum(p.numel() for p in model.parameters()):,}")
-    # Baseline evaluation
-    print("\n[2] Baseline arithmetic evaluation...")
-    baseline = evaluate_arithmetic(model, tokenizer, n_problems=50, device=args.device)
-    print(f"    Accuracy: {baseline['accuracy']*100:.1f}% ({baseline['correct']}/{baseline['total']})")
-    if baseline['errors']:
-        print(f"    Sample errors:")
-        for a, b, exp, got in baseline['errors'][:5]:
-            print(f"      {a} + {b} = {exp}, model said {got}")
-    if args.eval_only:
-        print("\nDone (eval only mode).")
-        exit(0)
-    # Augment with circuits
-    print(f"\n[3] Augmenting with threshold circuits...")
-    print(f"    Circuit path: {args.circuit_path}")
-    model = augment_smollm2_with_circuits(
-        model,
-        args.circuit_path,
-        device=args.device
-    )
-    new_params = sum(p.numel() for p in model.parameters())
-    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print(f"    Total parameters: {new_params:,}")
-    print(f"    Trainable parameters: {trainable:,}")
-    # Test circuit execution directly
-    print("\n[4] Testing circuit execution...")
-    circuit_exec = CircuitExecutor(args.circuit_path, args.device)
-    test_cases = [(127, 128), (255, 1), (0, 0), (100, 55)]
-    for a, b in test_cases:
-        # Convert to bits (LSB first)
-        a_bits = torch.tensor([(a >> i) & 1 for i in range(8)], dtype=torch.float32)
-        b_bits = torch.tensor([(b >> i) & 1 for i in range(8)], dtype=torch.float32)
-        result_bits, carry = circuit_exec.add_8bit(
-            a_bits.unsqueeze(0),
-            b_bits.unsqueeze(0)
-        )
-        # Convert result bits back to int
-        result = sum(int(result_bits[0, i].item()) * (2**i) for i in range(8))
-        expected = (a + b) % 256
-        status = "OK" if result == expected else "FAIL"
-        print(f"    {a} + {b} = {result} (expected {expected}) [{status}]")
-    print("\n[5] Model ready for fine-tuning.")
-    print("    Next: Train interface layers on arithmetic examples.")
-    print("=" * 70)

+"""
+Circuit-Augmented LLM: Embedding threshold logic circuits into SmolLM2
+======================================================================
+Replaces/augments MLP layers with frozen threshold circuits for exact arithmetic.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Optional, Tuple
+from safetensors.torch import load_file
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import warnings
+warnings.filterwarnings('ignore')
+# =============================================================================
+# HEAVISIDE WITH STRAIGHT-THROUGH ESTIMATOR
+# =============================================================================
+class HeavisideSTE(torch.autograd.Function):
+    """Heaviside step function with straight-through estimator for backprop."""
+    @staticmethod
+    def forward(ctx, x):
+        return (x >= 0).float()
+    @staticmethod
+    def backward(ctx, grad_output):
+        # STE: pass gradient through unchanged
+        return grad_output
+def heaviside(x: torch.Tensor) -> torch.Tensor:
+    """Heaviside step: 1 if x >= 0, else 0. Uses STE for training."""
+    return HeavisideSTE.apply(x)
+# =============================================================================
+# CIRCUIT EXECUTOR - Runs the frozen threshold circuits
+# =============================================================================
+class CircuitExecutor(nn.Module):
+    """
+    Executes threshold logic circuits from the safetensors file.
+    All circuit weights are frozen - only interface layers train.
+    """
+    def __init__(self, circuit_path: str, device: str = 'cpu'):
+        super().__init__()
+        self.device = device
+        # Load all circuit tensors
+        raw_circuits = load_file(circuit_path)
+        # Store as frozen parameters (use underscores for valid param names)
+        self.circuits = {}
+        for k, v in raw_circuits.items():
+            safe_name = k.replace('.', '__')
+            self.register_buffer(safe_name, v.float().to(device))
+            self.circuits[k] = safe_name
+    def _get(self, name: str) -> torch.Tensor:
+        """Get circuit tensor by original dotted name."""
+        return getattr(self, self.circuits[name])
+    # -------------------------------------------------------------------------
+    # Boolean Gates
+    # -------------------------------------------------------------------------
+    def eval_and(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        """AND gate: output 1 iff both inputs are 1."""
+        inp = torch.stack([a, b], dim=-1)
+        w = self._get('boolean.and.weight')
+        bias = self._get('boolean.and.bias')
+        return heaviside(inp @ w + bias)
+    def eval_or(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        """OR gate: output 1 if either input is 1."""
+        inp = torch.stack([a, b], dim=-1)
+        w = self._get('boolean.or.weight')
+        bias = self._get('boolean.or.bias')
+        return heaviside(inp @ w + bias)
+    def eval_xor(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        """XOR gate: two-layer network (not linearly separable)."""
+        inp = torch.stack([a, b], dim=-1)
+        # Layer 1: OR and NAND neurons
+        w1_n1 = self._get('boolean.xor.layer1.neuron1.weight')
+        b1_n1 = self._get('boolean.xor.layer1.neuron1.bias')
+        w1_n2 = self._get('boolean.xor.layer1.neuron2.weight')
+        b1_n2 = self._get('boolean.xor.layer1.neuron2.bias')
+        h1 = heaviside(inp @ w1_n1 + b1_n1)
+        h2 = heaviside(inp @ w1_n2 + b1_n2)
+        hidden = torch.stack([h1, h2], dim=-1)
+        # Layer 2: AND of hidden
+        w2 = self._get('boolean.xor.layer2.weight')
+        b2 = self._get('boolean.xor.layer2.bias')
+        return heaviside(hidden @ w2 + b2)
+    # -------------------------------------------------------------------------
+    # Arithmetic: Full Adder
+    # -------------------------------------------------------------------------
+    def eval_full_adder(self, a: torch.Tensor, b: torch.Tensor,
+                        cin: torch.Tensor, prefix: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Full adder: sum = a XOR b XOR cin, cout = (a AND b) OR (cin AND (a XOR b))
+        Returns (sum_bit, carry_out)
+        """
+        inp_ab = torch.stack([a, b], dim=-1)
+        # HA1: a XOR b
+        w1_or = self._get(f'{prefix}.ha1.sum.layer1.or.weight')
+        b1_or = self._get(f'{prefix}.ha1.sum.layer1.or.bias')
+        w1_nand = self._get(f'{prefix}.ha1.sum.layer1.nand.weight')
+        b1_nand = self._get(f'{prefix}.ha1.sum.layer1.nand.bias')
+        w2 = self._get(f'{prefix}.ha1.sum.layer2.weight')
+        b2 = self._get(f'{prefix}.ha1.sum.layer2.bias')
+        h_or = heaviside(inp_ab @ w1_or + b1_or)
+        h_nand = heaviside(inp_ab @ w1_nand + b1_nand)
+        hidden = torch.stack([h_or, h_nand], dim=-1)
+        ha1_sum = heaviside(hidden @ w2 + b2)
+        # HA1 carry
+        w_c1 = self._get(f'{prefix}.ha1.carry.weight')
+        b_c1 = self._get(f'{prefix}.ha1.carry.bias')
+        ha1_carry = heaviside(inp_ab @ w_c1 + b_c1)
+        # HA2: ha1_sum XOR cin
+        inp_ha2 = torch.stack([ha1_sum, cin], dim=-1)
+        w1_or = self._get(f'{prefix}.ha2.sum.layer1.or.weight')
+        b1_or = self._get(f'{prefix}.ha2.sum.layer1.or.bias')
+        w1_nand = self._get(f'{prefix}.ha2.sum.layer1.nand.weight')
+        b1_nand = self._get(f'{prefix}.ha2.sum.layer1.nand.bias')
+        w2 = self._get(f'{prefix}.ha2.sum.layer2.weight')
+        b2 = self._get(f'{prefix}.ha2.sum.layer2.bias')
+        h_or = heaviside(inp_ha2 @ w1_or + b1_or)
+        h_nand = heaviside(inp_ha2 @ w1_nand + b1_nand)
+        hidden = torch.stack([h_or, h_nand], dim=-1)
+        ha2_sum = heaviside(hidden @ w2 + b2)
+        # HA2 carry
+        w_c2 = self._get(f'{prefix}.ha2.carry.weight')
+        b_c2 = self._get(f'{prefix}.ha2.carry.bias')
+        ha2_carry = heaviside(inp_ha2 @ w_c2 + b_c2)
+        # Carry out = ha1_carry OR ha2_carry
+        inp_cout = torch.stack([ha1_carry, ha2_carry], dim=-1)
+        w_or = self._get(f'{prefix}.carry_or.weight')
+        b_or = self._get(f'{prefix}.carry_or.bias')
+        cout = heaviside(inp_cout @ w_or + b_or)
+        return ha2_sum, cout
+    # -------------------------------------------------------------------------
+    # Arithmetic: 8-bit Ripple Carry Adder
+    # -------------------------------------------------------------------------
+    def add_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        8-bit ripple carry addition.
+        a_bits, b_bits: [..., 8] tensors of bits (LSB first)
+        Returns: (result_bits [..., 8], carry_out [...])
+        """
+        batch_shape = a_bits.shape[:-1]
+        carry = torch.zeros(batch_shape, device=a_bits.device)
+        result_bits = []
+        for i in range(8):
+            a_i = a_bits[..., i]
+            b_i = b_bits[..., i]
+            sum_bit, carry = self.eval_full_adder(
+                a_i, b_i, carry,
+                f'arithmetic.ripplecarry8bit.fa{i}'
+            )
+            result_bits.append(sum_bit)
+        return torch.stack(result_bits, dim=-1), carry
+    # -------------------------------------------------------------------------
+    # Arithmetic: 8-bit Comparators
+    # -------------------------------------------------------------------------
+    def greater_than_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> torch.Tensor:
+        """Returns 1 if a > b, else 0. Bits are MSB first."""
+        diff = a_bits - b_bits  # [..., 8]
+        w = self._get('arithmetic.greaterthan8bit.comparator')
+        score = (diff * w).sum(dim=-1)
+        return (score > 0).float()
+    def less_than_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> torch.Tensor:
+        """Returns 1 if a < b, else 0. Bits are MSB first."""
+        diff = b_bits - a_bits  # [..., 8]
+        w = self._get('arithmetic.lessthan8bit.comparator')
+        score = (diff * w).sum(dim=-1)
+        return (score > 0).float()
+    def equal_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> torch.Tensor:
+        """Returns 1 if a == b, else 0."""
+        gt = self.greater_than_8bit(a_bits, b_bits)
+        lt = self.less_than_8bit(a_bits, b_bits)
+        return (1 - gt) * (1 - lt)
+# =============================================================================
+# BIT EXTRACTION / INJECTION INTERFACES
+# =============================================================================
+class BitExtractor(nn.Module):
+    """
+    Learns to extract 8-bit operands from token embeddings.
+    Maps embedding -> 16 bits (two 8-bit operands).
+    """
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        # Project to logits, then binarize
+        self.proj = nn.Linear(d_model, 16)
+        # Learnable temperature for sigmoid approximation during training
+        self.temperature = nn.Parameter(torch.tensor(1.0))
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        x: [..., d_model]
+        Returns: a_bits [..., 8], b_bits [..., 8] (LSB first for arithmetic)
+        """
+        logits = self.proj(x)  # [..., 16]
+        # Binarize with STE
+        bits = heaviside(logits)
+        # Split into two operands
+        a_bits = bits[..., :8]
+        b_bits = bits[..., 8:]
+        return a_bits, b_bits
+class BitInjector(nn.Module):
+    """
+    Learns to inject circuit results back into embedding space.
+    Maps 16 bits (result + flags) -> embedding delta.
+    """
+    def __init__(self, d_model: int):
+        super().__init__()
+        self.d_model = d_model
+        # Project bits to embedding
+        self.proj = nn.Linear(16, d_model)
+        # Learnable scale
+        self.scale = nn.Parameter(torch.tensor(0.1))
+    def forward(self, result_bits: torch.Tensor, flags: torch.Tensor) -> torch.Tensor:
+        """
+        result_bits: [..., 8]
+        flags: [..., 8] (carry, overflow, zero, negative, etc.)
+        Returns: [..., d_model]
+        """
+        combined = torch.cat([result_bits, flags], dim=-1)  # [..., 16]
+        return self.proj(combined) * self.scale
+# =============================================================================
+# CIRCUIT-AUGMENTED MLP BLOCK
+# =============================================================================
+class CircuitAugmentedMLP(nn.Module):
+    """
+    MLP block augmented with frozen threshold circuits.
+    The original MLP path runs in parallel with the circuit path.
+    A learned router decides how much to use each.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        intermediate_size: int,
+        circuit_path: str,
+        device: str = 'cpu'
+    ):
+        super().__init__()
+        self.d_model = d_model
+        # Original MLP components (will be loaded from pretrained)
+        self.gate_proj = nn.Linear(d_model, intermediate_size, bias=False)
+        self.up_proj = nn.Linear(d_model, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, d_model, bias=False)
+        self.act_fn = nn.SiLU()
+        # Circuit components
+        self.circuits = CircuitExecutor(circuit_path, device)
+        self.bit_extractor = BitExtractor(d_model)
+        self.bit_injector = BitInjector(d_model)
+        # Router: decides circuit vs MLP contribution
+        self.router = nn.Sequential(
+            nn.Linear(d_model, 64),
+            nn.ReLU(),
+            nn.Linear(64, 2),
+            nn.Softmax(dim=-1)
+        )
+        # Operation selector (which arithmetic op to perform)
+        self.op_selector = nn.Sequential(
+            nn.Linear(d_model, 32),
+            nn.ReLU(),
+            nn.Linear(32, 4),  # add, sub, compare, passthrough
+            nn.Softmax(dim=-1)
+        )
+    def _compute_flags(self, result_bits: torch.Tensor, carry: torch.Tensor) -> torch.Tensor:
+        """Compute status flags from result."""
+        batch_shape = result_bits.shape[:-1]
+        # Zero flag: all bits are 0
+        zero = (result_bits.sum(dim=-1) == 0).float()
+        # Negative flag: MSB is 1 (two's complement)
+        negative = result_bits[..., 7]
+        # Carry flag
+        carry_flag = carry
+        # Pad to 8 flags
+        flags = torch.zeros(*batch_shape, 8, device=result_bits.device)
+        flags[..., 0] = zero
+        flags[..., 1] = negative
+        flags[..., 2] = carry_flag
+        return flags
+    def _circuit_forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Run input through threshold circuits."""
+        # Extract operands
+        a_bits, b_bits = self.bit_extractor(x)
+        # Get operation weights
+        op_weights = self.op_selector(x)  # [..., 4]
+        # Compute addition
+        add_result, add_carry = self.circuits.add_8bit(a_bits, b_bits)
+        add_flags = self._compute_flags(add_result, add_carry)
+        # Compute subtraction (a + (~b) + 1, simplified: just use add for now)
+        # For MVP, we'll focus on addition
+        # Inject result back
+        circuit_delta = self.bit_injector(add_result, add_flags)
+        return circuit_delta
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: [batch, seq_len, d_model]
+        Returns: [batch, seq_len, d_model]
+        """
+        # Original MLP path
+        mlp_out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        # Circuit path
+        circuit_out = self._circuit_forward(x)
+        # Route between paths
+        route_weights = self.router(x)  # [..., 2]
+        mlp_weight = route_weights[..., 0:1]
+        circuit_weight = route_weights[..., 1:2]
+        # Combine: MLP output + weighted circuit contribution
+        output = mlp_out + circuit_weight * circuit_out
+        return output
+# =============================================================================
+# MODEL SURGERY: Insert circuits into SmolLM2
+# =============================================================================
+def augment_smollm2_with_circuits(
+    model: AutoModelForCausalLM,
+    circuit_path: str,
+    layer_indices: list = None,
+    device: str = 'cpu'
+) -> AutoModelForCausalLM:
+    """
+    Surgically insert circuit blocks into SmolLM2's MLP layers.
+    Args:
+        model: Pretrained SmolLM2 model
+        circuit_path: Path to neural_computer.safetensors
+        layer_indices: Which layers to augment (default: middle layers)
+        device: Device for circuit tensors
+    Returns:
+        Modified model with circuit-augmented MLPs
+    """
+    config = model.config
+    num_layers = config.num_hidden_layers
+    # Default: augment middle third of layers
+    if layer_indices is None:
+        start = num_layers // 3
+        end = 2 * num_layers // 3
+        layer_indices = list(range(start, end))
+    print(f"Augmenting layers {layer_indices} with threshold circuits...")
+    for idx in layer_indices:
+        layer = model.model.layers[idx]
+        old_mlp = layer.mlp
+        # Create augmented MLP
+        new_mlp = CircuitAugmentedMLP(
+            d_model=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            circuit_path=circuit_path,
+            device=device
+        )
+        # Copy pretrained weights
+        new_mlp.gate_proj.weight.data = old_mlp.gate_proj.weight.data.clone()
+        new_mlp.up_proj.weight.data = old_mlp.up_proj.weight.data.clone()
+        new_mlp.down_proj.weight.data = old_mlp.down_proj.weight.data.clone()
+        # Replace
+        layer.mlp = new_mlp
+    # Freeze circuit weights, keep interfaces trainable
+    for name, param in model.named_parameters():
+        if 'circuits' in name:
+            param.requires_grad = False
+    print(f"Done. Circuit weights frozen, interfaces trainable.")
+    return model
+# =============================================================================
+# TRAINING UTILITIES
+# =============================================================================
+def generate_arithmetic_batch(batch_size: int, max_val: int = 255) -> Tuple[list, list]:
+    """Generate batch of arithmetic problems and solutions."""
+    prompts = []
+    targets = []
+    for _ in range(batch_size):
+        a = torch.randint(0, max_val + 1, (1,)).item()
+        b = torch.randint(0, max_val + 1, (1,)).item()
+        result = (a + b) % 256
+        prompts.append(f"{a} + {b} =")
+        targets.append(f" {result}")
+    return prompts, targets
+def evaluate_arithmetic(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    n_problems: int = 100,
+    device: str = 'cpu'
+) -> dict:
+    """Evaluate model on random arithmetic problems."""
+    correct = 0
+    total = 0
+    errors = []
+    model.eval()
+    for _ in range(n_problems):
+        a = torch.randint(0, 256, (1,)).item()
+        b = torch.randint(0, 256, (1,)).item()
+        expected = (a + b) % 256
+        prompt = f"{a} + {b} ="
+        inputs = tokenizer(prompt, return_tensors='pt').to(device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=10,
+                do_sample=False,
+                pad_token_id=tokenizer.eos_token_id
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract number from response
+        try:
+            # Find the part after "="
+            answer_part = response.split('=')[-1].strip()
+            # Extract first number
+            predicted = int(''.join(c for c in answer_part.split()[0] if c.isdigit()))
+            if predicted == expected:
+                correct += 1
+            else:
+                errors.append((a, b, expected, predicted))
+        except:
+            errors.append((a, b, expected, "parse_error"))
+        total += 1
+    return {
+        'accuracy': correct / total,
+        'correct': correct,
+        'total': total,
+        'errors': errors[:10]  # First 10 errors
+    }
+# =============================================================================
+# MAIN: Demo
+# =============================================================================
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Circuit-Augmented LLM Demo')
+    parser.add_argument('--circuit-path', type=str,
+                        default='./neural_computer.safetensors',
+                        help='Path to circuit weights')
+    parser.add_argument('--device', type=str, default='cpu',
+                        help='Device (cpu or cuda)')
+    parser.add_argument('--eval-only', action='store_true',
+                        help='Only evaluate, do not augment')
+    args = parser.parse_args()
+    print("=" * 70)
+    print(" CIRCUIT-AUGMENTED LLM")
+    print("=" * 70)
+    # Load tokenizer and model
+    print("\n[1] Loading SmolLM2-360M...")
+    model_id = "HuggingFaceTB/SmolLM2-360M"
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
+    print(f"    Parameters: {sum(p.numel() for p in model.parameters()):,}")
+    # Baseline evaluation
+    print("\n[2] Baseline arithmetic evaluation...")
+    baseline = evaluate_arithmetic(model, tokenizer, n_problems=50, device=args.device)
+    print(f"    Accuracy: {baseline['accuracy']*100:.1f}% ({baseline['correct']}/{baseline['total']})")
+    if baseline['errors']:
+        print(f"    Sample errors:")
+        for a, b, exp, got in baseline['errors'][:5]:
+            print(f"      {a} + {b} = {exp}, model said {got}")
+    if args.eval_only:
+        print("\nDone (eval only mode).")
+        exit(0)
+    # Augment with circuits
+    print(f"\n[3] Augmenting with threshold circuits...")
+    print(f"    Circuit path: {args.circuit_path}")
+    model = augment_smollm2_with_circuits(
+        model,
+        args.circuit_path,
+        device=args.device
+    )
+    new_params = sum(p.numel() for p in model.parameters())
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"    Total parameters: {new_params:,}")
+    print(f"    Trainable parameters: {trainable:,}")
+    # Test circuit execution directly
+    print("\n[4] Testing circuit execution...")
+    circuit_exec = CircuitExecutor(args.circuit_path, args.device)
+    test_cases = [(127, 128), (255, 1), (0, 0), (100, 55)]
+    for a, b in test_cases:
+        # Convert to bits (LSB first)
+        a_bits = torch.tensor([(a >> i) & 1 for i in range(8)], dtype=torch.float32)
+        b_bits = torch.tensor([(b >> i) & 1 for i in range(8)], dtype=torch.float32)
+        result_bits, carry = circuit_exec.add_8bit(
+            a_bits.unsqueeze(0),
+            b_bits.unsqueeze(0)
+        )
+        # Convert result bits back to int
+        result = sum(int(result_bits[0, i].item()) * (2**i) for i in range(8))
+        expected = (a + b) % 256
+        status = "OK" if result == expected else "FAIL"
+        print(f"    {a} + {b} = {result} (expected {expected}) [{status}]")
+    print("\n[5] Model ready for fine-tuning.")
+    print("    Next: Train interface layers on arithmetic examples.")
+    print("=" * 70)

llm/guide.md CHANGED Viewed

@@ -1,615 +1,615 @@
-# Embedding Threshold Logic Circuits into Transformer MLPs
-## Technical Implementation Guide
----
-## 1. Core Thesis
-Standard LLMs fail at arithmetic because they're interpolators—they approximate functions over training distributions rather than compute exact results. A 360M parameter model trained on internet text has seen "127 + 128 = 255" zero or few times, so it guesses "140" based on pattern matching.
-We solve this by embedding **frozen, proven-correct arithmetic circuits** directly into the transformer's MLP layers. The circuits use threshold logic (weighted sums + step activation), which is structurally compatible with neural network layers. We train only the **interface layers** that learn to:
-1. Extract operands from token embeddings
-2. Route computation through the circuits
-3. Inject results back into the residual stream
-The model learns **call dispatch**, not arithmetic. The arithmetic is already solved.
----
-## 2. Threshold Logic Fundamentals
-### 2.1 Single Threshold Gate
-A threshold gate computes:
-```
-output = 1  if  (Σ wᵢxᵢ + b) ≥ 0
-         0  otherwise
-```
-This is a neuron with Heaviside step activation. With integer weights `w` and bias `b`, it computes a Boolean function of binary inputs.
-**Example: AND gate**
-```
-w = [1, 1], b = -2
-AND(0,0) = H(0 + 0 - 2) = H(-2) = 0
-AND(0,1) = H(0 + 1 - 2) = H(-1) = 0
-AND(1,0) = H(1 + 0 - 2) = H(-1) = 0
-AND(1,1) = H(1 + 1 - 2) = H(0)  = 1
-```
-**Example: OR gate**
-```
-w = [1, 1], b = -1
-OR(0,0) = H(0 + 0 - 1) = H(-1) = 0
-OR(0,1) = H(0 + 1 - 1) = H(0)  = 1
-OR(1,0) = H(1 + 0 - 1) = H(0)  = 1
-OR(1,1) = H(1 + 1 - 1) = H(1)  = 1
-```
-### 2.2 Multi-Layer Circuits
-XOR is not linearly separable—it requires two layers:
-```
-Layer 1:
-  neuron1 (OR):   w=[1,1], b=-1   → fires if a OR b
-  neuron2 (NAND): w=[-1,-1], b=1  → fires if NOT(a AND b)
-Layer 2:
-  neuron3 (AND): w=[1,1], b=-2   → fires if both layer1 outputs are 1
-XOR(a,b) = AND(OR(a,b), NAND(a,b))
-```
-### 2.3 Full Adder
-A full adder computes `sum` and `carry_out` from inputs `a`, `b`, `carry_in`:
-```
-sum = a XOR b XOR cin
-cout = (a AND b) OR (cin AND (a XOR b))
-```
-Implementation uses two half-adders chained:
-```
-HA1: (a, b) → (sum1 = a XOR b, carry1 = a AND b)
-HA2: (sum1, cin) → (sum2 = sum1 XOR cin, carry2 = sum1 AND cin)
-cout = carry1 OR carry2
-final_sum = sum2
-```
-Each XOR is 2 layers, each AND/OR is 1 layer. Total depth: ~4 layers per full adder.
-### 2.4 8-bit Ripple Carry Adder
-Chain 8 full adders, propagating carry:
-```
-FA0: (a[0], b[0], 0)      → (sum[0], c0)
-FA1: (a[1], b[1], c0)     → (sum[1], c1)
-FA2: (a[2], b[2], c1)     → (sum[2], c2)
-...
-FA7: (a[7], b[7], c6)     → (sum[7], c7)
-```
-Total circuit depth: ~32 threshold layers (8 FAs × 4 layers each).
----
-## 3. Circuit Inventory
-The `neural_computer.safetensors` contains 3,122 tensors / 5,648 parameters implementing:
-| Category | Circuits | Tensors |
-|----------|----------|---------|
-| Boolean | AND, OR, NOT, NAND, NOR, XOR, XNOR, IMPLIES, BIIMPLIES | ~30 |
-| Arithmetic | Half adder, Full adder, Ripple carry 2/4/8-bit, 8×8 multiplier | ~800 |
-| Comparators | GT, LT, GEQ, LEQ, EQ (8-bit) | ~50 |
-| ALU | 16-operation ALU, opcode decoder, flag computation | ~400 |
-| Control | JMP, JZ, JNZ, JC, JNC, JN, JP, CALL, RET, PUSH, POP | ~200 |
-| Modular | Divisibility by 2-12 | ~600 |
-| Error Detection | Parity, CRC, Hamming, checksum | ~200 |
-| Pattern | Popcount, leading zeros, symmetry | ~150 |
-| Threshold | k-of-n gates, majority, minority | ~100 |
-All weights are integers. All activations are Heaviside. Verified with 6,590 exhaustive tests.
----
-## 4. Transformer Integration Architecture
-### 4.1 Target: SmolLM2-360M
-```
-Architecture: LlamaForCausalLM
-Hidden dim:   960
-Layers:       32
-Heads:        15
-MLP expansion: 4x (intermediate = 3840)
-Vocab:        49152
-Parameters:   361,821,120
-```
-Standard MLP block:
-```python
-def forward(x):  # x: [batch, seq, 960]
-    gate = self.gate_proj(x)      # [batch, seq, 3840]
-    up = self.up_proj(x)          # [batch, seq, 3840]
-    hidden = silu(gate) * up      # SwiGLU activation
-    return self.down_proj(hidden) # [batch, seq, 960]
-```
-### 4.2 Augmented MLP Block
-```python
-def forward(x):  # x: [batch, seq, 960]
-    # Original MLP path (unchanged)
-    mlp_out = self.down_proj(silu(self.gate_proj(x)) * self.up_proj(x))
-    # Circuit path (new)
-    a_bits, b_bits = self.bit_extractor(x)       # [batch, seq, 8] each
-    result_bits, carry = self.circuits.add_8bit(a_bits, b_bits)
-    flags = self.compute_flags(result_bits, carry)
-    circuit_delta = self.bit_injector(result_bits, flags)
-    # Routing
-    route_weights = self.router(x)  # [batch, seq, 2] softmax
-    # Combine
-    return mlp_out + route_weights[..., 1:2] * circuit_delta
-```
-### 4.3 Layer Selection
-We augment the **middle third** of layers (10-20 of 32):
-- Early layers (0-9): Token/position encoding, not arithmetic-relevant
-- Middle layers (10-20): Abstract reasoning, computation
-- Late layers (21-31): Output formatting, vocabulary projection
-Rationale: Arithmetic computation happens in middle layers where the model processes relationships between tokens. Early layers haven't built sufficient representations; late layers are committed to output tokens.
----
-## 5. Interface Layers (Trainable)
-### 5.1 BitExtractor
-Maps token embedding → two 8-bit operands.
-```python
-class BitExtractor(nn.Module):
-    def __init__(self, d_model=960):
-        self.proj = nn.Linear(d_model, 16)  # 960 → 16
-    def forward(self, x):
-        logits = self.proj(x)           # [batch, seq, 16]
-        bits = heaviside(logits)        # binarize with STE
-        a_bits = bits[..., :8]          # first operand
-        b_bits = bits[..., 8:]          # second operand
-        return a_bits, b_bits           # both [batch, seq, 8], LSB first
-```
-**What it learns**: Which embedding dimensions encode numeric magnitude. For token "127", it must learn that certain activation patterns correspond to bits `[1,1,1,1,1,1,1,0]`.
-**Parameters**: 960 × 16 + 16 = 15,376
-### 5.2 BitInjector
-Maps circuit outputs → embedding delta.
-```python
-class BitInjector(nn.Module):
-    def __init__(self, d_model=960):
-        self.proj = nn.Linear(16, d_model)  # 16 → 960
-        self.scale = nn.Parameter(torch.tensor(0.1))
-    def forward(self, result_bits, flags):
-        combined = torch.cat([result_bits, flags], dim=-1)  # [batch, seq, 16]
-        return self.proj(combined) * self.scale              # [batch, seq, 960]
-```
-**What it learns**: How to inject the result bits back into embedding space such that subsequent layers (and the final vocabulary projection) produce the correct output tokens.
-**Parameters**: 16 × 960 + 960 + 1 = 16,321
-### 5.3 Router
-Decides when to use circuit path.
-```python
-class Router(nn.Module):
-    def __init__(self, d_model=960):
-        self.net = nn.Sequential(
-            nn.Linear(d_model, 64),
-            nn.ReLU(),
-            nn.Linear(64, 2),
-            nn.Softmax(dim=-1)
-        )
-    def forward(self, x):
-        return self.net(x)  # [batch, seq, 2]: [mlp_weight, circuit_weight]
-```
-**What it learns**: "This position contains arithmetic" → route through circuits. "This is prose" → use normal MLP.
-**Parameters**: 960 × 64 + 64 + 64 × 2 + 2 = 61,698
-### 5.4 Total Trainable Parameters
-Per augmented layer:
-```
-BitExtractor:  15,376
-BitInjector:   16,321
-Router:        61,698
-OpSelector:    ~31,000
-───────────────────────
-Total:         ~124,395 per layer
-```
-For 11 augmented layers: **~1.37M trainable parameters**
-This is 0.38% of the model. The other 99.62% (including all circuit weights) is frozen.
----
-## 6. Gradient Flow Through Heaviside
-### 6.1 The Problem
-Heaviside has zero gradient almost everywhere:
-```
-H(x) = 1 if x ≥ 0 else 0
-dH/dx = 0 for x ≠ 0, undefined at x = 0
-```
-Standard backprop would give zero gradients to BitExtractor.
-### 6.2 Straight-Through Estimator (STE)
-We use STE: forward pass uses true Heaviside, backward pass pretends it's identity.
-```python
-class HeavisideSTE(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x):
-        return (x >= 0).float()  # true step function
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output  # pass gradient through unchanged
-```
-**Intuition**: "If making the input larger would have helped the output, increase the input." The gradient tells us the direction even though the function is flat.
-### 6.3 Alternative: Sigmoid Annealing
-During training, use sigmoid with increasing temperature:
-```python
-def soft_heaviside(x, temperature):
-    return torch.sigmoid(x * temperature)
-# temperature: 1 → 10 → 100 over training
-# At high temperature, sigmoid ≈ step function
-```
-This provides smoother gradients early in training, then sharpens to true binary at inference.
----
-## 7. Training Strategy
-### 7.1 Data Generation
-Generate arithmetic problems exhaustively:
-```python
-def generate_batch(batch_size):
-    a = torch.randint(0, 256, (batch_size,))
-    b = torch.randint(0, 256, (batch_size,))
-    result = (a + b) % 256
-    prompts = [f"{a[i]} + {b[i]} =" for i in range(batch_size)]
-    targets = [f" {result[i]}" for i in range(batch_size)]
-    return prompts, targets
-```
-For 8-bit addition, there are 256 × 256 = 65,536 unique problems. We can cover the entire space.
-### 7.2 Loss Function
-Standard cross-entropy on next-token prediction:
-```python
-outputs = model(input_ids, attention_mask=mask, labels=labels)
-loss = outputs.loss  # CE loss, only on target tokens
-```
-Labels are masked for prompt tokens (`-100`), so loss only backprops through the answer.
-### 7.3 Optimizer Configuration
-```python
-# Only train interface layers
-interface_params = [p for n, p in model.named_parameters()
-                    if any(x in n for x in ['bit_extractor', 'bit_injector', 'router'])]
-optimizer = AdamW(interface_params, lr=1e-4, weight_decay=0.01)
-scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
-```
-### 7.4 Curriculum Learning
-Start simple, increase difficulty:
-```
-Phase 1 (epochs 1-2):   Single-digit addition (0-9 + 0-9)
-Phase 2 (epochs 3-4):   Two-digit addition (0-99 + 0-99)
-Phase 3 (epochs 5-7):   Full 8-bit addition (0-255 + 0-255)
-Phase 4 (epochs 8-10):  Adversarial cases (carry chains: 127+128, 255+1)
-```
-This helps the interface layers learn the basic extraction pattern before tackling hard cases.
-### 7.5 Training Hyperparameters
-```
-Model:          SmolLM2-360M
-Augmented:      Layers 10-20 (11 layers)
-Trainable:      1.37M parameters
-Frozen:         362M parameters (including 5.6K circuit params)
-Batch size:     32
-Learning rate:  1e-4
-Epochs:         10
-Samples:        10,000 per epoch
-Warmup:         500 steps
-Device:         RTX 6000 Ada (48GB)
-Expected time:  ~30 minutes total
-```
----
-## 8. Forward Pass Walkthrough
-Input: `"127 + 128 ="`
-### 8.1 Tokenization
-```
-Tokens: ["127", " +", " 128", " ="]
-IDs:    [12700, 489, 13824, 284]  # hypothetical
-```
-### 8.2 Embedding
-```
-embeddings = embed(input_ids)  # [1, 4, 960]
-```
-### 8.3 Layers 0-9 (Unchanged)
-Standard attention + MLP, building representations.
-### 8.4 Layer 10 (Augmented)
-```python
-# After attention
-x = layer_norm(attn_output + residual)  # [1, 4, 960]
-# MLP path
-mlp_out = down_proj(silu(gate_proj(x)) * up_proj(x))
-# Circuit path
-a_bits, b_bits = bit_extractor(x)
-# Position 0 ("127"): a_bits ≈ [1,1,1,1,1,1,1,0] if well-trained
-# Position 2 ("128"): b_bits ≈ [0,0,0,0,0,0,0,1]
-# (In practice, extraction happens per-position; aggregation is learned)
-result_bits, carry = circuits.add_8bit(a_bits, b_bits)
-# result_bits = [1,1,1,1,1,1,1,1] = 255
-flags = compute_flags(result_bits, carry)
-# zero=0, negative=1, carry=1
-circuit_delta = bit_injector(result_bits, flags)  # [1, 4, 960]
-# Routing
-route = router(x)  # [1, 4, 2]
-# Position 3 ("="): route ≈ [0.1, 0.9] → use circuits
-# Position 1 ("+"): route ≈ [0.8, 0.2] → mostly MLP
-# Combine
-output = mlp_out + route[..., 1:2] * circuit_delta
-```
-### 8.5 Layers 11-31
-Continue processing, eventually projecting to vocabulary.
-### 8.6 Output
-```
-logits = lm_head(final_hidden)  # [1, 4, 49152]
-next_token = argmax(logits[0, 3, :])  # token after "="
-# Should decode to "255" (possibly as " 255" or "255")
-```
----
-## 9. Inference Characteristics
-### 9.1 Exactness
-At inference, Heaviside is true step function—no approximation. If BitExtractor correctly maps "127" → bits and "128" → bits, the circuit **will** output 255. The only failure mode is incorrect extraction.
-### 9.2 Latency
-Circuit computation adds ~5-10% overhead:
-- BitExtractor: 1 linear layer (960→16)
-- Circuits: ~32 threshold layers, but sparse and tiny
-- BitInjector: 1 linear layer (16→960)
-- Router: 2 linear layers
-The circuits have only 5,648 parameters total—negligible versus the 361M in the base model.
-### 9.3 Generalization
-Once the interface learns the mapping, it generalizes to **all** 65,536 8-bit additions. There's no memorization—the circuits compute.
----
-## 10. Evaluation Metrics
-### 10.1 Arithmetic Accuracy
-```python
-def eval_accuracy(model, n_problems=1000):
-    correct = 0
-    for _ in range(n_problems):
-        a, b = random 8-bit values
-        expected = (a + b) % 256
-        predicted = model.generate(f"{a} + {b} =")
-        if parse_int(predicted) == expected:
-            correct += 1
-    return correct / n_problems
-```
-**Baseline SmolLM2**: ~5-10% (guessing based on patterns)
-**Target**: >95% (circuit-accurate)
-### 10.2 Edge Case Performance
-Specifically test:
-- Carry propagation: 127+128, 255+1, 128+128
-- Zeros: 0+0, 0+255
-- Identity: x+0 for various x
-- Commutativity: verify a+b == b+a
-### 10.3 Non-Arithmetic Preservation
-Verify general capability isn't degraded:
-- Perplexity on held-out text
-- Common benchmarks (HellaSwag, etc.)
-The augmentation should be **additive**—circuits help arithmetic, MLP handles everything else via routing.
----
-## 11. Extension Roadmap
-### 11.1 Additional Operations
-The circuit inventory includes:
-- Subtraction (via two's complement)
-- Multiplication (8×8 → 16-bit)
-- Division (iterative subtraction)
-- Bitwise ops (AND, OR, XOR, shifts)
-- Comparisons (GT, LT, EQ)
-Each needs its own extraction/injection interface, or a unified interface with operation selection.
-### 11.2 Multi-Operand Expressions
-For "15 + 27 + 33 =", need:
-- Operand count detection
-- Sequential circuit invocation
-- Accumulator pattern
-### 11.3 Larger Bit Widths
-16-bit and 32-bit arithmetic require:
-- Larger circuits (or chained 8-bit)
-- Wider BitExtractor (32 or 64 output dims)
-- More training data
-### 11.4 Symbolic Integration
-Ultimate goal: the model recognizes when it needs to compute, invokes circuits, and integrates results into coherent natural language output.
-```
-User: "If I have 127 apples and buy 128 more, how many do I have?"
-Model: [extracts 127, 128] [routes to circuit] [gets 255]
-       "You would have 255 apples."
-```
----
-## 12. File Structure
-```
-8bit-threshold-computer/
-├── neural_computer.safetensors    # Frozen circuits (3,122 tensors)
-├── circuit_llm.py                 # Integration architecture
-├── train_circuit_interface.py     # Training loop
-├── iron_eval.py                   # Circuit verification (6,590 tests)
-├── skeptic_test.py                # Algebraic identity tests (127 tests)
-├── prune_weights.py               # Weight optimization
-├── tensors.txt                    # Tensor manifest
-├── guide.md                       # This document
-└── README.md                      # Project overview
-```
----
-## 13. Key Equations
-### Heaviside Step
-```
-H(x) = 1 if x ≥ 0 else 0
-```
-### Threshold Gate
-```
-f(x₁,...,xₙ) = H(Σᵢ wᵢxᵢ + b)
-```
-### Full Adder
-```
-sum = a ⊕ b ⊕ cᵢₙ
-cₒᵤₜ = (a ∧ b) ∨ (cᵢₙ ∧ (a ⊕ b))
-```
-### STE Gradient
-```
-Forward:  y = H(x)
-Backward: ∂L/∂x = ∂L/∂y
-```
-### Router Combination
-```
-output = mlp_out + softmax(router(x))[1] × circuit_delta
-```
----
-## 14. References
-1. McCulloch & Pitts (1943). "A Logical Calculus of Ideas Immanent in Nervous Activity"
-2. Muroga (1971). "Threshold Logic and Its Applications"
-3. Siegelmann & Sontag (1995). "On the Computational Power of Neural Nets"
-4. Bengio et al. (2013). "Estimating or Propagating Gradients Through Stochastic Neurons"
-5. Ma et al. (2024). "The Era of 1-bit LLMs" (BitNet b1.58)
-6. HuggingFace (2024). "SmolLM2: Small Language Models"
----
-## 15. Summary
-We embed a proven-correct 8-bit threshold logic computer into SmolLM2's MLP layers. The circuits are frozen; we train only the interface layers that learn call dispatch. This gives the LLM exact arithmetic capability without training it to "do math"—the math is already done.
-The approach is:
-- **Sound**: Circuits verified with 6,590 tests
-- **Efficient**: 1.37M trainable params, 5.6K circuit params
-- **Exact**: Heaviside at inference means no approximation error
-- **Composable**: Add more circuits (multiply, compare, etc.) with same pattern
-The model learns when to call the calculator, not how to calculate.

+# Embedding Threshold Logic Circuits into Transformer MLPs
+## Technical Implementation Guide
+---
+## 1. Core Thesis
+Standard LLMs fail at arithmetic because they're interpolators—they approximate functions over training distributions rather than compute exact results. A 360M parameter model trained on internet text has seen "127 + 128 = 255" zero or few times, so it guesses "140" based on pattern matching.
+We solve this by embedding **frozen, proven-correct arithmetic circuits** directly into the transformer's MLP layers. The circuits use threshold logic (weighted sums + step activation), which is structurally compatible with neural network layers. We train only the **interface layers** that learn to:
+1. Extract operands from token embeddings
+2. Route computation through the circuits
+3. Inject results back into the residual stream
+The model learns **call dispatch**, not arithmetic. The arithmetic is already solved.
+---
+## 2. Threshold Logic Fundamentals
+### 2.1 Single Threshold Gate
+A threshold gate computes:
+```
+output = 1  if  (Σ wᵢxᵢ + b) ≥ 0
+         0  otherwise
+```
+This is a neuron with Heaviside step activation. With integer weights `w` and bias `b`, it computes a Boolean function of binary inputs.
+**Example: AND gate**
+```
+w = [1, 1], b = -2
+AND(0,0) = H(0 + 0 - 2) = H(-2) = 0
+AND(0,1) = H(0 + 1 - 2) = H(-1) = 0
+AND(1,0) = H(1 + 0 - 2) = H(-1) = 0
+AND(1,1) = H(1 + 1 - 2) = H(0)  = 1
+```
+**Example: OR gate**
+```
+w = [1, 1], b = -1
+OR(0,0) = H(0 + 0 - 1) = H(-1) = 0
+OR(0,1) = H(0 + 1 - 1) = H(0)  = 1
+OR(1,0) = H(1 + 0 - 1) = H(0)  = 1
+OR(1,1) = H(1 + 1 - 1) = H(1)  = 1
+```
+### 2.2 Multi-Layer Circuits
+XOR is not linearly separable—it requires two layers:
+```
+Layer 1:
+  neuron1 (OR):   w=[1,1], b=-1   → fires if a OR b
+  neuron2 (NAND): w=[-1,-1], b=1  → fires if NOT(a AND b)
+Layer 2:
+  neuron3 (AND): w=[1,1], b=-2   → fires if both layer1 outputs are 1
+XOR(a,b) = AND(OR(a,b), NAND(a,b))
+```
+### 2.3 Full Adder
+A full adder computes `sum` and `carry_out` from inputs `a`, `b`, `carry_in`:
+```
+sum = a XOR b XOR cin
+cout = (a AND b) OR (cin AND (a XOR b))
+```
+Implementation uses two half-adders chained:
+```
+HA1: (a, b) → (sum1 = a XOR b, carry1 = a AND b)
+HA2: (sum1, cin) → (sum2 = sum1 XOR cin, carry2 = sum1 AND cin)
+cout = carry1 OR carry2
+final_sum = sum2
+```
+Each XOR is 2 layers, each AND/OR is 1 layer. Total depth: ~4 layers per full adder.
+### 2.4 8-bit Ripple Carry Adder
+Chain 8 full adders, propagating carry:
+```
+FA0: (a[0], b[0], 0)      → (sum[0], c0)
+FA1: (a[1], b[1], c0)     → (sum[1], c1)
+FA2: (a[2], b[2], c1)     → (sum[2], c2)
+...
+FA7: (a[7], b[7], c6)     → (sum[7], c7)
+```
+Total circuit depth: ~32 threshold layers (8 FAs × 4 layers each).
+---
+## 3. Circuit Inventory
+The `neural_computer.safetensors` contains 3,122 tensors / 5,648 parameters implementing:
+| Category | Circuits | Tensors |
+|----------|----------|---------|
+| Boolean | AND, OR, NOT, NAND, NOR, XOR, XNOR, IMPLIES, BIIMPLIES | ~30 |
+| Arithmetic | Half adder, Full adder, Ripple carry 2/4/8-bit, 8×8 multiplier | ~800 |
+| Comparators | GT, LT, GEQ, LEQ, EQ (8-bit) | ~50 |
+| ALU | 16-operation ALU, opcode decoder, flag computation | ~400 |
+| Control | JMP, JZ, JNZ, JC, JNC, JN, JP, CALL, RET, PUSH, POP | ~200 |
+| Modular | Divisibility by 2-12 | ~600 |
+| Error Detection | Parity, CRC, Hamming, checksum | ~200 |
+| Pattern | Popcount, leading zeros, symmetry | ~150 |
+| Threshold | k-of-n gates, majority, minority | ~100 |
+All weights are integers. All activations are Heaviside. Verified with 6,590 exhaustive tests.
+---
+## 4. Transformer Integration Architecture
+### 4.1 Target: SmolLM2-360M
+```
+Architecture: LlamaForCausalLM
+Hidden dim:   960
+Layers:       32
+Heads:        15
+MLP expansion: 4x (intermediate = 3840)
+Vocab:        49152
+Parameters:   361,821,120
+```
+Standard MLP block:
+```python
+def forward(x):  # x: [batch, seq, 960]
+    gate = self.gate_proj(x)      # [batch, seq, 3840]
+    up = self.up_proj(x)          # [batch, seq, 3840]
+    hidden = silu(gate) * up      # SwiGLU activation
+    return self.down_proj(hidden) # [batch, seq, 960]
+```
+### 4.2 Augmented MLP Block
+```python
+def forward(x):  # x: [batch, seq, 960]
+    # Original MLP path (unchanged)
+    mlp_out = self.down_proj(silu(self.gate_proj(x)) * self.up_proj(x))
+    # Circuit path (new)
+    a_bits, b_bits = self.bit_extractor(x)       # [batch, seq, 8] each
+    result_bits, carry = self.circuits.add_8bit(a_bits, b_bits)
+    flags = self.compute_flags(result_bits, carry)
+    circuit_delta = self.bit_injector(result_bits, flags)
+    # Routing
+    route_weights = self.router(x)  # [batch, seq, 2] softmax
+    # Combine
+    return mlp_out + route_weights[..., 1:2] * circuit_delta
+```
+### 4.3 Layer Selection
+We augment the **middle third** of layers (10-20 of 32):
+- Early layers (0-9): Token/position encoding, not arithmetic-relevant
+- Middle layers (10-20): Abstract reasoning, computation
+- Late layers (21-31): Output formatting, vocabulary projection
+Rationale: Arithmetic computation happens in middle layers where the model processes relationships between tokens. Early layers haven't built sufficient representations; late layers are committed to output tokens.
+---
+## 5. Interface Layers (Trainable)
+### 5.1 BitExtractor
+Maps token embedding → two 8-bit operands.
+```python
+class BitExtractor(nn.Module):
+    def __init__(self, d_model=960):
+        self.proj = nn.Linear(d_model, 16)  # 960 → 16
+    def forward(self, x):
+        logits = self.proj(x)           # [batch, seq, 16]
+        bits = heaviside(logits)        # binarize with STE
+        a_bits = bits[..., :8]          # first operand
+        b_bits = bits[..., 8:]          # second operand
+        return a_bits, b_bits           # both [batch, seq, 8], LSB first
+```
+**What it learns**: Which embedding dimensions encode numeric magnitude. For token "127", it must learn that certain activation patterns correspond to bits `[1,1,1,1,1,1,1,0]`.
+**Parameters**: 960 × 16 + 16 = 15,376
+### 5.2 BitInjector
+Maps circuit outputs → embedding delta.
+```python
+class BitInjector(nn.Module):
+    def __init__(self, d_model=960):
+        self.proj = nn.Linear(16, d_model)  # 16 → 960
+        self.scale = nn.Parameter(torch.tensor(0.1))
+    def forward(self, result_bits, flags):
+        combined = torch.cat([result_bits, flags], dim=-1)  # [batch, seq, 16]
+        return self.proj(combined) * self.scale              # [batch, seq, 960]
+```
+**What it learns**: How to inject the result bits back into embedding space such that subsequent layers (and the final vocabulary projection) produce the correct output tokens.
+**Parameters**: 16 × 960 + 960 + 1 = 16,321
+### 5.3 Router
+Decides when to use circuit path.
+```python
+class Router(nn.Module):
+    def __init__(self, d_model=960):
+        self.net = nn.Sequential(
+            nn.Linear(d_model, 64),
+            nn.ReLU(),
+            nn.Linear(64, 2),
+            nn.Softmax(dim=-1)
+        )
+    def forward(self, x):
+        return self.net(x)  # [batch, seq, 2]: [mlp_weight, circuit_weight]
+```
+**What it learns**: "This position contains arithmetic" → route through circuits. "This is prose" → use normal MLP.
+**Parameters**: 960 × 64 + 64 + 64 × 2 + 2 = 61,698
+### 5.4 Total Trainable Parameters
+Per augmented layer:
+```
+BitExtractor:  15,376
+BitInjector:   16,321
+Router:        61,698
+OpSelector:    ~31,000
+───────────────────────
+Total:         ~124,395 per layer
+```
+For 11 augmented layers: **~1.37M trainable parameters**
+This is 0.38% of the model. The other 99.62% (including all circuit weights) is frozen.
+---
+## 6. Gradient Flow Through Heaviside
+### 6.1 The Problem
+Heaviside has zero gradient almost everywhere:
+```
+H(x) = 1 if x ≥ 0 else 0
+dH/dx = 0 for x ≠ 0, undefined at x = 0
+```
+Standard backprop would give zero gradients to BitExtractor.
+### 6.2 Straight-Through Estimator (STE)
+We use STE: forward pass uses true Heaviside, backward pass pretends it's identity.
+```python
+class HeavisideSTE(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        return (x >= 0).float()  # true step function
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output  # pass gradient through unchanged
+```
+**Intuition**: "If making the input larger would have helped the output, increase the input." The gradient tells us the direction even though the function is flat.
+### 6.3 Alternative: Sigmoid Annealing
+During training, use sigmoid with increasing temperature:
+```python
+def soft_heaviside(x, temperature):
+    return torch.sigmoid(x * temperature)
+# temperature: 1 → 10 → 100 over training
+# At high temperature, sigmoid ≈ step function
+```
+This provides smoother gradients early in training, then sharpens to true binary at inference.
+---
+## 7. Training Strategy
+### 7.1 Data Generation
+Generate arithmetic problems exhaustively:
+```python
+def generate_batch(batch_size):
+    a = torch.randint(0, 256, (batch_size,))
+    b = torch.randint(0, 256, (batch_size,))
+    result = (a + b) % 256
+    prompts = [f"{a[i]} + {b[i]} =" for i in range(batch_size)]
+    targets = [f" {result[i]}" for i in range(batch_size)]
+    return prompts, targets
+```
+For 8-bit addition, there are 256 × 256 = 65,536 unique problems. We can cover the entire space.
+### 7.2 Loss Function
+Standard cross-entropy on next-token prediction:
+```python
+outputs = model(input_ids, attention_mask=mask, labels=labels)
+loss = outputs.loss  # CE loss, only on target tokens
+```
+Labels are masked for prompt tokens (`-100`), so loss only backprops through the answer.
+### 7.3 Optimizer Configuration
+```python
+# Only train interface layers
+interface_params = [p for n, p in model.named_parameters()
+                    if any(x in n for x in ['bit_extractor', 'bit_injector', 'router'])]
+optimizer = AdamW(interface_params, lr=1e-4, weight_decay=0.01)
+scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
+```
+### 7.4 Curriculum Learning
+Start simple, increase difficulty:
+```
+Phase 1 (epochs 1-2):   Single-digit addition (0-9 + 0-9)
+Phase 2 (epochs 3-4):   Two-digit addition (0-99 + 0-99)
+Phase 3 (epochs 5-7):   Full 8-bit addition (0-255 + 0-255)
+Phase 4 (epochs 8-10):  Adversarial cases (carry chains: 127+128, 255+1)
+```
+This helps the interface layers learn the basic extraction pattern before tackling hard cases.
+### 7.5 Training Hyperparameters
+```
+Model:          SmolLM2-360M
+Augmented:      Layers 10-20 (11 layers)
+Trainable:      1.37M parameters
+Frozen:         362M parameters (including 5.6K circuit params)
+Batch size:     32
+Learning rate:  1e-4
+Epochs:         10
+Samples:        10,000 per epoch
+Warmup:         500 steps
+Device:         RTX 6000 Ada (48GB)
+Expected time:  ~30 minutes total
+```
+---
+## 8. Forward Pass Walkthrough
+Input: `"127 + 128 ="`
+### 8.1 Tokenization
+```
+Tokens: ["127", " +", " 128", " ="]
+IDs:    [12700, 489, 13824, 284]  # hypothetical
+```
+### 8.2 Embedding
+```
+embeddings = embed(input_ids)  # [1, 4, 960]
+```
+### 8.3 Layers 0-9 (Unchanged)
+Standard attention + MLP, building representations.
+### 8.4 Layer 10 (Augmented)
+```python
+# After attention
+x = layer_norm(attn_output + residual)  # [1, 4, 960]
+# MLP path
+mlp_out = down_proj(silu(gate_proj(x)) * up_proj(x))
+# Circuit path
+a_bits, b_bits = bit_extractor(x)
+# Position 0 ("127"): a_bits ≈ [1,1,1,1,1,1,1,0] if well-trained
+# Position 2 ("128"): b_bits ≈ [0,0,0,0,0,0,0,1]
+# (In practice, extraction happens per-position; aggregation is learned)
+result_bits, carry = circuits.add_8bit(a_bits, b_bits)
+# result_bits = [1,1,1,1,1,1,1,1] = 255
+flags = compute_flags(result_bits, carry)
+# zero=0, negative=1, carry=1
+circuit_delta = bit_injector(result_bits, flags)  # [1, 4, 960]
+# Routing
+route = router(x)  # [1, 4, 2]
+# Position 3 ("="): route ≈ [0.1, 0.9] → use circuits
+# Position 1 ("+"): route ≈ [0.8, 0.2] → mostly MLP
+# Combine
+output = mlp_out + route[..., 1:2] * circuit_delta
+```
+### 8.5 Layers 11-31
+Continue processing, eventually projecting to vocabulary.
+### 8.6 Output
+```
+logits = lm_head(final_hidden)  # [1, 4, 49152]
+next_token = argmax(logits[0, 3, :])  # token after "="
+# Should decode to "255" (possibly as " 255" or "255")
+```
+---
+## 9. Inference Characteristics
+### 9.1 Exactness
+At inference, Heaviside is true step function—no approximation. If BitExtractor correctly maps "127" → bits and "128" → bits, the circuit **will** output 255. The only failure mode is incorrect extraction.
+### 9.2 Latency
+Circuit computation adds ~5-10% overhead:
+- BitExtractor: 1 linear layer (960→16)
+- Circuits: ~32 threshold layers, but sparse and tiny
+- BitInjector: 1 linear layer (16→960)
+- Router: 2 linear layers
+The circuits have only 5,648 parameters total—negligible versus the 361M in the base model.
+### 9.3 Generalization
+Once the interface learns the mapping, it generalizes to **all** 65,536 8-bit additions. There's no memorization—the circuits compute.
+---
+## 10. Evaluation Metrics
+### 10.1 Arithmetic Accuracy
+```python
+def eval_accuracy(model, n_problems=1000):
+    correct = 0
+    for _ in range(n_problems):
+        a, b = random 8-bit values
+        expected = (a + b) % 256
+        predicted = model.generate(f"{a} + {b} =")
+        if parse_int(predicted) == expected:
+            correct += 1
+    return correct / n_problems
+```
+**Baseline SmolLM2**: ~5-10% (guessing based on patterns)
+**Target**: >95% (circuit-accurate)
+### 10.2 Edge Case Performance
+Specifically test:
+- Carry propagation: 127+128, 255+1, 128+128
+- Zeros: 0+0, 0+255
+- Identity: x+0 for various x
+- Commutativity: verify a+b == b+a
+### 10.3 Non-Arithmetic Preservation
+Verify general capability isn't degraded:
+- Perplexity on held-out text
+- Common benchmarks (HellaSwag, etc.)
+The augmentation should be **additive**—circuits help arithmetic, MLP handles everything else via routing.
+---
+## 11. Extension Roadmap
+### 11.1 Additional Operations
+The circuit inventory includes:
+- Subtraction (via two's complement)
+- Multiplication (8×8 → 16-bit)
+- Division (iterative subtraction)
+- Bitwise ops (AND, OR, XOR, shifts)
+- Comparisons (GT, LT, EQ)
+Each needs its own extraction/injection interface, or a unified interface with operation selection.
+### 11.2 Multi-Operand Expressions
+For "15 + 27 + 33 =", need:
+- Operand count detection
+- Sequential circuit invocation
+- Accumulator pattern
+### 11.3 Larger Bit Widths
+16-bit and 32-bit arithmetic require:
+- Larger circuits (or chained 8-bit)
+- Wider BitExtractor (32 or 64 output dims)
+- More training data
+### 11.4 Symbolic Integration
+Ultimate goal: the model recognizes when it needs to compute, invokes circuits, and integrates results into coherent natural language output.
+```
+User: "If I have 127 apples and buy 128 more, how many do I have?"
+Model: [extracts 127, 128] [routes to circuit] [gets 255]
+       "You would have 255 apples."
+```
+---
+## 12. File Structure
+```
+8bit-threshold-computer/
+├── neural_computer.safetensors    # Frozen circuits (3,122 tensors)
+├── circuit_llm.py                 # Integration architecture
+├── train_circuit_interface.py     # Training loop
+├── iron_eval.py                   # Circuit verification (6,590 tests)
+├── skeptic_test.py                # Algebraic identity tests (127 tests)
+├── prune_weights.py               # Weight optimization
+├── tensors.txt                    # Tensor manifest
+├── guide.md                       # This document
+└── README.md                      # Project overview
+```
+---
+## 13. Key Equations
+### Heaviside Step
+```
+H(x) = 1 if x ≥ 0 else 0
+```
+### Threshold Gate
+```
+f(x₁,...,xₙ) = H(Σᵢ wᵢxᵢ + b)
+```
+### Full Adder
+```
+sum = a ⊕ b ⊕ cᵢₙ
+cₒᵤₜ = (a ∧ b) ∨ (cᵢₙ ∧ (a ⊕ b))
+```
+### STE Gradient
+```
+Forward:  y = H(x)
+Backward: ∂L/∂x = ∂L/∂y
+```
+### Router Combination
+```
+output = mlp_out + softmax(router(x))[1] × circuit_delta
+```
+---
+## 14. References
+1. McCulloch & Pitts (1943). "A Logical Calculus of Ideas Immanent in Nervous Activity"
+2. Muroga (1971). "Threshold Logic and Its Applications"
+3. Siegelmann & Sontag (1995). "On the Computational Power of Neural Nets"
+4. Bengio et al. (2013). "Estimating or Propagating Gradients Through Stochastic Neurons"
+5. Ma et al. (2024). "The Era of 1-bit LLMs" (BitNet b1.58)
+6. HuggingFace (2024). "SmolLM2: Small Language Models"
+---
+## 15. Summary
+We embed a proven-correct 8-bit threshold logic computer into SmolLM2's MLP layers. The circuits are frozen; we train only the interface layers that learn call dispatch. This gives the LLM exact arithmetic capability without training it to "do math"—the math is already done.
+The approach is:
+- **Sound**: Circuits verified with 6,590 tests
+- **Efficient**: 1.37M trainable params, 5.6K circuit params
+- **Exact**: Heaviside at inference means no approximation error
+- **Composable**: Add more circuits (multiply, compare, etc.) with same pattern
+The model learns when to call the calculator, not how to calculate.

llm/train_circuit_interface.py CHANGED Viewed

@@ -1,306 +1,306 @@
-"""
-Train the circuit interface layers on arithmetic examples.
-============================================================
-The threshold circuits are frozen - we only train:
-- BitExtractor: embedding -> operand bits
-- BitInjector: result bits -> embedding
-- Router: when to use circuits vs MLP
-"""
-import torch
-import torch.nn as nn
-from torch.utils.data import Dataset, DataLoader
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from tqdm import tqdm
-import argparse
-import warnings
-warnings.filterwarnings('ignore')
-from circuit_llm import (
-    augment_smollm2_with_circuits,
-    evaluate_arithmetic,
-    CircuitExecutor
-)
-# =============================================================================
-# ARITHMETIC DATASET
-# =============================================================================
-class ArithmeticDataset(Dataset):
-    """Dataset of 8-bit addition problems."""
-    def __init__(self, tokenizer, n_samples: int = 10000, max_val: int = 255):
-        self.tokenizer = tokenizer
-        self.n_samples = n_samples
-        self.max_val = max_val
-        # Pre-generate all examples
-        self.examples = []
-        for _ in range(n_samples):
-            a = torch.randint(0, max_val + 1, (1,)).item()
-            b = torch.randint(0, max_val + 1, (1,)).item()
-            result = (a + b) % 256
-            prompt = f"{a} + {b} ="
-            target = f" {result}"
-            self.examples.append((prompt, target, a, b, result))
-    def __len__(self):
-        return len(self.examples)
-    def __getitem__(self, idx):
-        prompt, target, a, b, result = self.examples[idx]
-        # Tokenize
-        prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=False)
-        target_ids = self.tokenizer.encode(target, add_special_tokens=False)
-        input_ids = prompt_ids + target_ids
-        labels = [-100] * len(prompt_ids) + target_ids  # Only predict target
-        return {
-            'input_ids': torch.tensor(input_ids),
-            'labels': torch.tensor(labels),
-            'a': a,
-            'b': b,
-            'result': result
-        }
-def collate_fn(batch):
-    """Collate with padding."""
-    max_len = max(len(item['input_ids']) for item in batch)
-    input_ids = []
-    labels = []
-    attention_mask = []
-    for item in batch:
-        pad_len = max_len - len(item['input_ids'])
-        input_ids.append(
-            torch.cat([item['input_ids'], torch.zeros(pad_len, dtype=torch.long)])
-        )
-        labels.append(
-            torch.cat([item['labels'], torch.full((pad_len,), -100, dtype=torch.long)])
-        )
-        attention_mask.append(
-            torch.cat([torch.ones(len(item['input_ids'])), torch.zeros(pad_len)])
-        )
-    return {
-        'input_ids': torch.stack(input_ids),
-        'labels': torch.stack(labels),
-        'attention_mask': torch.stack(attention_mask),
-    }
-# =============================================================================
-# TRAINING LOOP
-# =============================================================================
-def train_interface(
-    model: AutoModelForCausalLM,
-    tokenizer: AutoTokenizer,
-    n_epochs: int = 3,
-    batch_size: int = 16,
-    lr: float = 1e-4,
-    n_train_samples: int = 10000,
-    device: str = 'cpu',
-    eval_every: int = 500
-):
-    """
-    Train the circuit interface layers.
-    Only trains:
-    - bit_extractor (embedding -> bits)
-    - bit_injector (bits -> embedding)
-    - router (circuit vs MLP weighting)
-    - op_selector (which operation)
-    """
-    print("\n" + "=" * 70)
-    print(" TRAINING CIRCUIT INTERFACE")
-    print("=" * 70)
-    # Freeze everything except interface layers
-    interface_params = []
-    frozen_count = 0
-    trainable_count = 0
-    for name, param in model.named_parameters():
-        if any(x in name for x in ['bit_extractor', 'bit_injector', 'router', 'op_selector']):
-            param.requires_grad = True
-            interface_params.append(param)
-            trainable_count += param.numel()
-        else:
-            param.requires_grad = False
-            frozen_count += param.numel()
-    print(f"\n  Frozen parameters: {frozen_count:,}")
-    print(f"  Trainable parameters: {trainable_count:,}")
-    print(f"  Training {len(interface_params)} parameter groups")
-    # Create dataset
-    print(f"\n  Creating dataset ({n_train_samples} examples)...")
-    dataset = ArithmeticDataset(tokenizer, n_samples=n_train_samples)
-    dataloader = DataLoader(
-        dataset,
-        batch_size=batch_size,
-        shuffle=True,
-        collate_fn=collate_fn
-    )
-    # Optimizer
-    optimizer = torch.optim.AdamW(interface_params, lr=lr)
-    # Training
-    model.to(device)
-    model.train()
-    global_step = 0
-    total_loss = 0
-    for epoch in range(n_epochs):
-        print(f"\n  Epoch {epoch + 1}/{n_epochs}")
-        print("  " + "-" * 60)
-        epoch_loss = 0
-        epoch_steps = 0
-        pbar = tqdm(dataloader, desc=f"  Training", leave=False)
-        for batch in pbar:
-            input_ids = batch['input_ids'].to(device)
-            labels = batch['labels'].to(device)
-            attention_mask = batch['attention_mask'].to(device)
-            # Forward
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                labels=labels
-            )
-            loss = outputs.loss
-            # Backward
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            # Logging
-            epoch_loss += loss.item()
-            epoch_steps += 1
-            global_step += 1
-            total_loss += loss.item()
-            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
-            # Periodic evaluation
-            if global_step % eval_every == 0:
-                model.eval()
-                eval_results = evaluate_arithmetic(model, tokenizer, n_problems=50, device=device)
-                print(f"\n    Step {global_step}: Loss={total_loss/eval_every:.4f}, "
-                      f"Accuracy={eval_results['accuracy']*100:.1f}%")
-                total_loss = 0
-                model.train()
-        avg_loss = epoch_loss / epoch_steps
-        print(f"\n  Epoch {epoch + 1} complete. Avg loss: {avg_loss:.4f}")
-        # End of epoch evaluation
-        model.eval()
-        eval_results = evaluate_arithmetic(model, tokenizer, n_problems=100, device=device)
-        print(f"  Evaluation: {eval_results['accuracy']*100:.1f}% "
-              f"({eval_results['correct']}/{eval_results['total']})")
-        if eval_results['errors']:
-            print(f"  Sample errors:")
-            for a, b, exp, got in eval_results['errors'][:3]:
-                print(f"    {a} + {b} = {exp}, model said {got}")
-        model.train()
-    print("\n" + "=" * 70)
-    print(" TRAINING COMPLETE")
-    print("=" * 70)
-    return model
-# =============================================================================
-# MAIN
-# =============================================================================
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Train Circuit Interface')
-    parser.add_argument('--circuit-path', type=str,
-                        default='./neural_computer.safetensors',
-                        help='Path to circuit weights')
-    parser.add_argument('--device', type=str, default='cpu',
-                        help='Device (cpu or cuda)')
-    parser.add_argument('--epochs', type=int, default=3,
-                        help='Number of epochs')
-    parser.add_argument('--batch-size', type=int, default=8,
-                        help='Batch size')
-    parser.add_argument('--lr', type=float, default=1e-4,
-                        help='Learning rate')
-    parser.add_argument('--n-samples', type=int, default=5000,
-                        help='Number of training samples')
-    args = parser.parse_args()
-    print("=" * 70)
-    print(" CIRCUIT-AUGMENTED LLM TRAINING")
-    print("=" * 70)
-    # Load model
-    print("\n[1] Loading SmolLM2-360M...")
-    model_id = "HuggingFaceTB/SmolLM2-360M"
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
-    # Baseline
-    print("\n[2] Baseline evaluation...")
-    baseline = evaluate_arithmetic(model, tokenizer, n_problems=50, device=args.device)
-    print(f"    Baseline accuracy: {baseline['accuracy']*100:.1f}%")
-    # Augment
-    print("\n[3] Augmenting with circuits...")
-    model = augment_smollm2_with_circuits(
-        model,
-        args.circuit_path,
-        device=args.device
-    )
-    # Train
-    print("\n[4] Training interface layers...")
-    model = train_interface(
-        model,
-        tokenizer,
-        n_epochs=args.epochs,
-        batch_size=args.batch_size,
-        lr=args.lr,
-        n_train_samples=args.n_samples,
-        device=args.device
-    )
-    # Final evaluation
-    print("\n[5] Final evaluation...")
-    final = evaluate_arithmetic(model, tokenizer, n_problems=100, device=args.device)
-    print(f"    Final accuracy: {final['accuracy']*100:.1f}%")
-    print(f"    Improvement: {baseline['accuracy']*100:.1f}% -> {final['accuracy']*100:.1f}%")
-    # Save
-    save_path = './circuit_augmented_smollm2.pt'
-    print(f"\n[6] Saving to {save_path}...")
-    torch.save({
-        'model_state_dict': model.state_dict(),
-        'baseline_accuracy': baseline['accuracy'],
-        'final_accuracy': final['accuracy']
-    }, save_path)
-    print("\nDone!")

+"""
+Train the circuit interface layers on arithmetic examples.
+============================================================
+The threshold circuits are frozen - we only train:
+- BitExtractor: embedding -> operand bits
+- BitInjector: result bits -> embedding
+- Router: when to use circuits vs MLP
+"""
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from tqdm import tqdm
+import argparse
+import warnings
+warnings.filterwarnings('ignore')
+from circuit_llm import (
+    augment_smollm2_with_circuits,
+    evaluate_arithmetic,
+    CircuitExecutor
+)
+# =============================================================================
+# ARITHMETIC DATASET
+# =============================================================================
+class ArithmeticDataset(Dataset):
+    """Dataset of 8-bit addition problems."""
+    def __init__(self, tokenizer, n_samples: int = 10000, max_val: int = 255):
+        self.tokenizer = tokenizer
+        self.n_samples = n_samples
+        self.max_val = max_val
+        # Pre-generate all examples
+        self.examples = []
+        for _ in range(n_samples):
+            a = torch.randint(0, max_val + 1, (1,)).item()
+            b = torch.randint(0, max_val + 1, (1,)).item()
+            result = (a + b) % 256
+            prompt = f"{a} + {b} ="
+            target = f" {result}"
+            self.examples.append((prompt, target, a, b, result))
+    def __len__(self):
+        return len(self.examples)
+    def __getitem__(self, idx):
+        prompt, target, a, b, result = self.examples[idx]
+        # Tokenize
+        prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=False)
+        target_ids = self.tokenizer.encode(target, add_special_tokens=False)
+        input_ids = prompt_ids + target_ids
+        labels = [-100] * len(prompt_ids) + target_ids  # Only predict target
+        return {
+            'input_ids': torch.tensor(input_ids),
+            'labels': torch.tensor(labels),
+            'a': a,
+            'b': b,
+            'result': result
+        }
+def collate_fn(batch):
+    """Collate with padding."""
+    max_len = max(len(item['input_ids']) for item in batch)
+    input_ids = []
+    labels = []
+    attention_mask = []
+    for item in batch:
+        pad_len = max_len - len(item['input_ids'])
+        input_ids.append(
+            torch.cat([item['input_ids'], torch.zeros(pad_len, dtype=torch.long)])
+        )
+        labels.append(
+            torch.cat([item['labels'], torch.full((pad_len,), -100, dtype=torch.long)])
+        )
+        attention_mask.append(
+            torch.cat([torch.ones(len(item['input_ids'])), torch.zeros(pad_len)])
+        )
+    return {
+        'input_ids': torch.stack(input_ids),
+        'labels': torch.stack(labels),
+        'attention_mask': torch.stack(attention_mask),
+    }
+# =============================================================================
+# TRAINING LOOP
+# =============================================================================
+def train_interface(
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+    n_epochs: int = 3,
+    batch_size: int = 16,
+    lr: float = 1e-4,
+    n_train_samples: int = 10000,
+    device: str = 'cpu',
+    eval_every: int = 500
+):
+    """
+    Train the circuit interface layers.
+    Only trains:
+    - bit_extractor (embedding -> bits)
+    - bit_injector (bits -> embedding)
+    - router (circuit vs MLP weighting)
+    - op_selector (which operation)
+    """
+    print("\n" + "=" * 70)
+    print(" TRAINING CIRCUIT INTERFACE")
+    print("=" * 70)
+    # Freeze everything except interface layers
+    interface_params = []
+    frozen_count = 0
+    trainable_count = 0
+    for name, param in model.named_parameters():
+        if any(x in name for x in ['bit_extractor', 'bit_injector', 'router', 'op_selector']):
+            param.requires_grad = True
+            interface_params.append(param)
+            trainable_count += param.numel()
+        else:
+            param.requires_grad = False
+            frozen_count += param.numel()
+    print(f"\n  Frozen parameters: {frozen_count:,}")
+    print(f"  Trainable parameters: {trainable_count:,}")
+    print(f"  Training {len(interface_params)} parameter groups")
+    # Create dataset
+    print(f"\n  Creating dataset ({n_train_samples} examples)...")
+    dataset = ArithmeticDataset(tokenizer, n_samples=n_train_samples)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=True,
+        collate_fn=collate_fn
+    )
+    # Optimizer
+    optimizer = torch.optim.AdamW(interface_params, lr=lr)
+    # Training
+    model.to(device)
+    model.train()
+    global_step = 0
+    total_loss = 0
+    for epoch in range(n_epochs):
+        print(f"\n  Epoch {epoch + 1}/{n_epochs}")
+        print("  " + "-" * 60)
+        epoch_loss = 0
+        epoch_steps = 0
+        pbar = tqdm(dataloader, desc=f"  Training", leave=False)
+        for batch in pbar:
+            input_ids = batch['input_ids'].to(device)
+            labels = batch['labels'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            # Forward
+            outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                labels=labels
+            )
+            loss = outputs.loss
+            # Backward
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            # Logging
+            epoch_loss += loss.item()
+            epoch_steps += 1
+            global_step += 1
+            total_loss += loss.item()
+            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
+            # Periodic evaluation
+            if global_step % eval_every == 0:
+                model.eval()
+                eval_results = evaluate_arithmetic(model, tokenizer, n_problems=50, device=device)
+                print(f"\n    Step {global_step}: Loss={total_loss/eval_every:.4f}, "
+                      f"Accuracy={eval_results['accuracy']*100:.1f}%")
+                total_loss = 0
+                model.train()
+        avg_loss = epoch_loss / epoch_steps
+        print(f"\n  Epoch {epoch + 1} complete. Avg loss: {avg_loss:.4f}")
+        # End of epoch evaluation
+        model.eval()
+        eval_results = evaluate_arithmetic(model, tokenizer, n_problems=100, device=device)
+        print(f"  Evaluation: {eval_results['accuracy']*100:.1f}% "
+              f"({eval_results['correct']}/{eval_results['total']})")
+        if eval_results['errors']:
+            print(f"  Sample errors:")
+            for a, b, exp, got in eval_results['errors'][:3]:
+                print(f"    {a} + {b} = {exp}, model said {got}")
+        model.train()
+    print("\n" + "=" * 70)
+    print(" TRAINING COMPLETE")
+    print("=" * 70)
+    return model
+# =============================================================================
+# MAIN
+# =============================================================================
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Train Circuit Interface')
+    parser.add_argument('--circuit-path', type=str,
+                        default='./neural_computer.safetensors',
+                        help='Path to circuit weights')
+    parser.add_argument('--device', type=str, default='cpu',
+                        help='Device (cpu or cuda)')
+    parser.add_argument('--epochs', type=int, default=3,
+                        help='Number of epochs')
+    parser.add_argument('--batch-size', type=int, default=8,
+                        help='Batch size')
+    parser.add_argument('--lr', type=float, default=1e-4,
+                        help='Learning rate')
+    parser.add_argument('--n-samples', type=int, default=5000,
+                        help='Number of training samples')
+    args = parser.parse_args()
+    print("=" * 70)
+    print(" CIRCUIT-AUGMENTED LLM TRAINING")
+    print("=" * 70)
+    # Load model
+    print("\n[1] Loading SmolLM2-360M...")
+    model_id = "HuggingFaceTB/SmolLM2-360M"
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
+    # Baseline
+    print("\n[2] Baseline evaluation...")
+    baseline = evaluate_arithmetic(model, tokenizer, n_problems=50, device=args.device)
+    print(f"    Baseline accuracy: {baseline['accuracy']*100:.1f}%")
+    # Augment
+    print("\n[3] Augmenting with circuits...")
+    model = augment_smollm2_with_circuits(
+        model,
+        args.circuit_path,
+        device=args.device
+    )
+    # Train
+    print("\n[4] Training interface layers...")
+    model = train_interface(
+        model,
+        tokenizer,
+        n_epochs=args.epochs,
+        batch_size=args.batch_size,
+        lr=args.lr,
+        n_train_samples=args.n_samples,
+        device=args.device
+    )
+    # Final evaluation
+    print("\n[5] Final evaluation...")
+    final = evaluate_arithmetic(model, tokenizer, n_problems=100, device=args.device)
+    print(f"    Final accuracy: {final['accuracy']*100:.1f}%")
+    print(f"    Improvement: {baseline['accuracy']*100:.1f}% -> {final['accuracy']*100:.1f}%")
+    # Save
+    save_path = './circuit_augmented_smollm2.pt'
+    print(f"\n[6] Saving to {save_path}...")
+    torch.save({
+        'model_state_dict': model.state_dict(),
+        'baseline_accuracy': baseline['accuracy'],
+        'final_accuracy': final['accuracy']
+    }, save_path)
+    print("\nDone!")

neural_computer.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:51d4e2725c0d24bce807a5b7dc58319e9eed0f95c17fc39e662272ed0cbe8f1f
-size 351104

 version https://git-lfs.github.com/spec/v1
+oid sha256:ec37339654639ab0a16a32fe5324f6bc1ed02d457d0936383ea9993c19edc92a
+size 358696

tensors.txt CHANGED Viewed

The diff for this file is too large to render. See raw diff

todo.md CHANGED Viewed

@@ -6,7 +6,7 @@
 |-------------------------|-----------------------------|--------------------------------|
 | SUB                     | Subtraction circuit         | Missing - need NOT(B)+1+A path |
 | DIV                     | Division circuit            | Missing                        |
-| NEG                     | Two's complement negate     | Missing                        |
 | Program Counter         | PC register + increment     | Missing                        |
 | PC Load                 | Load PC from jump target    | Missing                        |
 | Register File MUX       | Select 1-of-4 GPRs          | Missing                        |

 |-------------------------|-----------------------------|--------------------------------|
 | SUB                     | Subtraction circuit         | Missing - need NOT(B)+1+A path |
 | DIV                     | Division circuit            | Missing                        |
+| NEG                     | Two's complement negate     | DONE - 76 tensors, 256/256 tests pass |
 | Program Counter         | PC register + increment     | Missing                        |
 | PC Load                 | Load PC from jump target    | Missing                        |
 | Register File MUX       | Select 1-of-4 GPRs          | Missing                        |