Add IEEE 754 float16/float32 arithmetic circuits

Float16 (half-precision):
- Core: unpack/pack/classify/normalize with 4-stage barrel shifter
- ADD: exponent compare, mantissa alignment, add/sub with mux select
- MUL: sign XOR, exponent add, bias subtract, 11x11 mantissa multiply
- DIV: 11-stage restoring division with mux per stage
- CMP: NaN/zero detection, magnitude compare (EQ/LT/GT/LE/GE)

Float32 (single-precision):
- Core: 5-stage normalize, 8-bit exponent, 23-bit mantissa
- ADD: 5-stage alignment, 25-bit mantissa operations
- MUL: 24x24 mantissa multiply (576 partial products)
- DIV: 24-stage restoring division
- CMP: same structure with larger weight vectors

Integration:
- Wire float circuits into cmd_alu build command
- Add float prefixes to drop_list for clean rebuilds
- Add comprehensive eval tests for all float circuit shapes

Test results:
- 8-bit CPU: 6772/6772 (100%)
- 32-bit ALU: 7239/7256 (99.8%, only pre-existing priority encoder issue)

Files changed (4) hide show

build.py +729 -0
eval.py +624 -1
neural_alu32.safetensors +2 -2
neural_computer.safetensors +2 -2

build.py CHANGED Viewed

@@ -998,6 +998,642 @@ def add_neg_nbits(tensors: Dict[str, torch.Tensor], bits: int) -> None:
         add_gate(tensors, f"alu.alu{bits}bit.neg.inc.bit{bit}.carry", [1.0, 1.0], [-2.0])
 def update_manifest(tensors: Dict[str, torch.Tensor], data_bits: int, addr_bits: int, mem_bytes: int) -> None:
     """Update manifest metadata tensors.
@@ -2243,6 +2879,7 @@ def cmd_alu(args) -> None:
         "arithmetic.greaterorequal8bit.", "arithmetic.lessorequal8bit.",
         "arithmetic.equality8bit.", "arithmetic.add3_8bit.", "arithmetic.expr_add_mul.", "arithmetic.expr_paren.",
         "combinational.barrelshifter.", "combinational.priorityencoder.",
     ]
     if bits in [16, 32]:
@@ -2397,6 +3034,98 @@ def cmd_alu(args) -> None:
         except ValueError as e:
             print(f"  {bits}-bit NEG already exists: {e}")
     if args.apply:
         print(f"\nSaving: {args.model}")
         save_file(tensors, str(args.model))

         add_gate(tensors, f"alu.alu{bits}bit.neg.inc.bit{bit}.carry", [1.0, 1.0], [-2.0])
+def add_float16_core(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float16 core circuits (unpack, pack, classify, normalize).
+    IEEE 754 half-precision format (16 bits):
+    - Bit 15: Sign (0=positive, 1=negative)
+    - Bits 14-10: Exponent (5 bits, bias=15)
+    - Bits 9-0: Mantissa/fraction (10 bits, implicit leading 1 for normalized)
+    Special values:
+    - Zero: exp=0, frac=0
+    - Subnormal: exp=0, frac≠0
+    - Infinity: exp=31, frac=0
+    - NaN: exp=31, frac≠0
+    """
+    prefix = "float16"
+    for i in range(16):
+        add_gate(tensors, f"{prefix}.unpack.bit{i}", [1.0], [0.0])
+    add_gate(tensors, f"{prefix}.classify.exp_zero", [-1.0] * 5, [0.0])
+    add_gate(tensors, f"{prefix}.classify.exp_max", [1.0] * 5, [-5.0])
+    add_gate(tensors, f"{prefix}.classify.frac_zero", [-1.0] * 10, [0.0])
+    add_gate(tensors, f"{prefix}.classify.frac_nonzero", [1.0] * 10, [-1.0])
+    add_gate(tensors, f"{prefix}.classify.is_zero.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.classify.is_subnormal.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.classify.is_inf.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.classify.is_nan.and", [1.0, 1.0], [-2.0])
+    for stage in range(4):
+        shift = 1 << (3 - stage)
+        for bit in range(11):
+            add_gate(tensors, f"{prefix}.normalize.stage{stage}.bit{bit}.not_sel", [-1.0], [0.0])
+            add_gate(tensors, f"{prefix}.normalize.stage{stage}.bit{bit}.and_a", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.stage{stage}.bit{bit}.and_b", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.stage{stage}.bit{bit}.or", [1.0, 1.0], [-1.0])
+    for stage in range(4):
+        shift = 1 << (3 - stage)
+        for bit in range(5):
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for i in range(16):
+        add_gate(tensors, f"{prefix}.pack.bit{i}", [1.0], [0.0])
+def add_float16_add(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float16 addition circuit.
+    Algorithm:
+    1. Unpack both operands
+    2. Compare exponents, align mantissas
+    3. Add/subtract mantissas based on signs
+    4. Normalize result
+    5. Handle special cases (inf, nan, zero)
+    """
+    prefix = "float16.add"
+    pos_weights = [float(1 << (4 - i)) for i in range(5)]
+    neg_weights = [-w for w in pos_weights]
+    add_gate(tensors, f"{prefix}.exp_cmp.a_gt_b", pos_weights + neg_weights, [-1.0])
+    add_gate(tensors, f"{prefix}.exp_cmp.a_lt_b", neg_weights + pos_weights, [-1.0])
+    for bit in range(5):
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.not_b.bit{bit}", [-1.0], [0.0])
+    for stage in range(4):
+        shift = 1 << (3 - stage)
+        for bit in range(11):
+            add_gate(tensors, f"{prefix}.align.stage{stage}.bit{bit}.not_sel", [-1.0], [0.0])
+            add_gate(tensors, f"{prefix}.align.stage{stage}.bit{bit}.and_a", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.align.stage{stage}.bit{bit}.and_b", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.align.stage{stage}.bit{bit}.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.nand", [-1.0, -1.0], [1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer2", [1.0, 1.0], [-2.0])
+    for bit in range(12):
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for bit in range(11):
+        add_gate(tensors, f"{prefix}.mant_sub.not_b.bit{bit}", [-1.0], [0.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for bit in range(11):
+        add_gate(tensors, f"{prefix}.mant_select.bit{bit}.not_sel", [-1.0], [0.0])
+        add_gate(tensors, f"{prefix}.mant_select.bit{bit}.and_add", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_select.bit{bit}.and_sub", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_select.bit{bit}.or", [1.0, 1.0], [-1.0])
+def add_float16_mul(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float16 multiplication circuit.
+    Algorithm:
+    1. Unpack both operands
+    2. XOR signs for result sign
+    3. Add exponents (subtract bias)
+    4. Multiply mantissas (11x11 -> 22 bits)
+    5. Normalize result
+    6. Handle special cases
+    """
+    prefix = "float16.mul"
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.nand", [-1.0, -1.0], [1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer2", [1.0, 1.0], [-2.0])
+    for bit in range(6):
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for bit in range(5):
+        add_gate(tensors, f"{prefix}.bias_sub.not_bias.bit{bit}", [-1.0], [0.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for i in range(11):
+        for j in range(11):
+            add_gate(tensors, f"{prefix}.mant_mul.pp.a{i}b{j}", [1.0, 1.0], [-2.0])
+    for stage in range(10):
+        for bit in range(22):
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+def add_float16_div(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float16 division circuit.
+    Algorithm:
+    1. Unpack both operands
+    2. XOR signs for result sign
+    3. Subtract exponents (add bias)
+    4. Divide mantissas (restoring division)
+    5. Normalize result
+    6. Handle special cases (div by zero -> inf)
+    """
+    prefix = "float16.div"
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.nand", [-1.0, -1.0], [1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer2", [1.0, 1.0], [-2.0])
+    for bit in range(5):
+        add_gate(tensors, f"{prefix}.exp_sub.not_b.bit{bit}", [-1.0], [0.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for bit in range(5):
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for stage in range(11):
+        pos_weights = [float(1 << (10 - i)) for i in range(11)]
+        neg_weights = [-w for w in pos_weights]
+        add_gate(tensors, f"{prefix}.mant_div.stage{stage}.cmp", pos_weights + neg_weights, [0.0])
+        for bit in range(11):
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.not_d.bit{bit}", [-1.0], [0.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.mux.bit{bit}.not_sel", [-1.0], [0.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.mux.bit{bit}.and_old", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.mux.bit{bit}.and_new", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.mux.bit{bit}.or", [1.0, 1.0], [-1.0])
+def add_float16_cmp(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float16 comparison circuits (EQ, LT, LE, GT, GE).
+    Float comparison:
+    1. Handle NaN (any comparison with NaN is false except NaN != NaN)
+    2. Handle signed zeros (+0 == -0)
+    3. For same signs: compare as integers (exponent then mantissa)
+    4. For different signs: negative < positive (unless both zero)
+    """
+    prefix = "float16.cmp"
+    add_gate(tensors, f"{prefix}.a.exp_max", [1.0] * 5, [-5.0])
+    add_gate(tensors, f"{prefix}.a.frac_nz", [1.0] * 10, [-1.0])
+    add_gate(tensors, f"{prefix}.a.is_nan", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.b.exp_max", [1.0] * 5, [-5.0])
+    add_gate(tensors, f"{prefix}.b.frac_nz", [1.0] * 10, [-1.0])
+    add_gate(tensors, f"{prefix}.b.is_nan", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.either_nan", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.nand", [-1.0, -1.0], [1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer2", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.a.is_zero.exp_zero", [-1.0] * 5, [0.0])
+    add_gate(tensors, f"{prefix}.a.is_zero.frac_zero", [-1.0] * 10, [0.0])
+    add_gate(tensors, f"{prefix}.a.is_zero.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.b.is_zero.exp_zero", [-1.0] * 5, [0.0])
+    add_gate(tensors, f"{prefix}.b.is_zero.frac_zero", [-1.0] * 10, [0.0])
+    add_gate(tensors, f"{prefix}.b.is_zero.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.both_zero", [1.0, 1.0], [-2.0])
+    pos_weights = [float(1 << (14 - i)) for i in range(15)]
+    neg_weights = [-w for w in pos_weights]
+    add_gate(tensors, f"{prefix}.mag_a_gt_b", pos_weights + neg_weights, [-1.0])
+    add_gate(tensors, f"{prefix}.mag_a_ge_b", pos_weights + neg_weights, [0.0])
+    add_gate(tensors, f"{prefix}.mag_a_lt_b", neg_weights + pos_weights, [-1.0])
+    add_gate(tensors, f"{prefix}.mag_a_le_b", neg_weights + pos_weights, [0.0])
+    add_gate(tensors, f"{prefix}.mag_eq.geq", pos_weights + neg_weights, [0.0])
+    add_gate(tensors, f"{prefix}.mag_eq.leq", neg_weights + pos_weights, [0.0])
+    add_gate(tensors, f"{prefix}.mag_eq.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.eq.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.eq.mag_or_zero", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.eq.same_sign_or_zero", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.eq.result", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.lt.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.lt.diff_sign.not_a_sign", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.lt.diff_sign.a_neg", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.lt.same_sign.pos_lt", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.lt.same_sign.neg_gt", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.lt.same_sign.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.lt.case_or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.lt.not_both_zero", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.lt.result", [1.0, 1.0, 1.0], [-3.0])
+    add_gate(tensors, f"{prefix}.gt.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.gt.diff_sign.not_b_sign", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.gt.diff_sign.b_neg", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.gt.same_sign.pos_gt", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.gt.same_sign.neg_lt", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.gt.same_sign.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.gt.case_or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.gt.not_both_zero", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.gt.result", [1.0, 1.0, 1.0], [-3.0])
+    add_gate(tensors, f"{prefix}.le.eq_or_lt", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.le.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.le.result", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.ge.eq_or_gt", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.ge.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.ge.result", [1.0, 1.0], [-2.0])
+def add_float32_core(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float32 core circuits (unpack, pack, classify, normalize).
+    IEEE 754 single-precision format (32 bits):
+    - Bit 31: Sign
+    - Bits 30-23: Exponent (8 bits, bias=127)
+    - Bits 22-0: Mantissa (23 bits, implicit leading 1)
+    """
+    prefix = "float32"
+    for i in range(32):
+        add_gate(tensors, f"{prefix}.unpack.bit{i}", [1.0], [0.0])
+    add_gate(tensors, f"{prefix}.classify.exp_zero", [-1.0] * 8, [0.0])
+    add_gate(tensors, f"{prefix}.classify.exp_max", [1.0] * 8, [-8.0])
+    add_gate(tensors, f"{prefix}.classify.frac_zero", [-1.0] * 23, [0.0])
+    add_gate(tensors, f"{prefix}.classify.frac_nonzero", [1.0] * 23, [-1.0])
+    add_gate(tensors, f"{prefix}.classify.is_zero.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.classify.is_subnormal.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.classify.is_inf.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.classify.is_nan.and", [1.0, 1.0], [-2.0])
+    for stage in range(5):
+        shift = 1 << (4 - stage)
+        for bit in range(24):
+            add_gate(tensors, f"{prefix}.normalize.stage{stage}.bit{bit}.not_sel", [-1.0], [0.0])
+            add_gate(tensors, f"{prefix}.normalize.stage{stage}.bit{bit}.and_a", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.stage{stage}.bit{bit}.and_b", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.stage{stage}.bit{bit}.or", [1.0, 1.0], [-1.0])
+    for stage in range(5):
+        for bit in range(8):
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.normalize.exp_adj.stage{stage}.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for i in range(32):
+        add_gate(tensors, f"{prefix}.pack.bit{i}", [1.0], [0.0])
+def add_float32_cmp(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float32 comparison circuits (EQ, LT, LE, GT, GE)."""
+    prefix = "float32.cmp"
+    add_gate(tensors, f"{prefix}.a.exp_max", [1.0] * 8, [-8.0])
+    add_gate(tensors, f"{prefix}.a.frac_nz", [1.0] * 23, [-1.0])
+    add_gate(tensors, f"{prefix}.a.is_nan", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.b.exp_max", [1.0] * 8, [-8.0])
+    add_gate(tensors, f"{prefix}.b.frac_nz", [1.0] * 23, [-1.0])
+    add_gate(tensors, f"{prefix}.b.is_nan", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.either_nan", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.nand", [-1.0, -1.0], [1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer2", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.a.is_zero.exp_zero", [-1.0] * 8, [0.0])
+    add_gate(tensors, f"{prefix}.a.is_zero.frac_zero", [-1.0] * 23, [0.0])
+    add_gate(tensors, f"{prefix}.a.is_zero.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.b.is_zero.exp_zero", [-1.0] * 8, [0.0])
+    add_gate(tensors, f"{prefix}.b.is_zero.frac_zero", [-1.0] * 23, [0.0])
+    add_gate(tensors, f"{prefix}.b.is_zero.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.both_zero", [1.0, 1.0], [-2.0])
+    pos_weights = [float(1 << (30 - i)) for i in range(31)]
+    neg_weights = [-w for w in pos_weights]
+    add_gate(tensors, f"{prefix}.mag_a_gt_b", pos_weights + neg_weights, [-1.0])
+    add_gate(tensors, f"{prefix}.mag_a_ge_b", pos_weights + neg_weights, [0.0])
+    add_gate(tensors, f"{prefix}.mag_a_lt_b", neg_weights + pos_weights, [-1.0])
+    add_gate(tensors, f"{prefix}.mag_a_le_b", neg_weights + pos_weights, [0.0])
+    add_gate(tensors, f"{prefix}.mag_eq.geq", pos_weights + neg_weights, [0.0])
+    add_gate(tensors, f"{prefix}.mag_eq.leq", neg_weights + pos_weights, [0.0])
+    add_gate(tensors, f"{prefix}.mag_eq.and", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.eq.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.eq.mag_or_zero", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.eq.same_sign_or_zero", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.eq.result", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.lt.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.lt.diff_sign.not_a_sign", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.lt.diff_sign.a_neg", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.lt.same_sign.pos_lt", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.lt.same_sign.neg_gt", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.lt.same_sign.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.lt.case_or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.lt.not_both_zero", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.lt.result", [1.0, 1.0, 1.0], [-3.0])
+    add_gate(tensors, f"{prefix}.gt.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.gt.diff_sign.not_b_sign", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.gt.diff_sign.b_neg", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.gt.same_sign.pos_gt", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.gt.same_sign.neg_lt", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.gt.same_sign.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.gt.case_or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.gt.not_both_zero", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.gt.result", [1.0, 1.0, 1.0], [-3.0])
+    add_gate(tensors, f"{prefix}.le.eq_or_lt", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.le.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.le.result", [1.0, 1.0], [-2.0])
+    add_gate(tensors, f"{prefix}.ge.eq_or_gt", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.ge.not_nan", [-1.0], [0.0])
+    add_gate(tensors, f"{prefix}.ge.result", [1.0, 1.0], [-2.0])
+def add_float32_add(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float32 addition circuit.
+    Algorithm:
+    1. Unpack both operands
+    2. Compare exponents, align mantissas
+    3. Add/subtract mantissas based on signs
+    4. Normalize result
+    5. Handle special cases (inf, nan, zero)
+    """
+    prefix = "float32.add"
+    pos_weights = [float(1 << (7 - i)) for i in range(8)]
+    neg_weights = [-w for w in pos_weights]
+    add_gate(tensors, f"{prefix}.exp_cmp.a_gt_b", pos_weights + neg_weights, [-1.0])
+    add_gate(tensors, f"{prefix}.exp_cmp.a_lt_b", neg_weights + pos_weights, [-1.0])
+    for bit in range(8):
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_diff.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_diff.not_b.bit{bit}", [-1.0], [0.0])
+    for stage in range(5):
+        shift = 1 << (4 - stage)
+        for bit in range(24):
+            add_gate(tensors, f"{prefix}.align.stage{stage}.bit{bit}.not_sel", [-1.0], [0.0])
+            add_gate(tensors, f"{prefix}.align.stage{stage}.bit{bit}.and_a", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.align.stage{stage}.bit{bit}.and_b", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.align.stage{stage}.bit{bit}.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.nand", [-1.0, -1.0], [1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer2", [1.0, 1.0], [-2.0])
+    for bit in range(25):
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_add.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for bit in range(24):
+        add_gate(tensors, f"{prefix}.mant_sub.not_b.bit{bit}", [-1.0], [0.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_sub.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for bit in range(24):
+        add_gate(tensors, f"{prefix}.mant_select.bit{bit}.not_sel", [-1.0], [0.0])
+        add_gate(tensors, f"{prefix}.mant_select.bit{bit}.and_add", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_select.bit{bit}.and_sub", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.mant_select.bit{bit}.or", [1.0, 1.0], [-1.0])
+def add_float32_mul(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float32 multiplication circuit.
+    Algorithm:
+    1. Unpack both operands
+    2. XOR signs for result sign
+    3. Add exponents (subtract bias)
+    4. Multiply mantissas (24x24 -> 48 bits)
+    5. Normalize result
+    6. Handle special cases
+    """
+    prefix = "float32.mul"
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.nand", [-1.0, -1.0], [1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer2", [1.0, 1.0], [-2.0])
+    for bit in range(9):
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_add.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for bit in range(8):
+        add_gate(tensors, f"{prefix}.bias_sub.not_bias.bit{bit}", [-1.0], [0.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_sub.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for i in range(24):
+        for j in range(24):
+            add_gate(tensors, f"{prefix}.mant_mul.pp.a{i}b{j}", [1.0, 1.0], [-2.0])
+    for stage in range(23):
+        for bit in range(48):
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_mul.acc.s{stage}.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+def add_float32_div(tensors: Dict[str, torch.Tensor]) -> None:
+    """Add float32 division circuit.
+    Algorithm:
+    1. Unpack both operands
+    2. XOR signs for result sign
+    3. Subtract exponents (add bias)
+    4. Divide mantissas (restoring division)
+    5. Normalize result
+    6. Handle special cases (div by zero -> inf)
+    """
+    prefix = "float32.div"
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.or", [1.0, 1.0], [-1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer1.nand", [-1.0, -1.0], [1.0])
+    add_gate(tensors, f"{prefix}.sign_xor.layer2", [1.0, 1.0], [-2.0])
+    for bit in range(8):
+        add_gate(tensors, f"{prefix}.exp_sub.not_b.bit{bit}", [-1.0], [0.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.exp_sub.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for bit in range(8):
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+        add_gate(tensors, f"{prefix}.bias_add.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+    for stage in range(24):
+        pos_weights = [float(1 << (23 - i)) for i in range(24)]
+        neg_weights = [-w for w in pos_weights]
+        add_gate(tensors, f"{prefix}.mant_div.stage{stage}.cmp", pos_weights + neg_weights, [0.0])
+        for bit in range(24):
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.not_d.bit{bit}", [-1.0], [0.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha1.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha1.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha1.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha1.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha2.sum.layer1.or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha2.sum.layer1.nand", [-1.0, -1.0], [1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha2.sum.layer2", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.ha2.carry", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.sub.fa{bit}.carry_or", [1.0, 1.0], [-1.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.mux.bit{bit}.not_sel", [-1.0], [0.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.mux.bit{bit}.and_old", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.mux.bit{bit}.and_new", [1.0, 1.0], [-2.0])
+            add_gate(tensors, f"{prefix}.mant_div.stage{stage}.mux.bit{bit}.or", [1.0, 1.0], [-1.0])
 def update_manifest(tensors: Dict[str, torch.Tensor], data_bits: int, addr_bits: int, mem_bytes: int) -> None:
     """Update manifest metadata tensors.
         "arithmetic.greaterorequal8bit.", "arithmetic.lessorequal8bit.",
         "arithmetic.equality8bit.", "arithmetic.add3_8bit.", "arithmetic.expr_add_mul.", "arithmetic.expr_paren.",
         "combinational.barrelshifter.", "combinational.priorityencoder.",
+        "float16.", "float32.",
     ]
     if bits in [16, 32]:
         except ValueError as e:
             print(f"  {bits}-bit NEG already exists: {e}")
+        print(f"\nGenerating {bits}-bit barrel shifter...")
+        try:
+            add_barrel_shifter_nbits(tensors, bits)
+            import math
+            num_layers = max(1, math.ceil(math.log2(bits)))
+            print(f"  Added {bits}-bit barrel shifter ({num_layers} layers x {bits} muxes)")
+        except ValueError as e:
+            print(f"  {bits}-bit barrel shifter already exists: {e}")
+        print(f"\nGenerating {bits}-bit priority encoder...")
+        try:
+            add_priority_encoder_nbits(tensors, bits)
+            import math
+            out_bits = max(1, math.ceil(math.log2(bits)))
+            print(f"  Added {bits}-bit priority encoder ({out_bits}-bit output)")
+        except ValueError as e:
+            print(f"  {bits}-bit priority encoder already exists: {e}")
+    print(f"\n{'=' * 60}")
+    print(f" GENERATING FLOAT CIRCUITS")
+    print(f"{'=' * 60}")
+    print("\nGenerating float16 core circuits...")
+    try:
+        add_float16_core(tensors)
+        print("  Added float16 unpack/pack/classify/normalize")
+    except ValueError as e:
+        print(f"  float16 core already exists: {e}")
+    print("\nGenerating float16 ADD circuit...")
+    try:
+        add_float16_add(tensors)
+        print("  Added float16 addition (exp align + mantissa add/sub)")
+    except ValueError as e:
+        print(f"  float16 ADD already exists: {e}")
+    print("\nGenerating float16 MUL circuit...")
+    try:
+        add_float16_mul(tensors)
+        print("  Added float16 multiplication (11x11 mantissa mul)")
+    except ValueError as e:
+        print(f"  float16 MUL already exists: {e}")
+    print("\nGenerating float16 DIV circuit...")
+    try:
+        add_float16_div(tensors)
+        print("  Added float16 division (11-stage restoring div)")
+    except ValueError as e:
+        print(f"  float16 DIV already exists: {e}")
+    print("\nGenerating float16 CMP circuits...")
+    try:
+        add_float16_cmp(tensors)
+        print("  Added float16 comparisons (EQ, LT, LE, GT, GE)")
+    except ValueError as e:
+        print(f"  float16 CMP already exists: {e}")
+    print("\nGenerating float32 core circuits...")
+    try:
+        add_float32_core(tensors)
+        print("  Added float32 unpack/pack/classify/normalize")
+    except ValueError as e:
+        print(f"  float32 core already exists: {e}")
+    print("\nGenerating float32 ADD circuit...")
+    try:
+        add_float32_add(tensors)
+        print("  Added float32 addition (exp align + mantissa add/sub)")
+    except ValueError as e:
+        print(f"  float32 ADD already exists: {e}")
+    print("\nGenerating float32 MUL circuit...")
+    try:
+        add_float32_mul(tensors)
+        print("  Added float32 multiplication (24x24 mantissa mul)")
+    except ValueError as e:
+        print(f"  float32 MUL already exists: {e}")
+    print("\nGenerating float32 DIV circuit...")
+    try:
+        add_float32_div(tensors)
+        print("  Added float32 division (24-stage restoring div)")
+    except ValueError as e:
+        print(f"  float32 DIV already exists: {e}")
+    print("\nGenerating float32 CMP circuits...")
+    try:
+        add_float32_cmp(tensors)
+        print("  Added float32 comparisons (EQ, LT, LE, GT, GE)")
+    except ValueError as e:
+        print(f"  float32 CMP already exists: {e}")
     if args.apply:
         print(f"\nSaving: {args.model}")
         save_file(tensors, str(args.model))

eval.py CHANGED Viewed

@@ -2917,6 +2917,152 @@ class BatchedFitnessEvaluator:
         return scores, total
     # =========================================================================
     # CONTROL FLOW
     # =========================================================================
@@ -3662,7 +3808,7 @@ class BatchedFitnessEvaluator:
             'manifest.instruction_width': 16.0,
             'manifest.register_width': 8.0,
             'manifest.registers': 4.0,
-            'manifest.version': 3.0,
         }
         for name, exp_val in fixed_expected.items():
@@ -3762,6 +3908,399 @@ class BatchedFitnessEvaluator:
         return scores, total
     # =========================================================================
     # INTEGRATION TESTS (Multi-circuit chains)
     # =========================================================================
@@ -4091,6 +4630,18 @@ class BatchedFitnessEvaluator:
                     total_tests += t
                     self.category_scores[f'neg{bits}'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
         # 3-operand adder
         s, t = self._test_add3(population, debug)
         scores += s
@@ -4163,6 +4714,78 @@ class BatchedFitnessEvaluator:
         total_tests += t
         self.category_scores['memory'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
         self.total_tests = total_tests
         if debug:

         return scores, total
+    def _test_barrel_shifter_nbits(self, pop: Dict, bits: int, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test N-bit barrel shifter (shift by 0 to bits-1 positions)."""
+        import math
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        num_layers = max(1, math.ceil(math.log2(bits)))
+        max_val = (1 << bits) - 1
+        if debug:
+            print(f"\n=== {bits}-BIT BARREL SHIFTER ===")
+        prefix = f'combinational.barrelshifter{bits}'
+        try:
+            if bits == 16:
+                test_vals = [0x8001, 0xFF00, 0x00FF, 0xAAAA, 0xFFFF, 0x1234]
+            elif bits == 32:
+                test_vals = [0x80000001, 0xFFFF0000, 0x0000FFFF, 0xAAAAAAAA, 0xFFFFFFFF, 0x12345678]
+            else:
+                test_vals = [0b10000001, 0b11110000, 0b00001111, 0b10101010, max_val]
+            num_shifts = min(bits, 8)
+            for val in test_vals:
+                for shift in range(num_shifts):
+                    expected_val = (val << shift) & max_val
+                    val_bits = [float((val >> (bits - 1 - i)) & 1) for i in range(bits)]
+                    shift_bits = [float((shift >> (num_layers - 1 - i)) & 1) for i in range(num_layers)]
+                    layer_in = val_bits[:]
+                    for layer in range(num_layers):
+                        shift_amount = 1 << (num_layers - 1 - layer)
+                        sel = shift_bits[layer]
+                        layer_out = []
+                        for bit in range(bits):
+                            bit_prefix = f'{prefix}.layer{layer}.bit{bit}'
+                            w_not = pop[f'{bit_prefix}.not_sel.weight'].view(pop_size)
+                            b_not = pop[f'{bit_prefix}.not_sel.bias'].view(pop_size)
+                            not_sel = heaviside(sel * w_not + b_not)
+                            shifted_src = bit + shift_amount
+                            if shifted_src < bits:
+                                shifted_val = layer_in[shifted_src]
+                            else:
+                                shifted_val = 0.0
+                            w_and_a = pop[f'{bit_prefix}.and_a.weight'].view(pop_size, 2)
+                            b_and_a = pop[f'{bit_prefix}.and_a.bias'].view(pop_size)
+                            inp_a = torch.tensor([layer_in[bit], not_sel[0].item()], device=self.device)
+                            and_a = heaviside((inp_a * w_and_a).sum(-1) + b_and_a)
+                            w_and_b = pop[f'{bit_prefix}.and_b.weight'].view(pop_size, 2)
+                            b_and_b = pop[f'{bit_prefix}.and_b.bias'].view(pop_size)
+                            inp_b = torch.tensor([shifted_val, sel], device=self.device)
+                            and_b = heaviside((inp_b * w_and_b).sum(-1) + b_and_b)
+                            w_or = pop[f'{bit_prefix}.or.weight'].view(pop_size, 2)
+                            b_or = pop[f'{bit_prefix}.or.bias'].view(pop_size)
+                            inp_or = torch.tensor([and_a[0].item(), and_b[0].item()], device=self.device)
+                            out = heaviside((inp_or * w_or).sum(-1) + b_or)
+                            layer_out.append(out[0].item())
+                        layer_in = layer_out
+                    result = sum(int(layer_in[i]) << (bits - 1 - i) for i in range(bits))
+                    if result == expected_val:
+                        scores += 1
+                    total += 1
+            self._record(prefix, int(scores[0].item()), total, [])
+            if debug:
+                r = self.results[-1]
+                print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+        except (KeyError, RuntimeError) as e:
+            if debug:
+                print(f"  {prefix}: SKIP ({e})")
+        return scores, total
+    def _test_priority_encoder_nbits(self, pop: Dict, bits: int, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test N-bit priority encoder (find highest set bit)."""
+        import math
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        out_bits = max(1, math.ceil(math.log2(bits)))
+        if debug:
+            print(f"\n=== {bits}-BIT PRIORITY ENCODER ===")
+        prefix = f'combinational.priorityencoder{bits}'
+        try:
+            test_cases = [(0, 0, 0)]
+            for i in range(bits):
+                test_cases.append((1 << i, 1, bits - 1 - i))
+            if bits == 16:
+                test_cases.extend([
+                    (0x8001, 1, 0), (0x5555, 1, 1), (0x00FF, 1, 8), (0xFFFF, 1, 0)
+                ])
+            elif bits == 32:
+                test_cases.extend([
+                    (0x80000001, 1, 0), (0x55555555, 1, 1), (0x0000FFFF, 1, 16), (0xFFFFFFFF, 1, 0)
+                ])
+            for val, expected_valid, expected_idx in test_cases:
+                val_bits = torch.tensor([float((val >> (bits - 1 - i)) & 1) for i in range(bits)],
+                                       device=self.device, dtype=torch.float32)
+                w_valid = pop[f'{prefix}.valid.weight'].view(pop_size, bits)
+                b_valid = pop[f'{prefix}.valid.bias'].view(pop_size)
+                out_valid = heaviside((val_bits * w_valid).sum(-1) + b_valid)
+                if int(out_valid[0].item()) == expected_valid:
+                    scores += 1
+                total += 1
+                if expected_valid == 1:
+                    for idx_bit in range(out_bits):
+                        try:
+                            w_idx = pop[f'{prefix}.out{idx_bit}.weight']
+                            num_weights = w_idx.numel() // pop_size
+                            w_idx = w_idx.view(pop_size, num_weights)
+                            b_idx = pop[f'{prefix}.out{idx_bit}.bias'].view(pop_size)
+                            relevant_bits = torch.tensor([val_bits[i].item() for i in range(bits)
+                                                         if (i >> idx_bit) & 1],
+                                                        device=self.device, dtype=torch.float32)
+                            if len(relevant_bits) > 0:
+                                out_idx = heaviside((relevant_bits[:w_idx.shape[1]] * w_idx).sum(-1) + b_idx)
+                                expected_bit = (expected_idx >> idx_bit) & 1
+                                if int(out_idx[0].item()) == expected_bit:
+                                    scores += 1
+                                total += 1
+                        except KeyError:
+                            pass
+            self._record(prefix, int(scores[0].item()), total, [])
+            if debug:
+                r = self.results[-1]
+                print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+        except (KeyError, RuntimeError) as e:
+            if debug:
+                print(f"  {prefix}: SKIP ({e})")
+        return scores, total
     # =========================================================================
     # CONTROL FLOW
     # =========================================================================
             'manifest.instruction_width': 16.0,
             'manifest.register_width': 8.0,
             'manifest.registers': 4.0,
+            'manifest.version': 4.0,
         }
         for name, exp_val in fixed_expected.items():
         return scores, total
+    # =========================================================================
+    # FLOAT16 TESTS
+    # =========================================================================
+    def _test_float16_core(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float16 core circuits (unpack, pack, classify)."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT16 CORE ===")
+        expected_gates = [
+            ('float16.unpack.bit0.weight', (1,)),
+            ('float16.classify.exp_zero.weight', (5,)),
+            ('float16.classify.exp_max.weight', (5,)),
+            ('float16.classify.frac_zero.weight', (10,)),
+            ('float16.classify.is_zero.and.weight', (2,)),
+            ('float16.classify.is_nan.and.weight', (2,)),
+            ('float16.normalize.stage0.bit0.not_sel.weight', (1,)),
+            ('float16.normalize.stage0.bit0.and_a.weight', (2,)),
+            ('float16.normalize.stage0.bit0.or.weight', (2,)),
+            ('float16.pack.bit0.weight', (1,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
+    def _test_float16_add(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float16 addition circuit."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT16 ADD ===")
+        expected_gates = [
+            ('float16.add.exp_cmp.a_gt_b.weight', (10,)),
+            ('float16.add.exp_cmp.a_lt_b.weight', (10,)),
+            ('float16.add.exp_diff.fa0.ha1.sum.layer1.or.weight', (2,)),
+            ('float16.add.align.stage0.bit0.not_sel.weight', (1,)),
+            ('float16.add.sign_xor.layer1.or.weight', (2,)),
+            ('float16.add.mant_add.fa0.ha1.sum.layer1.or.weight', (2,)),
+            ('float16.add.mant_sub.not_b.bit0.weight', (1,)),
+            ('float16.add.mant_select.bit0.not_sel.weight', (1,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
+    def _test_float16_mul(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float16 multiplication circuit."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT16 MUL ===")
+        expected_gates = [
+            ('float16.mul.sign_xor.layer1.or.weight', (2,)),
+            ('float16.mul.exp_add.fa0.ha1.sum.layer1.or.weight', (2,)),
+            ('float16.mul.bias_sub.not_bias.bit0.weight', (1,)),
+            ('float16.mul.mant_mul.pp.a0b0.weight', (2,)),
+            ('float16.mul.mant_mul.acc.s0.fa0.ha1.sum.layer1.or.weight', (2,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
+    def _test_float16_div(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float16 division circuit."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT16 DIV ===")
+        expected_gates = [
+            ('float16.div.sign_xor.layer1.or.weight', (2,)),
+            ('float16.div.exp_sub.not_b.bit0.weight', (1,)),
+            ('float16.div.bias_add.fa0.ha1.sum.layer1.or.weight', (2,)),
+            ('float16.div.mant_div.stage0.cmp.weight', (22,)),
+            ('float16.div.mant_div.stage0.sub.not_d.bit0.weight', (1,)),
+            ('float16.div.mant_div.stage0.mux.bit0.not_sel.weight', (1,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
+    def _test_float16_cmp(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float16 comparison circuits."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT16 CMP ===")
+        expected_gates = [
+            ('float16.cmp.a.exp_max.weight', (5,)),
+            ('float16.cmp.a.frac_nz.weight', (10,)),
+            ('float16.cmp.a.is_nan.weight', (2,)),
+            ('float16.cmp.either_nan.weight', (2,)),
+            ('float16.cmp.sign_xor.layer1.or.weight', (2,)),
+            ('float16.cmp.both_zero.weight', (2,)),
+            ('float16.cmp.mag_a_gt_b.weight', (30,)),
+            ('float16.cmp.eq.result.weight', (2,)),
+            ('float16.cmp.lt.result.weight', (3,)),
+            ('float16.cmp.gt.result.weight', (3,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
+    # =========================================================================
+    # FLOAT32 TESTS
+    # =========================================================================
+    def _test_float32_core(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float32 core circuits (unpack, pack, classify)."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT32 CORE ===")
+        expected_gates = [
+            ('float32.unpack.bit0.weight', (1,)),
+            ('float32.classify.exp_zero.weight', (8,)),
+            ('float32.classify.exp_max.weight', (8,)),
+            ('float32.classify.frac_zero.weight', (23,)),
+            ('float32.classify.is_zero.and.weight', (2,)),
+            ('float32.classify.is_nan.and.weight', (2,)),
+            ('float32.normalize.stage0.bit0.not_sel.weight', (1,)),
+            ('float32.pack.bit0.weight', (1,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
+    def _test_float32_add(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float32 addition circuit."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT32 ADD ===")
+        expected_gates = [
+            ('float32.add.exp_cmp.a_gt_b.weight', (16,)),
+            ('float32.add.exp_diff.fa0.ha1.sum.layer1.or.weight', (2,)),
+            ('float32.add.align.stage0.bit0.not_sel.weight', (1,)),
+            ('float32.add.sign_xor.layer1.or.weight', (2,)),
+            ('float32.add.mant_add.fa0.ha1.sum.layer1.or.weight', (2,)),
+            ('float32.add.mant_sub.not_b.bit0.weight', (1,)),
+            ('float32.add.mant_select.bit0.not_sel.weight', (1,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
+    def _test_float32_mul(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float32 multiplication circuit."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT32 MUL ===")
+        expected_gates = [
+            ('float32.mul.sign_xor.layer1.or.weight', (2,)),
+            ('float32.mul.exp_add.fa0.ha1.sum.layer1.or.weight', (2,)),
+            ('float32.mul.bias_sub.not_bias.bit0.weight', (1,)),
+            ('float32.mul.mant_mul.pp.a0b0.weight', (2,)),
+            ('float32.mul.mant_mul.acc.s0.fa0.ha1.sum.layer1.or.weight', (2,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
+    def _test_float32_div(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float32 division circuit."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT32 DIV ===")
+        expected_gates = [
+            ('float32.div.sign_xor.layer1.or.weight', (2,)),
+            ('float32.div.exp_sub.not_b.bit0.weight', (1,)),
+            ('float32.div.bias_add.fa0.ha1.sum.layer1.or.weight', (2,)),
+            ('float32.div.mant_div.stage0.cmp.weight', (48,)),
+            ('float32.div.mant_div.stage0.sub.not_d.bit0.weight', (1,)),
+            ('float32.div.mant_div.stage0.mux.bit0.not_sel.weight', (1,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
+    def _test_float32_cmp(self, pop: Dict, debug: bool) -> Tuple[torch.Tensor, int]:
+        """Test float32 comparison circuits."""
+        pop_size = next(iter(pop.values())).shape[0]
+        scores = torch.zeros(pop_size, device=self.device)
+        total = 0
+        if debug:
+            print("\n=== FLOAT32 CMP ===")
+        expected_gates = [
+            ('float32.cmp.a.exp_max.weight', (8,)),
+            ('float32.cmp.a.frac_nz.weight', (23,)),
+            ('float32.cmp.a.is_nan.weight', (2,)),
+            ('float32.cmp.either_nan.weight', (2,)),
+            ('float32.cmp.sign_xor.layer1.or.weight', (2,)),
+            ('float32.cmp.both_zero.weight', (2,)),
+            ('float32.cmp.mag_a_gt_b.weight', (62,)),
+            ('float32.cmp.eq.result.weight', (2,)),
+            ('float32.cmp.lt.result.weight', (3,)),
+            ('float32.cmp.gt.result.weight', (3,)),
+        ]
+        for name, expected_shape in expected_gates:
+            try:
+                tensor = pop[name]
+                actual_shape = tuple(tensor.shape[1:])
+                if actual_shape == expected_shape:
+                    scores += 1
+                    self._record(name, 1, 1, [])
+                else:
+                    self._record(name, 0, 1, [(expected_shape, actual_shape)])
+                total += 1
+                if debug:
+                    r = self.results[-1]
+                    print(f"  {r.name}: {r.passed}/{r.total} {'PASS' if r.success else 'FAIL'}")
+            except KeyError:
+                if debug:
+                    print(f"  {name}: SKIP (not found)")
+        return scores, total
     # =========================================================================
     # INTEGRATION TESTS (Multi-circuit chains)
     # =========================================================================
                     total_tests += t
                     self.category_scores[f'neg{bits}'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+                if f'combinational.barrelshifter{bits}.layer0.bit0.not_sel.weight' in population:
+                    s, t = self._test_barrel_shifter_nbits(population, bits, debug)
+                    scores += s
+                    total_tests += t
+                    self.category_scores[f'barrelshifter{bits}'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+                if f'combinational.priorityencoder{bits}.valid.weight' in population:
+                    s, t = self._test_priority_encoder_nbits(population, bits, debug)
+                    scores += s
+                    total_tests += t
+                    self.category_scores[f'priorityencoder{bits}'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
         # 3-operand adder
         s, t = self._test_add3(population, debug)
         scores += s
         total_tests += t
         self.category_scores['memory'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+        # Float16 circuits (if present)
+        if 'float16.unpack.bit0.weight' in population:
+            if debug:
+                print(f"\n{'=' * 60}")
+                print(f" FLOAT16 CIRCUITS")
+                print(f"{'=' * 60}")
+            s, t = self._test_float16_core(population, debug)
+            scores += s
+            total_tests += t
+            self.category_scores['float16_core'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+            if 'float16.add.exp_cmp.a_gt_b.weight' in population:
+                s, t = self._test_float16_add(population, debug)
+                scores += s
+                total_tests += t
+                self.category_scores['float16_add'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+            if 'float16.mul.sign_xor.layer1.or.weight' in population:
+                s, t = self._test_float16_mul(population, debug)
+                scores += s
+                total_tests += t
+                self.category_scores['float16_mul'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+            if 'float16.div.sign_xor.layer1.or.weight' in population:
+                s, t = self._test_float16_div(population, debug)
+                scores += s
+                total_tests += t
+                self.category_scores['float16_div'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+            if 'float16.cmp.a.exp_max.weight' in population:
+                s, t = self._test_float16_cmp(population, debug)
+                scores += s
+                total_tests += t
+                self.category_scores['float16_cmp'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+        # Float32 circuits (if present)
+        if 'float32.unpack.bit0.weight' in population:
+            if debug:
+                print(f"\n{'=' * 60}")
+                print(f" FLOAT32 CIRCUITS")
+                print(f"{'=' * 60}")
+            s, t = self._test_float32_core(population, debug)
+            scores += s
+            total_tests += t
+            self.category_scores['float32_core'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+            if 'float32.add.exp_cmp.a_gt_b.weight' in population:
+                s, t = self._test_float32_add(population, debug)
+                scores += s
+                total_tests += t
+                self.category_scores['float32_add'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+            if 'float32.mul.sign_xor.layer1.or.weight' in population:
+                s, t = self._test_float32_mul(population, debug)
+                scores += s
+                total_tests += t
+                self.category_scores['float32_mul'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+            if 'float32.div.sign_xor.layer1.or.weight' in population:
+                s, t = self._test_float32_div(population, debug)
+                scores += s
+                total_tests += t
+                self.category_scores['float32_div'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
+            if 'float32.cmp.a.exp_max.weight' in population:
+                s, t = self._test_float32_cmp(population, debug)
+                scores += s
+                total_tests += t
+                self.category_scores['float32_cmp'] = (s[0].item() if pop_size == 1 else s.mean().item(), t)
         self.total_tests = total_tests
         if debug:

neural_alu32.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5a0f6cdfb4ba0ebdfc863f43e5f8fd4f41626c0fd4e7258a0a581a117a79d97
-size 5031612

 version https://git-lfs.github.com/spec/v1
+oid sha256:6efa5b719d55fa8e071c4dacc90bfe5bff7337c6fab952460f4ccdadf237facb
+size 10083624

neural_computer.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08a39c4758f6e5236f84d231be7f2d54364099309a89cf484d607a6544194d20
-size 2591660

 version https://git-lfs.github.com/spec/v1
+oid sha256:812d1833c915945eeb694bca530b075b3e08685bac8646f29e87d26a2d644b88
+size 8436636