Rewrite float16.toint with right-shift barrel shifter

- Changed from left-shift to right-shift (25-exp positions)
- Improved from 31/93 to 54/93 tests passing
- Still needs debugging for remaining edge cases

Files changed (2) hide show

arithmetic.safetensors +2 -2
convert_to_explicit_inputs.py +200 -57

arithmetic.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9bdd850c9d33e5e667744caf0ce5dee8afde5aa5fafbf20eb812fa647c556626
-size 2860992

 version https://git-lfs.github.com/spec/v1
+oid sha256:5c7bc258f49a9a4c85321d0980f843bb989e8fb84c4d8bc65883f24c6e306334
+size 2863492

convert_to_explicit_inputs.py CHANGED Viewed

@@ -3046,7 +3046,7 @@ def infer_float16_div_inputs(gate: str, registry: SignalRegistry) -> List[int]:
 def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]:
-    """Infer inputs for float16.toint circuit."""
     prefix = "float16.toint"
     for i in range(16):
@@ -3055,6 +3055,7 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
     exp_bits = [f"{prefix}.$x[{10+i}]" for i in range(5)]
     mant_bits = [f"{prefix}.$x[{i}]" for i in range(10)]
     if '.exp_all_ones' in gate:
         return [registry.get_id(b) for b in exp_bits]
     if '.exp_zero' in gate:
@@ -3080,7 +3081,7 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
     if '.is_inf' in gate:
         return [registry.get_id(f"{prefix}.exp_all_ones"),
                 registry.get_id(f"{prefix}.mant_zero")]
-    if '.is_zero' in gate:
         return [registry.get_id(f"{prefix}.exp_zero"),
                 registry.get_id(f"{prefix}.mant_zero")]
@@ -3092,27 +3093,53 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
     registry.register(f"{prefix}.exp_lt_15")
-    if '.result_is_zero' in gate:
         return [registry.get_id(f"{prefix}.is_nan"),
                 registry.get_id(f"{prefix}.is_zero"),
                 registry.get_id(f"{prefix}.exp_lt_15")]
     registry.register(f"{prefix}.result_is_zero")
     if '.implicit_bit' in gate:
         return [registry.get_id(f"{prefix}.exp_zero")]
     registry.register(f"{prefix}.implicit_bit")
-    bits_25 = [1, 0, 0, 1, 1, 0]
-    not_25 = [0, 1, 1, 0, 0, 1]
-    match = re.search(r'\.not_25_(\d+)$', gate)
-    if match:
-        return [registry.get_id(f"#{bits_25[int(match.group(1))]}")]
     for i in range(5):
-        registry.register(f"{prefix}.not_25_{i}")
     if '.shift_calc.fa' in gate:
         match = re.search(r'\.shift_calc\.fa(\d+)\.', gate)
@@ -3120,9 +3147,11 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
             i = int(match.group(1))
             fa_prefix = f"{prefix}.shift_calc.fa{i}"
-            a_bit = registry.get_id(exp_bits[i]) if i < 5 else registry.get_id("#0")
-            b_bit = registry.get_id(f"#{not_25[i]}")
-            cin = registry.get_id("#1") if i == 0 else registry.register(f"{prefix}.shift_calc.fa{i-1}.cout")
             if '.xor1.layer1' in gate:
                 return [a_bit, b_bit]
@@ -3147,6 +3176,7 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
         registry.register(f"{prefix}.shift_calc.fa{i}.xor2.layer2")
         registry.register(f"{prefix}.shift_calc.fa{i}.cout")
     for stage in range(4):
         shift_amt = 1 << stage
@@ -3154,10 +3184,13 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
             return [registry.get_id(f"{prefix}.shift_calc.fa{stage}.xor2.layer2")]
         registry.register(f"{prefix}.not_shift{stage}")
-        match = re.search(rf'\.lshift_s{stage}_(\d+)\.', gate)
         if match:
             i = int(match.group(1))
             if '.pass' in gate:
                 if stage == 0:
                     if i < 10:
                         val = registry.get_id(mant_bits[i])
@@ -3166,36 +3199,39 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
                     else:
                         val = registry.get_id("#0")
                 else:
-                    val = registry.get_id(f"{prefix}.lshift_s{stage-1}_{i}")
                 return [val, registry.get_id(f"{prefix}.not_shift{stage}")]
-            if '.shift' in gate and i >= shift_amt:
                 if stage == 0:
-                    prev_i = i - shift_amt
-                    if prev_i < 10:
-                        val = registry.get_id(mant_bits[prev_i])
-                    elif prev_i == 10:
                         val = registry.get_id(f"{prefix}.implicit_bit")
                     else:
                         val = registry.get_id("#0")
                 else:
-                    val = registry.get_id(f"{prefix}.lshift_s{stage-1}_{i-shift_amt}")
                 return [val, registry.get_id(f"{prefix}.shift_calc.fa{stage}.xor2.layer2")]
-        match = re.search(rf'\.lshift_s{stage}_(\d+)$', gate)
         if match:
             i = int(match.group(1))
-            if i >= shift_amt:
-                return [registry.register(f"{prefix}.lshift_s{stage}_{i}.pass"),
-                        registry.register(f"{prefix}.lshift_s{stage}_{i}.shift")]
             else:
-                return [registry.register(f"{prefix}.lshift_s{stage}_{i}.pass")]
         for i in range(16):
-            registry.register(f"{prefix}.lshift_s{stage}_{i}")
     for i in range(16):
         if f'.not_mag{i}' in gate:
-            return [registry.get_id(f"{prefix}.lshift_s3_{i}")]
         registry.register(f"{prefix}.not_mag{i}")
     if '.neg.fa' in gate:
@@ -3222,17 +3258,24 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
         registry.register(f"{prefix}.neg.fa{i}.xor.layer2")
         registry.register(f"{prefix}.neg.fa{i}.cout")
     match = re.search(r'\.out(\d+)\.', gate)
     if match:
         i = int(match.group(1))
         sign = registry.get_id(f"{prefix}.$x[15]")
         if '.pos_path' in gate:
-            return [registry.get_id(f"{prefix}.lshift_s3_{i}"),
-                    registry.get_id(f"{prefix}.$x[15]")]
         if '.neg_path' in gate:
             return [registry.get_id(f"{prefix}.neg.fa{i}.xor.layer2"),
                     sign]
     match = re.search(r'\.out(\d+)$', gate)
     if match:
         i = int(match.group(1))
@@ -5726,54 +5769,135 @@ def build_float16_toint_tensors() -> Dict[str, torch.Tensor]:
     Convert float16 to signed 16-bit integer (truncate toward zero).
     Algorithm:
-    1. If NaN or Inf, return 0 or max/min int
-    2. If |value| < 1, return 0
-    3. Shift mantissa by (exponent - 15 - 10) positions
-    4. Apply sign
     """
     tensors = {}
     prefix = "float16.toint"
-    # Special case detection
     tensors[f"{prefix}.exp_all_ones.weight"] = torch.tensor([1.0] * 5)
     tensors[f"{prefix}.exp_all_ones.bias"] = torch.tensor([-5.0])
     tensors[f"{prefix}.exp_zero.weight"] = torch.tensor([-1.0] * 5)
     tensors[f"{prefix}.exp_zero.bias"] = torch.tensor([0.0])
     tensors[f"{prefix}.mant_nonzero.weight"] = torch.tensor([1.0] * 10)
     tensors[f"{prefix}.mant_nonzero.bias"] = torch.tensor([-1.0])
     tensors[f"{prefix}.is_nan.weight"] = torch.tensor([1.0, 1.0])
     tensors[f"{prefix}.is_nan.bias"] = torch.tensor([-2.0])
     tensors[f"{prefix}.mant_zero.weight"] = torch.tensor([-1.0])
     tensors[f"{prefix}.mant_zero.bias"] = torch.tensor([0.0])
     tensors[f"{prefix}.is_inf.weight"] = torch.tensor([1.0, 1.0])
     tensors[f"{prefix}.is_inf.bias"] = torch.tensor([-2.0])
     tensors[f"{prefix}.is_zero.weight"] = torch.tensor([1.0, 1.0])
     tensors[f"{prefix}.is_zero.bias"] = torch.tensor([-2.0])
-    # Check if exponent < 15 (|value| < 1)
-    # exp < 15 means unbiased exp < 0, so result is 0
     weights = [-float(2**i) for i in range(5)]
     tensors[f"{prefix}.exp_lt_15.weight"] = torch.tensor(weights)
     tensors[f"{prefix}.exp_lt_15.bias"] = torch.tensor([14.0])
     tensors[f"{prefix}.result_is_zero.weight"] = torch.tensor([1.0, 1.0, 1.0])
     tensors[f"{prefix}.result_is_zero.bias"] = torch.tensor([-1.0])
-    # Compute shift amount: exp - 15 - 10 = exp - 25
-    # If positive, left shift mantissa; if negative, right shift
-    # For int16, max shift is 15 (for values up to 32767)
-    # exp - 25 subtractor
     for i in range(5):
-        tensors[f"{prefix}.not_25_{i}.weight"] = torch.tensor([1.0])
-        tensors[f"{prefix}.not_25_{i}.bias"] = torch.tensor([-0.5])
     for i in range(6):
         p = f"{prefix}.shift_calc.fa{i}"
         tensors[f"{p}.xor1.layer1.or.weight"] = torch.tensor([1.0, 1.0])
@@ -5797,32 +5921,40 @@ def build_float16_toint_tensors() -> Dict[str, torch.Tensor]:
         tensors[f"{p}.cout.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{p}.cout.bias"] = torch.tensor([-1.0])
-    # Barrel shifter (left shift mantissa)
-    tensors[f"{prefix}.implicit_bit.weight"] = torch.tensor([-1.0])
-    tensors[f"{prefix}.implicit_bit.bias"] = torch.tensor([0.0])
     for stage in range(4):
         shift_amt = 1 << stage
         tensors[f"{prefix}.not_shift{stage}.weight"] = torch.tensor([-1.0])
         tensors[f"{prefix}.not_shift{stage}.bias"] = torch.tensor([0.0])
         for i in range(16):
-            tensors[f"{prefix}.lshift_s{stage}_{i}.pass.weight"] = torch.tensor([1.0, 1.0])
-            tensors[f"{prefix}.lshift_s{stage}_{i}.pass.bias"] = torch.tensor([-2.0])
-            if i >= shift_amt:
-                tensors[f"{prefix}.lshift_s{stage}_{i}.shift.weight"] = torch.tensor([1.0, 1.0])
-                tensors[f"{prefix}.lshift_s{stage}_{i}.shift.bias"] = torch.tensor([-2.0])
-                tensors[f"{prefix}.lshift_s{stage}_{i}.weight"] = torch.tensor([1.0, 1.0])
             else:
-                tensors[f"{prefix}.lshift_s{stage}_{i}.weight"] = torch.tensor([1.0])
-            tensors[f"{prefix}.lshift_s{stage}_{i}.bias"] = torch.tensor([-1.0])
-    # Apply sign (negate if negative)
     for i in range(16):
         tensors[f"{prefix}.not_mag{i}.weight"] = torch.tensor([-1.0])
         tensors[f"{prefix}.not_mag{i}.bias"] = torch.tensor([0.0])
-    # Two's complement negation
     for i in range(16):
         p = f"{prefix}.neg.fa{i}"
         tensors[f"{p}.xor.layer1.or.weight"] = torch.tensor([1.0, 1.0])
@@ -5837,7 +5969,13 @@ def build_float16_toint_tensors() -> Dict[str, torch.Tensor]:
         tensors[f"{p}.cout.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{p}.cout.bias"] = torch.tensor([-1.0])
-    # Output selection
     for i in range(16):
         tensors[f"{prefix}.out{i}.pos_path.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{prefix}.out{i}.pos_path.bias"] = torch.tensor([-2.0])
@@ -6081,12 +6219,17 @@ def main():
     print(f"Loaded {len(tensors)} tensors")
-    # Remove old float16.add tensors (we're rebuilding from scratch)
     old_float16_add = [k for k in tensors.keys() if k.startswith('float16.add')]
     for k in old_float16_add:
         del tensors[k]
     print(f"Removed {len(old_float16_add)} old float16.add tensors")
     # Build new circuits
     print("Building new circuits...")
     clz_tensors = build_clz8bit_tensors()

 def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]:
+    """Infer inputs for float16.toint circuit (with right-shift barrel shifter)."""
     prefix = "float16.toint"
     for i in range(16):
     exp_bits = [f"{prefix}.$x[{10+i}]" for i in range(5)]
     mant_bits = [f"{prefix}.$x[{i}]" for i in range(10)]
+    # === SPECIAL CASE DETECTION ===
     if '.exp_all_ones' in gate:
         return [registry.get_id(b) for b in exp_bits]
     if '.exp_zero' in gate:
     if '.is_inf' in gate:
         return [registry.get_id(f"{prefix}.exp_all_ones"),
                 registry.get_id(f"{prefix}.mant_zero")]
+    if '.is_zero' in gate and '.not_' not in gate and '.result_is_zero' not in gate:
         return [registry.get_id(f"{prefix}.exp_zero"),
                 registry.get_id(f"{prefix}.mant_zero")]
     registry.register(f"{prefix}.exp_lt_15")
+    if '.result_is_zero' in gate and '.not_' not in gate:
         return [registry.get_id(f"{prefix}.is_nan"),
                 registry.get_id(f"{prefix}.is_zero"),
                 registry.get_id(f"{prefix}.exp_lt_15")]
     registry.register(f"{prefix}.result_is_zero")
+    if '.not_result_is_zero' in gate:
+        return [registry.get_id(f"{prefix}.result_is_zero")]
+    registry.register(f"{prefix}.not_result_is_zero")
     if '.implicit_bit' in gate:
         return [registry.get_id(f"{prefix}.exp_zero")]
     registry.register(f"{prefix}.implicit_bit")
+    # === THRESHOLD GATES FOR SHIFT CONTROL ===
+    if '.exp_ge_15' in gate:
+        return [registry.get_id(b) for b in exp_bits]
+    if '.exp_ge_18' in gate:
+        return [registry.get_id(b) for b in exp_bits]
+    if '.exp_le_21' in gate:
+        return [registry.get_id(b) for b in exp_bits]
+    registry.register(f"{prefix}.exp_ge_15")
+    registry.register(f"{prefix}.exp_ge_18")
+    registry.register(f"{prefix}.exp_le_21")
+    if '.shift_bit3' in gate:
+        return [registry.get_id(b) for b in exp_bits]
+    if '.shift_bit2' in gate:
+        return [registry.get_id(f"{prefix}.exp_ge_18"),
+                registry.get_id(f"{prefix}.exp_le_21")]
+    registry.register(f"{prefix}.shift_bit3")
+    registry.register(f"{prefix}.shift_bit2")
+    # === NOT OF EXPONENT BITS ===
     for i in range(5):
+        if f'.not_exp{i}' in gate:
+            return [registry.get_id(exp_bits[i])]
+        registry.register(f"{prefix}.not_exp{i}")
+    # === SHIFT CALCULATION: 25 - exp = ~exp + 26 ===
+    # 26 = 0b011010
+    const_26 = [0, 1, 0, 1, 1, 0]
     if '.shift_calc.fa' in gate:
         match = re.search(r'\.shift_calc\.fa(\d+)\.', gate)
             i = int(match.group(1))
             fa_prefix = f"{prefix}.shift_calc.fa{i}"
+            # a = ~exp[i] (or 1 for i >= 5)
+            a_bit = registry.get_id(f"{prefix}.not_exp{i}") if i < 5 else registry.get_id("#1")
+            # b = const_26[i]
+            b_bit = registry.get_id(f"#{const_26[i]}")
+            cin = registry.get_id("#0") if i == 0 else registry.register(f"{prefix}.shift_calc.fa{i-1}.cout")
             if '.xor1.layer1' in gate:
                 return [a_bit, b_bit]
         registry.register(f"{prefix}.shift_calc.fa{i}.xor2.layer2")
         registry.register(f"{prefix}.shift_calc.fa{i}.cout")
+    # === RIGHT-SHIFT BARREL SHIFTER ===
     for stage in range(4):
         shift_amt = 1 << stage
             return [registry.get_id(f"{prefix}.shift_calc.fa{stage}.xor2.layer2")]
         registry.register(f"{prefix}.not_shift{stage}")
+        match = re.search(rf'\.rshift_s{stage}_(\d+)\.', gate)
         if match:
             i = int(match.group(1))
+            src_pos = i + shift_amt
             if '.pass' in gate:
+                # Current value (from previous stage or input)
                 if stage == 0:
                     if i < 10:
                         val = registry.get_id(mant_bits[i])
                     else:
                         val = registry.get_id("#0")
                 else:
+                    val = registry.get_id(f"{prefix}.rshift_s{stage-1}_{i}")
                 return [val, registry.get_id(f"{prefix}.not_shift{stage}")]
+            if '.shift' in gate and src_pos < 16:
+                # Value from higher position
                 if stage == 0:
+                    if src_pos < 10:
+                        val = registry.get_id(mant_bits[src_pos])
+                    elif src_pos == 10:
                         val = registry.get_id(f"{prefix}.implicit_bit")
                     else:
                         val = registry.get_id("#0")
                 else:
+                    val = registry.get_id(f"{prefix}.rshift_s{stage-1}_{src_pos}")
                 return [val, registry.get_id(f"{prefix}.shift_calc.fa{stage}.xor2.layer2")]
+        match = re.search(rf'\.rshift_s{stage}_(\d+)$', gate)
         if match:
             i = int(match.group(1))
+            src_pos = i + shift_amt
+            if src_pos < 16:
+                return [registry.register(f"{prefix}.rshift_s{stage}_{i}.pass"),
+                        registry.register(f"{prefix}.rshift_s{stage}_{i}.shift")]
             else:
+                return [registry.register(f"{prefix}.rshift_s{stage}_{i}.pass")]
         for i in range(16):
+            registry.register(f"{prefix}.rshift_s{stage}_{i}")
+    # === NEGATION ===
     for i in range(16):
         if f'.not_mag{i}' in gate:
+            return [registry.get_id(f"{prefix}.rshift_s3_{i}")]
         registry.register(f"{prefix}.not_mag{i}")
     if '.neg.fa' in gate:
         registry.register(f"{prefix}.neg.fa{i}.xor.layer2")
         registry.register(f"{prefix}.neg.fa{i}.cout")
+    # === OUTPUT SELECTION ===
     match = re.search(r'\.out(\d+)\.', gate)
     if match:
         i = int(match.group(1))
         sign = registry.get_id(f"{prefix}.$x[15]")
+        not_sign = registry.register(f"{prefix}.not_sign")
         if '.pos_path' in gate:
+            return [registry.get_id(f"{prefix}.rshift_s3_{i}"),
+                    not_sign]
         if '.neg_path' in gate:
             return [registry.get_id(f"{prefix}.neg.fa{i}.xor.layer2"),
                     sign]
+    # not_sign gate
+    if '.not_sign' in gate:
+        return [registry.get_id(f"{prefix}.$x[15]")]
+    registry.register(f"{prefix}.not_sign")
     match = re.search(r'\.out(\d+)$', gate)
     if match:
         i = int(match.group(1))
     Convert float16 to signed 16-bit integer (truncate toward zero).
     Algorithm:
+    1. Extract mantissa M with implicit bit (11 bits, bit 10 = implicit 1)
+    2. For exp < 15: result = 0 (|value| < 1)
+    3. For exp >= 15: right-shift M by (25 - exp) positions
+       - exp = 15: shift by 10, result = 1 for normalized
+       - exp = 25: shift by 0, result = M (up to 2047)
+       - exp > 25: would need left shift, but limited range
+    4. Apply sign via two's complement negation
+    5. Handle special cases: NaN, Inf, overflow
     """
     tensors = {}
     prefix = "float16.toint"
+    # === SPECIAL CASE DETECTION ===
+    # exp_all_ones: exponent = 31 (NaN or Inf)
     tensors[f"{prefix}.exp_all_ones.weight"] = torch.tensor([1.0] * 5)
     tensors[f"{prefix}.exp_all_ones.bias"] = torch.tensor([-5.0])
+    # exp_zero: exponent = 0 (zero or subnormal)
     tensors[f"{prefix}.exp_zero.weight"] = torch.tensor([-1.0] * 5)
     tensors[f"{prefix}.exp_zero.bias"] = torch.tensor([0.0])
+    # mant_nonzero: any mantissa bit set
     tensors[f"{prefix}.mant_nonzero.weight"] = torch.tensor([1.0] * 10)
     tensors[f"{prefix}.mant_nonzero.bias"] = torch.tensor([-1.0])
+    # is_nan: exp=31 AND mant!=0
     tensors[f"{prefix}.is_nan.weight"] = torch.tensor([1.0, 1.0])
     tensors[f"{prefix}.is_nan.bias"] = torch.tensor([-2.0])
+    # mant_zero: NOT mant_nonzero
     tensors[f"{prefix}.mant_zero.weight"] = torch.tensor([-1.0])
     tensors[f"{prefix}.mant_zero.bias"] = torch.tensor([0.0])
+    # is_inf: exp=31 AND mant=0
     tensors[f"{prefix}.is_inf.weight"] = torch.tensor([1.0, 1.0])
     tensors[f"{prefix}.is_inf.bias"] = torch.tensor([-2.0])
+    # is_zero: exp=0 AND mant=0
     tensors[f"{prefix}.is_zero.weight"] = torch.tensor([1.0, 1.0])
     tensors[f"{prefix}.is_zero.bias"] = torch.tensor([-2.0])
+    # === CHECK IF |VALUE| < 1 ===
+    # exp < 15 means unbiased exponent < 0, so |value| < 1
+    # Use threshold: sum(exp[i] * 2^i) < 15
     weights = [-float(2**i) for i in range(5)]
     tensors[f"{prefix}.exp_lt_15.weight"] = torch.tensor(weights)
     tensors[f"{prefix}.exp_lt_15.bias"] = torch.tensor([14.0])
+    # result_is_zero: exp_zero OR exp_lt_15 OR is_nan
     tensors[f"{prefix}.result_is_zero.weight"] = torch.tensor([1.0, 1.0, 1.0])
     tensors[f"{prefix}.result_is_zero.bias"] = torch.tensor([-1.0])
+    # not_result_is_zero for muxing
+    tensors[f"{prefix}.not_result_is_zero.weight"] = torch.tensor([-1.0])
+    tensors[f"{prefix}.not_result_is_zero.bias"] = torch.tensor([0.0])
+    # === COMPUTE SHIFT AMOUNT: 25 - exp ===
+    # For right shift: shift_amt = 25 - exp (need 0 to 10 for normal range)
+    # 25 = 0b11001, so we compute NOT(exp) + 25 + 1 = NOT(exp) + 26
+    # Actually simpler: use 25 - exp directly with threshold gates
+    # We'll use a different approach: compute exp directly and use threshold
+    # gates to determine shift amount bits
+    # Implicit bit (always 1 for normalized numbers, 0 for subnormals)
+    # implicit = NOT exp_zero
+    tensors[f"{prefix}.implicit_bit.weight"] = torch.tensor([-1.0])
+    tensors[f"{prefix}.implicit_bit.bias"] = torch.tensor([0.0])
+    # === DIRECT SHIFT USING EXPONENT VALUE ===
+    # For exp in range 15-25, shift right by (25-exp)
+    # For exp >= 25, no shift or left shift (overflow territory)
+    #
+    # Shift amounts needed: 0-10 for exp 25-15
+    # shift[0] = 1 if (25-exp) is odd = exp is even when exp in {15,17,19,21,23,25}
+    # This is complex. Let's use threshold gates on exp value.
+    # exp_ge_15: exp >= 15 (value >= 1)
+    tensors[f"{prefix}.exp_ge_15.weight"] = torch.tensor([float(2**i) for i in range(5)])
+    tensors[f"{prefix}.exp_ge_15.bias"] = torch.tensor([-15.0])
+    # For each shift stage, determine if we should shift
+    # Right shift by 2^k if bit k of (25-exp) is set
+    # 25 - exp for exp in [15, 25]: shift in [10, 0]
+    # Binary of shift amounts:
+    #   exp=15: shift=10 = 0b1010
+    #   exp=16: shift=9  = 0b1001
+    #   exp=17: shift=8  = 0b1000
+    #   exp=18: shift=7  = 0b0111
+    #   exp=19: shift=6  = 0b0110
+    #   exp=20: shift=5  = 0b0101
+    #   exp=21: shift=4  = 0b0100
+    #   exp=22: shift=3  = 0b0011
+    #   exp=23: shift=2  = 0b0010
+    #   exp=24: shift=1  = 0b0001
+    #   exp=25: shift=0  = 0b0000
+    # Use threshold on exp to determine shift control bits
+    # shift_bit3 (shift by 8): exp <= 17 (shift >= 8)
+    tensors[f"{prefix}.shift_bit3.weight"] = torch.tensor([-float(2**i) for i in range(5)])
+    tensors[f"{prefix}.shift_bit3.bias"] = torch.tensor([17.0])
+    # shift_bit2 (shift by 4): (exp <= 17) OR (18 <= exp <= 21)
+    # = exp <= 21 AND NOT (18 <= exp <= 21 AND exp > 17)... complex
+    # Simpler: shift_bit2 = 1 when shift in {4,5,6,7,12,13,14,15} ∩ [0,10] = {4,5,6,7}
+    # = exp in {18,19,20,21}
+    # Use: exp >= 18 AND exp <= 21
+    tensors[f"{prefix}.exp_ge_18.weight"] = torch.tensor([float(2**i) for i in range(5)])
+    tensors[f"{prefix}.exp_ge_18.bias"] = torch.tensor([-18.0])
+    tensors[f"{prefix}.exp_le_21.weight"] = torch.tensor([-float(2**i) for i in range(5)])
+    tensors[f"{prefix}.exp_le_21.bias"] = torch.tensor([21.0])
+    tensors[f"{prefix}.shift_bit2.weight"] = torch.tensor([1.0, 1.0])
+    tensors[f"{prefix}.shift_bit2.bias"] = torch.tensor([-2.0])
+    # shift_bit1 (shift by 2): shift in {2,3,6,7,10,11,...} ∩ [0,10] = {2,3,6,7,10}
+    # = exp in {15,19,22,23} -- this is getting complex
+    # Let's use a simpler direct threshold approach
+    # Actually, let's compute 25-exp using subtraction, then use those bits
+    # 25 = 0b011001 (6 bits), exp is 5 bits
+    # 25 - exp in two's complement
     for i in range(5):
+        tensors[f"{prefix}.not_exp{i}.weight"] = torch.tensor([-1.0])
+        tensors[f"{prefix}.not_exp{i}.bias"] = torch.tensor([0.0])
+    # 25 - exp = 25 + (~exp) + 1 = 26 + ~exp (in binary)
+    # 26 = 0b011010
+    const_26 = [0, 1, 0, 1, 1, 0]  # bits of 26
     for i in range(6):
         p = f"{prefix}.shift_calc.fa{i}"
         tensors[f"{p}.xor1.layer1.or.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{p}.cout.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{p}.cout.bias"] = torch.tensor([-1.0])
+    # === RIGHT-SHIFT BARREL SHIFTER ===
+    # 4 stages for shifts of 1, 2, 4, 8
+    # Input: mantissa (10 bits) + implicit bit at position 10 = 11 bits
+    # We'll work with 16 bits to have room
     for stage in range(4):
         shift_amt = 1 << stage
+        # NOT of shift control bit for mux
         tensors[f"{prefix}.not_shift{stage}.weight"] = torch.tensor([-1.0])
         tensors[f"{prefix}.not_shift{stage}.bias"] = torch.tensor([0.0])
         for i in range(16):
+            # pass: keep current position (AND with NOT shift_bit)
+            tensors[f"{prefix}.rshift_s{stage}_{i}.pass.weight"] = torch.tensor([1.0, 1.0])
+            tensors[f"{prefix}.rshift_s{stage}_{i}.pass.bias"] = torch.tensor([-2.0])
+            # shift: take from higher position (AND with shift_bit)
+            src_pos = i + shift_amt
+            if src_pos < 16:
+                tensors[f"{prefix}.rshift_s{stage}_{i}.shift.weight"] = torch.tensor([1.0, 1.0])
+                tensors[f"{prefix}.rshift_s{stage}_{i}.shift.bias"] = torch.tensor([-2.0])
+                tensors[f"{prefix}.rshift_s{stage}_{i}.weight"] = torch.tensor([1.0, 1.0])
             else:
+                # Shift in 0 from above
+                tensors[f"{prefix}.rshift_s{stage}_{i}.weight"] = torch.tensor([1.0])
+            tensors[f"{prefix}.rshift_s{stage}_{i}.bias"] = torch.tensor([-1.0])
+    # === TWO'S COMPLEMENT NEGATION FOR NEGATIVE FLOATS ===
+    # If sign bit is 1, negate the result
     for i in range(16):
         tensors[f"{prefix}.not_mag{i}.weight"] = torch.tensor([-1.0])
         tensors[f"{prefix}.not_mag{i}.bias"] = torch.tensor([0.0])
     for i in range(16):
         p = f"{prefix}.neg.fa{i}"
         tensors[f"{p}.xor.layer1.or.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{p}.cout.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{p}.cout.bias"] = torch.tensor([-1.0])
+    # === OUTPUT SELECTION ===
+    # Select between positive path, negative path, and zero
+    # NOT of sign bit for muxing positive path
+    tensors[f"{prefix}.not_sign.weight"] = torch.tensor([-1.0])
+    tensors[f"{prefix}.not_sign.bias"] = torch.tensor([0.0])
     for i in range(16):
         tensors[f"{prefix}.out{i}.pos_path.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{prefix}.out{i}.pos_path.bias"] = torch.tensor([-2.0])
     print(f"Loaded {len(tensors)} tensors")
+    # Remove old tensors for circuits we're rebuilding
     old_float16_add = [k for k in tensors.keys() if k.startswith('float16.add')]
     for k in old_float16_add:
         del tensors[k]
     print(f"Removed {len(old_float16_add)} old float16.add tensors")
+    old_float16_toint = [k for k in tensors.keys() if k.startswith('float16.toint')]
+    for k in old_float16_toint:
+        del tensors[k]
+    print(f"Removed {len(old_float16_toint)} old float16.toint tensors")
     # Build new circuits
     print("Building new circuits...")
     clz_tensors = build_clz8bit_tensors()