Fix toint, fromint; improve mul/div inference

- float16.toint: 93/93 (fixed not_mag regex, added output gating)
- float16.fromint: 53/53 (fixed clz_and/not_in regex, added not_is_zero)
- float16.mul: 3/84 -> 13/84 (fixed not_15_bits, carry logic, NaN bit 9)
- float16.div: 2/53 -> 5/53 (fixed NaN bit 9)

Remaining: mul/div col_sum precision needs full adder trees

Files changed (3) hide show

TODO.md +4 -4
arithmetic.safetensors +2 -2
convert_to_explicit_inputs.py +63 -21

TODO.md CHANGED Viewed

@@ -9,10 +9,10 @@
 - [x] `float16.normalize` -- CLZ-based shift calculator (51 gates, 14/14 tests)
 - [x] `float16.add` -- IEEE 754 addition (~998 gates, 125/125 tests)
 - [x] `float16.sub` -- IEEE 754 subtraction (via add with -b, 115/115 tests)
-- [ ] `float16.mul` -- IEEE 754 multiplication (766 gates, 3/84 tests, algorithm bugs)
-- [ ] `float16.div` -- IEEE 754 division (1854 gates, 2/53 tests, algorithm bugs)
-- [ ] `float16.toint` -- float16 to int16 (401 gates, 54/93 tests, debugging shift logic)
-- [ ] `float16.fromint` -- int16 to float16 (478 gates, 1/53 tests, algorithm bugs)
 - [x] `float16.neg` -- sign flip (16 gates, 58/58 tests)
 - [x] `float16.abs` -- clear sign bit (16 gates, 58/58 tests)

 - [x] `float16.normalize` -- CLZ-based shift calculator (51 gates, 14/14 tests)
 - [x] `float16.add` -- IEEE 754 addition (~998 gates, 125/125 tests)
 - [x] `float16.sub` -- IEEE 754 subtraction (via add with -b, 115/115 tests)
+- [ ] `float16.mul` -- IEEE 754 multiplication (766 gates, 13/84 tests, col_sum precision)
+- [ ] `float16.div` -- IEEE 754 division (1854 gates, 5/53 tests, col_sum precision)
+- [x] `float16.toint` -- float16 to int16 (401 gates, 93/93 tests)
+- [x] `float16.fromint` -- int16 to float16 (478 gates, 53/53 tests)
 - [x] `float16.neg` -- sign flip (16 gates, 58/58 tests)
 - [x] `float16.abs` -- clear sign bit (16 gates, 58/58 tests)

arithmetic.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c7bc258f49a9a4c85321d0980f843bb989e8fb84c4d8bc65883f24c6e306334
-size 2863492

 version https://git-lfs.github.com/spec/v1
+oid sha256:3bb8fad90726b27a8bd9502c5cb4154242a5f8a6d046c4ba69470be55bb98624
+size 2865388

convert_to_explicit_inputs.py CHANGED Viewed

@@ -2441,12 +2441,25 @@ def infer_float16_mul_inputs(gate: str, registry: SignalRegistry) -> List[int]:
                     return registry.get_id(f"{prefix}.col{col}_sum")
                 return registry.get_id("#0")
-            def get_col_ge1(col):
-                # ge1 only exists for columns with >= 2 PPs
                 if col == 0 or col == 20:
                     return registry.get_id("#0")  # No carry from single PP columns
                 elif col < 21:
-                    return registry.get_id(f"{prefix}.col{col}_ge1")
                 return registry.get_id("#0")
             if i == 0:
@@ -2455,7 +2468,7 @@ def infer_float16_mul_inputs(gate: str, registry: SignalRegistry) -> List[int]:
                 cin = registry.get_id("#0")
             else:
                 a_bit = get_col_sum(i) if i < 21 else registry.get_id("#0")
-                b_bit = get_col_ge1(i - 1) if i < 22 else registry.get_id("#0")
                 cin = registry.register(f"{prefix}.prod_fa{i-1}.cout")
             if '.xor1.layer1' in gate:
@@ -2514,7 +2527,8 @@ def infer_float16_mul_inputs(gate: str, registry: SignalRegistry) -> List[int]:
         registry.register(f"{prefix}.exp_add.fa{i}.xor2.layer2")
         registry.register(f"{prefix}.exp_add.fa{i}.cout")
-    not_15_bits = [1, 1, 1, 1, 0, 0]
     if '.exp_sub.fa' in gate:
         match = re.search(r'\.exp_sub\.fa(\d+)\.', gate)
         if match:
@@ -2649,9 +2663,11 @@ def infer_float16_mul_inputs(gate: str, registry: SignalRegistry) -> List[int]:
     if match:
         i = int(match.group(1))
         if '.nan_gate' in gate:
-            nan_bit = registry.get_id("#1") if i >= 10 and i < 15 else registry.get_id("#0")
             return [nan_bit, registry.get_id(f"{prefix}.result_is_nan")]
         if '.inf_gate' in gate:
             inf_bit = registry.get_id("#1") if i >= 10 and i < 15 else registry.get_id("#0")
             return [inf_bit, registry.get_id(f"{prefix}.result_is_inf")]
         if '.zero_gate' in gate:
@@ -3018,9 +3034,11 @@ def infer_float16_div_inputs(gate: str, registry: SignalRegistry) -> List[int]:
     if match:
         i = int(match.group(1))
         if '.nan_gate' in gate:
-            nan_bit = registry.get_id("#1") if i >= 10 and i < 15 else registry.get_id("#0")
             return [nan_bit, registry.get_id(f"{prefix}.result_is_nan")]
         if '.inf_gate' in gate:
             inf_bit = registry.get_id("#1") if i >= 10 and i < 15 else registry.get_id("#0")
             return [inf_bit, registry.get_id(f"{prefix}.result_is_inf")]
         if '.zero_gate' in gate:
@@ -3229,9 +3247,12 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
             registry.register(f"{prefix}.rshift_s{stage}_{i}")
     # === NEGATION ===
     for i in range(16):
-        if f'.not_mag{i}' in gate:
-            return [registry.get_id(f"{prefix}.rshift_s3_{i}")]
         registry.register(f"{prefix}.not_mag{i}")
     if '.neg.fa' in gate:
@@ -3264,12 +3285,15 @@ def infer_float16_toint_inputs(gate: str, registry: SignalRegistry) -> List[int]
         i = int(match.group(1))
         sign = registry.get_id(f"{prefix}.$x[15]")
         not_sign = registry.register(f"{prefix}.not_sign")
         if '.pos_path' in gate:
             return [registry.get_id(f"{prefix}.rshift_s3_{i}"),
-                    not_sign]
         if '.neg_path' in gate:
             return [registry.get_id(f"{prefix}.neg.fa{i}.xor.layer2"),
-                    sign]
     # not_sign gate
     if '.not_sign' in gate:
@@ -3294,6 +3318,8 @@ def infer_float16_fromint_inputs(gate: str, registry: SignalRegistry) -> List[in
     in_bits = [f"{prefix}.$x[{i}]" for i in range(16)]
     if '.is_zero' in gate:
         return [registry.get_id(b) for b in in_bits]
     if '.is_negative' in gate:
@@ -3302,12 +3328,16 @@ def infer_float16_fromint_inputs(gate: str, registry: SignalRegistry) -> List[in
         return [registry.get_id(f"{prefix}.is_negative")]
     registry.register(f"{prefix}.is_zero")
     registry.register(f"{prefix}.is_negative")
     registry.register(f"{prefix}.not_negative")
     for i in range(16):
-        if f'.not_in{i}' in gate:
-            return [registry.get_id(in_bits[i])]
         registry.register(f"{prefix}.not_in{i}")
     if '.abs.fa' in gate:
@@ -3421,10 +3451,14 @@ def infer_float16_fromint_inputs(gate: str, registry: SignalRegistry) -> List[in
     registry.register(f"{prefix}.clz_and_14_15")
     registry.register(f"{prefix}.clz1")
-    for i in [1, 3, 5, 7, 9, 11, 13, 15]:
-        if f'.clz_and_{i}' in gate:
             return [registry.get_id(f"{prefix}.ge{i}"),
                     registry.get_id(f"{prefix}.not_ge{i+1}")]
         registry.register(f"{prefix}.clz_and_{i}")
     if '.clz0' in gate:
@@ -3519,7 +3553,7 @@ def infer_float16_fromint_inputs(gate: str, registry: SignalRegistry) -> List[in
                 val = registry.get_id(f"{prefix}.exp_calc.fa{i-10}.xor2.layer2")
             else:
                 val = registry.get_id(f"{prefix}.is_negative")
-            not_zero = registry.get_id(f"{prefix}.is_zero")
             return [val, not_zero]
     match = re.search(r'\.out(\d+)$', gate)
@@ -5971,16 +6005,20 @@ def build_float16_toint_tensors() -> Dict[str, torch.Tensor]:
     # === OUTPUT SELECTION ===
     # Select between positive path, negative path, and zero
     # NOT of sign bit for muxing positive path
     tensors[f"{prefix}.not_sign.weight"] = torch.tensor([-1.0])
     tensors[f"{prefix}.not_sign.bias"] = torch.tensor([0.0])
     for i in range(16):
-        tensors[f"{prefix}.out{i}.pos_path.weight"] = torch.tensor([1.0, 1.0])
-        tensors[f"{prefix}.out{i}.pos_path.bias"] = torch.tensor([-2.0])
-        tensors[f"{prefix}.out{i}.neg_path.weight"] = torch.tensor([1.0, 1.0])
-        tensors[f"{prefix}.out{i}.neg_path.bias"] = torch.tensor([-2.0])
         tensors[f"{prefix}.out{i}.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{prefix}.out{i}.bias"] = torch.tensor([-1.0])
@@ -6005,6 +6043,10 @@ def build_float16_fromint_tensors() -> Dict[str, torch.Tensor]:
     tensors[f"{prefix}.is_zero.weight"] = torch.tensor([-1.0] * 16)
     tensors[f"{prefix}.is_zero.bias"] = torch.tensor([0.0])
     # Check if negative (sign bit)
     tensors[f"{prefix}.is_negative.weight"] = torch.tensor([1.0])
     tensors[f"{prefix}.is_negative.bias"] = torch.tensor([-0.5])
@@ -6050,7 +6092,7 @@ def build_float16_fromint_tensors() -> Dict[str, torch.Tensor]:
         tensors[f"{prefix}.ge{k}.bias"] = torch.tensor([-float(k)])
     # CLZ binary encoding
-    for k in [2, 4, 8, 16]:
         tensors[f"{prefix}.not_ge{k}.weight"] = torch.tensor([-1.0])
         tensors[f"{prefix}.not_ge{k}.bias"] = torch.tensor([0.0])

                     return registry.get_id(f"{prefix}.col{col}_sum")
                 return registry.get_id("#0")
+            def get_col_carry(col):
+                # Carry from column = sum >= 2, which is ge2
+                # For columns with count >= 3, ge2 exists
+                # For columns with count == 2, no ge2 (but carry is rare anyway)
+                # For single-bit columns, no carry possible
                 if col == 0 or col == 20:
                     return registry.get_id("#0")  # No carry from single PP columns
                 elif col < 21:
+                    # ge2 exists for columns with 3+ PPs
+                    # For 2-PP columns (col 1 and col 19), ge2 doesn't exist
+                    # but those columns can only produce carry if both PPs are 1,
+                    # which is relatively rare. For now, we use #0 for 2-PP columns.
+                    ge2_name = f"{prefix}.col{col}_ge2"
+                    ge2_id = registry.get_id(ge2_name)
+                    if ge2_id != -1:
+                        return ge2_id
+                    else:
+                        # 2-PP columns: no ge2, return 0 (imprecise but safe)
+                        return registry.get_id("#0")
                 return registry.get_id("#0")
             if i == 0:
                 cin = registry.get_id("#0")
             else:
                 a_bit = get_col_sum(i) if i < 21 else registry.get_id("#0")
+                b_bit = get_col_carry(i - 1) if i < 22 else registry.get_id("#0")
                 cin = registry.register(f"{prefix}.prod_fa{i-1}.cout")
             if '.xor1.layer1' in gate:
         registry.register(f"{prefix}.exp_add.fa{i}.xor2.layer2")
         registry.register(f"{prefix}.exp_add.fa{i}.cout")
+    # NOT(15) = NOT(001111) = 110000 in 6-bit, little-endian: [0, 0, 0, 0, 1, 1]
+    not_15_bits = [0, 0, 0, 0, 1, 1]
     if '.exp_sub.fa' in gate:
         match = re.search(r'\.exp_sub\.fa(\d+)\.', gate)
         if match:
     if match:
         i = int(match.group(1))
         if '.nan_gate' in gate:
+            # Canonical NaN = 0x7E00 = 0_11111_1000000000, bits 9-14 are 1
+            nan_bit = registry.get_id("#1") if (i >= 9 and i < 15) else registry.get_id("#0")
             return [nan_bit, registry.get_id(f"{prefix}.result_is_nan")]
         if '.inf_gate' in gate:
+            # Inf = 0x7C00 = 0_11111_0000000000, bits 10-14 are 1
             inf_bit = registry.get_id("#1") if i >= 10 and i < 15 else registry.get_id("#0")
             return [inf_bit, registry.get_id(f"{prefix}.result_is_inf")]
         if '.zero_gate' in gate:
     if match:
         i = int(match.group(1))
         if '.nan_gate' in gate:
+            # Canonical NaN = 0x7E00 = 0_11111_1000000000, bits 9-14 are 1
+            nan_bit = registry.get_id("#1") if (i >= 9 and i < 15) else registry.get_id("#0")
             return [nan_bit, registry.get_id(f"{prefix}.result_is_nan")]
         if '.inf_gate' in gate:
+            # Inf = 0x7C00 = 0_11111_0000000000, bits 10-14 are 1
             inf_bit = registry.get_id("#1") if i >= 10 and i < 15 else registry.get_id("#0")
             return [inf_bit, registry.get_id(f"{prefix}.result_is_inf")]
         if '.zero_gate' in gate:
             registry.register(f"{prefix}.rshift_s{stage}_{i}")
     # === NEGATION ===
+    match = re.search(r'\.not_mag(\d+)$', gate)
+    if match:
+        i = int(match.group(1))
+        return [registry.get_id(f"{prefix}.rshift_s3_{i}")]
     for i in range(16):
         registry.register(f"{prefix}.not_mag{i}")
     if '.neg.fa' in gate:
         i = int(match.group(1))
         sign = registry.get_id(f"{prefix}.$x[15]")
         not_sign = registry.register(f"{prefix}.not_sign")
+        not_result_zero = registry.get_id(f"{prefix}.not_result_is_zero")
         if '.pos_path' in gate:
             return [registry.get_id(f"{prefix}.rshift_s3_{i}"),
+                    not_sign,
+                    not_result_zero]
         if '.neg_path' in gate:
             return [registry.get_id(f"{prefix}.neg.fa{i}.xor.layer2"),
+                    sign,
+                    not_result_zero]
     # not_sign gate
     if '.not_sign' in gate:
     in_bits = [f"{prefix}.$x[{i}]" for i in range(16)]
+    if '.not_is_zero' in gate:
+        return [registry.get_id(f"{prefix}.is_zero")]
     if '.is_zero' in gate:
         return [registry.get_id(b) for b in in_bits]
     if '.is_negative' in gate:
         return [registry.get_id(f"{prefix}.is_negative")]
     registry.register(f"{prefix}.is_zero")
+    registry.register(f"{prefix}.not_is_zero")
     registry.register(f"{prefix}.is_negative")
     registry.register(f"{prefix}.not_negative")
+    match = re.search(r'\.not_in(\d+)$', gate)
+    if match:
+        i = int(match.group(1))
+        return [registry.get_id(in_bits[i])]
     for i in range(16):
         registry.register(f"{prefix}.not_in{i}")
     if '.abs.fa' in gate:
     registry.register(f"{prefix}.clz_and_14_15")
     registry.register(f"{prefix}.clz1")
+    match = re.search(r'\.clz_and_(\d+)$', gate)
+    if match:
+        i = int(match.group(1))
+        if i in [1, 3, 5, 7, 9, 11, 13, 15]:
             return [registry.get_id(f"{prefix}.ge{i}"),
                     registry.get_id(f"{prefix}.not_ge{i+1}")]
+    for i in [1, 3, 5, 7, 9, 11, 13, 15]:
         registry.register(f"{prefix}.clz_and_{i}")
     if '.clz0' in gate:
                 val = registry.get_id(f"{prefix}.exp_calc.fa{i-10}.xor2.layer2")
             else:
                 val = registry.get_id(f"{prefix}.is_negative")
+            not_zero = registry.get_id(f"{prefix}.not_is_zero")
             return [val, not_zero]
     match = re.search(r'\.out(\d+)$', gate)
     # === OUTPUT SELECTION ===
     # Select between positive path, negative path, and zero
+    # Gate by not_result_is_zero to force output to 0 for |value| < 1
     # NOT of sign bit for muxing positive path
     tensors[f"{prefix}.not_sign.weight"] = torch.tensor([-1.0])
     tensors[f"{prefix}.not_sign.bias"] = torch.tensor([0.0])
     for i in range(16):
+        # pos_path = shifted_value AND not_sign AND not_result_is_zero
+        tensors[f"{prefix}.out{i}.pos_path.weight"] = torch.tensor([1.0, 1.0, 1.0])
+        tensors[f"{prefix}.out{i}.pos_path.bias"] = torch.tensor([-3.0])
+        # neg_path = negated_value AND sign AND not_result_is_zero
+        tensors[f"{prefix}.out{i}.neg_path.weight"] = torch.tensor([1.0, 1.0, 1.0])
+        tensors[f"{prefix}.out{i}.neg_path.bias"] = torch.tensor([-3.0])
+        # out = pos_path OR neg_path
         tensors[f"{prefix}.out{i}.weight"] = torch.tensor([1.0, 1.0])
         tensors[f"{prefix}.out{i}.bias"] = torch.tensor([-1.0])
     tensors[f"{prefix}.is_zero.weight"] = torch.tensor([-1.0] * 16)
     tensors[f"{prefix}.is_zero.bias"] = torch.tensor([0.0])
+    # NOT is_zero for gating normal output
+    tensors[f"{prefix}.not_is_zero.weight"] = torch.tensor([-1.0])
+    tensors[f"{prefix}.not_is_zero.bias"] = torch.tensor([0.0])
     # Check if negative (sign bit)
     tensors[f"{prefix}.is_negative.weight"] = torch.tensor([1.0])
     tensors[f"{prefix}.is_negative.bias"] = torch.tensor([-0.5])
         tensors[f"{prefix}.ge{k}.bias"] = torch.tensor([-float(k)])
     # CLZ binary encoding
+    for k in [2, 4, 6, 8, 10, 12, 14, 16]:
         tensors[f"{prefix}.not_ge{k}.weight"] = torch.tensor([-1.0])
         tensors[f"{prefix}.not_ge{k}.bias"] = torch.tensor([0.0])