phanerozoic
/

8bit-threshold-computer

@@ -1,606 +0,0 @@
-"""
-Circuit-Augmented LLM: Embedding threshold logic circuits into SmolLM2
-======================================================================
-Replaces/augments MLP layers with frozen threshold circuits for exact arithmetic.
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Dict, Optional, Tuple
-from safetensors.torch import load_file
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import warnings
-warnings.filterwarnings('ignore')
-# =============================================================================
-# HEAVISIDE WITH STRAIGHT-THROUGH ESTIMATOR
-# =============================================================================
-class HeavisideSTE(torch.autograd.Function):
-    """Heaviside step function with straight-through estimator for backprop."""
-    @staticmethod
-    def forward(ctx, x):
-        return (x >= 0).float()
-    @staticmethod
-    def backward(ctx, grad_output):
-        # STE: pass gradient through unchanged
-        return grad_output
-def heaviside(x: torch.Tensor) -> torch.Tensor:
-    """Heaviside step: 1 if x >= 0, else 0. Uses STE for training."""
-    return HeavisideSTE.apply(x)
-# =============================================================================
-# CIRCUIT EXECUTOR - Runs the frozen threshold circuits
-# =============================================================================
-class CircuitExecutor(nn.Module):
-    """
-    Executes threshold logic circuits from the safetensors file.
-    All circuit weights are frozen - only interface layers train.
-    """
-    def __init__(self, circuit_path: str, device: str = 'cpu'):
-        super().__init__()
-        self.device = device
-        # Load all circuit tensors
-        raw_circuits = load_file(circuit_path)
-        # Store as frozen parameters (use underscores for valid param names)
-        self.circuits = {}
-        for k, v in raw_circuits.items():
-            safe_name = k.replace('.', '__')
-            self.register_buffer(safe_name, v.float().to(device))
-            self.circuits[k] = safe_name
-    def _get(self, name: str) -> torch.Tensor:
-        """Get circuit tensor by original dotted name."""
-        return getattr(self, self.circuits[name])
-    # -------------------------------------------------------------------------
-    # Boolean Gates
-    # -------------------------------------------------------------------------
-    def eval_and(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-        """AND gate: output 1 iff both inputs are 1."""
-        inp = torch.stack([a, b], dim=-1)
-        w = self._get('boolean.and.weight')
-        bias = self._get('boolean.and.bias')
-        return heaviside(inp @ w + bias)
-    def eval_or(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-        """OR gate: output 1 if either input is 1."""
-        inp = torch.stack([a, b], dim=-1)
-        w = self._get('boolean.or.weight')
-        bias = self._get('boolean.or.bias')
-        return heaviside(inp @ w + bias)
-    def eval_xor(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
-        """XOR gate: two-layer network (not linearly separable)."""
-        inp = torch.stack([a, b], dim=-1)
-        # Layer 1: OR and NAND neurons
-        w1_n1 = self._get('boolean.xor.layer1.neuron1.weight')
-        b1_n1 = self._get('boolean.xor.layer1.neuron1.bias')
-        w1_n2 = self._get('boolean.xor.layer1.neuron2.weight')
-        b1_n2 = self._get('boolean.xor.layer1.neuron2.bias')
-        h1 = heaviside(inp @ w1_n1 + b1_n1)
-        h2 = heaviside(inp @ w1_n2 + b1_n2)
-        hidden = torch.stack([h1, h2], dim=-1)
-        # Layer 2: AND of hidden
-        w2 = self._get('boolean.xor.layer2.weight')
-        b2 = self._get('boolean.xor.layer2.bias')
-        return heaviside(hidden @ w2 + b2)
-    # -------------------------------------------------------------------------
-    # Arithmetic: Full Adder
-    # -------------------------------------------------------------------------
-    def eval_full_adder(self, a: torch.Tensor, b: torch.Tensor,
-                        cin: torch.Tensor, prefix: str) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Full adder: sum = a XOR b XOR cin, cout = (a AND b) OR (cin AND (a XOR b))
-        Returns (sum_bit, carry_out)
-        """
-        inp_ab = torch.stack([a, b], dim=-1)
-        # HA1: a XOR b
-        w1_or = self._get(f'{prefix}.ha1.sum.layer1.or.weight')
-        b1_or = self._get(f'{prefix}.ha1.sum.layer1.or.bias')
-        w1_nand = self._get(f'{prefix}.ha1.sum.layer1.nand.weight')
-        b1_nand = self._get(f'{prefix}.ha1.sum.layer1.nand.bias')
-        w2 = self._get(f'{prefix}.ha1.sum.layer2.weight')
-        b2 = self._get(f'{prefix}.ha1.sum.layer2.bias')
-        h_or = heaviside(inp_ab @ w1_or + b1_or)
-        h_nand = heaviside(inp_ab @ w1_nand + b1_nand)
-        hidden = torch.stack([h_or, h_nand], dim=-1)
-        ha1_sum = heaviside(hidden @ w2 + b2)
-        # HA1 carry
-        w_c1 = self._get(f'{prefix}.ha1.carry.weight')
-        b_c1 = self._get(f'{prefix}.ha1.carry.bias')
-        ha1_carry = heaviside(inp_ab @ w_c1 + b_c1)
-        # HA2: ha1_sum XOR cin
-        inp_ha2 = torch.stack([ha1_sum, cin], dim=-1)
-        w1_or = self._get(f'{prefix}.ha2.sum.layer1.or.weight')
-        b1_or = self._get(f'{prefix}.ha2.sum.layer1.or.bias')
-        w1_nand = self._get(f'{prefix}.ha2.sum.layer1.nand.weight')
-        b1_nand = self._get(f'{prefix}.ha2.sum.layer1.nand.bias')
-        w2 = self._get(f'{prefix}.ha2.sum.layer2.weight')
-        b2 = self._get(f'{prefix}.ha2.sum.layer2.bias')
-        h_or = heaviside(inp_ha2 @ w1_or + b1_or)
-        h_nand = heaviside(inp_ha2 @ w1_nand + b1_nand)
-        hidden = torch.stack([h_or, h_nand], dim=-1)
-        ha2_sum = heaviside(hidden @ w2 + b2)
-        # HA2 carry
-        w_c2 = self._get(f'{prefix}.ha2.carry.weight')
-        b_c2 = self._get(f'{prefix}.ha2.carry.bias')
-        ha2_carry = heaviside(inp_ha2 @ w_c2 + b_c2)
-        # Carry out = ha1_carry OR ha2_carry
-        inp_cout = torch.stack([ha1_carry, ha2_carry], dim=-1)
-        w_or = self._get(f'{prefix}.carry_or.weight')
-        b_or = self._get(f'{prefix}.carry_or.bias')
-        cout = heaviside(inp_cout @ w_or + b_or)
-        return ha2_sum, cout
-    # -------------------------------------------------------------------------
-    # Arithmetic: 8-bit Ripple Carry Adder
-    # -------------------------------------------------------------------------
-    def add_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        8-bit ripple carry addition.
-        a_bits, b_bits: [..., 8] tensors of bits (LSB first)
-        Returns: (result_bits [..., 8], carry_out [...])
-        """
-        batch_shape = a_bits.shape[:-1]
-        carry = torch.zeros(batch_shape, device=a_bits.device)
-        result_bits = []
-        for i in range(8):
-            a_i = a_bits[..., i]
-            b_i = b_bits[..., i]
-            sum_bit, carry = self.eval_full_adder(
-                a_i, b_i, carry,
-                f'arithmetic.ripplecarry8bit.fa{i}'
-            )
-            result_bits.append(sum_bit)
-        return torch.stack(result_bits, dim=-1), carry
-    # -------------------------------------------------------------------------
-    # Arithmetic: 8-bit Comparators
-    # -------------------------------------------------------------------------
-    def greater_than_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> torch.Tensor:
-        """Returns 1 if a > b, else 0. Bits are MSB first."""
-        diff = a_bits - b_bits  # [..., 8]
-        w = self._get('arithmetic.greaterthan8bit.comparator')
-        score = (diff * w).sum(dim=-1)
-        return (score > 0).float()
-    def less_than_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> torch.Tensor:
-        """Returns 1 if a < b, else 0. Bits are MSB first."""
-        diff = b_bits - a_bits  # [..., 8]
-        w = self._get('arithmetic.lessthan8bit.comparator')
-        score = (diff * w).sum(dim=-1)
-        return (score > 0).float()
-    def equal_8bit(self, a_bits: torch.Tensor, b_bits: torch.Tensor) -> torch.Tensor:
-        """Returns 1 if a == b, else 0."""
-        gt = self.greater_than_8bit(a_bits, b_bits)
-        lt = self.less_than_8bit(a_bits, b_bits)
-        return (1 - gt) * (1 - lt)
-# =============================================================================
-# BIT EXTRACTION / INJECTION INTERFACES
-# =============================================================================
-class BitExtractor(nn.Module):
-    """
-    Learns to extract 8-bit operands from token embeddings.
-    Maps embedding -> 16 bits (two 8-bit operands).
-    """
-    def __init__(self, d_model: int):
-        super().__init__()
-        self.d_model = d_model
-        # Project to logits, then binarize
-        self.proj = nn.Linear(d_model, 16)
-        # Learnable temperature for sigmoid approximation during training
-        self.temperature = nn.Parameter(torch.tensor(1.0))
-    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        x: [..., d_model]
-        Returns: a_bits [..., 8], b_bits [..., 8] (LSB first for arithmetic)
-        """
-        logits = self.proj(x)  # [..., 16]
-        # Binarize with STE
-        bits = heaviside(logits)
-        # Split into two operands
-        a_bits = bits[..., :8]
-        b_bits = bits[..., 8:]
-        return a_bits, b_bits
-class BitInjector(nn.Module):
-    """
-    Learns to inject circuit results back into embedding space.
-    Maps 16 bits (result + flags) -> embedding delta.
-    """
-    def __init__(self, d_model: int):
-        super().__init__()
-        self.d_model = d_model
-        # Project bits to embedding
-        self.proj = nn.Linear(16, d_model)
-        # Learnable scale
-        self.scale = nn.Parameter(torch.tensor(0.1))
-    def forward(self, result_bits: torch.Tensor, flags: torch.Tensor) -> torch.Tensor:
-        """
-        result_bits: [..., 8]
-        flags: [..., 8] (carry, overflow, zero, negative, etc.)
-        Returns: [..., d_model]
-        """
-        combined = torch.cat([result_bits, flags], dim=-1)  # [..., 16]
-        return self.proj(combined) * self.scale
-# =============================================================================
-# CIRCUIT-AUGMENTED MLP BLOCK
-# =============================================================================
-class CircuitAugmentedMLP(nn.Module):
-    """
-    MLP block augmented with frozen threshold circuits.
-    The original MLP path runs in parallel with the circuit path.
-    A learned router decides how much to use each.
-    """
-    def __init__(
-        self,
-        d_model: int,
-        intermediate_size: int,
-        circuit_path: str,
-        device: str = 'cpu'
-    ):
-        super().__init__()
-        self.d_model = d_model
-        # Original MLP components (will be loaded from pretrained)
-        self.gate_proj = nn.Linear(d_model, intermediate_size, bias=False)
-        self.up_proj = nn.Linear(d_model, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, d_model, bias=False)
-        self.act_fn = nn.SiLU()
-        # Circuit components
-        self.circuits = CircuitExecutor(circuit_path, device)
-        self.bit_extractor = BitExtractor(d_model)
-        self.bit_injector = BitInjector(d_model)
-        # Router: decides circuit vs MLP contribution
-        self.router = nn.Sequential(
-            nn.Linear(d_model, 64),
-            nn.ReLU(),
-            nn.Linear(64, 2),
-            nn.Softmax(dim=-1)
-        )
-        # Operation selector (which arithmetic op to perform)
-        self.op_selector = nn.Sequential(
-            nn.Linear(d_model, 32),
-            nn.ReLU(),
-            nn.Linear(32, 4),  # add, sub, compare, passthrough
-            nn.Softmax(dim=-1)
-        )
-    def _compute_flags(self, result_bits: torch.Tensor, carry: torch.Tensor) -> torch.Tensor:
-        """Compute status flags from result."""
-        batch_shape = result_bits.shape[:-1]
-        # Zero flag: all bits are 0
-        zero = (result_bits.sum(dim=-1) == 0).float()
-        # Negative flag: MSB is 1 (two's complement)
-        negative = result_bits[..., 7]
-        # Carry flag
-        carry_flag = carry
-        # Pad to 8 flags
-        flags = torch.zeros(*batch_shape, 8, device=result_bits.device)
-        flags[..., 0] = zero
-        flags[..., 1] = negative
-        flags[..., 2] = carry_flag
-        return flags
-    def _circuit_forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Run input through threshold circuits."""
-        # Extract operands
-        a_bits, b_bits = self.bit_extractor(x)
-        # Get operation weights
-        op_weights = self.op_selector(x)  # [..., 4]
-        # Compute addition
-        add_result, add_carry = self.circuits.add_8bit(a_bits, b_bits)
-        add_flags = self._compute_flags(add_result, add_carry)
-        # Compute subtraction (a + (~b) + 1, simplified: just use add for now)
-        # For MVP, we'll focus on addition
-        # Inject result back
-        circuit_delta = self.bit_injector(add_result, add_flags)
-        return circuit_delta
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        x: [batch, seq_len, d_model]
-        Returns: [batch, seq_len, d_model]
-        """
-        # Original MLP path
-        mlp_out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        # Circuit path
-        circuit_out = self._circuit_forward(x)
-        # Route between paths
-        route_weights = self.router(x)  # [..., 2]
-        mlp_weight = route_weights[..., 0:1]
-        circuit_weight = route_weights[..., 1:2]
-        # Combine: MLP output + weighted circuit contribution
-        output = mlp_out + circuit_weight * circuit_out
-        return output
-# =============================================================================
-# MODEL SURGERY: Insert circuits into SmolLM2
-# =============================================================================
-def augment_smollm2_with_circuits(
-    model: AutoModelForCausalLM,
-    circuit_path: str,
-    layer_indices: list = None,
-    device: str = 'cpu'
-) -> AutoModelForCausalLM:
-    """
-    Surgically insert circuit blocks into SmolLM2's MLP layers.
-    Args:
-        model: Pretrained SmolLM2 model
-        circuit_path: Path to neural_computer.safetensors
-        layer_indices: Which layers to augment (default: middle layers)
-        device: Device for circuit tensors
-    Returns:
-        Modified model with circuit-augmented MLPs
-    """
-    config = model.config
-    num_layers = config.num_hidden_layers
-    # Default: augment middle third of layers
-    if layer_indices is None:
-        start = num_layers // 3
-        end = 2 * num_layers // 3
-        layer_indices = list(range(start, end))
-    print(f"Augmenting layers {layer_indices} with threshold circuits...")
-    for idx in layer_indices:
-        layer = model.model.layers[idx]
-        old_mlp = layer.mlp
-        # Create augmented MLP
-        new_mlp = CircuitAugmentedMLP(
-            d_model=config.hidden_size,
-            intermediate_size=config.intermediate_size,
-            circuit_path=circuit_path,
-            device=device
-        )
-        # Copy pretrained weights
-        new_mlp.gate_proj.weight.data = old_mlp.gate_proj.weight.data.clone()
-        new_mlp.up_proj.weight.data = old_mlp.up_proj.weight.data.clone()
-        new_mlp.down_proj.weight.data = old_mlp.down_proj.weight.data.clone()
-        # Replace
-        layer.mlp = new_mlp
-    # Freeze circuit weights, keep interfaces trainable
-    for name, param in model.named_parameters():
-        if 'circuits' in name:
-            param.requires_grad = False
-    print(f"Done. Circuit weights frozen, interfaces trainable.")
-    return model
-# =============================================================================
-# TRAINING UTILITIES
-# =============================================================================
-def generate_arithmetic_batch(batch_size: int, max_val: int = 255) -> Tuple[list, list]:
-    """Generate batch of arithmetic problems and solutions."""
-    prompts = []
-    targets = []
-    for _ in range(batch_size):
-        a = torch.randint(0, max_val + 1, (1,)).item()
-        b = torch.randint(0, max_val + 1, (1,)).item()
-        result = (a + b) % 256
-        prompts.append(f"{a} + {b} =")
-        targets.append(f" {result}")
-    return prompts, targets
-def evaluate_arithmetic(
-    model: AutoModelForCausalLM,
-    tokenizer: AutoTokenizer,
-    n_problems: int = 100,
-    device: str = 'cpu'
-) -> dict:
-    """Evaluate model on random arithmetic problems."""
-    correct = 0
-    total = 0
-    errors = []
-    model.eval()
-    for _ in range(n_problems):
-        a = torch.randint(0, 256, (1,)).item()
-        b = torch.randint(0, 256, (1,)).item()
-        expected = (a + b) % 256
-        prompt = f"{a} + {b} ="
-        inputs = tokenizer(prompt, return_tensors='pt').to(device)
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=10,
-                do_sample=False,
-                pad_token_id=tokenizer.eos_token_id
-            )
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract number from response
-        try:
-            # Find the part after "="
-            answer_part = response.split('=')[-1].strip()
-            # Extract first number
-            predicted = int(''.join(c for c in answer_part.split()[0] if c.isdigit()))
-            if predicted == expected:
-                correct += 1
-            else:
-                errors.append((a, b, expected, predicted))
-        except:
-            errors.append((a, b, expected, "parse_error"))
-        total += 1
-    return {
-        'accuracy': correct / total,
-        'correct': correct,
-        'total': total,
-        'errors': errors[:10]  # First 10 errors
-    }
-# =============================================================================
-# MAIN: Demo
-# =============================================================================
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='Circuit-Augmented LLM Demo')
-    parser.add_argument('--circuit-path', type=str,
-                        default='./neural_computer.safetensors',
-                        help='Path to circuit weights')
-    parser.add_argument('--device', type=str, default='cpu',
-                        help='Device (cpu or cuda)')
-    parser.add_argument('--eval-only', action='store_true',
-                        help='Only evaluate, do not augment')
-    args = parser.parse_args()
-    print("=" * 70)
-    print(" CIRCUIT-AUGMENTED LLM")
-    print("=" * 70)
-    # Load tokenizer and model
-    print("\n[1] Loading SmolLM2-360M...")
-    model_id = "HuggingFaceTB/SmolLM2-360M"
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
-    print(f"    Parameters: {sum(p.numel() for p in model.parameters()):,}")
-    # Baseline evaluation
-    print("\n[2] Baseline arithmetic evaluation...")
-    baseline = evaluate_arithmetic(model, tokenizer, n_problems=50, device=args.device)
-    print(f"    Accuracy: {baseline['accuracy']*100:.1f}% ({baseline['correct']}/{baseline['total']})")
-    if baseline['errors']:
-        print(f"    Sample errors:")
-        for a, b, exp, got in baseline['errors'][:5]:
-            print(f"      {a} + {b} = {exp}, model said {got}")
-    if args.eval_only:
-        print("\nDone (eval only mode).")
-        exit(0)
-    # Augment with circuits
-    print(f"\n[3] Augmenting with threshold circuits...")
-    print(f"    Circuit path: {args.circuit_path}")
-    model = augment_smollm2_with_circuits(
-        model,
-        args.circuit_path,
-        device=args.device
-    )
-    new_params = sum(p.numel() for p in model.parameters())
-    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print(f"    Total parameters: {new_params:,}")
-    print(f"    Trainable parameters: {trainable:,}")
-    # Test circuit execution directly
-    print("\n[4] Testing circuit execution...")
-    circuit_exec = CircuitExecutor(args.circuit_path, args.device)
-    test_cases = [(127, 128), (255, 1), (0, 0), (100, 55)]
-    for a, b in test_cases:
-        # Convert to bits (LSB first)
-        a_bits = torch.tensor([(a >> i) & 1 for i in range(8)], dtype=torch.float32)
-        b_bits = torch.tensor([(b >> i) & 1 for i in range(8)], dtype=torch.float32)
-        result_bits, carry = circuit_exec.add_8bit(
-            a_bits.unsqueeze(0),
-            b_bits.unsqueeze(0)
-        )
-        # Convert result bits back to int
-        result = sum(int(result_bits[0, i].item()) * (2**i) for i in range(8))
-        expected = (a + b) % 256
-        status = "OK" if result == expected else "FAIL"
-        print(f"    {a} + {b} = {result} (expected {expected}) [{status}]")
-    print("\n[5] Model ready for fine-tuning.")
-    print("    Next: Train interface layers on arithmetic examples.")
-    print("=" * 70)