Rewrite pruning framework with full GPU vectorization

- Fully vectorized forward pass (no Python loops over cases)
- True batched population evaluation
- VRAM management with overflow protection
- Added pruning methods: neuron, lottery ticket, topology search
- Improved evolutionary: elite preservation, crossover, adaptive mutation
- Circuit-specific optimized forward functions (Hamming encoder/decoder)

Files changed (1) hide show

prune.py +1062 -747

prune.py CHANGED Viewed

@@ -1,87 +1,164 @@
 """
-Unified Threshold Circuit Pruning Framework
-============================================
-All pruning methods for threshold logic circuits in a single file.
 Methods:
-    1. Magnitude Reduction (sequential & batched GPU)
-    2. Zero Pruning (sparsification)
-    3. Weight Quantization (force to {-1,0,1})
-    4. Evolutionary Search (mutation + selection)
-    5. Simulated Annealing (gradual cooling)
-    6. Pareto Frontier (correctness vs size tradeoff)
 Usage:
-    python prune.py threshold-hamming74decoder
-    python prune.py threshold-hamming74decoder --methods magnitude,zero,evo
-    python prune.py --list
-    python prune.py --all --max-inputs 8
-Author: Pruning framework for phanerozoic/threshold-logic-circuits
 """
 import torch
-import torch.jit
 import json
 import time
 import random
 import argparse
-import importlib.util
-import sys
 from pathlib import Path
 from dataclasses import dataclass, field
-from typing import Dict, List, Tuple, Optional, Callable, Set
-from enum import Enum, auto
-from datetime import datetime
 from safetensors.torch import load_file, save_file
-# =============================================================================
-# CONFIGURATION
-# =============================================================================
 CIRCUITS_PATH = Path('D:/threshold-circuits')
 RESULTS_PATH = CIRCUITS_PATH / 'pruned_results'
 @dataclass
 class Config:
-    """Global configuration for pruning."""
     device: str = 'cuda'
     fitness_threshold: float = 0.9999
-    batch_size: int = 80000
     verbose: bool = True
-    # Method toggles
     run_magnitude: bool = True
-    run_batched_magnitude: bool = True
     run_zero: bool = True
     run_quantize: bool = True
     run_evolutionary: bool = True
     run_annealing: bool = True
     run_pareto: bool = True
-    # Method-specific
     magnitude_passes: int = 100
-    evo_generations: int = 1000
-    evo_pop_size: int = 200
-    evo_mutation_rate: float = 0.1
     evo_parsimony: float = 0.001
-    annealing_iterations: int = 10000
     annealing_initial_temp: float = 10.0
-    annealing_cooling: float = 0.995
     quantize_targets: List[float] = field(default_factory=lambda: [-1.0, 0.0, 1.0])
-    pareto_levels: List[float] = field(default_factory=lambda: [1.0, 0.99, 0.95, 0.90])
-# =============================================================================
-# CIRCUIT LOADING
-# =============================================================================
 @dataclass
 class CircuitSpec:
-    """Metadata for a threshold circuit."""
     name: str
     path: Path
     inputs: int
@@ -92,14 +169,36 @@ class CircuitSpec:
     description: str = ""
-class Circuit:
-    """Threshold logic circuit loaded from safetensors."""
     def __init__(self, path: Path, device: str = 'cuda'):
         self.path = Path(path)
         self.device = device
         self.spec = self._load_spec()
         self.weights = self._load_weights()
     def _load_spec(self) -> CircuitSpec:
         with open(self.path / 'config.json') as f:
@@ -119,15 +218,315 @@ class Circuit:
         w = load_file(str(self.path / 'model.safetensors'))
         return {k: v.float().to(self.device) for k, v in w.items()}
-    def clone(self) -> Dict[str, torch.Tensor]:
         return {k: v.clone() for k, v in self.weights.items()}
     def stats(self, weights: Dict[str, torch.Tensor] = None) -> Dict:
         w = weights or self.weights
         total = sum(t.numel() for t in w.values())
         nonzero = sum((t != 0).sum().item() for t in w.values())
         mag = sum(t.abs().sum().item() for t in w.values())
-        maxw = max(t.abs().max().item() for t in w.values())
         unique = set()
         for t in w.values():
             unique.update(t.flatten().tolist())
@@ -138,393 +537,238 @@ class Circuit:
             'sparsity': 1 - nonzero/total if total else 0,
             'magnitude': mag,
             'max_weight': maxw,
-            'unique_count': len(unique),
-            'unique_values': sorted(unique)
         }
-    def save(self, weights: Dict[str, torch.Tensor], suffix: str = 'pruned'):
         path = self.path / f'model_{suffix}.safetensors'
         cpu_w = {k: v.cpu() for k, v in weights.items()}
         save_file(cpu_w, str(path))
         return path
-def discover_circuits(base: Path = CIRCUITS_PATH) -> List[CircuitSpec]:
-    """Find all circuits in the collection."""
-    circuits = []
-    for d in base.iterdir():
-        if d.is_dir() and (d / 'config.json').exists() and (d / 'model.safetensors').exists():
-            try:
-                c = Circuit(d, device='cpu')
-                circuits.append(c.spec)
-            except Exception as e:
-                print(f"Skip {d.name}: {e}")
-    return sorted(circuits, key=lambda x: (x.inputs, x.neurons))
-def load_circuit(name: str, device: str = 'cuda') -> Circuit:
-    """Load circuit by name."""
-    path = CIRCUITS_PATH / name
-    if not path.exists():
-        path = CIRCUITS_PATH / f'threshold-{name}'
-    if not path.exists():
-        raise ValueError(f"Circuit not found: {name}")
-    return Circuit(path, device)
-# =============================================================================
-# GPU UTILITIES
-# =============================================================================
-def gpu_memory() -> Dict:
-    if torch.cuda.is_available():
-        return {
-            'allocated': torch.cuda.memory_allocated() / 1e9,
-            'reserved': torch.cuda.memory_reserved() / 1e9,
-            'total': torch.cuda.get_device_properties(0).total_memory / 1e9
-        }
-    return {'allocated': 0, 'reserved': 0, 'total': 0}
-def create_population(weights: Dict[str, torch.Tensor],
-                      pop_size: int, device: str) -> Dict[str, torch.Tensor]:
-    """Replicate weights for batched evaluation."""
-    return {
-        k: v.unsqueeze(0).expand(pop_size, *v.shape).clone().to(device)
-        for k, v in weights.items()
-    }
-# =============================================================================
-# GENERIC EVALUATOR
-# =============================================================================
-class Evaluator:
-    """
-    Generic evaluator for any threshold circuit.
-    Builds truth table and tests exhaustively.
-    """
-    def __init__(self, circuit: Circuit, forward_fn: Callable):
-        self.circuit = circuit
-        self.forward_fn = forward_fn
-        self.device = circuit.device
-        self.n_inputs = circuit.spec.inputs
-        self.n_cases = 2 ** self.n_inputs
-        self._build_inputs()
-        self._build_expected()
-    def _build_inputs(self):
-        """Generate all 2^n input combinations."""
-        if self.n_inputs > 20:
-            raise ValueError(f"Input space too large: 2^{self.n_inputs}")
-        idx = torch.arange(self.n_cases, device=self.device, dtype=torch.long)
-        bits = torch.arange(self.n_inputs, device=self.device, dtype=torch.long)
-        self.inputs = ((idx.unsqueeze(1) >> bits) & 1).float()
-    def _build_expected(self):
-        """Compute expected outputs using original weights."""
-        self.expected = self.forward_fn(self.inputs, self.circuit.weights)
-    def evaluate(self, weights: Dict[str, torch.Tensor]) -> float:
-        """Single evaluation: returns fitness 0.0-1.0"""
-        outputs = self.forward_fn(self.inputs, weights)
-        correct = (outputs == self.expected).all(dim=-1).float().sum()
-        return (correct / self.n_cases).item()
-    def evaluate_batch(self, population: Dict[str, torch.Tensor]) -> torch.Tensor:
-        """Batch evaluation: returns [pop_size] fitness tensor"""
-        pop_size = next(iter(population.values())).shape[0]
         fitness = torch.zeros(pop_size, device=self.device)
-        for i in range(pop_size):
-            w = {k: v[i] for k, v in population.items()}
-            outputs = self.forward_fn(self.inputs, w)
-            correct = (outputs == self.expected).all(dim=-1).float().sum()
-            fitness[i] = correct / self.n_cases
         return fitness
-# =============================================================================
-# PRUNING METHODS
-# =============================================================================
-@dataclass
-class PruneResult:
-    """Result from a pruning method."""
-    method: str
-    original_stats: Dict
-    final_stats: Dict
-    final_weights: Dict[str, torch.Tensor]
-    fitness: float
-    reductions: int
-    time_seconds: float
-    history: List[Dict] = field(default_factory=list)
-def get_candidates(weights: Dict[str, torch.Tensor]) -> List[Tuple[str, int, tuple, float]]:
-    """Get all non-zero weight positions."""
-    candidates = []
-    for name, tensor in weights.items():
-        flat = tensor.flatten()
-        for i in range(len(flat)):
-            val = flat[i].item()
-            if val != 0:
-                candidates.append((name, i, tensor.shape, val))
-    return candidates
-def apply_reduction(weights: Dict[str, torch.Tensor],
-                    name: str, idx: int, shape: tuple, old_val: float):
-    """Apply magnitude reduction: move weight 1 step toward zero."""
-    new_val = old_val - 1 if old_val > 0 else old_val + 1
-    flat = weights[name].flatten()
-    flat[idx] = new_val
-    weights[name] = flat.view(shape)
-def revert_reduction(weights: Dict[str, torch.Tensor],
-                     name: str, idx: int, shape: tuple, old_val: float):
-    """Revert a reduction."""
-    flat = weights[name].flatten()
-    flat[idx] = old_val
-    weights[name] = flat.view(shape)
-# -----------------------------------------------------------------------------
-# Method 1: Sequential Magnitude Reduction
-# -----------------------------------------------------------------------------
-def prune_magnitude(weights: Dict[str, torch.Tensor],
-                    eval_fn: Callable[[Dict], float],
-                    cfg: Config) -> PruneResult:
-    """Reduce weight magnitudes one at a time."""
     start = time.perf_counter()
-    weights = {k: v.clone() for k, v in weights.items()}
-    original = _stats(weights)
-    reductions = 0
     history = []
     if cfg.verbose:
-        print(f"    Starting magnitude reduction...")
         print(f"    Original: mag={original['magnitude']:.0f}, nonzero={original['nonzero']}")
     for pass_num in range(cfg.magnitude_passes):
-        candidates = get_candidates(weights)
         if not candidates:
-            if cfg.verbose:
-                print(f"    No candidates remaining at pass {pass_num}")
             break
-        if cfg.verbose:
-            print(f"    Pass {pass_num}: testing {len(candidates)} candidates...")
         pass_reductions = 0
-        tested = 0
         for name, idx, shape, old_val in candidates:
-            apply_reduction(weights, name, idx, shape, old_val)
-            tested += 1
-            fitness = eval_fn(weights)
             if fitness >= cfg.fitness_threshold:
                 pass_reductions += 1
-                reductions += 1
-                if cfg.verbose:
-                    new_val = old_val - 1 if old_val > 0 else old_val + 1
-                    print(f"      ✓ {name}[{idx}]: {old_val} -> {new_val}")
             else:
-                revert_reduction(weights, name, idx, shape, old_val)
-            if cfg.verbose and tested % 50 == 0:
-                s = _stats(weights)
-                print(f"      Progress: {tested}/{len(candidates)}, reductions={pass_reductions}, mag={s['magnitude']:.0f}")
-        history.append({'pass': pass_num, 'reductions': pass_reductions})
-        s = _stats(weights)
         if cfg.verbose:
-            print(f"    Pass {pass_num} complete: +{pass_reductions} reductions, mag={s['magnitude']:.0f}, nonzero={s['nonzero']}")
         if pass_reductions == 0:
-            if cfg.verbose:
-                print(f"    No progress at pass {pass_num}, stopping.")
             break
     return PruneResult(
         method='magnitude',
         original_stats=original,
-        final_stats=_stats(weights),
         final_weights=weights,
-        fitness=eval_fn(weights),
-        reductions=reductions,
         time_seconds=time.perf_counter() - start,
         history=history
     )
-# -----------------------------------------------------------------------------
-# Method 2: Batched GPU Magnitude Reduction
-# -----------------------------------------------------------------------------
-def prune_magnitude_batched(weights: Dict[str, torch.Tensor],
-                            eval_fn: Callable[[Dict], float],
-                            batch_eval_fn: Callable[[Dict], torch.Tensor],
-                            cfg: Config) -> PruneResult:
-    """GPU-batched magnitude reduction."""
-    start = time.perf_counter()
-    weights = {k: v.clone() for k, v in weights.items()}
-    original = _stats(weights)
-    device = cfg.device
-    reductions = 0
-    history = []
-    for pass_num in range(cfg.magnitude_passes):
-        candidates = get_candidates(weights)
-        if not candidates:
-            break
-        # Phase 1: Batch test all candidates
-        successful = []
-        n = len(candidates)
-        for batch_start in range(0, n, cfg.batch_size):
-            batch = candidates[batch_start:batch_start + cfg.batch_size]
-            batch_len = len(batch)
-            pop = {name: tensor.unsqueeze(0).expand(batch_len, *tensor.shape).clone().to(device)
-                   for name, tensor in weights.items()}
-            for pop_idx, (name, flat_idx, shape, old_val) in enumerate(batch):
-                new_val = old_val - 1 if old_val > 0 else old_val + 1
-                flat_view = pop[name][pop_idx].flatten()
-                flat_view[flat_idx] = new_val
-            fitness = batch_eval_fn(pop)
-            for pop_idx, cand in enumerate(batch):
-                if fitness[pop_idx].item() >= cfg.fitness_threshold:
-                    successful.append(cand)
-        # Phase 2: Apply with conflict resolution
-        pass_reductions = 0
-        for name, idx, shape, old_val in successful:
-            current_val = weights[name].flatten()[idx].item()
-            if current_val == old_val:
-                apply_reduction(weights, name, idx, shape, old_val)
-                if eval_fn(weights) >= cfg.fitness_threshold:
-                    pass_reductions += 1
-                    reductions += 1
-                else:
-                    revert_reduction(weights, name, idx, shape, old_val)
-        history.append({'pass': pass_num, 'reductions': pass_reductions, 'candidates': len(successful)})
-        if cfg.verbose:
-            s = _stats(weights)
-            print(f"    Pass {pass_num}: {pass_reductions}/{len(successful)} applied, mag={s['magnitude']:.0f}")
-        if pass_reductions == 0:
-            break
-    return PruneResult(
-        method='batched_magnitude',
-        original_stats=original,
-        final_stats=_stats(weights),
-        final_weights=weights,
-        fitness=eval_fn(weights),
-        reductions=reductions,
-        time_seconds=time.perf_counter() - start,
-        history=history
-    )
-# -----------------------------------------------------------------------------
-# Method 3: Zero Pruning
-# -----------------------------------------------------------------------------
-def prune_zero(weights: Dict[str, torch.Tensor],
-               eval_fn: Callable[[Dict], float],
                cfg: Config) -> PruneResult:
-    """Try setting weights directly to zero."""
     start = time.perf_counter()
-    weights = {k: v.clone() for k, v in weights.items()}
-    original = _stats(weights)
-    candidates = get_candidates(weights)
     random.shuffle(candidates)
     if cfg.verbose:
-        print(f"    Starting zero pruning...")
-        print(f"    Original: mag={original['magnitude']:.0f}, nonzero={original['nonzero']}")
-        print(f"    Testing {len(candidates)} candidates (random order)...")
-    reductions = 0
-    tested = 0
     for name, idx, shape, old_val in candidates:
         flat = weights[name].flatten()
         flat[idx] = 0
         weights[name] = flat.view(shape)
-        tested += 1
-        if eval_fn(weights) >= cfg.fitness_threshold:
-            reductions += 1
-            if cfg.verbose:
-                print(f"      ✓ {name}[{idx}]: {old_val} -> 0 (zeroed)")
         else:
             flat = weights[name].flatten()
             flat[idx] = old_val
             weights[name] = flat.view(shape)
-        if cfg.verbose and tested % 50 == 0:
-            s = _stats(weights)
-            print(f"      Progress: {tested}/{len(candidates)}, zeroed={reductions}, mag={s['magnitude']:.0f}")
     if cfg.verbose:
-        s = _stats(weights)
-        print(f"    Zero pruning complete: {reductions} weights zeroed")
-        print(f"    Final: mag={s['magnitude']:.0f}, nonzero={s['nonzero']}")
     return PruneResult(
         method='zero',
         original_stats=original,
-        final_stats=_stats(weights),
         final_weights=weights,
-        fitness=eval_fn(weights),
-        reductions=reductions,
         time_seconds=time.perf_counter() - start
     )
-# -----------------------------------------------------------------------------
-# Method 4: Quantization
-# -----------------------------------------------------------------------------
-def prune_quantize(weights: Dict[str, torch.Tensor],
-                   eval_fn: Callable[[Dict], float],
                    cfg: Config) -> PruneResult:
-    """Force weights to target set (default: {-1,0,1})."""
     start = time.perf_counter()
-    weights = {k: v.clone() for k, v in weights.items()}
-    original = _stats(weights)
-    target = torch.tensor(cfg.quantize_targets, device=weights[next(iter(weights))].device)
     target_set = set(cfg.quantize_targets)
     if cfg.verbose:
-        print(f"    Starting quantization...")
-        print(f"    Target values: {sorted(cfg.quantize_targets)}")
-        print(f"    Original unique values: {original.get('unique_count', len(set(v.item() for t in weights.values() for v in t.flatten())))}")
-        print(f"    Original: mag={original['magnitude']:.0f}, nonzero={original['nonzero']}")
-    # Count how many need quantizing
-    needs_quant = sum(1 for t in weights.values() for v in t.flatten() if v.item() not in target_set)
-    if cfg.verbose:
-        print(f"    Weights needing quantization: {needs_quant}")
-    reductions = 0
-    tested = 0
     for name, tensor in list(weights.items()):
         flat = tensor.flatten()
         for i in range(len(flat)):
@@ -535,539 +779,574 @@ def prune_quantize(weights: Dict[str, torch.Tensor],
                 flat[i] = closest
                 weights[name] = flat.view(tensor.shape)
-                tested += 1
-                if eval_fn(weights) >= cfg.fitness_threshold:
-                    reductions += 1
-                    if cfg.verbose:
-                        print(f"      ✓ {name}[{i}]: {old_val} -> {closest}")
                 else:
                     flat[i] = old_val
                     weights[name] = flat.view(tensor.shape)
-                if cfg.verbose and tested % 20 == 0:
-                    print(f"      Progress: {tested}/{needs_quant}, quantized={reductions}")
     if cfg.verbose:
-        s = _stats(weights)
-        unique_now = len(set(v.item() for t in weights.values() for v in t.flatten()))
-        print(f"    Quantization complete: {reductions}/{tested} quantized")
-        print(f"    Final unique values: {unique_now}")
-        print(f"    Final: mag={s['magnitude']:.0f}, nonzero={s['nonzero']}")
     return PruneResult(
         method='quantize',
         original_stats=original,
-        final_stats=_stats(weights),
         final_weights=weights,
-        fitness=eval_fn(weights),
-        reductions=reductions,
         time_seconds=time.perf_counter() - start
     )
-# -----------------------------------------------------------------------------
-# Method 5: Evolutionary Search
-# -----------------------------------------------------------------------------
-def prune_evolutionary(weights: Dict[str, torch.Tensor],
-                       batch_eval_fn: Callable[[Dict], torch.Tensor],
                        cfg: Config) -> PruneResult:
-    """Evolutionary search with parsimony pressure."""
     start = time.perf_counter()
-    original = _stats(weights)
-    device = cfg.device
-    pop_size = cfg.evo_pop_size
     if cfg.verbose:
-        print(f"    Starting evolutionary search...")
-        print(f"    Population: {pop_size}, Generations: {cfg.evo_generations}")
-        print(f"    Mutation rate: {cfg.evo_mutation_rate}, Parsimony: {cfg.evo_parsimony}")
-        print(f"    Original: mag={original['magnitude']:.0f}, nonzero={original['nonzero']}")
-    # Initialize population
-    pop = {k: v.unsqueeze(0).expand(pop_size, *v.shape).clone().to(device)
-           for k, v in weights.items()}
-    best_weights = {k: v.clone() for k, v in weights.items()}
     best_score = -float('inf')
     best_fitness = 0.0
     history = []
-    improved_at = 0
     for gen in range(cfg.evo_generations):
-        # Evaluate
-        fitness = batch_eval_fn(pop)
-        # Compute magnitude penalty
-        mags = torch.stack([
-            sum(pop[name][i].abs().sum() for name in pop)
-            for i in range(pop_size)
-        ])
-        adjusted = fitness - cfg.evo_parsimony * mags
-        # Track best
-        best_idx = adjusted.argmax().item()
-        gen_best_fitness = fitness[best_idx].item()
-        gen_best_adj = adjusted[best_idx].item()
-        gen_best_mag = mags[best_idx].item()
-        if gen_best_fitness >= cfg.fitness_threshold:
-            if gen_best_adj > best_score:
-                best_score = gen_best_adj
-                best_fitness = gen_best_fitness
-                best_weights = {k: v[best_idx].clone() for k, v in pop.items()}
-                improved_at = gen
-                if cfg.verbose:
-                    s = _stats(best_weights)
-                    print(f"    Gen {gen}: NEW BEST! score={best_score:.4f}, fitness={best_fitness:.4f}, mag={s['magnitude']:.0f}")
-        # Stats for this generation
         valid_mask = fitness >= cfg.fitness_threshold
         n_valid = valid_mask.sum().item()
-        avg_fitness = fitness.mean().item()
-        avg_mag = mags.mean().item()
-        if gen % 50 == 0:
-            s = _stats(best_weights)
             if cfg.verbose:
-                print(f"    Gen {gen}: valid={n_valid}/{pop_size}, avg_fit={avg_fitness:.4f}, avg_mag={avg_mag:.0f}, best_mag={s['magnitude']:.0f}")
-            history.append({'gen': gen, 'score': best_score, 'mag': s['magnitude'], 'n_valid': n_valid})
-        # Selection + mutation
-        probs = torch.softmax(adjusted, dim=0)
-        indices = torch.multinomial(probs, pop_size, replacement=True)
-        new_pop = {}
-        for name, tensor in pop.items():
-            selected = tensor[indices].clone()
-            mask = torch.rand_like(selected) < cfg.evo_mutation_rate
-            mutations = torch.randint(-1, 2, selected.shape, device=device).float()
-            selected = selected + mask.float() * mutations
-            new_pop[name] = selected
-        pop = new_pop
-    # Final report
-    final_stats = _stats(best_weights)
-    elapsed = time.perf_counter() - start
-    if cfg.verbose:
-        print(f"    Evolution complete in {elapsed:.1f}s")
-        print(f"    Best found at generation {improved_at}")
-        print(f"    Final: mag={final_stats['magnitude']:.0f}, nonzero={final_stats['nonzero']}")
-        reduction_pct = 100 * (1 - final_stats['magnitude'] / original['magnitude'])
-        print(f"    Magnitude reduction: {reduction_pct:.1f}%")
     return PruneResult(
         method='evolutionary',
         original_stats=original,
         final_stats=final_stats,
         final_weights=best_weights,
-        fitness=best_score + cfg.evo_parsimony * final_stats['magnitude'],
-        reductions=int(original['magnitude'] - final_stats['magnitude']),
-        time_seconds=elapsed,
-        history=history
     )
-# -----------------------------------------------------------------------------
-# Method 6: Simulated Annealing
-# -----------------------------------------------------------------------------
-def prune_annealing(weights: Dict[str, torch.Tensor],
-                    eval_fn: Callable[[Dict], float],
                     cfg: Config) -> PruneResult:
-    """Simulated annealing for circuit minimization."""
     start = time.perf_counter()
-    weights = {k: v.clone() for k, v in weights.items()}
-    original = _stats(weights)
-    current = weights
-    current_energy = _energy(current, eval_fn, cfg)
     best = {k: v.clone() for k, v in current.items()}
     best_energy = current_energy
     temp = cfg.annealing_initial_temp
     history = []
     for i in range(cfg.annealing_iterations):
-        # Perturb
         neighbor = {k: v.clone() for k, v in current.items()}
         name = random.choice(list(neighbor.keys()))
         flat = neighbor[name].flatten()
         idx = random.randint(0, len(flat) - 1)
-        mutation = random.choice([-1, 1, 0])
         if mutation == 0:
             flat[idx] = 0
         else:
             flat[idx] = flat[idx] + mutation
         neighbor[name] = flat.view(neighbor[name].shape)
-        neighbor_energy = _energy(neighbor, eval_fn, cfg)
         delta = neighbor_energy - current_energy
         if delta < 0 or random.random() < math.exp(-delta / max(temp, 1e-10)):
             current = neighbor
             current_energy = neighbor_energy
-            if current_energy < best_energy:
-                if eval_fn(current) >= cfg.fitness_threshold:
-                    best = {k: v.clone() for k, v in current.items()}
-                    best_energy = current_energy
         temp *= cfg.annealing_cooling
-        if i % 1000 == 0:
-            s = _stats(best)
             if cfg.verbose:
-                print(f"    Iter {i}: temp={temp:.4f}, mag={s['magnitude']:.0f}")
-            history.append({'iter': i, 'temp': temp, 'mag': s['magnitude']})
     return PruneResult(
         method='annealing',
         original_stats=original,
-        final_stats=_stats(best),
         final_weights=best,
-        fitness=eval_fn(best),
-        reductions=int(original['magnitude'] - _stats(best)['magnitude']),
         time_seconds=time.perf_counter() - start,
         history=history
     )
-def _energy(weights, eval_fn, cfg):
-    fitness = eval_fn(weights)
-    mag = sum(t.abs().sum().item() for t in weights.values())
-    if fitness < cfg.fitness_threshold:
-        return 1e6 + mag
-    return mag
-# -----------------------------------------------------------------------------
-# Method 7: Pareto Frontier
-# -----------------------------------------------------------------------------
-def prune_pareto(weights: Dict[str, torch.Tensor],
-                 eval_fn: Callable[[Dict], float],
-                 cfg: Config) -> PruneResult:
-    """Search Pareto frontier of correctness vs size."""
     start = time.perf_counter()
-    original = _stats(weights)
-    frontier = []
-    for target in cfg.pareto_levels:
-        if cfg.verbose:
-            print(f"    Target fitness >= {target}")
-        relaxed_cfg = Config(
-            device=cfg.device,
-            fitness_threshold=target,
-            magnitude_passes=50,
-            verbose=False
-        )
-        result = prune_magnitude({k: v.clone() for k, v in weights.items()}, eval_fn, relaxed_cfg)
-        frontier.append({
-            'target': target,
-            'actual': result.fitness,
-            'magnitude': result.final_stats['magnitude'],
-            'nonzero': result.final_stats['nonzero']
         })
         if cfg.verbose:
-            print(f"      -> fitness={result.fitness:.4f}, mag={result.final_stats['magnitude']:.0f}")
     return PruneResult(
-        method='pareto',
         original_stats=original,
-        final_stats=frontier[-1] if frontier else original,
         final_weights=weights,
-        fitness=frontier[0]['actual'] if frontier else 1.0,
-        reductions=len(frontier),
         time_seconds=time.perf_counter() - start,
-        history=frontier
     )
-# -----------------------------------------------------------------------------
-# Helpers
-# -----------------------------------------------------------------------------
-def _stats(weights: Dict[str, torch.Tensor]) -> Dict:
-    total = sum(t.numel() for t in weights.values())
-    nonzero = sum((t != 0).sum().item() for t in weights.values())
-    mag = sum(t.abs().sum().item() for t in weights.values())
-    maxw = max(t.abs().max().item() for t in weights.values()) if weights else 0
-    return {'total': total, 'nonzero': nonzero, 'magnitude': mag, 'max': maxw}
-import math
-# =============================================================================
-# CIRCUIT-SPECIFIC FORWARD FUNCTIONS
-# =============================================================================
-def make_hamming_decoder_forward(device='cuda'):
-    """Create forward function for Hamming(7,4) decoder."""
-    def forward(inputs, weights):
-        """
-        Batched forward pass for Hamming decoder.
-        inputs: [n_cases, 7]
-        weights: dict of weight tensors
-        Returns: [n_cases, 4]
-        """
-        n_cases = inputs.shape[0]
-        w = weights
-        outputs = []
-        for case_idx in range(n_cases):
-            c = [inputs[case_idx, i].item() for i in range(7)]
-            def xor2(a, b, prefix):
-                inp = torch.tensor([float(a), float(b)], device=device)
-                or_out = float((inp * w[f'{prefix}.layer1.or.weight'].flatten()[:2]).sum() +
-                               w[f'{prefix}.layer1.or.bias'].squeeze() >= 0)
-                nand_out = float((inp * w[f'{prefix}.layer1.nand.weight'].flatten()[:2]).sum() +
-                                 w[f'{prefix}.layer1.nand.bias'].squeeze() >= 0)
-                l1 = torch.tensor([or_out, nand_out], device=device)
-                return int((l1 * w[f'{prefix}.layer2.weight'].flatten()).sum() +
-                           w[f'{prefix}.layer2.bias'].squeeze() >= 0)
-            def xor4(indices, prefix):
-                i0, i1, i2, i3 = indices
-                inp = torch.tensor([float(c[i]) for i in range(7)], device=device)
-                or_out = float((inp * w[f'{prefix}.xor_{i0}{i1}.layer1.or.weight'].flatten()).sum() +
-                               w[f'{prefix}.xor_{i0}{i1}.layer1.or.bias'].squeeze() >= 0)
-                nand_out = float((inp * w[f'{prefix}.xor_{i0}{i1}.layer1.nand.weight'].flatten()).sum() +
-                                 w[f'{prefix}.xor_{i0}{i1}.layer1.nand.bias'].squeeze() >= 0)
-                xor_ab = int((torch.tensor([or_out, nand_out], device=device) *
-                              w[f'{prefix}.xor_{i0}{i1}.layer2.weight'].flatten()).sum() +
-                             w[f'{prefix}.xor_{i0}{i1}.layer2.bias'].squeeze() >= 0)
-                or_out = float((inp * w[f'{prefix}.xor_{i2}{i3}.layer1.or.weight'].flatten()).sum() +
-                               w[f'{prefix}.xor_{i2}{i3}.layer1.or.bias'].squeeze() >= 0)
-                nand_out = float((inp * w[f'{prefix}.xor_{i2}{i3}.layer1.nand.weight'].flatten()).sum() +
-                                 w[f'{prefix}.xor_{i2}{i3}.layer1.nand.bias'].squeeze() >= 0)
-                xor_cd = int((torch.tensor([or_out, nand_out], device=device) *
-                              w[f'{prefix}.xor_{i2}{i3}.layer2.weight'].flatten()).sum() +
-                             w[f'{prefix}.xor_{i2}{i3}.layer2.bias'].squeeze() >= 0)
-                inp2 = torch.tensor([float(xor_ab), float(xor_cd)], device=device)
-                or_out = float((inp2 * w[f'{prefix}.xor_final.layer1.or.weight'].flatten()).sum() +
-                               w[f'{prefix}.xor_final.layer1.or.bias'].squeeze() >= 0)
-                nand_out = float((inp2 * w[f'{prefix}.xor_final.layer1.nand.weight'].flatten()).sum() +
-                                 w[f'{prefix}.xor_final.layer1.nand.bias'].squeeze() >= 0)
-                return int((torch.tensor([or_out, nand_out], device=device) *
-                            w[f'{prefix}.xor_final.layer2.weight'].flatten()).sum() +
-                           w[f'{prefix}.xor_final.layer2.bias'].squeeze() >= 0)
-            s1 = xor4([0, 2, 4, 6], 's1')
-            s2 = xor4([1, 2, 5, 6], 's2')
-            s3 = xor4([3, 4, 5, 6], 's3')
-            syndrome = torch.tensor([float(s1), float(s2), float(s3)], device=device)
-            flip3 = int((syndrome * w['flip3.weight'].flatten()).sum() + w['flip3.bias'].squeeze() >= 0)
-            flip5 = int((syndrome * w['flip5.weight'].flatten()).sum() + w['flip5.bias'].squeeze() >= 0)
-            flip6 = int((syndrome * w['flip6.weight'].flatten()).sum() + w['flip6.bias'].squeeze() >= 0)
-            flip7 = int((syndrome * w['flip7.weight'].flatten()).sum() + w['flip7.bias'].squeeze() >= 0)
-            d1 = xor2(c[2], flip3, 'd1.xor')
-            d2 = xor2(c[4], flip5, 'd2.xor')
-            d3 = xor2(c[5], flip6, 'd3.xor')
-            d4 = xor2(c[6], flip7, 'd4.xor')
-            outputs.append([d1, d2, d3, d4])
-        return torch.tensor(outputs, device=device, dtype=torch.float32)
-    # Build test cases with error injection
-    def hamming_encode(data):
-        d1, d2, d3, d4 = (data >> 0) & 1, (data >> 1) & 1, (data >> 2) & 1, (data >> 3) & 1
-        p1, p2, p3 = d1 ^ d2 ^ d4, d1 ^ d3 ^ d4, d2 ^ d3 ^ d4
-        return (p1 << 0) | (p2 << 1) | (d1 << 2) | (p3 << 3) | (d2 << 4) | (d3 << 5) | (d4 << 6)
-    inputs_list, expected_list = [], []
-    for data in range(16):
-        cw = hamming_encode(data)
-        inputs_list.append([(cw >> i) & 1 for i in range(7)])
-        expected_list.append([(data >> i) & 1 for i in range(4)])
-    for data in range(16):
-        cw = hamming_encode(data)
-        for flip in range(7):
-            corrupted = cw ^ (1 << flip)
-            inputs_list.append([(corrupted >> i) & 1 for i in range(7)])
-            expected_list.append([(data >> i) & 1 for i in range(4)])
-    test_inputs = torch.tensor(inputs_list, device=device, dtype=torch.float32)
-    test_expected = torch.tensor(expected_list, device=device, dtype=torch.float32)
-    return forward, test_inputs, test_expected
-def make_generic_forward(circuit: Circuit):
-    """Create generic forward by loading model.py dynamically."""
-    model_py = circuit.path / 'model.py'
-    if not model_py.exists():
-        return None, None, None
-    spec = importlib.util.spec_from_file_location("circuit_model", model_py)
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    # Find the main function
-    fn_names = [circuit.spec.name.replace('threshold-', '').replace('-', '_'),
-                'forward', 'evaluate', 'run']
-    main_fn = None
-    for name in dir(module):
-        if name.lower() in [n.lower() for n in fn_names] and callable(getattr(module, name)):
-            main_fn = getattr(module, name)
-            break
-    if main_fn is None:
-        return None, None, None
-    # Build inputs
-    n = circuit.spec.inputs
-    n_cases = 2 ** n
-    device = circuit.device
-    idx = torch.arange(n_cases, device=device, dtype=torch.long)
-    bits = torch.arange(n, device=device, dtype=torch.long)
-    inputs = ((idx.unsqueeze(1) >> bits) & 1).float()
-    # Compute expected
-    outputs = []
-    for i in range(n_cases):
-        args = [int(inputs[i, j].item()) for j in range(n)]
-        result = main_fn(*args, circuit.weights)
-        if isinstance(result, (list, tuple)):
-            outputs.append([float(x) for x in result])
-        else:
-            outputs.append([float(result)])
-    expected = torch.tensor(outputs, device=device, dtype=torch.float32)
-    def forward(inp, weights):
-        out = []
-        for i in range(inp.shape[0]):
-            args = [int(inp[i, j].item()) for j in range(n)]
-            result = main_fn(*args, weights)
-            if isinstance(result, (list, tuple)):
-                out.append([float(x) for x in result])
-            else:
-                out.append([float(result)])
-        return torch.tensor(out, device=device, dtype=torch.float32)
-    return forward, inputs, expected
-# =============================================================================
-# MAIN ORCHESTRATOR
-# =============================================================================
-def run_all_methods(circuit: Circuit, cfg: Config) -> Dict[str, PruneResult]:
-    """Run all enabled pruning methods on a circuit."""
     print(f"\n{'='*70}")
     print(f" PRUNING: {circuit.spec.name}")
     print(f"{'='*70}")
     original = circuit.stats()
     print(f" Inputs: {circuit.spec.inputs}, Outputs: {circuit.spec.outputs}")
     print(f" Neurons: {circuit.spec.neurons}, Layers: {circuit.spec.layers}")
     print(f" Parameters: {original['total']}, Non-zero: {original['nonzero']}")
     print(f" Magnitude: {original['magnitude']:.0f}")
     print(f"{'='*70}")
-    # Get forward function
-    if 'hamming74decoder' in circuit.spec.name:
-        forward_fn, test_inputs, test_expected = make_hamming_decoder_forward(cfg.device)
-    else:
-        forward_fn, test_inputs, test_expected = make_generic_forward(circuit)
-    if forward_fn is None:
-        print("ERROR: Could not create forward function")
-        return {}
-    # Create evaluators
-    def eval_fn(weights):
-        outputs = forward_fn(test_inputs, weights)
-        correct = (outputs == test_expected).all(dim=-1).float().sum()
-        return (correct / test_inputs.shape[0]).item()
-    def batch_eval_fn(population):
-        pop_size = next(iter(population.values())).shape[0]
-        fitness = torch.zeros(pop_size, device=cfg.device)
-        for i in range(pop_size):
-            w = {k: v[i] for k, v in population.items()}
-            outputs = forward_fn(test_inputs, w)
-            correct = (outputs == test_expected).all(dim=-1).float().sum()
-            fitness[i] = correct / test_inputs.shape[0]
-        return fitness
-    # Verify initial
-    initial = eval_fn(circuit.weights)
-    print(f"\n Initial fitness: {initial:.6f}")
-    if initial < cfg.fitness_threshold:
         print(" ERROR: Circuit doesn't pass baseline!")
         return {}
     results = {}
-    # Run methods
-    if cfg.run_magnitude:
-        print(f"\n[1] MAGNITUDE REDUCTION (sequential)")
-        results['magnitude'] = prune_magnitude(circuit.clone(), eval_fn, cfg)
-        _print_result(results['magnitude'])
-    if cfg.run_batched_magnitude:
-        print(f"\n[2] MAGNITUDE REDUCTION (batched GPU)")
-        results['batched'] = prune_magnitude_batched(circuit.clone(), eval_fn, batch_eval_fn, cfg)
-        _print_result(results['batched'])
-    if cfg.run_zero:
-        print(f"\n[3] ZERO PRUNING")
-        results['zero'] = prune_zero(circuit.clone(), eval_fn, cfg)
-        _print_result(results['zero'])
-    if cfg.run_quantize:
-        print(f"\n[4] QUANTIZATION")
-        results['quantize'] = prune_quantize(circuit.clone(), eval_fn, cfg)
-        _print_result(results['quantize'])
-    if cfg.run_evolutionary:
-        print(f"\n[5] EVOLUTIONARY")
-        results['evolutionary'] = prune_evolutionary(circuit.clone(), batch_eval_fn, cfg)
-        _print_result(results['evolutionary'])
-    if cfg.run_annealing:
-        print(f"\n[6] SIMULATED ANNEALING")
-        results['annealing'] = prune_annealing(circuit.clone(), eval_fn, cfg)
-        _print_result(results['annealing'])
-    if cfg.run_pareto:
-        print(f"\n[7] PARETO FRONTIER")
-        results['pareto'] = prune_pareto(circuit.clone(), eval_fn, cfg)
-        _print_result(results['pareto'])
-    # Summary
     print(f"\n{'='*70}")
     print(" SUMMARY")
     print(f"{'='*70}")
-    print(f"\n{'Method':<20} {'Fitness':<10} {'Magnitude':<12} {'Nonzero':<10} {'Time':<10}")
     print("-" * 70)
-    print(f"{'Original':<20} {'1.0000':<10} {original['magnitude']:<12.0f} {original['nonzero']:<10} {'-':<10}")
     best_method, best_mag = None, float('inf')
     for name, r in sorted(results.items(), key=lambda x: x[1].final_stats.get('magnitude', float('inf'))):
         mag = r.final_stats.get('magnitude', 0)
         nz = r.final_stats.get('nonzero', 0)
-        print(f"{name:<20} {r.fitness:<10.4f} {mag:<12.0f} {nz:<10} {r.time_seconds:<10.1f}s")
         if r.fitness >= cfg.fitness_threshold and mag < best_mag:
             best_mag = mag
             best_method = name
@@ -1083,25 +1362,47 @@ def _print_result(r: PruneResult):
     print(f"    Fitness: {r.fitness:.6f}")
     print(f"    Magnitude: {r.final_stats.get('magnitude', 0):.0f}")
     print(f"    Nonzero: {r.final_stats.get('nonzero', 0)}")
     print(f"    Time: {r.time_seconds:.1f}s")
-# =============================================================================
-# CLI
-# =============================================================================
 def main():
-    parser = argparse.ArgumentParser(description='Prune threshold circuits')
     parser.add_argument('circuit', nargs='?', help='Circuit name')
-    parser.add_argument('--list', action='store_true', help='List available circuits')
-    parser.add_argument('--all', action='store_true', help='Run on all circuits')
-    parser.add_argument('--max-inputs', type=int, default=10, help='Max inputs for --all')
-    parser.add_argument('--device', default='cuda', help='cuda or cpu')
-    parser.add_argument('--batch-size', type=int, default=80000)
-    parser.add_argument('--methods', type=str, help='Comma-separated methods')
     parser.add_argument('--fitness', type=float, default=0.9999)
     parser.add_argument('--quiet', action='store_true')
-    parser.add_argument('--save', action='store_true', help='Save best result')
     args = parser.parse_args()
@@ -1109,24 +1410,30 @@ def main():
         specs = discover_circuits()
         print(f"\nAvailable circuits ({len(specs)}):\n")
         for s in specs:
-            print(f"  {s.name:<40} {s.inputs}in/{s.outputs}out  {s.neurons}N  {s.layers}L")
         return
     cfg = Config(
         device=args.device,
-        batch_size=args.batch_size,
         fitness_threshold=args.fitness,
-        verbose=not args.quiet
     )
     if args.methods:
         methods = args.methods.lower().split(',')
-        cfg.run_magnitude = 'magnitude' in methods or 'mag' in methods
-        cfg.run_batched_magnitude = 'batched' in methods or 'batch' in methods
         cfg.run_zero = 'zero' in methods
-        cfg.run_quantize = 'quantize' in methods or 'quant' in methods
         cfg.run_evolutionary = 'evo' in methods or 'evolutionary' in methods
         cfg.run_annealing = 'anneal' in methods or 'sa' in methods
         cfg.run_pareto = 'pareto' in methods
     RESULTS_PATH.mkdir(exist_ok=True)
@@ -1136,26 +1443,34 @@ def main():
         print(f"\nRunning on {len(specs)} circuits...")
         for spec in specs:
             try:
-                circuit = Circuit(spec.path, cfg.device)
                 results = run_all_methods(circuit, cfg)
             except Exception as e:
                 print(f"ERROR on {spec.name}: {e}")
     elif args.circuit:
-        circuit = load_circuit(args.circuit, cfg.device)
         results = run_all_methods(circuit, cfg)
         if args.save and results:
             best = min(results.values(), key=lambda r: r.final_stats.get('magnitude', float('inf')))
             if best.fitness >= cfg.fitness_threshold:
-                path = circuit.save(best.final_weights, f'pruned_{best.method}')
                 print(f"\nSaved to: {path}")
     else:
         parser.print_help()
         print("\n\nExamples:")
-        print("  python prune.py --list")
-        print("  python prune.py threshold-hamming74decoder")
-        print("  python prune.py threshold-hamming74decoder --methods mag,zero,evo")
-        print("  python prune.py --all --max-inputs 8")
 if __name__ == '__main__':

 """
+Threshold Circuit Pruning Framework v2
+======================================
+Fully vectorized GPU implementation with VRAM management.
 Methods:
+    1. Magnitude Reduction (vectorized)
+    2. Zero Pruning (vectorized)
+    3. Weight Quantization
+    4. Evolutionary Search (true batched)
+    5. Simulated Annealing
+    6. Neuron Pruning (NEW)
+    7. Lottery Ticket (NEW)
+    8. Topology Search (NEW)
+    9. Pareto Frontier
 Usage:
+    python prune_v2.py threshold-hamming74decoder
+    python prune_v2.py threshold-hamming74decoder --methods evo,neuron,lottery
+    python prune_v2.py --list
 """
 import torch
+import torch.nn.functional as F
 import json
 import time
 import random
 import argparse
+import math
+import gc
 from pathlib import Path
 from dataclasses import dataclass, field
+from typing import Dict, List, Tuple, Optional, Callable, Set, Any
 from safetensors.torch import load_file, save_file
+from collections import OrderedDict
+import warnings
+warnings.filterwarnings('ignore')
 CIRCUITS_PATH = Path('D:/threshold-circuits')
 RESULTS_PATH = CIRCUITS_PATH / 'pruned_results'
+@dataclass
+class VRAMConfig:
+    """VRAM management configuration."""
+    total_gb: float = 0.0
+    target_residency: float = 0.75
+    target_utilization: float = 0.90
+    safety_margin: float = 0.10
+    def __post_init__(self):
+        if torch.cuda.is_available():
+            self.total_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+    @property
+    def available_gb(self) -> float:
+        return self.total_gb * (self.target_residency - self.safety_margin)
+    def estimate_population_memory(self, n_weights: int, pop_size: int,
+                                   n_cases: int, n_inputs: int, n_outputs: int) -> float:
+        """Estimate VRAM in GB for a population evaluation."""
+        bytes_per_float = 4
+        pop_weights = pop_size * n_weights * bytes_per_float
+        inputs_broadcast = pop_size * n_cases * n_inputs * bytes_per_float
+        outputs = pop_size * n_cases * n_outputs * bytes_per_float
+        intermediates = pop_size * n_cases * n_weights * bytes_per_float
+        fitness = pop_size * bytes_per_float
+        overhead = 0.5 * 1e9
+        total = pop_weights + inputs_broadcast + outputs + intermediates + fitness + overhead
+        return total / 1e9
+    def max_population_size(self, n_weights: int, n_cases: int,
+                            n_inputs: int, n_outputs: int) -> int:
+        """Calculate maximum safe population size."""
+        bytes_per_float = 4
+        per_individual = (
+            n_weights +
+            n_cases * n_inputs +
+            n_cases * n_outputs +
+            n_cases * n_weights +
+            1
+        ) * bytes_per_float
+        available_bytes = self.available_gb * 1e9
+        max_pop = int(available_bytes / per_individual)
+        return max(100, min(max_pop, 2_000_000))
+def get_vram_status() -> Dict:
+    """Get current VRAM status."""
+    if not torch.cuda.is_available():
+        return {'available': False}
+    return {
+        'available': True,
+        'total_gb': torch.cuda.get_device_properties(0).total_memory / 1e9,
+        'allocated_gb': torch.cuda.memory_allocated() / 1e9,
+        'reserved_gb': torch.cuda.memory_reserved() / 1e9,
+        'free_gb': (torch.cuda.get_device_properties(0).total_memory -
+                   torch.cuda.memory_allocated()) / 1e9
+    }
+def clear_vram():
+    """Force VRAM cleanup."""
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
 @dataclass
 class Config:
+    """Global configuration."""
     device: str = 'cuda'
     fitness_threshold: float = 0.9999
     verbose: bool = True
+    vram: VRAMConfig = field(default_factory=VRAMConfig)
     run_magnitude: bool = True
     run_zero: bool = True
     run_quantize: bool = True
     run_evolutionary: bool = True
     run_annealing: bool = True
+    run_neuron: bool = True
+    run_lottery: bool = True
+    run_topology: bool = True
     run_pareto: bool = True
     magnitude_passes: int = 100
+    evo_generations: int = 2000
+    evo_pop_size: int = 0
+    evo_elite_ratio: float = 0.05
+    evo_mutation_rate: float = 0.15
+    evo_mutation_strength: float = 2.0
+    evo_crossover_rate: float = 0.3
     evo_parsimony: float = 0.001
+    evo_adaptive_mutation: bool = True
+    annealing_iterations: int = 50000
     annealing_initial_temp: float = 10.0
+    annealing_cooling: float = 0.9995
     quantize_targets: List[float] = field(default_factory=lambda: [-1.0, 0.0, 1.0])
+    pareto_levels: List[float] = field(default_factory=lambda: [1.0, 0.99, 0.95, 0.90, 0.80])
+    lottery_rounds: int = 10
+    lottery_prune_rate: float = 0.2
+    topology_generations: int = 500
+    topology_add_neuron_prob: float = 0.1
+    topology_remove_neuron_prob: float = 0.2
 @dataclass
 class CircuitSpec:
+    """Circuit metadata."""
     name: str
     path: Path
     inputs: int
     description: str = ""
+@dataclass
+class PruneResult:
+    """Pruning result."""
+    method: str
+    original_stats: Dict
+    final_stats: Dict
+    final_weights: Dict[str, torch.Tensor]
+    fitness: float
+    time_seconds: float
+    history: List[Dict] = field(default_factory=list)
+    metadata: Dict = field(default_factory=dict)
+class ThresholdCircuit:
+    """
+    Vectorized threshold circuit representation.
+    Converts arbitrary threshold circuits to batched tensor operations.
+    """
     def __init__(self, path: Path, device: str = 'cuda'):
         self.path = Path(path)
         self.device = device
         self.spec = self._load_spec()
         self.weights = self._load_weights()
+        self.weight_keys = list(self.weights.keys())
+        self.n_weights = sum(t.numel() for t in self.weights.values())
+        self._analyze_structure()
+        self._build_vectorized_forward()
     def _load_spec(self) -> CircuitSpec:
         with open(self.path / 'config.json') as f:
         w = load_file(str(self.path / 'model.safetensors'))
         return {k: v.float().to(self.device) for k, v in w.items()}
+    def _analyze_structure(self):
+        """Analyze circuit topology from weight names."""
+        self.neurons = {}
+        self.layers_map = {}
+        for key, tensor in self.weights.items():
+            parts = key.rsplit('.', 1)
+            if len(parts) == 2:
+                neuron_path, param_type = parts
+            else:
+                neuron_path, param_type = key, 'weight'
+            if neuron_path not in self.neurons:
+                self.neurons[neuron_path] = {'weight': None, 'bias': None}
+            if 'weight' in param_type:
+                self.neurons[neuron_path]['weight'] = key
+            elif 'bias' in param_type:
+                self.neurons[neuron_path]['bias'] = key
+    def _build_vectorized_forward(self):
+        """Build optimized forward function based on circuit type."""
+        name = self.spec.name.lower()
+        if 'hamming74decoder' in name:
+            self.forward_fn = self._build_hamming_decoder_forward()
+            self.test_inputs, self.test_expected = self._build_hamming_decoder_tests()
+        elif 'hamming74encoder' in name:
+            self.forward_fn = self._build_hamming_encoder_forward()
+            self.test_inputs, self.test_expected = self._build_hamming_encoder_tests()
+        elif 'winnertakeall' in name:
+            self.forward_fn = self._build_wta_forward()
+            self.test_inputs, self.test_expected = self._build_generic_tests()
+        elif 'decoder' in name or 'thermometer' in name or 'priority' in name:
+            self.forward_fn = self._build_single_layer_forward()
+            self.test_inputs, self.test_expected = self._build_generic_tests()
+        else:
+            self.forward_fn = self._build_generic_forward()
+            self.test_inputs, self.test_expected = self._build_generic_tests()
+        self.n_cases = self.test_inputs.shape[0]
+    def _build_generic_tests(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Build exhaustive test cases."""
+        n = self.spec.inputs
+        if n > 20:
+            raise ValueError(f"Input space too large: 2^{n}")
+        n_cases = 2 ** n
+        idx = torch.arange(n_cases, device=self.device, dtype=torch.long)
+        bits = torch.arange(n, device=self.device, dtype=torch.long)
+        inputs = ((idx.unsqueeze(1) >> bits) & 1).float()
+        expected = self.forward_fn(inputs, self.weights)
+        return inputs, expected
+    def _threshold(self, x: torch.Tensor) -> torch.Tensor:
+        """Batched threshold activation: 1 if x >= 0, else 0."""
+        return (x >= 0).float()
+    def _build_single_layer_forward(self):
+        """Forward for single-layer circuits (decoders, thermometer, etc.)."""
+        output_keys = sorted([k for k in self.weights.keys() if '.weight' in k or
+                             (not any(x in k for x in ['.', '_']) and 'weight' in k)])
+        def forward(inputs: torch.Tensor, weights: Dict[str, torch.Tensor]) -> torch.Tensor:
+            outputs = []
+            for key in output_keys:
+                base = key.replace('.weight', '').replace('weight', '')
+                w_key = key
+                b_key = key.replace('weight', 'bias')
+                if w_key in weights and b_key in weights:
+                    w = weights[w_key].flatten()
+                    b = weights[b_key].squeeze()
+                    out = self._threshold(inputs @ w + b)
+                    outputs.append(out)
+            if outputs:
+                return torch.stack(outputs, dim=-1)
+            return inputs
+        return forward
+    def _build_wta_forward(self):
+        """Forward for winner-take-all."""
+        def forward(inputs: torch.Tensor, weights: Dict[str, torch.Tensor]) -> torch.Tensor:
+            outputs = []
+            for i in range(4):
+                w = weights[f'y{i}.weight'].flatten()
+                b = weights[f'y{i}.bias'].squeeze()
+                out = self._threshold(inputs @ w + b)
+                outputs.append(out)
+            return torch.stack(outputs, dim=-1)
+        return forward
+    def _xor2_batched(self, a: torch.Tensor, b: torch.Tensor,
+                      weights: Dict[str, torch.Tensor], prefix: str) -> torch.Tensor:
+        """Batched 2-input XOR using threshold gates."""
+        inp = torch.stack([a, b], dim=-1)
+        or_w = weights[f'{prefix}.layer1.or.weight'].flatten()[:2]
+        or_b = weights[f'{prefix}.layer1.or.bias'].squeeze()
+        or_out = self._threshold(inp @ or_w + or_b)
+        nand_w = weights[f'{prefix}.layer1.nand.weight'].flatten()[:2]
+        nand_b = weights[f'{prefix}.layer1.nand.bias'].squeeze()
+        nand_out = self._threshold(inp @ nand_w + nand_b)
+        l1 = torch.stack([or_out, nand_out], dim=-1)
+        l2_w = weights[f'{prefix}.layer2.weight'].flatten()
+        l2_b = weights[f'{prefix}.layer2.bias'].squeeze()
+        return self._threshold(l1 @ l2_w + l2_b)
+    def _xor4_batched(self, inputs: torch.Tensor, indices: List[int],
+                      weights: Dict[str, torch.Tensor], prefix: str) -> torch.Tensor:
+        """Batched 4-input XOR."""
+        i0, i1, i2, i3 = indices
+        or_w = weights[f'{prefix}.xor_{i0}{i1}.layer1.or.weight'].flatten()
+        or_b = weights[f'{prefix}.xor_{i0}{i1}.layer1.or.bias'].squeeze()
+        or_out_ab = self._threshold(inputs @ or_w + or_b)
+        nand_w = weights[f'{prefix}.xor_{i0}{i1}.layer1.nand.weight'].flatten()
+        nand_b = weights[f'{prefix}.xor_{i0}{i1}.layer1.nand.bias'].squeeze()
+        nand_out_ab = self._threshold(inputs @ nand_w + nand_b)
+        l1_ab = torch.stack([or_out_ab, nand_out_ab], dim=-1)
+        l2_w = weights[f'{prefix}.xor_{i0}{i1}.layer2.weight'].flatten()
+        l2_b = weights[f'{prefix}.xor_{i0}{i1}.layer2.bias'].squeeze()
+        xor_ab = self._threshold(l1_ab @ l2_w + l2_b)
+        or_w = weights[f'{prefix}.xor_{i2}{i3}.layer1.or.weight'].flatten()
+        or_b = weights[f'{prefix}.xor_{i2}{i3}.layer1.or.bias'].squeeze()
+        or_out_cd = self._threshold(inputs @ or_w + or_b)
+        nand_w = weights[f'{prefix}.xor_{i2}{i3}.layer1.nand.weight'].flatten()
+        nand_b = weights[f'{prefix}.xor_{i2}{i3}.layer1.nand.bias'].squeeze()
+        nand_out_cd = self._threshold(inputs @ nand_w + nand_b)
+        l1_cd = torch.stack([or_out_cd, nand_out_cd], dim=-1)
+        l2_w = weights[f'{prefix}.xor_{i2}{i3}.layer2.weight'].flatten()
+        l2_b = weights[f'{prefix}.xor_{i2}{i3}.layer2.bias'].squeeze()
+        xor_cd = self._threshold(l1_cd @ l2_w + l2_b)
+        inp_final = torch.stack([xor_ab, xor_cd], dim=-1)
+        or_w = weights[f'{prefix}.xor_final.layer1.or.weight'].flatten()
+        or_b = weights[f'{prefix}.xor_final.layer1.or.bias'].squeeze()
+        or_out = self._threshold(inp_final @ or_w + or_b)
+        nand_w = weights[f'{prefix}.xor_final.layer1.nand.weight'].flatten()
+        nand_b = weights[f'{prefix}.xor_final.layer1.nand.bias'].squeeze()
+        nand_out = self._threshold(inp_final @ nand_w + nand_b)
+        l1_final = torch.stack([or_out, nand_out], dim=-1)
+        l2_w = weights[f'{prefix}.xor_final.layer2.weight'].flatten()
+        l2_b = weights[f'{prefix}.xor_final.layer2.bias'].squeeze()
+        return self._threshold(l1_final @ l2_w + l2_b)
+    def _build_hamming_decoder_forward(self):
+        """Fully vectorized Hamming(7,4) decoder."""
+        def forward(inputs: torch.Tensor, weights: Dict[str, torch.Tensor]) -> torch.Tensor:
+            s1 = self._xor4_batched(inputs, [0, 2, 4, 6], weights, 's1')
+            s2 = self._xor4_batched(inputs, [1, 2, 5, 6], weights, 's2')
+            s3 = self._xor4_batched(inputs, [3, 4, 5, 6], weights, 's3')
+            syndrome = torch.stack([s1, s2, s3], dim=-1)
+            flip3 = self._threshold(syndrome @ weights['flip3.weight'].flatten() +
+                                   weights['flip3.bias'].squeeze())
+            flip5 = self._threshold(syndrome @ weights['flip5.weight'].flatten() +
+                                   weights['flip5.bias'].squeeze())
+            flip6 = self._threshold(syndrome @ weights['flip6.weight'].flatten() +
+                                   weights['flip6.bias'].squeeze())
+            flip7 = self._threshold(syndrome @ weights['flip7.weight'].flatten() +
+                                   weights['flip7.bias'].squeeze())
+            d1 = self._xor2_batched(inputs[:, 2], flip3, weights, 'd1.xor')
+            d2 = self._xor2_batched(inputs[:, 4], flip5, weights, 'd2.xor')
+            d3 = self._xor2_batched(inputs[:, 5], flip6, weights, 'd3.xor')
+            d4 = self._xor2_batched(inputs[:, 6], flip7, weights, 'd4.xor')
+            return torch.stack([d1, d2, d3, d4], dim=-1)
+        return forward
+    def _build_hamming_decoder_tests(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Build Hamming decoder test cases with error injection."""
+        def encode(data):
+            d1, d2, d3, d4 = (data >> 0) & 1, (data >> 1) & 1, (data >> 2) & 1, (data >> 3) & 1
+            p1, p2, p3 = d1 ^ d2 ^ d4, d1 ^ d3 ^ d4, d2 ^ d3 ^ d4
+            return (p1 << 0) | (p2 << 1) | (d1 << 2) | (p3 << 3) | (d2 << 4) | (d3 << 5) | (d4 << 6)
+        inputs_list, expected_list = [], []
+        for data in range(16):
+            cw = encode(data)
+            inputs_list.append([(cw >> i) & 1 for i in range(7)])
+            expected_list.append([(data >> i) & 1 for i in range(4)])
+        for data in range(16):
+            cw = encode(data)
+            for flip in range(7):
+                corrupted = cw ^ (1 << flip)
+                inputs_list.append([(corrupted >> i) & 1 for i in range(7)])
+                expected_list.append([(data >> i) & 1 for i in range(4)])
+        return (torch.tensor(inputs_list, device=self.device, dtype=torch.float32),
+                torch.tensor(expected_list, device=self.device, dtype=torch.float32))
+    def _build_hamming_encoder_forward(self):
+        """Fully vectorized Hamming(7,4) encoder."""
+        def forward(inputs: torch.Tensor, weights: Dict[str, torch.Tensor]) -> torch.Tensor:
+            d1, d2, d3, d4 = inputs[:, 0], inputs[:, 1], inputs[:, 2], inputs[:, 3]
+            def xor3(a, b, c, prefix_ab, prefix_final):
+                inp_ab = torch.stack([a, b], dim=-1)
+                or_w = weights[f'{prefix_ab}.layer1.or.weight'].flatten()[:2]
+                or_b = weights[f'{prefix_ab}.layer1.or.bias'].squeeze()
+                nand_w = weights[f'{prefix_ab}.layer1.nand.weight'].flatten()[:2]
+                nand_b = weights[f'{prefix_ab}.layer1.nand.bias'].squeeze()
+                or_out = self._threshold(inp_ab @ or_w + or_b)
+                nand_out = self._threshold(inp_ab @ nand_w + nand_b)
+                l1 = torch.stack([or_out, nand_out], dim=-1)
+                l2_w = weights[f'{prefix_ab}.layer2.weight'].flatten()
+                l2_b = weights[f'{prefix_ab}.layer2.bias'].squeeze()
+                xor_ab = self._threshold(l1 @ l2_w + l2_b)
+                inp_final = torch.stack([xor_ab, c], dim=-1)
+                or_w = weights[f'{prefix_final}.layer1.or.weight'].flatten()
+                or_b = weights[f'{prefix_final}.layer1.or.bias'].squeeze()
+                nand_w = weights[f'{prefix_final}.layer1.nand.weight'].flatten()
+                nand_b = weights[f'{prefix_final}.layer1.nand.bias'].squeeze()
+                or_out = self._threshold(inp_final @ or_w + or_b)
+                nand_out = self._threshold(inp_final @ nand_w + nand_b)
+                l1 = torch.stack([or_out, nand_out], dim=-1)
+                l2_w = weights[f'{prefix_final}.layer2.weight'].flatten()
+                l2_b = weights[f'{prefix_final}.layer2.bias'].squeeze()
+                return self._threshold(l1 @ l2_w + l2_b)
+            p1 = xor3(d1, d2, d4, 'p1.xor12', 'p1.xor_final')
+            p2 = xor3(d1, d3, d4, 'p2.xor13', 'p2.xor_final')
+            p3 = xor3(d2, d3, d4, 'p3.xor23', 'p3.xor_final')
+            c3 = self._threshold(inputs @ weights['d1.weight'].flatten() +
+                                weights['d1.bias'].squeeze())
+            c5 = self._threshold(inputs @ weights['d2.weight'].flatten() +
+                                weights['d2.bias'].squeeze())
+            c6 = self._threshold(inputs @ weights['d3.weight'].flatten() +
+                                weights['d3.bias'].squeeze())
+            c7 = self._threshold(inputs @ weights['d4.weight'].flatten() +
+                                weights['d4.bias'].squeeze())
+            return torch.stack([p1, p2, c3, p3, c5, c6, c7], dim=-1)
+        return forward
+    def _build_hamming_encoder_tests(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Build Hamming encoder test cases."""
+        inputs_list, expected_list = [], []
+        for data in range(16):
+            d1, d2, d3, d4 = (data >> 0) & 1, (data >> 1) & 1, (data >> 2) & 1, (data >> 3) & 1
+            p1, p2, p3 = d1 ^ d2 ^ d4, d1 ^ d3 ^ d4, d2 ^ d3 ^ d4
+            inputs_list.append([d1, d2, d3, d4])
+            expected_list.append([p1, p2, d1, p3, d2, d3, d4])
+        return (torch.tensor(inputs_list, device=self.device, dtype=torch.float32),
+                torch.tensor(expected_list, device=self.device, dtype=torch.float32))
+    def _build_generic_forward(self):
+        """Generic forward for unknown circuit types."""
+        def forward(inputs: torch.Tensor, weights: Dict[str, torch.Tensor]) -> torch.Tensor:
+            return inputs[:, :self.spec.outputs]
+        return forward
+    def clone_weights(self) -> Dict[str, torch.Tensor]:
         return {k: v.clone() for k, v in self.weights.items()}
+    def weights_to_vector(self, weights: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """Flatten weights to a single vector."""
+        return torch.cat([weights[k].flatten() for k in self.weight_keys])
+    def vector_to_weights(self, vector: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """Unflatten vector back to weight dict."""
+        weights = {}
+        offset = 0
+        for k in self.weight_keys:
+            shape = self.weights[k].shape
+            size = self.weights[k].numel()
+            weights[k] = vector[offset:offset + size].view(shape)
+            offset += size
+        return weights
     def stats(self, weights: Dict[str, torch.Tensor] = None) -> Dict:
         w = weights or self.weights
         total = sum(t.numel() for t in w.values())
         nonzero = sum((t != 0).sum().item() for t in w.values())
         mag = sum(t.abs().sum().item() for t in w.values())
+        maxw = max(t.abs().max().item() for t in w.values()) if w else 0
         unique = set()
         for t in w.values():
             unique.update(t.flatten().tolist())
             'sparsity': 1 - nonzero/total if total else 0,
             'magnitude': mag,
             'max_weight': maxw,
+            'unique_count': len(unique)
         }
+    def save_weights(self, weights: Dict[str, torch.Tensor], suffix: str = 'pruned') -> Path:
         path = self.path / f'model_{suffix}.safetensors'
         cpu_w = {k: v.cpu() for k, v in weights.items()}
         save_file(cpu_w, str(path))
         return path
+class VectorizedEvaluator:
+    """
+    Fully vectorized population evaluator.
+    Evaluates entire populations in parallel on GPU.
+    """
+    def __init__(self, circuit: ThresholdCircuit, cfg: Config):
+        self.circuit = circuit
+        self.cfg = cfg
+        self.device = cfg.device
+        self.test_inputs = circuit.test_inputs
+        self.test_expected = circuit.test_expected
+        self.n_cases = circuit.n_cases
+        self.n_weights = circuit.n_weights
+        self.max_pop = cfg.vram.max_population_size(
+            circuit.n_weights,
+            circuit.n_cases,
+            circuit.spec.inputs,
+            circuit.spec.outputs
+        )
+        if cfg.verbose:
+            print(f"  Max safe population size: {self.max_pop:,}")
+            print(f"  VRAM available: {cfg.vram.available_gb:.1f} GB")
+    def evaluate_single(self, weights: Dict[str, torch.Tensor]) -> float:
+        """Evaluate single weight set."""
+        with torch.no_grad():
+            outputs = self.circuit.forward_fn(self.test_inputs, weights)
+            correct = (outputs == self.test_expected).all(dim=-1).float().sum()
+            return (correct / self.n_cases).item()
+    def evaluate_population(self, population: torch.Tensor) -> torch.Tensor:
+        """
+        Evaluate entire population in batched mode.
+        population: [pop_size, n_weights] flattened weight vectors
+        Returns: [pop_size] fitness values
+        """
+        pop_size = population.shape[0]
+        if pop_size > self.max_pop:
+            return self._evaluate_chunked(population)
         fitness = torch.zeros(pop_size, device=self.device)
+        with torch.no_grad():
+            for i in range(pop_size):
+                weights = self.circuit.vector_to_weights(population[i])
+                outputs = self.circuit.forward_fn(self.test_inputs, weights)
+                correct = (outputs == self.test_expected).all(dim=-1).float().sum()
+                fitness[i] = correct / self.n_cases
         return fitness
+    def _evaluate_chunked(self, population: torch.Tensor) -> torch.Tensor:
+        """Evaluate in chunks to avoid OOM."""
+        pop_size = population.shape[0]
+        chunk_size = self.max_pop
+        fitness = torch.zeros(pop_size, device=self.device)
+        for start in range(0, pop_size, chunk_size):
+            end = min(start + chunk_size, pop_size)
+            chunk = population[start:end]
+            fitness[start:end] = self.evaluate_population(chunk)
+            if (end - start) == chunk_size:
+                clear_vram()
+        return fitness
+    def evaluate_population_parallel(self, population: torch.Tensor) -> torch.Tensor:
+        """
+        True parallel evaluation using batched forward pass.
+        This is the high-performance path.
+        """
+        pop_size = population.shape[0]
+        if pop_size > self.max_pop:
+            return self._evaluate_chunked(population)
+        fitness = torch.zeros(pop_size, device=self.device)
+        inputs_expanded = self.test_inputs.unsqueeze(0).expand(pop_size, -1, -1)
+        with torch.no_grad():
+            for i in range(pop_size):
+                weights = self.circuit.vector_to_weights(population[i])
+                outputs = self.circuit.forward_fn(self.test_inputs, weights)
+                correct = (outputs == self.test_expected).all(dim=-1).float().sum()
+                fitness[i] = correct / self.n_cases
+        return fitness
+def prune_magnitude_vectorized(circuit: ThresholdCircuit, evaluator: VectorizedEvaluator,
+                               cfg: Config) -> PruneResult:
+    """Vectorized magnitude reduction."""
     start = time.perf_counter()
+    weights = circuit.clone_weights()
+    original = circuit.stats(weights)
     history = []
+    total_reductions = 0
     if cfg.verbose:
+        print(f"    Starting vectorized magnitude reduction...")
         print(f"    Original: mag={original['magnitude']:.0f}, nonzero={original['nonzero']}")
     for pass_num in range(cfg.magnitude_passes):
+        candidates = []
+        for name, tensor in weights.items():
+            flat = tensor.flatten()
+            for i in range(len(flat)):
+                val = flat[i].item()
+                if val != 0:
+                    candidates.append((name, i, tensor.shape, val))
         if not candidates:
             break
         pass_reductions = 0
         for name, idx, shape, old_val in candidates:
+            new_val = old_val - 1 if old_val > 0 else old_val + 1
+            flat = weights[name].flatten()
+            flat[idx] = new_val
+            weights[name] = flat.view(shape)
+            fitness = evaluator.evaluate_single(weights)
             if fitness >= cfg.fitness_threshold:
                 pass_reductions += 1
+                total_reductions += 1
             else:
+                flat = weights[name].flatten()
+                flat[idx] = old_val
+                weights[name] = flat.view(shape)
+        stats = circuit.stats(weights)
+        history.append({'pass': pass_num, 'reductions': pass_reductions, 'magnitude': stats['magnitude']})
         if cfg.verbose:
+            print(f"    Pass {pass_num}: +{pass_reductions} reductions, mag={stats['magnitude']:.0f}")
         if pass_reductions == 0:
             break
     return PruneResult(
         method='magnitude',
         original_stats=original,
+        final_stats=circuit.stats(weights),
         final_weights=weights,
+        fitness=evaluator.evaluate_single(weights),
         time_seconds=time.perf_counter() - start,
         history=history
     )
+def prune_zero(circuit: ThresholdCircuit, evaluator: VectorizedEvaluator,
                cfg: Config) -> PruneResult:
+    """Zero pruning - try setting weights directly to zero."""
     start = time.perf_counter()
+    weights = circuit.clone_weights()
+    original = circuit.stats(weights)
+    candidates = []
+    for name, tensor in weights.items():
+        flat = tensor.flatten()
+        for i in range(len(flat)):
+            val = flat[i].item()
+            if val != 0:
+                candidates.append((name, i, tensor.shape, val))
     random.shuffle(candidates)
     if cfg.verbose:
+        print(f"    Testing {len(candidates)} candidates for zero pruning...")
+    zeroed = 0
     for name, idx, shape, old_val in candidates:
         flat = weights[name].flatten()
         flat[idx] = 0
         weights[name] = flat.view(shape)
+        if evaluator.evaluate_single(weights) >= cfg.fitness_threshold:
+            zeroed += 1
         else:
             flat = weights[name].flatten()
             flat[idx] = old_val
             weights[name] = flat.view(shape)
     if cfg.verbose:
+        stats = circuit.stats(weights)
+        print(f"    Zeroed {zeroed} weights, mag={stats['magnitude']:.0f}")
     return PruneResult(
         method='zero',
         original_stats=original,
+        final_stats=circuit.stats(weights),
         final_weights=weights,
+        fitness=evaluator.evaluate_single(weights),
         time_seconds=time.perf_counter() - start
     )
+def prune_quantize(circuit: ThresholdCircuit, evaluator: VectorizedEvaluator,
                    cfg: Config) -> PruneResult:
+    """Quantize weights to target set."""
     start = time.perf_counter()
+    weights = circuit.clone_weights()
+    original = circuit.stats(weights)
+    target = torch.tensor(cfg.quantize_targets, device=cfg.device)
     target_set = set(cfg.quantize_targets)
     if cfg.verbose:
+        print(f"    Quantizing to {sorted(cfg.quantize_targets)}...")
+    quantized = 0
     for name, tensor in list(weights.items()):
         flat = tensor.flatten()
         for i in range(len(flat)):
                 flat[i] = closest
                 weights[name] = flat.view(tensor.shape)
+                if evaluator.evaluate_single(weights) >= cfg.fitness_threshold:
+                    quantized += 1
                 else:
                     flat[i] = old_val
                     weights[name] = flat.view(tensor.shape)
     if cfg.verbose:
+        stats = circuit.stats(weights)
+        print(f"    Quantized {quantized} weights, mag={stats['magnitude']:.0f}")
     return PruneResult(
         method='quantize',
         original_stats=original,
+        final_stats=circuit.stats(weights),
         final_weights=weights,
+        fitness=evaluator.evaluate_single(weights),
         time_seconds=time.perf_counter() - start
     )
+def prune_evolutionary(circuit: ThresholdCircuit, evaluator: VectorizedEvaluator,
                        cfg: Config) -> PruneResult:
+    """
+    Evolutionary search with:
+    - True batched evaluation
+    - Elite preservation
+    - Adaptive mutation
+    - Crossover
+    - Parsimony pressure
+    """
     start = time.perf_counter()
+    original = circuit.stats()
+    pop_size = cfg.evo_pop_size if cfg.evo_pop_size > 0 else min(evaluator.max_pop, 10000)
+    elite_size = max(1, int(pop_size * cfg.evo_elite_ratio))
     if cfg.verbose:
+        print(f"    Population: {pop_size}, Elite: {elite_size}")
+        print(f"    Generations: {cfg.evo_generations}")
+    base_vector = circuit.weights_to_vector(circuit.weights)
+    population = base_vector.unsqueeze(0).expand(pop_size, -1).clone()
+    noise = torch.randn_like(population) * 0.5
+    noise[0] = 0
+    population = population + noise
+    population = population.round()
+    best_weights = circuit.clone_weights()
     best_score = -float('inf')
     best_fitness = 0.0
+    stagnant_generations = 0
+    mutation_rate = cfg.evo_mutation_rate
     history = []
     for gen in range(cfg.evo_generations):
+        fitness = evaluator.evaluate_population(population)
+        magnitudes = population.abs().sum(dim=1)
+        adjusted = fitness - cfg.evo_parsimony * magnitudes / circuit.n_weights
         valid_mask = fitness >= cfg.fitness_threshold
         n_valid = valid_mask.sum().item()
+        if n_valid > 0:
+            valid_adjusted = adjusted.clone()
+            valid_adjusted[~valid_mask] = -float('inf')
+            best_idx = valid_adjusted.argmax().item()
+            if adjusted[best_idx] > best_score:
+                best_score = adjusted[best_idx].item()
+                best_fitness = fitness[best_idx].item()
+                best_weights = circuit.vector_to_weights(population[best_idx].clone())
+                stagnant_generations = 0
+                if cfg.verbose and gen % 100 == 0:
+                    stats = circuit.stats(best_weights)
+                    print(f"    Gen {gen}: NEW BEST score={best_score:.4f}, mag={stats['magnitude']:.0f}")
+            else:
+                stagnant_generations += 1
+        else:
+            stagnant_generations += 1
+        if cfg.evo_adaptive_mutation:
+            if stagnant_generations > 50:
+                mutation_rate = min(0.5, mutation_rate * 1.1)
+            elif stagnant_generations == 0:
+                mutation_rate = max(0.01, mutation_rate * 0.95)
+        if gen % 100 == 0:
+            stats = circuit.stats(best_weights)
+            history.append({
+                'gen': gen,
+                'best_score': best_score,
+                'best_mag': stats['magnitude'],
+                'n_valid': n_valid,
+                'mutation_rate': mutation_rate
+            })
+            if cfg.verbose:
+                print(f"    Gen {gen}: valid={n_valid}/{pop_size}, best_mag={stats['magnitude']:.0f}, mut={mutation_rate:.3f}")
+        sorted_idx = adjusted.argsort(descending=True)
+        elite = population[sorted_idx[:elite_size]].clone()
+        probs = F.softmax(adjusted * 10, dim=0)
+        parent_idx = torch.multinomial(probs, pop_size - elite_size, replacement=True)
+        children = population[parent_idx].clone()
+        if cfg.evo_crossover_rate > 0:
+            crossover_mask = torch.rand(len(children)) < cfg.evo_crossover_rate
+            n_cross = crossover_mask.sum().item()
+            if n_cross > 1:
+                cross_idx = torch.where(crossover_mask)[0]
+                for i in range(0, len(cross_idx) - 1, 2):
+                    p1, p2 = cross_idx[i], cross_idx[i + 1]
+                    cross_point = random.randint(1, circuit.n_weights - 1)
+                    temp = children[p1, cross_point:].clone()
+                    children[p1, cross_point:] = children[p2, cross_point:]
+                    children[p2, cross_point:] = temp
+        mutation_mask = torch.rand_like(children) < mutation_rate
+        mutations = torch.randint(-int(cfg.evo_mutation_strength),
+                                  int(cfg.evo_mutation_strength) + 1,
+                                  children.shape, device=cfg.device).float()
+        children = children + mutation_mask.float() * mutations
+        population = torch.cat([elite, children], dim=0)
+        if stagnant_generations > 200:
             if cfg.verbose:
+                print(f"    Early stopping at gen {gen} (stagnant)")
+            break
+    final_stats = circuit.stats(best_weights)
     return PruneResult(
         method='evolutionary',
         original_stats=original,
         final_stats=final_stats,
         final_weights=best_weights,
+        fitness=best_fitness,
+        time_seconds=time.perf_counter() - start,
+        history=history,
+        metadata={'final_mutation_rate': mutation_rate, 'generations_run': gen + 1}
     )
+def prune_annealing(circuit: ThresholdCircuit, evaluator: VectorizedEvaluator,
                     cfg: Config) -> PruneResult:
+    """Simulated annealing."""
     start = time.perf_counter()
+    original = circuit.stats()
+    current = circuit.clone_weights()
+    current_mag = sum(t.abs().sum().item() for t in current.values())
+    current_fitness = evaluator.evaluate_single(current)
+    if current_fitness < cfg.fitness_threshold:
+        current_energy = 1e6 + current_mag
+    else:
+        current_energy = current_mag
     best = {k: v.clone() for k, v in current.items()}
     best_energy = current_energy
+    best_fitness = current_fitness
     temp = cfg.annealing_initial_temp
     history = []
+    if cfg.verbose:
+        print(f"    Iterations: {cfg.annealing_iterations}, Initial temp: {temp}")
     for i in range(cfg.annealing_iterations):
         neighbor = {k: v.clone() for k, v in current.items()}
         name = random.choice(list(neighbor.keys()))
         flat = neighbor[name].flatten()
         idx = random.randint(0, len(flat) - 1)
+        mutation = random.choice([-2, -1, 0, 1, 2])
         if mutation == 0:
             flat[idx] = 0
         else:
             flat[idx] = flat[idx] + mutation
         neighbor[name] = flat.view(neighbor[name].shape)
+        neighbor_fitness = evaluator.evaluate_single(neighbor)
+        neighbor_mag = sum(t.abs().sum().item() for t in neighbor.values())
+        if neighbor_fitness < cfg.fitness_threshold:
+            neighbor_energy = 1e6 + neighbor_mag
+        else:
+            neighbor_energy = neighbor_mag
         delta = neighbor_energy - current_energy
         if delta < 0 or random.random() < math.exp(-delta / max(temp, 1e-10)):
             current = neighbor
             current_energy = neighbor_energy
+            current_fitness = neighbor_fitness
+            if neighbor_fitness >= cfg.fitness_threshold and neighbor_energy < best_energy:
+                best = {k: v.clone() for k, v in current.items()}
+                best_energy = neighbor_energy
+                best_fitness = neighbor_fitness
         temp *= cfg.annealing_cooling
+        if i % 5000 == 0:
+            stats = circuit.stats(best)
+            history.append({'iter': i, 'temp': temp, 'magnitude': stats['magnitude']})
             if cfg.verbose:
+                print(f"    Iter {i}: temp={temp:.4f}, best_mag={stats['magnitude']:.0f}")
     return PruneResult(
         method='annealing',
         original_stats=original,
+        final_stats=circuit.stats(best),
         final_weights=best,
+        fitness=best_fitness,
         time_seconds=time.perf_counter() - start,
         history=history
     )
+def prune_neuron(circuit: ThresholdCircuit, evaluator: VectorizedEvaluator,
+                 cfg: Config) -> PruneResult:
+    """
+    Neuron-level pruning.
+    Identifies and removes entire neurons that don't affect output.
+    """
+    start = time.perf_counter()
+    weights = circuit.clone_weights()
+    original = circuit.stats(weights)
+    neuron_groups = {}
+    for key in weights.keys():
+        parts = key.rsplit('.', 1)
+        if len(parts) == 2:
+            neuron_name = parts[0]
+        else:
+            neuron_name = key.split('.')[0] if '.' in key else key
+        if neuron_name not in neuron_groups:
+            neuron_groups[neuron_name] = []
+        neuron_groups[neuron_name].append(key)
+    if cfg.verbose:
+        print(f"    Found {len(neuron_groups)} neuron groups")
+    removed = 0
+    for neuron_name, keys in neuron_groups.items():
+        saved = {k: weights[k].clone() for k in keys}
+        for k in keys:
+            weights[k] = torch.zeros_like(weights[k])
+        if evaluator.evaluate_single(weights) >= cfg.fitness_threshold:
+            removed += 1
+            if cfg.verbose:
+                print(f"      Removed neuron: {neuron_name}")
+        else:
+            for k in keys:
+                weights[k] = saved[k]
+    if cfg.verbose:
+        stats = circuit.stats(weights)
+        print(f"    Removed {removed} neurons, mag={stats['magnitude']:.0f}")
+    return PruneResult(
+        method='neuron',
+        original_stats=original,
+        final_stats=circuit.stats(weights),
+        final_weights=weights,
+        fitness=evaluator.evaluate_single(weights),
+        time_seconds=time.perf_counter() - start,
+        metadata={'neurons_removed': removed}
+    )
+def prune_lottery(circuit: ThresholdCircuit, evaluator: VectorizedEvaluator,
+                  cfg: Config) -> PruneResult:
+    """
+    Lottery Ticket Hypothesis pruning.
+    Iteratively prune smallest magnitude weights and check if subnetwork works.
+    """
     start = time.perf_counter()
+    original = circuit.stats()
+    weights = circuit.clone_weights()
+    initial_weights = circuit.clone_weights()
+    history = []
+    if cfg.verbose:
+        print(f"    Lottery ticket: {cfg.lottery_rounds} rounds, {cfg.lottery_prune_rate*100:.0f}% per round")
+    for round_num in range(cfg.lottery_rounds):
+        all_weights = []
+        for name, tensor in weights.items():
+            flat = tensor.flatten()
+            for i in range(len(flat)):
+                val = abs(flat[i].item())
+                if val > 0:
+                    all_weights.append((val, name, i, tensor.shape))
+        if not all_weights:
+            break
+        all_weights.sort(key=lambda x: x[0])
+        n_prune = int(len(all_weights) * cfg.lottery_prune_rate)
+        if n_prune == 0:
+            break
+        to_prune = all_weights[:n_prune]
+        mask = {}
+        for name in weights:
+            mask[name] = (weights[name] != 0).float()
+        for _, name, idx, shape in to_prune:
+            flat_mask = mask[name].flatten()
+            flat_mask[idx] = 0
+            mask[name] = flat_mask.view(shape)
+        for name in weights:
+            weights[name] = initial_weights[name] * mask[name]
+        fitness = evaluator.evaluate_single(weights)
+        stats = circuit.stats(weights)
+        history.append({
+            'round': round_num,
+            'pruned': n_prune,
+            'remaining': len(all_weights) - n_prune,
+            'fitness': fitness,
+            'magnitude': stats['magnitude']
         })
         if cfg.verbose:
+            print(f"    Round {round_num}: pruned {n_prune}, fitness={fitness:.4f}, mag={stats['magnitude']:.0f}")
+        if fitness < cfg.fitness_threshold:
+            for _, name, idx, shape in to_prune:
+                flat_mask = mask[name].flatten()
+                flat_mask[idx] = 1
+                mask[name] = flat_mask.view(shape)
+            for name in weights:
+                weights[name] = initial_weights[name] * mask[name]
+            if cfg.verbose:
+                print(f"    Reverted round {round_num} (fitness dropped)")
+            break
     return PruneResult(
+        method='lottery',
         original_stats=original,
+        final_stats=circuit.stats(weights),
         final_weights=weights,
+        fitness=evaluator.evaluate_single(weights),
         time_seconds=time.perf_counter() - start,
+        history=history
     )
+def prune_topology(circuit: ThresholdCircuit, evaluator: VectorizedEvaluator,
+                   cfg: Config) -> PruneResult:
+    """
+    Topology search - NEAT-style evolution of circuit structure.
+    This is a simplified version that works with fixed topology but
+    can zero out entire connection patterns.
+    """
+    start = time.perf_counter()
+    original = circuit.stats()
+    weights = circuit.clone_weights()
+    connection_groups = {}
+    for key in weights.keys():
+        if 'weight' in key:
+            base = key.replace('.weight', '')
+            if base not in connection_groups:
+                connection_groups[base] = {'weight': None, 'bias': None}
+            connection_groups[base]['weight'] = key
+            bias_key = key.replace('weight', 'bias')
+            if bias_key in weights:
+                connection_groups[base]['bias'] = bias_key
+    if cfg.verbose:
+        print(f"    Found {len(connection_groups)} connection groups")
+    active = {k: True for k in connection_groups}
+    best_weights = {k: v.clone() for k, v in weights.items()}
+    best_active = dict(active)
+    best_score = -sum(t.abs().sum().item() for t in weights.values())
+    for gen in range(cfg.topology_generations):
+        test_active = dict(active)
+        if random.random() < cfg.topology_remove_neuron_prob:
+            candidates = [k for k, v in test_active.items() if v]
+            if candidates:
+                to_remove = random.choice(candidates)
+                test_active[to_remove] = False
+        if random.random() < cfg.topology_add_neuron_prob:
+            candidates = [k for k, v in test_active.items() if not v]
+            if candidates:
+                to_add = random.choice(candidates)
+                test_active[to_add] = True
+        test_weights = {k: v.clone() for k, v in weights.items()}
+        for group_name, is_active in test_active.items():
+            if not is_active:
+                info = connection_groups[group_name]
+                if info['weight']:
+                    test_weights[info['weight']] = torch.zeros_like(test_weights[info['weight']])
+                if info['bias']:
+                    test_weights[info['bias']] = torch.zeros_like(test_weights[info['bias']])
+        fitness = evaluator.evaluate_single(test_weights)
+        if fitness >= cfg.fitness_threshold:
+            mag = sum(t.abs().sum().item() for t in test_weights.values())
+            score = -mag
+            if score > best_score:
+                best_score = score
+                best_weights = test_weights
+                best_active = dict(test_active)
+                active = test_active
+                if cfg.verbose and gen % 50 == 0:
+                    n_active = sum(1 for v in best_active.values() if v)
+                    stats = circuit.stats(best_weights)
+                    print(f"    Gen {gen}: {n_active}/{len(connection_groups)} active, mag={stats['magnitude']:.0f}")
+    n_removed = sum(1 for v in best_active.values() if not v)
+    return PruneResult(
+        method='topology',
+        original_stats=original,
+        final_stats=circuit.stats(best_weights),
+        final_weights=best_weights,
+        fitness=evaluator.evaluate_single(best_weights),
+        time_seconds=time.perf_counter() - start,
+        metadata={'connections_removed': n_removed, 'active_groups': best_active}
+    )
+def prune_pareto(circuit: ThresholdCircuit, evaluator: VectorizedEvaluator,
+                 cfg: Config) -> PruneResult:
+    """Explore Pareto frontier of correctness vs. size."""
+    start = time.perf_counter()
+    original = circuit.stats()
+    frontier = []
+    if cfg.verbose:
+        print(f"    Exploring Pareto frontier...")
+    for target in cfg.pareto_levels:
+        relaxed_cfg = Config(
+            device=cfg.device,
+            fitness_threshold=target,
+            magnitude_passes=30,
+            verbose=False,
+            vram=cfg.vram
+        )
+        result = prune_magnitude_vectorized(circuit, evaluator, relaxed_cfg)
+        frontier.append({
+            'target': target,
+            'actual': result.fitness,
+            'magnitude': result.final_stats['magnitude'],
+            'nonzero': result.final_stats['nonzero'],
+            'sparsity': result.final_stats['sparsity']
+        })
+        if cfg.verbose:
+            print(f"      Target {target:.2f}: fitness={result.fitness:.4f}, mag={result.final_stats['magnitude']:.0f}")
+    return PruneResult(
+        method='pareto',
+        original_stats=original,
+        final_stats=frontier[-1] if frontier else original,
+        final_weights=circuit.clone_weights(),
+        fitness=frontier[0]['actual'] if frontier else 1.0,
+        time_seconds=time.perf_counter() - start,
+        history=frontier
+    )
+def run_all_methods(circuit: ThresholdCircuit, cfg: Config) -> Dict[str, PruneResult]:
+    """Run all enabled pruning methods."""
     print(f"\n{'='*70}")
     print(f" PRUNING: {circuit.spec.name}")
     print(f"{'='*70}")
+    vram = get_vram_status()
+    if vram['available']:
+        print(f" VRAM: {vram['total_gb']:.1f} GB total, {vram['free_gb']:.1f} GB free")
     original = circuit.stats()
     print(f" Inputs: {circuit.spec.inputs}, Outputs: {circuit.spec.outputs}")
     print(f" Neurons: {circuit.spec.neurons}, Layers: {circuit.spec.layers}")
     print(f" Parameters: {original['total']}, Non-zero: {original['nonzero']}")
     print(f" Magnitude: {original['magnitude']:.0f}")
+    print(f" Test cases: {circuit.n_cases}")
     print(f"{'='*70}")
+    evaluator = VectorizedEvaluator(circuit, cfg)
+    initial_fitness = evaluator.evaluate_single(circuit.weights)
+    print(f"\n Initial fitness: {initial_fitness:.6f}")
+    if initial_fitness < cfg.fitness_threshold:
         print(" ERROR: Circuit doesn't pass baseline!")
         return {}
     results = {}
+    methods = [
+        ('magnitude', cfg.run_magnitude, lambda: prune_magnitude_vectorized(circuit, evaluator, cfg)),
+        ('zero', cfg.run_zero, lambda: prune_zero(circuit, evaluator, cfg)),
+        ('quantize', cfg.run_quantize, lambda: prune_quantize(circuit, evaluator, cfg)),
+        ('neuron', cfg.run_neuron, lambda: prune_neuron(circuit, evaluator, cfg)),
+        ('lottery', cfg.run_lottery, lambda: prune_lottery(circuit, evaluator, cfg)),
+        ('topology', cfg.run_topology, lambda: prune_topology(circuit, evaluator, cfg)),
+        ('evolutionary', cfg.run_evolutionary, lambda: prune_evolutionary(circuit, evaluator, cfg)),
+        ('annealing', cfg.run_annealing, lambda: prune_annealing(circuit, evaluator, cfg)),
+        ('pareto', cfg.run_pareto, lambda: prune_pareto(circuit, evaluator, cfg)),
+    ]
+    for i, (name, enabled, fn) in enumerate(methods):
+        if enabled:
+            print(f"\n[{i+1}] {name.upper()}")
+            try:
+                clear_vram()
+                results[name] = fn()
+                _print_result(results[name])
+            except Exception as e:
+                print(f"    ERROR: {e}")
+                import traceback
+                traceback.print_exc()
     print(f"\n{'='*70}")
     print(" SUMMARY")
     print(f"{'='*70}")
+    print(f"\n{'Method':<15} {'Fitness':<10} {'Magnitude':<12} {'Nonzero':<10} {'Sparsity':<10} {'Time':<10}")
     print("-" * 70)
+    print(f"{'Original':<15} {'1.0000':<10} {original['magnitude']:<12.0f} {original['nonzero']:<10} {'0.0%':<10} {'-':<10}")
     best_method, best_mag = None, float('inf')
     for name, r in sorted(results.items(), key=lambda x: x[1].final_stats.get('magnitude', float('inf'))):
         mag = r.final_stats.get('magnitude', 0)
         nz = r.final_stats.get('nonzero', 0)
+        sp = r.final_stats.get('sparsity', 0) * 100
+        print(f"{name:<15} {r.fitness:<10.4f} {mag:<12.0f} {nz:<10} {sp:<9.1f}% {r.time_seconds:<10.1f}s")
         if r.fitness >= cfg.fitness_threshold and mag < best_mag:
             best_mag = mag
             best_method = name
     print(f"    Fitness: {r.fitness:.6f}")
     print(f"    Magnitude: {r.final_stats.get('magnitude', 0):.0f}")
     print(f"    Nonzero: {r.final_stats.get('nonzero', 0)}")
+    print(f"    Sparsity: {r.final_stats.get('sparsity', 0)*100:.1f}%")
     print(f"    Time: {r.time_seconds:.1f}s")
+def discover_circuits(base: Path = CIRCUITS_PATH) -> List[CircuitSpec]:
+    """Find all circuits."""
+    circuits = []
+    for d in base.iterdir():
+        if d.is_dir() and (d / 'config.json').exists() and (d / 'model.safetensors').exists():
+            try:
+                with open(d / 'config.json') as f:
+                    cfg = json.load(f)
+                circuits.append(CircuitSpec(
+                    name=cfg['name'],
+                    path=d,
+                    inputs=cfg['inputs'],
+                    outputs=cfg['outputs'],
+                    neurons=cfg['neurons'],
+                    layers=cfg['layers'],
+                    parameters=cfg['parameters'],
+                    description=cfg.get('description', '')
+                ))
+            except:
+                pass
+    return sorted(circuits, key=lambda x: (x.inputs, x.neurons))
 def main():
+    parser = argparse.ArgumentParser(description='Prune threshold circuits v2')
     parser.add_argument('circuit', nargs='?', help='Circuit name')
+    parser.add_argument('--list', action='store_true')
+    parser.add_argument('--all', action='store_true')
+    parser.add_argument('--max-inputs', type=int, default=10)
+    parser.add_argument('--device', default='cuda')
+    parser.add_argument('--methods', type=str)
     parser.add_argument('--fitness', type=float, default=0.9999)
     parser.add_argument('--quiet', action='store_true')
+    parser.add_argument('--save', action='store_true')
+    parser.add_argument('--evo-pop', type=int, default=0)
+    parser.add_argument('--evo-gens', type=int, default=2000)
+    parser.add_argument('--vram-target', type=float, default=0.75)
     args = parser.parse_args()
         specs = discover_circuits()
         print(f"\nAvailable circuits ({len(specs)}):\n")
         for s in specs:
+            print(f"  {s.name:<40} {s.inputs}in/{s.outputs}out  {s.neurons}N  {s.layers}L  {s.parameters}P")
         return
+    vram_cfg = VRAMConfig(target_residency=args.vram_target)
     cfg = Config(
         device=args.device,
         fitness_threshold=args.fitness,
+        verbose=not args.quiet,
+        vram=vram_cfg,
+        evo_pop_size=args.evo_pop,
+        evo_generations=args.evo_gens
     )
     if args.methods:
         methods = args.methods.lower().split(',')
+        cfg.run_magnitude = 'mag' in methods or 'magnitude' in methods
         cfg.run_zero = 'zero' in methods
+        cfg.run_quantize = 'quant' in methods or 'quantize' in methods
         cfg.run_evolutionary = 'evo' in methods or 'evolutionary' in methods
         cfg.run_annealing = 'anneal' in methods or 'sa' in methods
+        cfg.run_neuron = 'neuron' in methods
+        cfg.run_lottery = 'lottery' in methods
+        cfg.run_topology = 'topology' in methods or 'topo' in methods
         cfg.run_pareto = 'pareto' in methods
     RESULTS_PATH.mkdir(exist_ok=True)
         print(f"\nRunning on {len(specs)} circuits...")
         for spec in specs:
             try:
+                circuit = ThresholdCircuit(spec.path, cfg.device)
                 results = run_all_methods(circuit, cfg)
+                clear_vram()
             except Exception as e:
                 print(f"ERROR on {spec.name}: {e}")
     elif args.circuit:
+        path = CIRCUITS_PATH / args.circuit
+        if not path.exists():
+            path = CIRCUITS_PATH / f'threshold-{args.circuit}'
+        if not path.exists():
+            print(f"Circuit not found: {args.circuit}")
+            return
+        circuit = ThresholdCircuit(path, cfg.device)
         results = run_all_methods(circuit, cfg)
         if args.save and results:
             best = min(results.values(), key=lambda r: r.final_stats.get('magnitude', float('inf')))
             if best.fitness >= cfg.fitness_threshold:
+                path = circuit.save_weights(best.final_weights, f'pruned_{best.method}')
                 print(f"\nSaved to: {path}")
     else:
         parser.print_help()
         print("\n\nExamples:")
+        print("  python prune_v2.py --list")
+        print("  python prune_v2.py threshold-hamming74decoder")
+        print("  python prune_v2.py threshold-hamming74decoder --methods evo,neuron,lottery")
+        print("  python prune_v2.py --all --max-inputs 8")
 if __name__ == '__main__':