Add pdp.py

3a3ad1b verified 13 days ago

13.3 kB

	"""
	PDP: Parameter-free Differentiable Pruning
	Implementation based on the NeurIPS 2023 paper:
	"PDP: Parameter-free Differentiable Pruning is All You Need"
	https://arxiv.org/abs/2305.11203
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import math
	from typing import Dict, List, Optional


	def pdp_soft_mask(weight: torch.Tensor, threshold: float, tau: float) -> torch.Tensor:
	"""
	Compute the PDP soft pruning mask.

	m(w) = exp(w^2 / tau) / (exp(w^2 / tau) + exp(t^2 / tau))

	Args:
	weight: The weight tensor.
	threshold: The threshold t for this layer/entity.
	tau: Temperature hyperparameter controlling mask softness.

	Returns:
	Soft mask tensor with same shape as weight.
	"""
	w2 = weight ** 2
	t2 = threshold ** 2
	# Numerically stable softmax-like computation
	# compute logits = [w^2/tau, t^2/tau]
	logits_w = w2 / tau
	logits_t = torch.full_like(w2, t2 / tau)
	# softmax over the "keep" dimension
	max_logits = torch.maximum(logits_w, logits_t)
	exp_w = torch.exp(logits_w - max_logits)
	exp_t = torch.exp(logits_t - max_logits)
	return exp_w / (exp_w + exp_t)


	def compute_threshold(weight: torch.Tensor, sparsity_ratio: float) -> float:
	"""
	Compute the threshold t for a given sparsity ratio.
	t is set halfway between the largest pruned weight and the smallest unpruned weight.

	Args:
	weight: Absolute weight tensor (flattened).
	sparsity_ratio: Target sparsity ratio in [0, 1).

	Returns:
	Threshold value t >= 0.
	"""
	if sparsity_ratio <= 0:
	return 0.0
	if sparsity_ratio >= 1.0:
	return (weight.max().item() + 1e-6)

	n = weight.numel()
	k = max(1, min(n - 1, int(math.floor(sparsity_ratio * n))))
	sorted_vals, _ = torch.sort(weight)
	pruned_max = sorted_vals[k - 1].item()
	unpruned_min = sorted_vals[k].item() if k < n else sorted_vals[-1].item()
	t = (pruned_max + unpruned_min) / 2.0
	return max(t, 0.0)


	def _make_masked_forward(module: nn.Module, pruner: "PDPPruner", param_name: str):
	"""
	Monkey-patch module.forward to apply the PDP soft mask during forward pass.
	This preserves the computation graph for differentiable backpropagation.
	"""
	if isinstance(module, nn.Conv2d):
	orig_forward = module.forward
	def forward(x):
	t = pruner.thresholds.get(param_name, 0.0)
	if t <= 0:
	return orig_forward(x)
	mask = pdp_soft_mask(module.weight, t, pruner.tau)
	masked_weight = mask * module.weight
	return F.conv2d(
	x, masked_weight, module.bias,
	module.stride, module.padding,
	module.dilation, module.groups
	)
	return forward

	elif isinstance(module, nn.Conv1d):
	orig_forward = module.forward
	def forward(x):
	t = pruner.thresholds.get(param_name, 0.0)
	if t <= 0:
	return orig_forward(x)
	mask = pdp_soft_mask(module.weight, t, pruner.tau)
	masked_weight = mask * module.weight
	return F.conv1d(
	x, masked_weight, module.bias,
	module.stride, module.padding,
	module.dilation, module.groups
	)
	return forward

	elif isinstance(module, nn.Conv3d):
	orig_forward = module.forward
	def forward(x):
	t = pruner.thresholds.get(param_name, 0.0)
	if t <= 0:
	return orig_forward(x)
	mask = pdp_soft_mask(module.weight, t, pruner.tau)
	masked_weight = mask * module.weight
	return F.conv3d(
	x, masked_weight, module.bias,
	module.stride, module.padding,
	module.dilation, module.groups
	)
	return forward

	elif isinstance(module, nn.Linear):
	orig_forward = module.forward
	def forward(x):
	t = pruner.thresholds.get(param_name, 0.0)
	if t <= 0:
	return orig_forward(x)
	mask = pdp_soft_mask(module.weight, t, pruner.tau)
	masked_weight = mask * module.weight
	return F.linear(x, masked_weight, module.bias)
	return forward

	else:
	return module.forward


	class PDPPruner:
	"""
	Parameter-free Differentiable Pruning (PDP) pruner.

	Applies soft pruning masks during training so the task loss directly guides
	pruning decisions. After training, call hard_prune() for inference.

	Usage:
	pruner = PDPPruner(model, target_sparsity=0.855, s=16, epsilon=0.015, tau=1e-4)
	pruner.attach()
	for epoch in range(num_epochs):
	for batch in dataloader:
	loss = model(...)
	loss.backward()
	optimizer.step()
	pruner.step(epoch)
	pruner.hard_prune()
	"""

	def __init__(
	self,
	model: nn.Module,
	target_sparsity: float,
	s: int = 16,
	epsilon: float = 0.015,
	tau: float = 1e-4,
	excluded_modules: Optional[List[str]] = None,
	):
	"""
	Args:
	model: The model to prune.
	target_sparsity: Global target sparsity ratio (e.g. 0.855 for 85.5%).
	s: Warmup epochs before computing target sparsity (default 16).
	epsilon: Gradual pruning rate per epoch (default 0.015 = 1.5%).
	tau: Temperature hyperparameter for soft mask (default 1e-4).
	excluded_modules: List of module class names to exclude.
	"""
	self.model = model
	self.target_sparsity = target_sparsity
	self.s = s
	self.epsilon = epsilon
	self.tau = tau
	self.excluded_modules = excluded_modules or ["BatchNorm2d", "LayerNorm", "BatchNorm1d"]

	# Maps param_name -> nn.Parameter
	self.prunable_params: Dict[str, nn.Parameter] = {}
	# Maps param_name -> float (target sparsity for that layer)
	self.layer_sparsity: Dict[str, float] = {}
	# Maps param_name -> float (current threshold t)
	self.thresholds: Dict[str, float] = {}
	# Whether target sparsities have been computed
	self.sparsity_computed = False
	# Current effective global sparsity (gradual schedule)
	self.current_effective_sparsity = 0.0
	# Store original forward methods to restore later
	self._orig_forwards: Dict[str, Callable] = {}

	self._find_prunable_params()

	def _find_prunable_params(self):
	"""Identify Conv and Linear weight parameters to prune."""
	for name, module in self.model.named_modules():
	if isinstance(module, (nn.Conv2d, nn.Conv1d, nn.Conv3d, nn.Linear)):
	if hasattr(module, "weight") and module.weight is not None:
	param_name = f"{name}.weight"
	self.prunable_params[param_name] = module.weight

	def _compute_layer_sparsities(self):
	"""
	Compute per-layer target sparsity by sorting all weights globally by magnitude.
	This is the PDP-base strategy from the paper.
	"""
	all_weights = []
	for name, param in self.prunable_params.items():
	all_weights.append(param.data.abs().flatten())

	if not all_weights:
	return

	all_weights_cat = torch.cat(all_weights)
	n_total = all_weights_cat.numel()
	k = int(math.floor(self.target_sparsity * n_total))
	k = max(0, min(n_total - 1, k))

	# Global threshold: the k-th smallest weight magnitude
	sorted_vals, _ = torch.sort(all_weights_cat)
	global_threshold = sorted_vals[k].item() if n_total > 0 else 0.0

	# Per-layer sparsity = fraction below/equal to global threshold
	for name, param in self.prunable_params.items():
	w_abs = param.data.abs()
	below = (w_abs <= global_threshold).float().sum().item()
	ratio = below / w_abs.numel()
	self.layer_sparsity[name] = min(ratio, 0.999) # cap at 99.9%

	self.sparsity_computed = True
	print(f"[PDP] Computed per-layer sparsities at epoch {self.s}. "
	f"Global target: {self.target_sparsity:.4f}")

	def _compute_thresholds(self):
	"""Recompute per-layer thresholds t based on current weight distribution."""
	for name, param in self.prunable_params.items():
	ratio = self.layer_sparsity.get(name, 0.0)
	if ratio <= 0:
	self.thresholds[name] = 0.0
	continue
	w_abs = param.data.abs().flatten()
	self.thresholds[name] = compute_threshold(w_abs, ratio)

	def attach(self):
	"""Monkey-patch forward methods of prunable modules to apply soft masks."""
	for name, module in self.model.named_modules():
	if isinstance(module, (nn.Conv2d, nn.Conv1d, nn.Conv3d, nn.Linear)):
	param_name = f"{name}.weight"
	if param_name in self.prunable_params:
	self._orig_forwards[param_name] = module.forward
	module.forward = _make_masked_forward(module, self, param_name)
	print(f"[PDP] Attached masked forwards to {len(self.prunable_params)} prunable layers.")

	def detach(self):
	"""Restore original forward methods."""
	for name, module in self.model.named_modules():
	if isinstance(module, (nn.Conv2d, nn.Conv1d, nn.Conv3d, nn.Linear)):
	param_name = f"{name}.weight"
	if param_name in self._orig_forwards:
	module.forward = self._orig_forwards[param_name]
	self._orig_forwards.clear()
	print("[PDP] Detached all masked forwards.")

	def step(self, epoch: int):
	"""
	Call this after each optimizer.step() (or at each epoch boundary).
	Recomputes thresholds and updates gradual sparsity schedule.
	"""
	# Warmup: first s epochs, no pruning
	if epoch < self.s:
	return

	# At epoch s, compute per-layer target sparsities (one-time)
	if epoch == self.s and not self.sparsity_computed:
	self._compute_layer_sparsities()

	# Gradual sparsity increase after warmup
	if epoch >= self.s:
	steps_since_s = epoch - self.s + 1
	# Increase by epsilon (absolute percentage) per epoch
	self.current_effective_sparsity = min(
	self.target_sparsity,
	self.epsilon * steps_since_s
	)
	# Scale per-layer sparsities proportionally
	if self.target_sparsity > 0:
	scale = self.current_effective_sparsity / self.target_sparsity
	for name in self.layer_sparsity:
	self.layer_sparsity[name] = min(1.0, self.layer_sparsity[name] * scale)

	# Recompute thresholds based on current weight distribution
	self._compute_thresholds()

	def get_sparsity(self) -> float:
	"""Return the current actual sparsity (fraction of weights below threshold)."""
	total = 0
	pruned = 0
	for name, param in self.prunable_params.items():
	t = self.thresholds.get(name, 0.0)
	total += param.numel()
	if t > 0:
	pruned += (param.data.abs() <= t).sum().item()
	return pruned / total if total > 0 else 0.0

	def hard_prune(self):
	"""
	After training, apply hard pruning masks for inference.
	Sets pruned weights to exactly zero.
	"""
	# Restore full target sparsities
	if self.target_sparsity > 0:
	scale = 1.0 / max(self.current_effective_sparsity / self.target_sparsity, 1e-6)
	for name in self.layer_sparsity:
	self.layer_sparsity[name] = min(1.0, self.layer_sparsity[name] * scale)

	self._compute_thresholds()

	for name, param in self.prunable_params.items():
	t = self.thresholds.get(name, 0.0)
	if t > 0:
	mask = (param.data.abs() > t).float()
	param.data.mul_(mask)

	final_sparsity = self.get_sparsity()
	print(f"[PDP] Hard pruning applied. Final sparsity: {final_sparsity:.4f}")
	return final_sparsity

	def state_dict(self) -> dict:
	"""Serialize pruner state."""
	return {
	"target_sparsity": self.target_sparsity,
	"s": self.s,
	"epsilon": self.epsilon,
	"tau": self.tau,
	"sparsity_computed": self.sparsity_computed,
	"layer_sparsity": self.layer_sparsity,
	"thresholds": self.thresholds,
	"current_effective_sparsity": self.current_effective_sparsity,
	}

	def load_state_dict(self, state: dict):
	"""Restore pruner state."""
	self.target_sparsity = state["target_sparsity"]
	self.s = state["s"]
	self.epsilon = state["epsilon"]
	self.tau = state["tau"]
	self.sparsity_computed = state["sparsity_computed"]
	self.layer_sparsity = state["layer_sparsity"]
	self.thresholds = state["thresholds"]
	self.current_effective_sparsity = state["current_effective_sparsity"]