Ram07
/

bitskip-v1-earlyexit

Text Generation

bitskip_v1_earlyexit

efficient-transformers

Model card Files Files and versions

bitskip-v1-earlyexit / models /bitlinear.py

Ram07's picture

Upload folder using huggingface_hub

83505eb verified 3 months ago

history blame contribute delete

1.77 kB

	"""
	Standard BitLinear layer for BitSkip v1 (8-bit activations, NO Hadamard transform)
	"""

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	class BitLinear(nn.Module):
	"""
	Standard BitLinear: Ternary weights + 8-bit activations.
	NO Hadamard transform - direct quantization.
	"""

	def __init__(self, in_features, out_features, bias=False):
	super().__init__()
	self.in_features = in_features
	self.out_features = out_features

	# Standard weight initialization
	self.weight = nn.Parameter(torch.randn(out_features, in_features) * 0.02)
	self.bias = nn.Parameter(torch.zeros(out_features)) if bias else None

	def forward(self, x):
	"""
	Forward with 8-bit activation quantization and ternary weights.
	Uses STE (Straight-Through Estimator) for gradients.
	"""
	# 8-bit activation quantization
	x_scale = x.abs().max(dim=-1, keepdim=True)[0].clamp(min=1e-5)
	x_quant = (x / x_scale * 127).round().clamp(-128, 127)
	x_quant = x_quant / 127 * x_scale

	# STE: quantized forward, full precision backward
	if self.training:
	x_quant = x + (x_quant - x).detach()

	# Ternary weight quantization
	w_scale = self.weight.abs().mean().clamp(min=1e-5)
	w_quant = torch.zeros_like(self.weight)
	w_quant[self.weight > 0.5 * w_scale] = 1.0
	w_quant[self.weight < -0.5 * w_scale] = -1.0
	w_quant = w_quant * w_scale

	# STE for weights
	if self.training:
	w_quant = self.weight + (w_quant - self.weight).detach()

	# Standard linear operation
	return F.linear(x_quant, w_quant, self.bias)