benchmarks/benchmark_memory.py · krisaujla/BitLinear at main

BitLinear / benchmarks /benchmark_memory.py

Upload folder using huggingface_hub

fd8c8b9 verified 6 days ago

6.8 kB

	"""
	Memory usage benchmarking for BitLinear.

	This script measures actual memory usage and compression ratios for BitLinear
	compared to standard nn.Linear layers.
	"""

	import torch
	import torch.nn as nn
	from bitlinear import BitLinear, MultiTernaryLinear, pack_ternary_base3, estimate_memory_savings
	import sys


	def get_tensor_memory_mb(tensor):
	"""Get memory usage of a tensor in MB."""
	return tensor.element_size() * tensor.nelement() / (1024 ** 2)


	def get_model_memory_mb(model):
	"""Get total memory usage of model parameters in MB."""
	total_bytes = sum(p.element_size() * p.nelement() for p in model.parameters())
	return total_bytes / (1024 ** 2)


	def analyze_layer_memory(in_features, out_features):
	"""Analyze memory usage for a single layer."""

	print(f"\n{'=' * 100}")
	print(f"Layer: {in_features} → {out_features}")
	print(f"{'=' * 100}\n")

	# Create layers
	linear = nn.Linear(in_features, out_features, bias=True)
	bitlinear = BitLinear.from_linear(linear)
	multi_ternary = MultiTernaryLinear.from_linear(linear, k=2)

	# Memory for nn.Linear
	mem_linear = get_model_memory_mb(linear)

	# Memory for BitLinear (stored as float32 currently, but can be packed)
	mem_bitlinear = get_model_memory_mb(bitlinear)

	# Memory for MultiTernaryLinear
	mem_multi = get_model_memory_mb(multi_ternary)

	# Theoretical packed memory (base-3 packing)
	weights_count = in_features * out_features
	packed_bytes = (weights_count + 4) // 5 # 5 ternary values per byte
	bias_bytes = out_features * 4 # float32 bias
	gamma_bytes = out_features * 4 # float32 gamma
	theoretical_packed_mb = (packed_bytes + bias_bytes + gamma_bytes) / (1024 ** 2)

	# Calculate compression ratios
	compression_current = mem_linear / mem_bitlinear
	compression_packed = mem_linear / theoretical_packed_mb

	# Print results
	print(f"nn.Linear memory: {mem_linear:10.4f} MB")
	print(f"BitLinear memory (current): {mem_bitlinear:10.4f} MB (ratio: {compression_current:5.2f}x)")
	print(f"BitLinear memory (packed): {theoretical_packed_mb:10.4f} MB (ratio: {compression_packed:5.2f}x)")
	print(f"MultiTernaryLinear memory (k=2): {mem_multi:10.4f} MB (ratio: {mem_linear/mem_multi:5.2f}x)")

	# Test actual packing
	print(f"\nPacking Test:")
	print(f"-" * 100)

	W_ternary = bitlinear.W_ternary
	packed, original_shape = pack_ternary_base3(W_ternary)

	unpacked_size_mb = get_tensor_memory_mb(W_ternary)
	packed_size_mb = get_tensor_memory_mb(packed)
	actual_compression = unpacked_size_mb / packed_size_mb

	print(f"Unpacked weights: {unpacked_size_mb:10.4f} MB")
	print(f"Packed weights: {packed_size_mb:10.4f} MB")
	print(f"Actual compression: {actual_compression:8.2f}x")

	return {
	'in_features': in_features,
	'out_features': out_features,
	'mem_linear': mem_linear,
	'mem_bitlinear': mem_bitlinear,
	'mem_packed': theoretical_packed_mb,
	'mem_multi': mem_multi,
	'compression_current': compression_current,
	'compression_packed': compression_packed,
	}


	def run_memory_benchmarks():
	"""Run comprehensive memory benchmarks."""

	print("=" * 100)
	print("BitLinear Memory Benchmarks")
	print("=" * 100)
	print(f"\nPyTorch version: {torch.__version__}")

	# Test configurations
	layer_sizes = [
	(512, 512),
	(768, 768),
	(1024, 1024),
	(2048, 2048),
	(4096, 4096),
	(768, 3072), # Typical Transformer FFN
	(1024, 4096), # Larger Transformer FFN
	]

	results = []

	for in_features, out_features in layer_sizes:
	result = analyze_layer_memory(in_features, out_features)
	results.append(result)

	# Generate summary table
	print(f"\n\n{'=' * 100}")
	print("Memory Compression Summary (Markdown Format)")
	print(f"{'=' * 100}\n")

	print("\| Layer Size \| nn.Linear (MB) \| BitLinear Current (MB) \| BitLinear Packed (MB) \| Compression (Packed) \|")
	print("\|------------\|----------------\|------------------------\|----------------------\|----------------------\|")

	for r in results:
	print(f"\| {r['in_features']}×{r['out_features']:<4} \| {r['mem_linear']:14.4f} \| "
	f"{r['mem_bitlinear']:22.4f} \| {r['mem_packed']:20.4f} \| {r['compression_packed']:20.2f}x \|")

	# Overall statistics
	print(f"\n{'=' * 100}")
	print("Summary Statistics")
	print(f"{'=' * 100}\n")

	avg_compression = sum(r['compression_packed'] for r in results) / len(results)
	min_compression = min(r['compression_packed'] for r in results)
	max_compression = max(r['compression_packed'] for r in results)

	print(f"Average compression ratio: {avg_compression:.2f}x")
	print(f"Minimum compression ratio: {min_compression:.2f}x")
	print(f"Maximum compression ratio: {max_compression:.2f}x")

	# Transformer example
	print(f"\n{'=' * 100}")
	print("Real-World Example: GPT-2 Style Transformer")
	print(f"{'=' * 100}\n")

	# GPT-2 small: 12 layers, d_model=768, d_ff=3072
	num_layers = 12
	d_model = 768
	d_ff = 3072

	# Each layer has: Q, K, V, O projections (4 × d_model²) + 2 FFN layers (d_model×d_ff + d_ff×d_model)
	linear_per_layer = (4 * d_model * d_model) + (d_model * d_ff) + (d_ff * d_model)
	linear_total = linear_per_layer * num_layers

	# Calculate memory
	linear_mem_mb = (linear_total * 4) / (1024 ** 2) # float32
	packed_mem_mb = ((linear_total + 4) // 5) / (1024 ** 2) # base-3 packed

	# Add bias and gamma
	params_per_layer = (4 * d_model) + d_ff + d_model # biases
	gammas_per_layer = (4 * d_model) + d_ff + d_model # scaling factors
	overhead_mb = ((params_per_layer + gammas_per_layer) * num_layers * 4) / (1024 ** 2)

	packed_total_mb = packed_mem_mb + overhead_mb
	compression = linear_mem_mb / packed_total_mb

	print(f"Configuration: {num_layers} layers, d_model={d_model}, d_ff={d_ff}")
	print(f"Total linear parameters: {linear_total:,}")
	print(f"\nnn.Linear memory: {linear_mem_mb:10.2f} MB")
	print(f"BitLinear packed: {packed_total_mb:10.2f} MB")
	print(f"Memory saved: {linear_mem_mb - packed_total_mb:10.2f} MB")
	print(f"Compression ratio: {compression:10.2f}x")

	print(f"\n{'=' * 100}")
	print("Benchmark Complete!")
	print(f"{'=' * 100}")


	if __name__ == "__main__":
	run_memory_benchmarks()