""" Memory usage benchmarking for BitLinear. This script measures actual memory usage and compression ratios for BitLinear compared to standard nn.Linear layers. """ import torch import torch.nn as nn from bitlinear import BitLinear, MultiTernaryLinear, pack_ternary_base3, estimate_memory_savings import sys def get_tensor_memory_mb(tensor): """Get memory usage of a tensor in MB.""" return tensor.element_size() * tensor.nelement() / (1024 ** 2) def get_model_memory_mb(model): """Get total memory usage of model parameters in MB.""" total_bytes = sum(p.element_size() * p.nelement() for p in model.parameters()) return total_bytes / (1024 ** 2) def analyze_layer_memory(in_features, out_features): """Analyze memory usage for a single layer.""" print(f"\n{'=' * 100}") print(f"Layer: {in_features} → {out_features}") print(f"{'=' * 100}\n") # Create layers linear = nn.Linear(in_features, out_features, bias=True) bitlinear = BitLinear.from_linear(linear) multi_ternary = MultiTernaryLinear.from_linear(linear, k=2) # Memory for nn.Linear mem_linear = get_model_memory_mb(linear) # Memory for BitLinear (stored as float32 currently, but can be packed) mem_bitlinear = get_model_memory_mb(bitlinear) # Memory for MultiTernaryLinear mem_multi = get_model_memory_mb(multi_ternary) # Theoretical packed memory (base-3 packing) weights_count = in_features * out_features packed_bytes = (weights_count + 4) // 5 # 5 ternary values per byte bias_bytes = out_features * 4 # float32 bias gamma_bytes = out_features * 4 # float32 gamma theoretical_packed_mb = (packed_bytes + bias_bytes + gamma_bytes) / (1024 ** 2) # Calculate compression ratios compression_current = mem_linear / mem_bitlinear compression_packed = mem_linear / theoretical_packed_mb # Print results print(f"nn.Linear memory: {mem_linear:10.4f} MB") print(f"BitLinear memory (current): {mem_bitlinear:10.4f} MB (ratio: {compression_current:5.2f}x)") print(f"BitLinear memory (packed): {theoretical_packed_mb:10.4f} MB (ratio: {compression_packed:5.2f}x)") print(f"MultiTernaryLinear memory (k=2): {mem_multi:10.4f} MB (ratio: {mem_linear/mem_multi:5.2f}x)") # Test actual packing print(f"\nPacking Test:") print(f"-" * 100) W_ternary = bitlinear.W_ternary packed, original_shape = pack_ternary_base3(W_ternary) unpacked_size_mb = get_tensor_memory_mb(W_ternary) packed_size_mb = get_tensor_memory_mb(packed) actual_compression = unpacked_size_mb / packed_size_mb print(f"Unpacked weights: {unpacked_size_mb:10.4f} MB") print(f"Packed weights: {packed_size_mb:10.4f} MB") print(f"Actual compression: {actual_compression:8.2f}x") return { 'in_features': in_features, 'out_features': out_features, 'mem_linear': mem_linear, 'mem_bitlinear': mem_bitlinear, 'mem_packed': theoretical_packed_mb, 'mem_multi': mem_multi, 'compression_current': compression_current, 'compression_packed': compression_packed, } def run_memory_benchmarks(): """Run comprehensive memory benchmarks.""" print("=" * 100) print("BitLinear Memory Benchmarks") print("=" * 100) print(f"\nPyTorch version: {torch.__version__}") # Test configurations layer_sizes = [ (512, 512), (768, 768), (1024, 1024), (2048, 2048), (4096, 4096), (768, 3072), # Typical Transformer FFN (1024, 4096), # Larger Transformer FFN ] results = [] for in_features, out_features in layer_sizes: result = analyze_layer_memory(in_features, out_features) results.append(result) # Generate summary table print(f"\n\n{'=' * 100}") print("Memory Compression Summary (Markdown Format)") print(f"{'=' * 100}\n") print("| Layer Size | nn.Linear (MB) | BitLinear Current (MB) | BitLinear Packed (MB) | Compression (Packed) |") print("|------------|----------------|------------------------|----------------------|----------------------|") for r in results: print(f"| {r['in_features']}×{r['out_features']:<4} | {r['mem_linear']:14.4f} | " f"{r['mem_bitlinear']:22.4f} | {r['mem_packed']:20.4f} | {r['compression_packed']:20.2f}x |") # Overall statistics print(f"\n{'=' * 100}") print("Summary Statistics") print(f"{'=' * 100}\n") avg_compression = sum(r['compression_packed'] for r in results) / len(results) min_compression = min(r['compression_packed'] for r in results) max_compression = max(r['compression_packed'] for r in results) print(f"Average compression ratio: {avg_compression:.2f}x") print(f"Minimum compression ratio: {min_compression:.2f}x") print(f"Maximum compression ratio: {max_compression:.2f}x") # Transformer example print(f"\n{'=' * 100}") print("Real-World Example: GPT-2 Style Transformer") print(f"{'=' * 100}\n") # GPT-2 small: 12 layers, d_model=768, d_ff=3072 num_layers = 12 d_model = 768 d_ff = 3072 # Each layer has: Q, K, V, O projections (4 × d_model²) + 2 FFN layers (d_model×d_ff + d_ff×d_model) linear_per_layer = (4 * d_model * d_model) + (d_model * d_ff) + (d_ff * d_model) linear_total = linear_per_layer * num_layers # Calculate memory linear_mem_mb = (linear_total * 4) / (1024 ** 2) # float32 packed_mem_mb = ((linear_total + 4) // 5) / (1024 ** 2) # base-3 packed # Add bias and gamma params_per_layer = (4 * d_model) + d_ff + d_model # biases gammas_per_layer = (4 * d_model) + d_ff + d_model # scaling factors overhead_mb = ((params_per_layer + gammas_per_layer) * num_layers * 4) / (1024 ** 2) packed_total_mb = packed_mem_mb + overhead_mb compression = linear_mem_mb / packed_total_mb print(f"Configuration: {num_layers} layers, d_model={d_model}, d_ff={d_ff}") print(f"Total linear parameters: {linear_total:,}") print(f"\nnn.Linear memory: {linear_mem_mb:10.2f} MB") print(f"BitLinear packed: {packed_total_mb:10.2f} MB") print(f"Memory saved: {linear_mem_mb - packed_total_mb:10.2f} MB") print(f"Compression ratio: {compression:10.2f}x") print(f"\n{'=' * 100}") print("Benchmark Complete!") print(f"{'=' * 100}") if __name__ == "__main__": run_memory_benchmarks()