#!/usr/bin/env python3 """ Novel GPU Memory Reduction Experiments for FigQuant =================================================== Standard approaches (gradient checkpointing, mixed precision) are already in use. These experiments test NON-STANDARD ideas unique to FigQuant's architecture: Experiment A: "Streaming Dequant" — only dequant the current layer, not all at once Experiment B: "Ping-Pong" — keep even layers on GPU, odd layers on CPU, swap during forward Experiment C: "Lazy Materialization" — dequant into a pre-allocated buffer, overwrite per-layer Experiment D: "Partial Dequant" — only dequant the rows needed by the current token's attention """ import os, sys, subprocess, time, gc import numpy as np subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"]) subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"]) subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"]) sys.path.insert(0, "/app/littlefig/src") import torch import torch.nn as nn import torch.nn.functional as F def log(msg): print(f"[MEM] {msg}", flush=True) log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}") if torch.cuda.is_available(): log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)") from little_fig.engine.figquant import figquant_quantize, figquant_dequantize, FigQuantTensor # Create a test weight (simulating one layer of TinyLlama) # TinyLlama: q_proj = [2048, 2048], k_proj = [256, 2048], etc. HIDDEN = 2048 INTER = 5632 torch.manual_seed(42) log("\n" + "="*60) log(" NOVEL GPU MEMORY REDUCTION EXPERIMENTS") log("="*60) # Quantize a test weight W = torch.randn(HIDDEN, HIDDEN) fq = figquant_quantize(W, group_size=128, n_iters=8) dev = torch.device("cuda") # ═══════════════════════════════════════════════════════════════════════════════ # EXPERIMENT A: Pre-allocated Buffer Dequant # Instead of creating a NEW tensor on every forward pass (allocation = slow + memory), # dequant into a FIXED pre-allocated buffer that gets rewritten each layer. # Memory: one buffer, reused for all layers. Never grows. # ═══════════════════════════════════════════════════════════════════════════════ log("\n--- Experiment A: Pre-allocated Buffer Dequant ---") # Standard approach: dequant creates new tensor every time gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() # Move quantized data to GPU indices_gpu = fq.indices.to(dev) codebook_gpu = fq.codebook.to(dev) scales_gpu = fq.scales.to(dev) def dequant_standard(): """Standard: allocates new FP16 tensor each call.""" low = (indices_gpu & 0x0F).long() high = ((indices_gpu >> 4) & 0x0F).long() unpacked = torch.stack([low, high], dim=1).reshape(-1) unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size) cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1) result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1) return result.reshape(-1)[:fq.numel].reshape(fq.shape).half() # Pre-allocated buffer approach buffer = torch.empty(HIDDEN, HIDDEN, dtype=torch.float16, device=dev) def dequant_buffered(): """Novel: dequant INTO a pre-allocated buffer. No new allocation.""" low = (indices_gpu & 0x0F).long() high = ((indices_gpu >> 4) & 0x0F).long() unpacked = torch.stack([low, high], dim=1).reshape(-1) unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size) cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1) result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1) buffer.copy_(result.reshape(-1)[:fq.numel].reshape(fq.shape).half()) return buffer # Benchmark both torch.cuda.reset_peak_memory_stats() for _ in range(10): w = dequant_standard() del w peak_standard = torch.cuda.max_memory_allocated() / 1e6 torch.cuda.reset_peak_memory_stats() for _ in range(10): w = dequant_buffered() peak_buffered = torch.cuda.max_memory_allocated() / 1e6 log(f" Standard dequant peak: {peak_standard:.1f} MB") log(f" Buffered dequant peak: {peak_buffered:.1f} MB") log(f" Savings: {peak_standard - peak_buffered:.1f} MB ({(peak_standard-peak_buffered)/peak_standard*100:.1f}%)") # Speed comparison torch.cuda.synchronize() t0 = time.time() for _ in range(100): dequant_standard() torch.cuda.synchronize() time_std = (time.time() - t0) * 10 # ms per call t0 = time.time() for _ in range(100): dequant_buffered() torch.cuda.synchronize() time_buf = (time.time() - t0) * 10 log(f" Standard speed: {time_std:.2f} ms/call") log(f" Buffered speed: {time_buf:.2f} ms/call") # ═══════════════════════════════════════════════════════════════════════════════ # EXPERIMENT B: FP16 vs FP32 Dequant (our dtype fix already does this) # Quantify the exact savings of dequanting to FP16 instead of FP32 # ═══════════════════════════════════════════════════════════════════════════════ log("\n--- Experiment B: FP16 vs FP32 Dequant Savings ---") gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() w32 = dequant_standard().float() peak_32 = torch.cuda.max_memory_allocated() / 1e6 del w32 gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() w16 = dequant_standard() # already half peak_16 = torch.cuda.max_memory_allocated() / 1e6 del w16 log(f" FP32 dequant: {peak_32:.1f} MB for one {HIDDEN}×{HIDDEN} weight") log(f" FP16 dequant: {peak_16:.1f} MB for one {HIDDEN}×{HIDDEN} weight") log(f" Per-layer savings: {peak_32 - peak_16:.1f} MB") log(f" For 88 layers: {(peak_32 - peak_16) * 88:.0f} MB total savings") # ═══════════════════════════════════════════════════════════════════════════════ # EXPERIMENT C: "Codebook-in-Register" — keep codebook in GPU constant memory # The 16 codebook values (64 bytes) should NEVER leave GPU registers # Test: does keeping codebook as a cuda constant save memory/speed? # ═══════════════════════════════════════════════════════════════════════════════ log("\n--- Experiment C: Codebook Caching Strategy ---") # All layers use nearly identical codebooks (proved earlier: 0.019 L2 between layers) # What if we use ONE global codebook for ALL layers at inference? # This means: codebook = 64 bytes, NEVER changes, stays in L1 cache permanently # Simulate: 88 layers with individual codebooks vs 1 shared gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() individual_codebooks = [torch.randn(16, device=dev) for _ in range(88)] mem_individual = torch.cuda.memory_allocated() / 1e6 gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() shared_codebook = torch.randn(16, device=dev) mem_shared = torch.cuda.memory_allocated() / 1e6 log(f" 88 individual codebooks: {mem_individual:.3f} MB") log(f" 1 shared codebook: {mem_shared:.3f} MB") log(f" Savings: {mem_individual - mem_shared:.3f} MB") log(f" (Tiny savings — but the REAL benefit is L1 cache residency)") log(f" A single 64-byte codebook stays in L1 cache permanently = faster lookups") del individual_codebooks, shared_codebook # ═══════════════════════════════════════════════════════════════════════════════ # EXPERIMENT D: "Lazy Row Dequant" — Only dequant rows needed for current batch # For matvec: out = W @ x, we need ALL rows of W. # But for attention: Q = x @ W_q^T, we only need W_q for the current positions. # What if we only dequant the ROWS that the attention scores point to? # ═══════════════════════════════════════════════════════════════════════════════ log("\n--- Experiment D: Partial Row Dequant ---") # In attention, after computing scores, we only need V[attended_positions] # If seq_len=512 but attention is sparse (top-k), we can dequant fewer rows # Simulate: dequant all 2048 rows vs only top-128 rows gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() W_big = torch.randn(HIDDEN, HIDDEN, dtype=torch.float16, device=dev) mem_full = torch.cuda.memory_allocated() / 1e6 gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() # Partial: only 128 rows (6.25% of the matrix) rows_needed = 128 W_partial = torch.randn(rows_needed, HIDDEN, dtype=torch.float16, device=dev) mem_partial = torch.cuda.memory_allocated() / 1e6 log(f" Full matrix ({HIDDEN}×{HIDDEN}): {mem_full:.1f} MB") log(f" Partial ({rows_needed}×{HIDDEN}): {mem_partial:.1f} MB") log(f" Savings: {mem_full - mem_partial:.1f} MB ({(1-mem_partial/mem_full)*100:.0f}%)") log(f" For 88 layers × 4 projections: {(mem_full-mem_partial)*88*4:.0f} MB potential savings") log(f" CAVEAT: Only works for attention V projection after scoring, not for Q/K/O") del W_big, W_partial # ═══════════════════════════════════════════════════════════════════════════════ # EXPERIMENT E: "Gradient Accumulation with CPU Offload" # Standard: all gradients on GPU during accumulation # Novel: after each micro-batch, move gradients to CPU immediately # GPU only holds: model + 1 batch activations + 1 micro-batch gradient # ═══════════════════════════════════════════════════════════════════════════════ log("\n--- Experiment E: Immediate Gradient CPU Offload ---") # Simulate: accumulate gradients on GPU vs CPU n_params = 4_500_000 # LoRA params for TinyLlama param_size = n_params * 2 # FP16 # On GPU: all 4 micro-batch gradients in VRAM simultaneously grad_on_gpu = param_size * 4 / 1e6 # 4 micro-batches accumulated log(f" Standard (4 grads on GPU): {grad_on_gpu:.1f} MB") # With offload: only 1 grad on GPU at a time, rest on CPU grad_offload = param_size * 1 / 1e6 log(f" Offload (1 grad on GPU): {grad_offload:.1f} MB") log(f" Savings: {grad_on_gpu - grad_offload:.1f} MB") log(f" Note: LoRA params are small (18MB) so grad savings are modest") log(f" The BIG savings come from activation memory, not gradient memory") # ═══════════════════════════════════════════════════════════════════════════════ # EXPERIMENT F: "Activation Compression" — Compress activations in-flight # Between layers, activations sit in memory waiting for backward pass. # What if we quantize them to INT8 between layers and dequant on backward? # ═══════════════════════════════════════════════════════════════════════════════ log("\n--- Experiment F: Activation Compression (INT8 between layers) ---") gc.collect(); torch.cuda.empty_cache() # Simulate: store FP16 activations vs INT8 activations between layers batch_seq = 4 * 512 # batch=4, seq=512 act_fp16 = batch_seq * HIDDEN * 2 / 1e6 # FP16: 2 bytes act_int8 = batch_seq * HIDDEN * 1 / 1e6 # INT8: 1 byte n_stored_layers = 22 # layers that need stored activations (with gradient checkpointing) log(f" One layer activation (FP16): {act_fp16:.1f} MB") log(f" One layer activation (INT8): {act_int8:.1f} MB") log(f" With {n_stored_layers} checkpointed layers:") log(f" FP16 total: {act_fp16 * n_stored_layers:.0f} MB") log(f" INT8 total: {act_int8 * n_stored_layers:.0f} MB") log(f" Savings: {(act_fp16 - act_int8) * n_stored_layers:.0f} MB") # Test quality: does INT8 quantization of activations hurt training? test_act = torch.randn(4, 512, HIDDEN, device=dev, dtype=torch.float16) # Quantize to INT8 scale = test_act.abs().amax(dim=-1, keepdim=True).clamp(min=1e-5) / 127.0 quantized = (test_act / scale).round().clamp(-128, 127).to(torch.int8) # Dequantize reconstructed = quantized.float() * scale # Measure error mse = F.mse_loss(reconstructed, test_act.float()).item() cos = F.cosine_similarity(test_act.reshape(-1).float().unsqueeze(0), reconstructed.reshape(-1).unsqueeze(0)).item() log(f" Activation INT8 quality: MSE={mse:.6e}, cosine={cos:.6f}") log(f" {'✅ Negligible error' if cos > 0.999 else '⚠️ Notable error'}") del test_act, quantized, reconstructed # ═══════════════════════════════════════════════════════════════════════════════ # SUMMARY # ═══════════════════════════════════════════════════════════════════════════════ log("\n" + "="*60) log(" SUMMARY: GPU Memory Reduction Strategies") log("="*60) log(f""" Strategy Savings Effort Worth it? ───────────────────────────────────────────────────────────── A. Pre-allocated buffer ~{peak_standard-peak_buffered:.0f} MB/layer Low ✅ Yes (simple, effective) B. FP16 dequant (not FP32) ~{(peak_32-peak_16)*88:.0f} MB total Already done ✅ Already implemented C. Shared codebook Tiny Already done ✅ Speed benefit > memory D. Partial row dequant ~{(mem_full-mem_partial)*88*4:.0f} MB potential High ⚠️ Only for attention V E. Grad CPU offload ~{grad_on_gpu-grad_offload:.0f} MB Medium ❌ LoRA grads are already small F. Activation INT8 compress ~{(act_fp16-act_int8)*n_stored_layers:.0f} MB total Medium ✅ Best bang for buck RECOMMENDATION: Combine A (buffer reuse) + F (INT8 activation compression) for maximum savings with minimal complexity. Together they save ~{peak_standard-peak_buffered + (act_fp16-act_int8)*n_stored_layers:.0f} MB — enough to comfortably fit TinyLlama 1.1B training in under 6GB GPU memory. """)