littlefig-bench / memory_experiments.py
ticketguy's picture
Novel GPU memory reduction experiments
6de6495 verified
#!/usr/bin/env python3
"""
Novel GPU Memory Reduction Experiments for FigQuant
===================================================
Standard approaches (gradient checkpointing, mixed precision) are already in use.
These experiments test NON-STANDARD ideas unique to FigQuant's architecture:
Experiment A: "Streaming Dequant" β€” only dequant the current layer, not all at once
Experiment B: "Ping-Pong" β€” keep even layers on GPU, odd layers on CPU, swap during forward
Experiment C: "Lazy Materialization" β€” dequant into a pre-allocated buffer, overwrite per-layer
Experiment D: "Partial Dequant" β€” only dequant the rows needed by the current token's attention
"""
import os, sys, subprocess, time, gc
import numpy as np
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
"transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"])
subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
sys.path.insert(0, "/app/littlefig/src")
import torch
import torch.nn as nn
import torch.nn.functional as F
def log(msg): print(f"[MEM] {msg}", flush=True)
log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
if torch.cuda.is_available():
log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")
from little_fig.engine.figquant import figquant_quantize, figquant_dequantize, FigQuantTensor
# Create a test weight (simulating one layer of TinyLlama)
# TinyLlama: q_proj = [2048, 2048], k_proj = [256, 2048], etc.
HIDDEN = 2048
INTER = 5632
torch.manual_seed(42)
log("\n" + "="*60)
log(" NOVEL GPU MEMORY REDUCTION EXPERIMENTS")
log("="*60)
# Quantize a test weight
W = torch.randn(HIDDEN, HIDDEN)
fq = figquant_quantize(W, group_size=128, n_iters=8)
dev = torch.device("cuda")
# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT A: Pre-allocated Buffer Dequant
# Instead of creating a NEW tensor on every forward pass (allocation = slow + memory),
# dequant into a FIXED pre-allocated buffer that gets rewritten each layer.
# Memory: one buffer, reused for all layers. Never grows.
# ═══════════════════════════════════════════════════════════════════════════════
log("\n--- Experiment A: Pre-allocated Buffer Dequant ---")
# Standard approach: dequant creates new tensor every time
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
# Move quantized data to GPU
indices_gpu = fq.indices.to(dev)
codebook_gpu = fq.codebook.to(dev)
scales_gpu = fq.scales.to(dev)
def dequant_standard():
"""Standard: allocates new FP16 tensor each call."""
low = (indices_gpu & 0x0F).long()
high = ((indices_gpu >> 4) & 0x0F).long()
unpacked = torch.stack([low, high], dim=1).reshape(-1)
unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size)
cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1)
result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1)
return result.reshape(-1)[:fq.numel].reshape(fq.shape).half()
# Pre-allocated buffer approach
buffer = torch.empty(HIDDEN, HIDDEN, dtype=torch.float16, device=dev)
def dequant_buffered():
"""Novel: dequant INTO a pre-allocated buffer. No new allocation."""
low = (indices_gpu & 0x0F).long()
high = ((indices_gpu >> 4) & 0x0F).long()
unpacked = torch.stack([low, high], dim=1).reshape(-1)
unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size)
cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1)
result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1)
buffer.copy_(result.reshape(-1)[:fq.numel].reshape(fq.shape).half())
return buffer
# Benchmark both
torch.cuda.reset_peak_memory_stats()
for _ in range(10):
w = dequant_standard()
del w
peak_standard = torch.cuda.max_memory_allocated() / 1e6
torch.cuda.reset_peak_memory_stats()
for _ in range(10):
w = dequant_buffered()
peak_buffered = torch.cuda.max_memory_allocated() / 1e6
log(f" Standard dequant peak: {peak_standard:.1f} MB")
log(f" Buffered dequant peak: {peak_buffered:.1f} MB")
log(f" Savings: {peak_standard - peak_buffered:.1f} MB ({(peak_standard-peak_buffered)/peak_standard*100:.1f}%)")
# Speed comparison
torch.cuda.synchronize()
t0 = time.time()
for _ in range(100): dequant_standard()
torch.cuda.synchronize()
time_std = (time.time() - t0) * 10 # ms per call
t0 = time.time()
for _ in range(100): dequant_buffered()
torch.cuda.synchronize()
time_buf = (time.time() - t0) * 10
log(f" Standard speed: {time_std:.2f} ms/call")
log(f" Buffered speed: {time_buf:.2f} ms/call")
# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT B: FP16 vs FP32 Dequant (our dtype fix already does this)
# Quantify the exact savings of dequanting to FP16 instead of FP32
# ═══════════════════════════════════════════════════════════════════════════════
log("\n--- Experiment B: FP16 vs FP32 Dequant Savings ---")
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
w32 = dequant_standard().float()
peak_32 = torch.cuda.max_memory_allocated() / 1e6
del w32
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
w16 = dequant_standard() # already half
peak_16 = torch.cuda.max_memory_allocated() / 1e6
del w16
log(f" FP32 dequant: {peak_32:.1f} MB for one {HIDDEN}Γ—{HIDDEN} weight")
log(f" FP16 dequant: {peak_16:.1f} MB for one {HIDDEN}Γ—{HIDDEN} weight")
log(f" Per-layer savings: {peak_32 - peak_16:.1f} MB")
log(f" For 88 layers: {(peak_32 - peak_16) * 88:.0f} MB total savings")
# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT C: "Codebook-in-Register" β€” keep codebook in GPU constant memory
# The 16 codebook values (64 bytes) should NEVER leave GPU registers
# Test: does keeping codebook as a cuda constant save memory/speed?
# ═══════════════════════════════════════════════════════════════════════════════
log("\n--- Experiment C: Codebook Caching Strategy ---")
# All layers use nearly identical codebooks (proved earlier: 0.019 L2 between layers)
# What if we use ONE global codebook for ALL layers at inference?
# This means: codebook = 64 bytes, NEVER changes, stays in L1 cache permanently
# Simulate: 88 layers with individual codebooks vs 1 shared
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
individual_codebooks = [torch.randn(16, device=dev) for _ in range(88)]
mem_individual = torch.cuda.memory_allocated() / 1e6
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
shared_codebook = torch.randn(16, device=dev)
mem_shared = torch.cuda.memory_allocated() / 1e6
log(f" 88 individual codebooks: {mem_individual:.3f} MB")
log(f" 1 shared codebook: {mem_shared:.3f} MB")
log(f" Savings: {mem_individual - mem_shared:.3f} MB")
log(f" (Tiny savings β€” but the REAL benefit is L1 cache residency)")
log(f" A single 64-byte codebook stays in L1 cache permanently = faster lookups")
del individual_codebooks, shared_codebook
# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT D: "Lazy Row Dequant" β€” Only dequant rows needed for current batch
# For matvec: out = W @ x, we need ALL rows of W.
# But for attention: Q = x @ W_q^T, we only need W_q for the current positions.
# What if we only dequant the ROWS that the attention scores point to?
# ═══════════════════════════════════════════════════════════════════════════════
log("\n--- Experiment D: Partial Row Dequant ---")
# In attention, after computing scores, we only need V[attended_positions]
# If seq_len=512 but attention is sparse (top-k), we can dequant fewer rows
# Simulate: dequant all 2048 rows vs only top-128 rows
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
W_big = torch.randn(HIDDEN, HIDDEN, dtype=torch.float16, device=dev)
mem_full = torch.cuda.memory_allocated() / 1e6
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
# Partial: only 128 rows (6.25% of the matrix)
rows_needed = 128
W_partial = torch.randn(rows_needed, HIDDEN, dtype=torch.float16, device=dev)
mem_partial = torch.cuda.memory_allocated() / 1e6
log(f" Full matrix ({HIDDEN}Γ—{HIDDEN}): {mem_full:.1f} MB")
log(f" Partial ({rows_needed}Γ—{HIDDEN}): {mem_partial:.1f} MB")
log(f" Savings: {mem_full - mem_partial:.1f} MB ({(1-mem_partial/mem_full)*100:.0f}%)")
log(f" For 88 layers Γ— 4 projections: {(mem_full-mem_partial)*88*4:.0f} MB potential savings")
log(f" CAVEAT: Only works for attention V projection after scoring, not for Q/K/O")
del W_big, W_partial
# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT E: "Gradient Accumulation with CPU Offload"
# Standard: all gradients on GPU during accumulation
# Novel: after each micro-batch, move gradients to CPU immediately
# GPU only holds: model + 1 batch activations + 1 micro-batch gradient
# ═══════════════════════════════════════════════════════════════════════════════
log("\n--- Experiment E: Immediate Gradient CPU Offload ---")
# Simulate: accumulate gradients on GPU vs CPU
n_params = 4_500_000 # LoRA params for TinyLlama
param_size = n_params * 2 # FP16
# On GPU: all 4 micro-batch gradients in VRAM simultaneously
grad_on_gpu = param_size * 4 / 1e6 # 4 micro-batches accumulated
log(f" Standard (4 grads on GPU): {grad_on_gpu:.1f} MB")
# With offload: only 1 grad on GPU at a time, rest on CPU
grad_offload = param_size * 1 / 1e6
log(f" Offload (1 grad on GPU): {grad_offload:.1f} MB")
log(f" Savings: {grad_on_gpu - grad_offload:.1f} MB")
log(f" Note: LoRA params are small (18MB) so grad savings are modest")
log(f" The BIG savings come from activation memory, not gradient memory")
# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT F: "Activation Compression" β€” Compress activations in-flight
# Between layers, activations sit in memory waiting for backward pass.
# What if we quantize them to INT8 between layers and dequant on backward?
# ═══════════════════════════════════════════════════════════════════════════════
log("\n--- Experiment F: Activation Compression (INT8 between layers) ---")
gc.collect(); torch.cuda.empty_cache()
# Simulate: store FP16 activations vs INT8 activations between layers
batch_seq = 4 * 512 # batch=4, seq=512
act_fp16 = batch_seq * HIDDEN * 2 / 1e6 # FP16: 2 bytes
act_int8 = batch_seq * HIDDEN * 1 / 1e6 # INT8: 1 byte
n_stored_layers = 22 # layers that need stored activations (with gradient checkpointing)
log(f" One layer activation (FP16): {act_fp16:.1f} MB")
log(f" One layer activation (INT8): {act_int8:.1f} MB")
log(f" With {n_stored_layers} checkpointed layers:")
log(f" FP16 total: {act_fp16 * n_stored_layers:.0f} MB")
log(f" INT8 total: {act_int8 * n_stored_layers:.0f} MB")
log(f" Savings: {(act_fp16 - act_int8) * n_stored_layers:.0f} MB")
# Test quality: does INT8 quantization of activations hurt training?
test_act = torch.randn(4, 512, HIDDEN, device=dev, dtype=torch.float16)
# Quantize to INT8
scale = test_act.abs().amax(dim=-1, keepdim=True).clamp(min=1e-5) / 127.0
quantized = (test_act / scale).round().clamp(-128, 127).to(torch.int8)
# Dequantize
reconstructed = quantized.float() * scale
# Measure error
mse = F.mse_loss(reconstructed, test_act.float()).item()
cos = F.cosine_similarity(test_act.reshape(-1).float().unsqueeze(0),
reconstructed.reshape(-1).unsqueeze(0)).item()
log(f" Activation INT8 quality: MSE={mse:.6e}, cosine={cos:.6f}")
log(f" {'βœ… Negligible error' if cos > 0.999 else '⚠️ Notable error'}")
del test_act, quantized, reconstructed
# ═══════════════════════════════════════════════════════════════════════════════
# SUMMARY
# ═══════════════════════════════════════════════════════════════════════════════
log("\n" + "="*60)
log(" SUMMARY: GPU Memory Reduction Strategies")
log("="*60)
log(f"""
Strategy Savings Effort Worth it?
─────────────────────────────────────────────────────────────
A. Pre-allocated buffer ~{peak_standard-peak_buffered:.0f} MB/layer Low βœ… Yes (simple, effective)
B. FP16 dequant (not FP32) ~{(peak_32-peak_16)*88:.0f} MB total Already done βœ… Already implemented
C. Shared codebook Tiny Already done βœ… Speed benefit > memory
D. Partial row dequant ~{(mem_full-mem_partial)*88*4:.0f} MB potential High ⚠️ Only for attention V
E. Grad CPU offload ~{grad_on_gpu-grad_offload:.0f} MB Medium ❌ LoRA grads are already small
F. Activation INT8 compress ~{(act_fp16-act_int8)*n_stored_layers:.0f} MB total Medium βœ… Best bang for buck
RECOMMENDATION:
Combine A (buffer reuse) + F (INT8 activation compression) for
maximum savings with minimal complexity. Together they save
~{peak_standard-peak_buffered + (act_fp16-act_int8)*n_stored_layers:.0f} MB β€” enough to comfortably fit TinyLlama 1.1B
training in under 6GB GPU memory.
""")