littlefig-bench / memory_experiments.py

Novel GPU memory reduction experiments

6de6495 verified 15 days ago

15.7 kB

	#!/usr/bin/env python3
	"""
	Novel GPU Memory Reduction Experiments for FigQuant
	===================================================

	Standard approaches (gradient checkpointing, mixed precision) are already in use.
	These experiments test NON-STANDARD ideas unique to FigQuant's architecture:

	Experiment A: "Streaming Dequant" — only dequant the current layer, not all at once
	Experiment B: "Ping-Pong" — keep even layers on GPU, odd layers on CPU, swap during forward
	Experiment C: "Lazy Materialization" — dequant into a pre-allocated buffer, overwrite per-layer
	Experiment D: "Partial Dequant" — only dequant the rows needed by the current token's attention
	"""
	import os, sys, subprocess, time, gc
	import numpy as np

	subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
	"transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"])
	subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
	subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
	sys.path.insert(0, "/app/littlefig/src")

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	def log(msg): print(f"[MEM] {msg}", flush=True)

	log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
	if torch.cuda.is_available():
	log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")

	from little_fig.engine.figquant import figquant_quantize, figquant_dequantize, FigQuantTensor

	# Create a test weight (simulating one layer of TinyLlama)
	# TinyLlama: q_proj = [2048, 2048], k_proj = [256, 2048], etc.
	HIDDEN = 2048
	INTER = 5632
	torch.manual_seed(42)

	log("\n" + "="*60)
	log(" NOVEL GPU MEMORY REDUCTION EXPERIMENTS")
	log("="*60)

	# Quantize a test weight
	W = torch.randn(HIDDEN, HIDDEN)
	fq = figquant_quantize(W, group_size=128, n_iters=8)

	dev = torch.device("cuda")

	# ═══════════════════════════════════════════════════════════════════════════════
	# EXPERIMENT A: Pre-allocated Buffer Dequant
	# Instead of creating a NEW tensor on every forward pass (allocation = slow + memory),
	# dequant into a FIXED pre-allocated buffer that gets rewritten each layer.
	# Memory: one buffer, reused for all layers. Never grows.
	# ═══════════════════════════════════════════════════════════════════════════════

	log("\n--- Experiment A: Pre-allocated Buffer Dequant ---")

	# Standard approach: dequant creates new tensor every time
	gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

	# Move quantized data to GPU
	indices_gpu = fq.indices.to(dev)
	codebook_gpu = fq.codebook.to(dev)
	scales_gpu = fq.scales.to(dev)

	def dequant_standard():
	"""Standard: allocates new FP16 tensor each call."""
	low = (indices_gpu & 0x0F).long()
	high = ((indices_gpu >> 4) & 0x0F).long()
	unpacked = torch.stack([low, high], dim=1).reshape(-1)
	unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size)
	cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1)
	result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1)
	return result.reshape(-1)[:fq.numel].reshape(fq.shape).half()

	# Pre-allocated buffer approach
	buffer = torch.empty(HIDDEN, HIDDEN, dtype=torch.float16, device=dev)

	def dequant_buffered():
	"""Novel: dequant INTO a pre-allocated buffer. No new allocation."""
	low = (indices_gpu & 0x0F).long()
	high = ((indices_gpu >> 4) & 0x0F).long()
	unpacked = torch.stack([low, high], dim=1).reshape(-1)
	unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size)
	cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1)
	result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1)
	buffer.copy_(result.reshape(-1)[:fq.numel].reshape(fq.shape).half())
	return buffer

	# Benchmark both
	torch.cuda.reset_peak_memory_stats()
	for _ in range(10):
	w = dequant_standard()
	del w
	peak_standard = torch.cuda.max_memory_allocated() / 1e6

	torch.cuda.reset_peak_memory_stats()
	for _ in range(10):
	w = dequant_buffered()
	peak_buffered = torch.cuda.max_memory_allocated() / 1e6

	log(f" Standard dequant peak: {peak_standard:.1f} MB")
	log(f" Buffered dequant peak: {peak_buffered:.1f} MB")
	log(f" Savings: {peak_standard - peak_buffered:.1f} MB ({(peak_standard-peak_buffered)/peak_standard*100:.1f}%)")

	# Speed comparison
	torch.cuda.synchronize()
	t0 = time.time()
	for _ in range(100): dequant_standard()
	torch.cuda.synchronize()
	time_std = (time.time() - t0) * 10 # ms per call

	t0 = time.time()
	for _ in range(100): dequant_buffered()
	torch.cuda.synchronize()
	time_buf = (time.time() - t0) * 10

	log(f" Standard speed: {time_std:.2f} ms/call")
	log(f" Buffered speed: {time_buf:.2f} ms/call")

	# ═══════════════════════════════════════════════════════════════════════════════
	# EXPERIMENT B: FP16 vs FP32 Dequant (our dtype fix already does this)
	# Quantify the exact savings of dequanting to FP16 instead of FP32
	# ═══════════════════════════════════════════════════════════════════════════════

	log("\n--- Experiment B: FP16 vs FP32 Dequant Savings ---")

	gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
	w32 = dequant_standard().float()
	peak_32 = torch.cuda.max_memory_allocated() / 1e6
	del w32

	gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
	w16 = dequant_standard() # already half
	peak_16 = torch.cuda.max_memory_allocated() / 1e6
	del w16

	log(f" FP32 dequant: {peak_32:.1f} MB for one {HIDDEN}×{HIDDEN} weight")
	log(f" FP16 dequant: {peak_16:.1f} MB for one {HIDDEN}×{HIDDEN} weight")
	log(f" Per-layer savings: {peak_32 - peak_16:.1f} MB")
	log(f" For 88 layers: {(peak_32 - peak_16) * 88:.0f} MB total savings")

	# ═══════════════════════════════════════════════════════════════════════════════
	# EXPERIMENT C: "Codebook-in-Register" — keep codebook in GPU constant memory
	# The 16 codebook values (64 bytes) should NEVER leave GPU registers
	# Test: does keeping codebook as a cuda constant save memory/speed?
	# ═══════════════════════════════════════════════════════════════════════════════

	log("\n--- Experiment C: Codebook Caching Strategy ---")

	# All layers use nearly identical codebooks (proved earlier: 0.019 L2 between layers)
	# What if we use ONE global codebook for ALL layers at inference?
	# This means: codebook = 64 bytes, NEVER changes, stays in L1 cache permanently

	# Simulate: 88 layers with individual codebooks vs 1 shared
	gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

	individual_codebooks = [torch.randn(16, device=dev) for _ in range(88)]
	mem_individual = torch.cuda.memory_allocated() / 1e6

	gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
	shared_codebook = torch.randn(16, device=dev)
	mem_shared = torch.cuda.memory_allocated() / 1e6

	log(f" 88 individual codebooks: {mem_individual:.3f} MB")
	log(f" 1 shared codebook: {mem_shared:.3f} MB")
	log(f" Savings: {mem_individual - mem_shared:.3f} MB")
	log(f" (Tiny savings — but the REAL benefit is L1 cache residency)")
	log(f" A single 64-byte codebook stays in L1 cache permanently = faster lookups")

	del individual_codebooks, shared_codebook

	# ═══════════════════════════════════════════════════════════════════════════════
	# EXPERIMENT D: "Lazy Row Dequant" — Only dequant rows needed for current batch
	# For matvec: out = W @ x, we need ALL rows of W.
	# But for attention: Q = x @ W_q^T, we only need W_q for the current positions.
	# What if we only dequant the ROWS that the attention scores point to?
	# ═══════════════════════════════════════════════════════════════════════════════

	log("\n--- Experiment D: Partial Row Dequant ---")

	# In attention, after computing scores, we only need V[attended_positions]
	# If seq_len=512 but attention is sparse (top-k), we can dequant fewer rows

	# Simulate: dequant all 2048 rows vs only top-128 rows
	gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

	W_big = torch.randn(HIDDEN, HIDDEN, dtype=torch.float16, device=dev)
	mem_full = torch.cuda.memory_allocated() / 1e6

	gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

	# Partial: only 128 rows (6.25% of the matrix)
	rows_needed = 128
	W_partial = torch.randn(rows_needed, HIDDEN, dtype=torch.float16, device=dev)
	mem_partial = torch.cuda.memory_allocated() / 1e6

	log(f" Full matrix ({HIDDEN}×{HIDDEN}): {mem_full:.1f} MB")
	log(f" Partial ({rows_needed}×{HIDDEN}): {mem_partial:.1f} MB")
	log(f" Savings: {mem_full - mem_partial:.1f} MB ({(1-mem_partial/mem_full)*100:.0f}%)")
	log(f" For 88 layers × 4 projections: {(mem_full-mem_partial)884:.0f} MB potential savings")
	log(f" CAVEAT: Only works for attention V projection after scoring, not for Q/K/O")

	del W_big, W_partial

	# ═══════════════════════════════════════════════════════════════════════════════
	# EXPERIMENT E: "Gradient Accumulation with CPU Offload"
	# Standard: all gradients on GPU during accumulation
	# Novel: after each micro-batch, move gradients to CPU immediately
	# GPU only holds: model + 1 batch activations + 1 micro-batch gradient
	# ═══════════════════════════════════════════════════════════════════════════════

	log("\n--- Experiment E: Immediate Gradient CPU Offload ---")

	# Simulate: accumulate gradients on GPU vs CPU
	n_params = 4_500_000 # LoRA params for TinyLlama
	param_size = n_params * 2 # FP16

	# On GPU: all 4 micro-batch gradients in VRAM simultaneously
	grad_on_gpu = param_size * 4 / 1e6 # 4 micro-batches accumulated
	log(f" Standard (4 grads on GPU): {grad_on_gpu:.1f} MB")

	# With offload: only 1 grad on GPU at a time, rest on CPU
	grad_offload = param_size * 1 / 1e6
	log(f" Offload (1 grad on GPU): {grad_offload:.1f} MB")
	log(f" Savings: {grad_on_gpu - grad_offload:.1f} MB")
	log(f" Note: LoRA params are small (18MB) so grad savings are modest")
	log(f" The BIG savings come from activation memory, not gradient memory")

	# ═══════════════════════════════════════════════════════════════════════════════
	# EXPERIMENT F: "Activation Compression" — Compress activations in-flight
	# Between layers, activations sit in memory waiting for backward pass.
	# What if we quantize them to INT8 between layers and dequant on backward?
	# ═══════════════════════════════════════════════════════════════════════════════

	log("\n--- Experiment F: Activation Compression (INT8 between layers) ---")

	gc.collect(); torch.cuda.empty_cache()

	# Simulate: store FP16 activations vs INT8 activations between layers
	batch_seq = 4 * 512 # batch=4, seq=512
	act_fp16 = batch_seq * HIDDEN * 2 / 1e6 # FP16: 2 bytes
	act_int8 = batch_seq * HIDDEN * 1 / 1e6 # INT8: 1 byte
	n_stored_layers = 22 # layers that need stored activations (with gradient checkpointing)

	log(f" One layer activation (FP16): {act_fp16:.1f} MB")
	log(f" One layer activation (INT8): {act_int8:.1f} MB")
	log(f" With {n_stored_layers} checkpointed layers:")
	log(f" FP16 total: {act_fp16 * n_stored_layers:.0f} MB")
	log(f" INT8 total: {act_int8 * n_stored_layers:.0f} MB")
	log(f" Savings: {(act_fp16 - act_int8) * n_stored_layers:.0f} MB")

	# Test quality: does INT8 quantization of activations hurt training?
	test_act = torch.randn(4, 512, HIDDEN, device=dev, dtype=torch.float16)
	# Quantize to INT8
	scale = test_act.abs().amax(dim=-1, keepdim=True).clamp(min=1e-5) / 127.0
	quantized = (test_act / scale).round().clamp(-128, 127).to(torch.int8)
	# Dequantize
	reconstructed = quantized.float() * scale

	# Measure error
	mse = F.mse_loss(reconstructed, test_act.float()).item()
	cos = F.cosine_similarity(test_act.reshape(-1).float().unsqueeze(0),
	reconstructed.reshape(-1).unsqueeze(0)).item()
	log(f" Activation INT8 quality: MSE={mse:.6e}, cosine={cos:.6f}")
	log(f" {'✅ Negligible error' if cos > 0.999 else '⚠️ Notable error'}")

	del test_act, quantized, reconstructed

	# ═══════════════════════════════════════════════════════════════════════════════
	# SUMMARY
	# ═══════════════════════════════════════════════════════════════════════════════

	log("\n" + "="*60)
	log(" SUMMARY: GPU Memory Reduction Strategies")
	log("="*60)
	log(f"""
	Strategy Savings Effort Worth it?
	─────────────────────────────────────────────────────────────
	A. Pre-allocated buffer ~{peak_standard-peak_buffered:.0f} MB/layer Low ✅ Yes (simple, effective)
	B. FP16 dequant (not FP32) ~{(peak_32-peak_16)*88:.0f} MB total Already done ✅ Already implemented
	C. Shared codebook Tiny Already done ✅ Speed benefit > memory
	D. Partial row dequant ~{(mem_full-mem_partial)884:.0f} MB potential High ⚠️ Only for attention V
	E. Grad CPU offload ~{grad_on_gpu-grad_offload:.0f} MB Medium ❌ LoRA grads are already small
	F. Activation INT8 compress ~{(act_fp16-act_int8)*n_stored_layers:.0f} MB total Medium ✅ Best bang for buck

	RECOMMENDATION:
	Combine A (buffer reuse) + F (INT8 activation compression) for
	maximum savings with minimal complexity. Together they save
	~{peak_standard-peak_buffered + (act_fp16-act_int8)*n_stored_layers:.0f} MB — enough to comfortably fit TinyLlama 1.1B
	training in under 6GB GPU memory.
	""")