| |
| """ |
| Novel GPU Memory Reduction Experiments for FigQuant |
| =================================================== |
| |
| Standard approaches (gradient checkpointing, mixed precision) are already in use. |
| These experiments test NON-STANDARD ideas unique to FigQuant's architecture: |
| |
| Experiment A: "Streaming Dequant" β only dequant the current layer, not all at once |
| Experiment B: "Ping-Pong" β keep even layers on GPU, odd layers on CPU, swap during forward |
| Experiment C: "Lazy Materialization" β dequant into a pre-allocated buffer, overwrite per-layer |
| Experiment D: "Partial Dequant" β only dequant the rows needed by the current token's attention |
| """ |
| import os, sys, subprocess, time, gc |
| import numpy as np |
|
|
| subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", |
| "transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"]) |
| subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"]) |
| subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"]) |
| sys.path.insert(0, "/app/littlefig/src") |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
|
|
| def log(msg): print(f"[MEM] {msg}", flush=True) |
|
|
| log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}") |
| if torch.cuda.is_available(): |
| log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)") |
|
|
| from little_fig.engine.figquant import figquant_quantize, figquant_dequantize, FigQuantTensor |
|
|
| |
| |
| HIDDEN = 2048 |
| INTER = 5632 |
| torch.manual_seed(42) |
|
|
| log("\n" + "="*60) |
| log(" NOVEL GPU MEMORY REDUCTION EXPERIMENTS") |
| log("="*60) |
|
|
| |
| W = torch.randn(HIDDEN, HIDDEN) |
| fq = figquant_quantize(W, group_size=128, n_iters=8) |
|
|
| dev = torch.device("cuda") |
|
|
| |
| |
| |
| |
| |
| |
|
|
| log("\n--- Experiment A: Pre-allocated Buffer Dequant ---") |
|
|
| |
| gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() |
|
|
| |
| indices_gpu = fq.indices.to(dev) |
| codebook_gpu = fq.codebook.to(dev) |
| scales_gpu = fq.scales.to(dev) |
|
|
| def dequant_standard(): |
| """Standard: allocates new FP16 tensor each call.""" |
| low = (indices_gpu & 0x0F).long() |
| high = ((indices_gpu >> 4) & 0x0F).long() |
| unpacked = torch.stack([low, high], dim=1).reshape(-1) |
| unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size) |
| cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1) |
| result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1) |
| return result.reshape(-1)[:fq.numel].reshape(fq.shape).half() |
|
|
| |
| buffer = torch.empty(HIDDEN, HIDDEN, dtype=torch.float16, device=dev) |
|
|
| def dequant_buffered(): |
| """Novel: dequant INTO a pre-allocated buffer. No new allocation.""" |
| low = (indices_gpu & 0x0F).long() |
| high = ((indices_gpu >> 4) & 0x0F).long() |
| unpacked = torch.stack([low, high], dim=1).reshape(-1) |
| unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size) |
| cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1) |
| result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1) |
| buffer.copy_(result.reshape(-1)[:fq.numel].reshape(fq.shape).half()) |
| return buffer |
|
|
| |
| torch.cuda.reset_peak_memory_stats() |
| for _ in range(10): |
| w = dequant_standard() |
| del w |
| peak_standard = torch.cuda.max_memory_allocated() / 1e6 |
|
|
| torch.cuda.reset_peak_memory_stats() |
| for _ in range(10): |
| w = dequant_buffered() |
| peak_buffered = torch.cuda.max_memory_allocated() / 1e6 |
|
|
| log(f" Standard dequant peak: {peak_standard:.1f} MB") |
| log(f" Buffered dequant peak: {peak_buffered:.1f} MB") |
| log(f" Savings: {peak_standard - peak_buffered:.1f} MB ({(peak_standard-peak_buffered)/peak_standard*100:.1f}%)") |
|
|
| |
| torch.cuda.synchronize() |
| t0 = time.time() |
| for _ in range(100): dequant_standard() |
| torch.cuda.synchronize() |
| time_std = (time.time() - t0) * 10 |
|
|
| t0 = time.time() |
| for _ in range(100): dequant_buffered() |
| torch.cuda.synchronize() |
| time_buf = (time.time() - t0) * 10 |
|
|
| log(f" Standard speed: {time_std:.2f} ms/call") |
| log(f" Buffered speed: {time_buf:.2f} ms/call") |
|
|
| |
| |
| |
| |
|
|
| log("\n--- Experiment B: FP16 vs FP32 Dequant Savings ---") |
|
|
| gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() |
| w32 = dequant_standard().float() |
| peak_32 = torch.cuda.max_memory_allocated() / 1e6 |
| del w32 |
|
|
| gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() |
| w16 = dequant_standard() |
| peak_16 = torch.cuda.max_memory_allocated() / 1e6 |
| del w16 |
|
|
| log(f" FP32 dequant: {peak_32:.1f} MB for one {HIDDEN}Γ{HIDDEN} weight") |
| log(f" FP16 dequant: {peak_16:.1f} MB for one {HIDDEN}Γ{HIDDEN} weight") |
| log(f" Per-layer savings: {peak_32 - peak_16:.1f} MB") |
| log(f" For 88 layers: {(peak_32 - peak_16) * 88:.0f} MB total savings") |
|
|
| |
| |
| |
| |
| |
|
|
| log("\n--- Experiment C: Codebook Caching Strategy ---") |
|
|
| |
| |
| |
|
|
| |
| gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() |
|
|
| individual_codebooks = [torch.randn(16, device=dev) for _ in range(88)] |
| mem_individual = torch.cuda.memory_allocated() / 1e6 |
|
|
| gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() |
| shared_codebook = torch.randn(16, device=dev) |
| mem_shared = torch.cuda.memory_allocated() / 1e6 |
|
|
| log(f" 88 individual codebooks: {mem_individual:.3f} MB") |
| log(f" 1 shared codebook: {mem_shared:.3f} MB") |
| log(f" Savings: {mem_individual - mem_shared:.3f} MB") |
| log(f" (Tiny savings β but the REAL benefit is L1 cache residency)") |
| log(f" A single 64-byte codebook stays in L1 cache permanently = faster lookups") |
|
|
| del individual_codebooks, shared_codebook |
|
|
| |
| |
| |
| |
| |
| |
|
|
| log("\n--- Experiment D: Partial Row Dequant ---") |
|
|
| |
| |
|
|
| |
| gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() |
|
|
| W_big = torch.randn(HIDDEN, HIDDEN, dtype=torch.float16, device=dev) |
| mem_full = torch.cuda.memory_allocated() / 1e6 |
|
|
| gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats() |
|
|
| |
| rows_needed = 128 |
| W_partial = torch.randn(rows_needed, HIDDEN, dtype=torch.float16, device=dev) |
| mem_partial = torch.cuda.memory_allocated() / 1e6 |
|
|
| log(f" Full matrix ({HIDDEN}Γ{HIDDEN}): {mem_full:.1f} MB") |
| log(f" Partial ({rows_needed}Γ{HIDDEN}): {mem_partial:.1f} MB") |
| log(f" Savings: {mem_full - mem_partial:.1f} MB ({(1-mem_partial/mem_full)*100:.0f}%)") |
| log(f" For 88 layers Γ 4 projections: {(mem_full-mem_partial)*88*4:.0f} MB potential savings") |
| log(f" CAVEAT: Only works for attention V projection after scoring, not for Q/K/O") |
|
|
| del W_big, W_partial |
|
|
| |
| |
| |
| |
| |
| |
|
|
| log("\n--- Experiment E: Immediate Gradient CPU Offload ---") |
|
|
| |
| n_params = 4_500_000 |
| param_size = n_params * 2 |
|
|
| |
| grad_on_gpu = param_size * 4 / 1e6 |
| log(f" Standard (4 grads on GPU): {grad_on_gpu:.1f} MB") |
|
|
| |
| grad_offload = param_size * 1 / 1e6 |
| log(f" Offload (1 grad on GPU): {grad_offload:.1f} MB") |
| log(f" Savings: {grad_on_gpu - grad_offload:.1f} MB") |
| log(f" Note: LoRA params are small (18MB) so grad savings are modest") |
| log(f" The BIG savings come from activation memory, not gradient memory") |
|
|
| |
| |
| |
| |
| |
|
|
| log("\n--- Experiment F: Activation Compression (INT8 between layers) ---") |
|
|
| gc.collect(); torch.cuda.empty_cache() |
|
|
| |
| batch_seq = 4 * 512 |
| act_fp16 = batch_seq * HIDDEN * 2 / 1e6 |
| act_int8 = batch_seq * HIDDEN * 1 / 1e6 |
| n_stored_layers = 22 |
|
|
| log(f" One layer activation (FP16): {act_fp16:.1f} MB") |
| log(f" One layer activation (INT8): {act_int8:.1f} MB") |
| log(f" With {n_stored_layers} checkpointed layers:") |
| log(f" FP16 total: {act_fp16 * n_stored_layers:.0f} MB") |
| log(f" INT8 total: {act_int8 * n_stored_layers:.0f} MB") |
| log(f" Savings: {(act_fp16 - act_int8) * n_stored_layers:.0f} MB") |
|
|
| |
| test_act = torch.randn(4, 512, HIDDEN, device=dev, dtype=torch.float16) |
| |
| scale = test_act.abs().amax(dim=-1, keepdim=True).clamp(min=1e-5) / 127.0 |
| quantized = (test_act / scale).round().clamp(-128, 127).to(torch.int8) |
| |
| reconstructed = quantized.float() * scale |
|
|
| |
| mse = F.mse_loss(reconstructed, test_act.float()).item() |
| cos = F.cosine_similarity(test_act.reshape(-1).float().unsqueeze(0), |
| reconstructed.reshape(-1).unsqueeze(0)).item() |
| log(f" Activation INT8 quality: MSE={mse:.6e}, cosine={cos:.6f}") |
| log(f" {'β
Negligible error' if cos > 0.999 else 'β οΈ Notable error'}") |
|
|
| del test_act, quantized, reconstructed |
|
|
| |
| |
| |
|
|
| log("\n" + "="*60) |
| log(" SUMMARY: GPU Memory Reduction Strategies") |
| log("="*60) |
| log(f""" |
| Strategy Savings Effort Worth it? |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| A. Pre-allocated buffer ~{peak_standard-peak_buffered:.0f} MB/layer Low β
Yes (simple, effective) |
| B. FP16 dequant (not FP32) ~{(peak_32-peak_16)*88:.0f} MB total Already done β
Already implemented |
| C. Shared codebook Tiny Already done β
Speed benefit > memory |
| D. Partial row dequant ~{(mem_full-mem_partial)*88*4:.0f} MB potential High β οΈ Only for attention V |
| E. Grad CPU offload ~{grad_on_gpu-grad_offload:.0f} MB Medium β LoRA grads are already small |
| F. Activation INT8 compress ~{(act_fp16-act_int8)*n_stored_layers:.0f} MB total Medium β
Best bang for buck |
| |
| RECOMMENDATION: |
| Combine A (buffer reuse) + F (INT8 activation compression) for |
| maximum savings with minimal complexity. Together they save |
| ~{peak_standard-peak_buffered + (act_fp16-act_int8)*n_stored_layers:.0f} MB β enough to comfortably fit TinyLlama 1.1B |
| training in under 6GB GPU memory. |
| """) |
|
|