File size: 15,677 Bytes
6de6495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#!/usr/bin/env python3
"""
Novel GPU Memory Reduction Experiments for FigQuant
===================================================

Standard approaches (gradient checkpointing, mixed precision) are already in use.
These experiments test NON-STANDARD ideas unique to FigQuant's architecture:

Experiment A: "Streaming Dequant" β€” only dequant the current layer, not all at once
Experiment B: "Ping-Pong" β€” keep even layers on GPU, odd layers on CPU, swap during forward
Experiment C: "Lazy Materialization" β€” dequant into a pre-allocated buffer, overwrite per-layer
Experiment D: "Partial Dequant" β€” only dequant the rows needed by the current token's attention
"""
import os, sys, subprocess, time, gc
import numpy as np

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
    "transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"])
subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
sys.path.insert(0, "/app/littlefig/src")

import torch
import torch.nn as nn
import torch.nn.functional as F

def log(msg): print(f"[MEM] {msg}", flush=True)

log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
if torch.cuda.is_available():
    log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")

from little_fig.engine.figquant import figquant_quantize, figquant_dequantize, FigQuantTensor

# Create a test weight (simulating one layer of TinyLlama)
# TinyLlama: q_proj = [2048, 2048], k_proj = [256, 2048], etc.
HIDDEN = 2048
INTER = 5632
torch.manual_seed(42)

log("\n" + "="*60)
log("  NOVEL GPU MEMORY REDUCTION EXPERIMENTS")
log("="*60)

# Quantize a test weight
W = torch.randn(HIDDEN, HIDDEN)
fq = figquant_quantize(W, group_size=128, n_iters=8)

dev = torch.device("cuda")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT A: Pre-allocated Buffer Dequant
# Instead of creating a NEW tensor on every forward pass (allocation = slow + memory),
# dequant into a FIXED pre-allocated buffer that gets rewritten each layer.
# Memory: one buffer, reused for all layers. Never grows.
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment A: Pre-allocated Buffer Dequant ---")

# Standard approach: dequant creates new tensor every time
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

# Move quantized data to GPU
indices_gpu = fq.indices.to(dev)
codebook_gpu = fq.codebook.to(dev)
scales_gpu = fq.scales.to(dev)

def dequant_standard():
    """Standard: allocates new FP16 tensor each call."""
    low = (indices_gpu & 0x0F).long()
    high = ((indices_gpu >> 4) & 0x0F).long()
    unpacked = torch.stack([low, high], dim=1).reshape(-1)
    unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size)
    cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1)
    result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1)
    return result.reshape(-1)[:fq.numel].reshape(fq.shape).half()

# Pre-allocated buffer approach
buffer = torch.empty(HIDDEN, HIDDEN, dtype=torch.float16, device=dev)

def dequant_buffered():
    """Novel: dequant INTO a pre-allocated buffer. No new allocation."""
    low = (indices_gpu & 0x0F).long()
    high = ((indices_gpu >> 4) & 0x0F).long()
    unpacked = torch.stack([low, high], dim=1).reshape(-1)
    unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size)
    cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1)
    result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1)
    buffer.copy_(result.reshape(-1)[:fq.numel].reshape(fq.shape).half())
    return buffer

# Benchmark both
torch.cuda.reset_peak_memory_stats()
for _ in range(10):
    w = dequant_standard()
    del w
peak_standard = torch.cuda.max_memory_allocated() / 1e6

torch.cuda.reset_peak_memory_stats()
for _ in range(10):
    w = dequant_buffered()
peak_buffered = torch.cuda.max_memory_allocated() / 1e6

log(f"  Standard dequant peak: {peak_standard:.1f} MB")
log(f"  Buffered dequant peak: {peak_buffered:.1f} MB")
log(f"  Savings: {peak_standard - peak_buffered:.1f} MB ({(peak_standard-peak_buffered)/peak_standard*100:.1f}%)")

# Speed comparison
torch.cuda.synchronize()
t0 = time.time()
for _ in range(100): dequant_standard()
torch.cuda.synchronize()
time_std = (time.time() - t0) * 10  # ms per call

t0 = time.time()
for _ in range(100): dequant_buffered()
torch.cuda.synchronize()
time_buf = (time.time() - t0) * 10

log(f"  Standard speed: {time_std:.2f} ms/call")
log(f"  Buffered speed: {time_buf:.2f} ms/call")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT B: FP16 vs FP32 Dequant (our dtype fix already does this)
# Quantify the exact savings of dequanting to FP16 instead of FP32
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment B: FP16 vs FP32 Dequant Savings ---")

gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
w32 = dequant_standard().float()
peak_32 = torch.cuda.max_memory_allocated() / 1e6
del w32

gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
w16 = dequant_standard()  # already half
peak_16 = torch.cuda.max_memory_allocated() / 1e6
del w16

log(f"  FP32 dequant: {peak_32:.1f} MB for one {HIDDEN}Γ—{HIDDEN} weight")
log(f"  FP16 dequant: {peak_16:.1f} MB for one {HIDDEN}Γ—{HIDDEN} weight")
log(f"  Per-layer savings: {peak_32 - peak_16:.1f} MB")
log(f"  For 88 layers: {(peak_32 - peak_16) * 88:.0f} MB total savings")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT C: "Codebook-in-Register" β€” keep codebook in GPU constant memory
# The 16 codebook values (64 bytes) should NEVER leave GPU registers
# Test: does keeping codebook as a cuda constant save memory/speed?
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment C: Codebook Caching Strategy ---")

# All layers use nearly identical codebooks (proved earlier: 0.019 L2 between layers)
# What if we use ONE global codebook for ALL layers at inference?
# This means: codebook = 64 bytes, NEVER changes, stays in L1 cache permanently

# Simulate: 88 layers with individual codebooks vs 1 shared
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

individual_codebooks = [torch.randn(16, device=dev) for _ in range(88)]
mem_individual = torch.cuda.memory_allocated() / 1e6

gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
shared_codebook = torch.randn(16, device=dev)
mem_shared = torch.cuda.memory_allocated() / 1e6

log(f"  88 individual codebooks: {mem_individual:.3f} MB")
log(f"  1 shared codebook:       {mem_shared:.3f} MB")
log(f"  Savings: {mem_individual - mem_shared:.3f} MB")
log(f"  (Tiny savings β€” but the REAL benefit is L1 cache residency)")
log(f"  A single 64-byte codebook stays in L1 cache permanently = faster lookups")

del individual_codebooks, shared_codebook

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT D: "Lazy Row Dequant" β€” Only dequant rows needed for current batch
# For matvec: out = W @ x, we need ALL rows of W.
# But for attention: Q = x @ W_q^T, we only need W_q for the current positions.
# What if we only dequant the ROWS that the attention scores point to?
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment D: Partial Row Dequant ---")

# In attention, after computing scores, we only need V[attended_positions]
# If seq_len=512 but attention is sparse (top-k), we can dequant fewer rows

# Simulate: dequant all 2048 rows vs only top-128 rows
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

W_big = torch.randn(HIDDEN, HIDDEN, dtype=torch.float16, device=dev)
mem_full = torch.cuda.memory_allocated() / 1e6

gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

# Partial: only 128 rows (6.25% of the matrix)
rows_needed = 128
W_partial = torch.randn(rows_needed, HIDDEN, dtype=torch.float16, device=dev)
mem_partial = torch.cuda.memory_allocated() / 1e6

log(f"  Full matrix ({HIDDEN}Γ—{HIDDEN}): {mem_full:.1f} MB")
log(f"  Partial ({rows_needed}Γ—{HIDDEN}):   {mem_partial:.1f} MB")
log(f"  Savings: {mem_full - mem_partial:.1f} MB ({(1-mem_partial/mem_full)*100:.0f}%)")
log(f"  For 88 layers Γ— 4 projections: {(mem_full-mem_partial)*88*4:.0f} MB potential savings")
log(f"  CAVEAT: Only works for attention V projection after scoring, not for Q/K/O")

del W_big, W_partial

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT E: "Gradient Accumulation with CPU Offload"
# Standard: all gradients on GPU during accumulation
# Novel: after each micro-batch, move gradients to CPU immediately
# GPU only holds: model + 1 batch activations + 1 micro-batch gradient
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment E: Immediate Gradient CPU Offload ---")

# Simulate: accumulate gradients on GPU vs CPU
n_params = 4_500_000  # LoRA params for TinyLlama
param_size = n_params * 2  # FP16

# On GPU: all 4 micro-batch gradients in VRAM simultaneously
grad_on_gpu = param_size * 4 / 1e6  # 4 micro-batches accumulated
log(f"  Standard (4 grads on GPU): {grad_on_gpu:.1f} MB")

# With offload: only 1 grad on GPU at a time, rest on CPU
grad_offload = param_size * 1 / 1e6
log(f"  Offload (1 grad on GPU):   {grad_offload:.1f} MB")
log(f"  Savings: {grad_on_gpu - grad_offload:.1f} MB")
log(f"  Note: LoRA params are small (18MB) so grad savings are modest")
log(f"  The BIG savings come from activation memory, not gradient memory")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT F: "Activation Compression" β€” Compress activations in-flight
# Between layers, activations sit in memory waiting for backward pass.
# What if we quantize them to INT8 between layers and dequant on backward?
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment F: Activation Compression (INT8 between layers) ---")

gc.collect(); torch.cuda.empty_cache()

# Simulate: store FP16 activations vs INT8 activations between layers
batch_seq = 4 * 512  # batch=4, seq=512
act_fp16 = batch_seq * HIDDEN * 2 / 1e6  # FP16: 2 bytes
act_int8 = batch_seq * HIDDEN * 1 / 1e6  # INT8: 1 byte
n_stored_layers = 22  # layers that need stored activations (with gradient checkpointing)

log(f"  One layer activation (FP16): {act_fp16:.1f} MB")
log(f"  One layer activation (INT8): {act_int8:.1f} MB")
log(f"  With {n_stored_layers} checkpointed layers:")
log(f"    FP16 total: {act_fp16 * n_stored_layers:.0f} MB")
log(f"    INT8 total: {act_int8 * n_stored_layers:.0f} MB")
log(f"    Savings: {(act_fp16 - act_int8) * n_stored_layers:.0f} MB")

# Test quality: does INT8 quantization of activations hurt training?
test_act = torch.randn(4, 512, HIDDEN, device=dev, dtype=torch.float16)
# Quantize to INT8
scale = test_act.abs().amax(dim=-1, keepdim=True).clamp(min=1e-5) / 127.0
quantized = (test_act / scale).round().clamp(-128, 127).to(torch.int8)
# Dequantize
reconstructed = quantized.float() * scale

# Measure error
mse = F.mse_loss(reconstructed, test_act.float()).item()
cos = F.cosine_similarity(test_act.reshape(-1).float().unsqueeze(0),
                           reconstructed.reshape(-1).unsqueeze(0)).item()
log(f"  Activation INT8 quality: MSE={mse:.6e}, cosine={cos:.6f}")
log(f"  {'βœ… Negligible error' if cos > 0.999 else '⚠️ Notable error'}")

del test_act, quantized, reconstructed

# ═══════════════════════════════════════════════════════════════════════════════
# SUMMARY
# ═══════════════════════════════════════════════════════════════════════════════

log("\n" + "="*60)
log("  SUMMARY: GPU Memory Reduction Strategies")
log("="*60)
log(f"""
  Strategy                    Savings       Effort    Worth it?
  ─────────────────────────────────────────────────────────────
  A. Pre-allocated buffer     ~{peak_standard-peak_buffered:.0f} MB/layer    Low       βœ… Yes (simple, effective)
  B. FP16 dequant (not FP32)  ~{(peak_32-peak_16)*88:.0f} MB total   Already done βœ… Already implemented
  C. Shared codebook          Tiny           Already done βœ… Speed benefit > memory
  D. Partial row dequant      ~{(mem_full-mem_partial)*88*4:.0f} MB potential  High      ⚠️ Only for attention V
  E. Grad CPU offload         ~{grad_on_gpu-grad_offload:.0f} MB         Medium    ❌ LoRA grads are already small
  F. Activation INT8 compress ~{(act_fp16-act_int8)*n_stored_layers:.0f} MB total    Medium    βœ… Best bang for buck

  RECOMMENDATION:
  Combine A (buffer reuse) + F (INT8 activation compression) for
  maximum savings with minimal complexity. Together they save
  ~{peak_standard-peak_buffered + (act_fp16-act_int8)*n_stored_layers:.0f} MB β€” enough to comfortably fit TinyLlama 1.1B
  training in under 6GB GPU memory.
""")