#!/usr/bin/env python3
"""
Novel GPU Memory Reduction Experiments for FigQuant
===================================================

Standard approaches (gradient checkpointing, mixed precision) are already in use.
These experiments test NON-STANDARD ideas unique to FigQuant's architecture:

Experiment A: "Streaming Dequant" — only dequant the current layer, not all at once
Experiment B: "Ping-Pong" — keep even layers on GPU, odd layers on CPU, swap during forward
Experiment C: "Lazy Materialization" — dequant into a pre-allocated buffer, overwrite per-layer
Experiment D: "Partial Dequant" — only dequant the rows needed by the current token's attention
"""
import os, sys, subprocess, time, gc
import numpy as np

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
    "transformers", "accelerate", "datasets", "sentencepiece", "protobuf", "psutil", "numpy"])
subprocess.check_call(["git", "clone", "https://github.com/ticketguy/littlefig.git", "/app/littlefig"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "-e", "/app/littlefig[train]"])
sys.path.insert(0, "/app/littlefig/src")

import torch
import torch.nn as nn
import torch.nn.functional as F

def log(msg): print(f"[MEM] {msg}", flush=True)

log(f"PyTorch {torch.__version__}, CUDA={torch.cuda.is_available()}")
if torch.cuda.is_available():
    log(f"GPU: {torch.cuda.get_device_name()} ({torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB)")

from little_fig.engine.figquant import figquant_quantize, figquant_dequantize, FigQuantTensor

# Create a test weight (simulating one layer of TinyLlama)
# TinyLlama: q_proj = [2048, 2048], k_proj = [256, 2048], etc.
HIDDEN = 2048
INTER = 5632
torch.manual_seed(42)

log("\n" + "="*60)
log("  NOVEL GPU MEMORY REDUCTION EXPERIMENTS")
log("="*60)

# Quantize a test weight
W = torch.randn(HIDDEN, HIDDEN)
fq = figquant_quantize(W, group_size=128, n_iters=8)

dev = torch.device("cuda")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT A: Pre-allocated Buffer Dequant
# Instead of creating a NEW tensor on every forward pass (allocation = slow + memory),
# dequant into a FIXED pre-allocated buffer that gets rewritten each layer.
# Memory: one buffer, reused for all layers. Never grows.
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment A: Pre-allocated Buffer Dequant ---")

# Standard approach: dequant creates new tensor every time
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

# Move quantized data to GPU
indices_gpu = fq.indices.to(dev)
codebook_gpu = fq.codebook.to(dev)
scales_gpu = fq.scales.to(dev)

def dequant_standard():
    """Standard: allocates new FP16 tensor each call."""
    low = (indices_gpu & 0x0F).long()
    high = ((indices_gpu >> 4) & 0x0F).long()
    unpacked = torch.stack([low, high], dim=1).reshape(-1)
    unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size)
    cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1)
    result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1)
    return result.reshape(-1)[:fq.numel].reshape(fq.shape).half()

# Pre-allocated buffer approach
buffer = torch.empty(HIDDEN, HIDDEN, dtype=torch.float16, device=dev)

def dequant_buffered():
    """Novel: dequant INTO a pre-allocated buffer. No new allocation."""
    low = (indices_gpu & 0x0F).long()
    high = ((indices_gpu >> 4) & 0x0F).long()
    unpacked = torch.stack([low, high], dim=1).reshape(-1)
    unpacked = unpacked[:fq.n_groups * fq.group_size].reshape(fq.n_groups, fq.group_size)
    cb = codebook_gpu.unsqueeze(0).expand(fq.n_groups, -1)
    result = torch.gather(cb, dim=1, index=unpacked) * scales_gpu.unsqueeze(1)
    buffer.copy_(result.reshape(-1)[:fq.numel].reshape(fq.shape).half())
    return buffer

# Benchmark both
torch.cuda.reset_peak_memory_stats()
for _ in range(10):
    w = dequant_standard()
    del w
peak_standard = torch.cuda.max_memory_allocated() / 1e6

torch.cuda.reset_peak_memory_stats()
for _ in range(10):
    w = dequant_buffered()
peak_buffered = torch.cuda.max_memory_allocated() / 1e6

log(f"  Standard dequant peak: {peak_standard:.1f} MB")
log(f"  Buffered dequant peak: {peak_buffered:.1f} MB")
log(f"  Savings: {peak_standard - peak_buffered:.1f} MB ({(peak_standard-peak_buffered)/peak_standard*100:.1f}%)")

# Speed comparison
torch.cuda.synchronize()
t0 = time.time()
for _ in range(100): dequant_standard()
torch.cuda.synchronize()
time_std = (time.time() - t0) * 10  # ms per call

t0 = time.time()
for _ in range(100): dequant_buffered()
torch.cuda.synchronize()
time_buf = (time.time() - t0) * 10

log(f"  Standard speed: {time_std:.2f} ms/call")
log(f"  Buffered speed: {time_buf:.2f} ms/call")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT B: FP16 vs FP32 Dequant (our dtype fix already does this)
# Quantify the exact savings of dequanting to FP16 instead of FP32
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment B: FP16 vs FP32 Dequant Savings ---")

gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
w32 = dequant_standard().float()
peak_32 = torch.cuda.max_memory_allocated() / 1e6
del w32

gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
w16 = dequant_standard()  # already half
peak_16 = torch.cuda.max_memory_allocated() / 1e6
del w16

log(f"  FP32 dequant: {peak_32:.1f} MB for one {HIDDEN}×{HIDDEN} weight")
log(f"  FP16 dequant: {peak_16:.1f} MB for one {HIDDEN}×{HIDDEN} weight")
log(f"  Per-layer savings: {peak_32 - peak_16:.1f} MB")
log(f"  For 88 layers: {(peak_32 - peak_16) * 88:.0f} MB total savings")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT C: "Codebook-in-Register" — keep codebook in GPU constant memory
# The 16 codebook values (64 bytes) should NEVER leave GPU registers
# Test: does keeping codebook as a cuda constant save memory/speed?
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment C: Codebook Caching Strategy ---")

# All layers use nearly identical codebooks (proved earlier: 0.019 L2 between layers)
# What if we use ONE global codebook for ALL layers at inference?
# This means: codebook = 64 bytes, NEVER changes, stays in L1 cache permanently

# Simulate: 88 layers with individual codebooks vs 1 shared
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

individual_codebooks = [torch.randn(16, device=dev) for _ in range(88)]
mem_individual = torch.cuda.memory_allocated() / 1e6

gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()
shared_codebook = torch.randn(16, device=dev)
mem_shared = torch.cuda.memory_allocated() / 1e6

log(f"  88 individual codebooks: {mem_individual:.3f} MB")
log(f"  1 shared codebook:       {mem_shared:.3f} MB")
log(f"  Savings: {mem_individual - mem_shared:.3f} MB")
log(f"  (Tiny savings — but the REAL benefit is L1 cache residency)")
log(f"  A single 64-byte codebook stays in L1 cache permanently = faster lookups")

del individual_codebooks, shared_codebook

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT D: "Lazy Row Dequant" — Only dequant rows needed for current batch
# For matvec: out = W @ x, we need ALL rows of W.
# But for attention: Q = x @ W_q^T, we only need W_q for the current positions.
# What if we only dequant the ROWS that the attention scores point to?
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment D: Partial Row Dequant ---")

# In attention, after computing scores, we only need V[attended_positions]
# If seq_len=512 but attention is sparse (top-k), we can dequant fewer rows

# Simulate: dequant all 2048 rows vs only top-128 rows
gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

W_big = torch.randn(HIDDEN, HIDDEN, dtype=torch.float16, device=dev)
mem_full = torch.cuda.memory_allocated() / 1e6

gc.collect(); torch.cuda.empty_cache(); torch.cuda.reset_peak_memory_stats()

# Partial: only 128 rows (6.25% of the matrix)
rows_needed = 128
W_partial = torch.randn(rows_needed, HIDDEN, dtype=torch.float16, device=dev)
mem_partial = torch.cuda.memory_allocated() / 1e6

log(f"  Full matrix ({HIDDEN}×{HIDDEN}): {mem_full:.1f} MB")
log(f"  Partial ({rows_needed}×{HIDDEN}):   {mem_partial:.1f} MB")
log(f"  Savings: {mem_full - mem_partial:.1f} MB ({(1-mem_partial/mem_full)*100:.0f}%)")
log(f"  For 88 layers × 4 projections: {(mem_full-mem_partial)*88*4:.0f} MB potential savings")
log(f"  CAVEAT: Only works for attention V projection after scoring, not for Q/K/O")

del W_big, W_partial

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT E: "Gradient Accumulation with CPU Offload"
# Standard: all gradients on GPU during accumulation
# Novel: after each micro-batch, move gradients to CPU immediately
# GPU only holds: model + 1 batch activations + 1 micro-batch gradient
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment E: Immediate Gradient CPU Offload ---")

# Simulate: accumulate gradients on GPU vs CPU
n_params = 4_500_000  # LoRA params for TinyLlama
param_size = n_params * 2  # FP16

# On GPU: all 4 micro-batch gradients in VRAM simultaneously
grad_on_gpu = param_size * 4 / 1e6  # 4 micro-batches accumulated
log(f"  Standard (4 grads on GPU): {grad_on_gpu:.1f} MB")

# With offload: only 1 grad on GPU at a time, rest on CPU
grad_offload = param_size * 1 / 1e6
log(f"  Offload (1 grad on GPU):   {grad_offload:.1f} MB")
log(f"  Savings: {grad_on_gpu - grad_offload:.1f} MB")
log(f"  Note: LoRA params are small (18MB) so grad savings are modest")
log(f"  The BIG savings come from activation memory, not gradient memory")

# ═══════════════════════════════════════════════════════════════════════════════
# EXPERIMENT F: "Activation Compression" — Compress activations in-flight
# Between layers, activations sit in memory waiting for backward pass.
# What if we quantize them to INT8 between layers and dequant on backward?
# ═══════════════════════════════════════════════════════════════════════════════

log("\n--- Experiment F: Activation Compression (INT8 between layers) ---")

gc.collect(); torch.cuda.empty_cache()

# Simulate: store FP16 activations vs INT8 activations between layers
batch_seq = 4 * 512  # batch=4, seq=512
act_fp16 = batch_seq * HIDDEN * 2 / 1e6  # FP16: 2 bytes
act_int8 = batch_seq * HIDDEN * 1 / 1e6  # INT8: 1 byte
n_stored_layers = 22  # layers that need stored activations (with gradient checkpointing)

log(f"  One layer activation (FP16): {act_fp16:.1f} MB")
log(f"  One layer activation (INT8): {act_int8:.1f} MB")
log(f"  With {n_stored_layers} checkpointed layers:")
log(f"    FP16 total: {act_fp16 * n_stored_layers:.0f} MB")
log(f"    INT8 total: {act_int8 * n_stored_layers:.0f} MB")
log(f"    Savings: {(act_fp16 - act_int8) * n_stored_layers:.0f} MB")

# Test quality: does INT8 quantization of activations hurt training?
test_act = torch.randn(4, 512, HIDDEN, device=dev, dtype=torch.float16)
# Quantize to INT8
scale = test_act.abs().amax(dim=-1, keepdim=True).clamp(min=1e-5) / 127.0
quantized = (test_act / scale).round().clamp(-128, 127).to(torch.int8)
# Dequantize
reconstructed = quantized.float() * scale

# Measure error
mse = F.mse_loss(reconstructed, test_act.float()).item()
cos = F.cosine_similarity(test_act.reshape(-1).float().unsqueeze(0),
                           reconstructed.reshape(-1).unsqueeze(0)).item()
log(f"  Activation INT8 quality: MSE={mse:.6e}, cosine={cos:.6f}")
log(f"  {'✅ Negligible error' if cos > 0.999 else '⚠️ Notable error'}")

del test_act, quantized, reconstructed

# ═══════════════════════════════════════════════════════════════════════════════
# SUMMARY
# ═══════════════════════════════════════════════════════════════════════════════

log("\n" + "="*60)
log("  SUMMARY: GPU Memory Reduction Strategies")
log("="*60)
log(f"""
  Strategy                    Savings       Effort    Worth it?
  ─────────────────────────────────────────────────────────────
  A. Pre-allocated buffer     ~{peak_standard-peak_buffered:.0f} MB/layer    Low       ✅ Yes (simple, effective)
  B. FP16 dequant (not FP32)  ~{(peak_32-peak_16)*88:.0f} MB total   Already done ✅ Already implemented
  C. Shared codebook          Tiny           Already done ✅ Speed benefit > memory
  D. Partial row dequant      ~{(mem_full-mem_partial)*88*4:.0f} MB potential  High      ⚠️ Only for attention V
  E. Grad CPU offload         ~{grad_on_gpu-grad_offload:.0f} MB         Medium    ❌ LoRA grads are already small
  F. Activation INT8 compress ~{(act_fp16-act_int8)*n_stored_layers:.0f} MB total    Medium    ✅ Best bang for buck

  RECOMMENDATION:
  Combine A (buffer reuse) + F (INT8 activation compression) for
  maximum savings with minimal complexity. Together they save
  ~{peak_standard-peak_buffered + (act_fp16-act_int8)*n_stored_layers:.0f} MB — enough to comfortably fit TinyLlama 1.1B
  training in under 6GB GPU memory.
""")