Condensate / test_condenser.py
Executor-Tyrant-Framework's picture
Fix HF Space: CPU-only torch, lazy imports
efd23fa
"""
Condensate Layer 3: Condenser Tests
The moment of truth β€” does condensation actually save RAM?
Run: python3 test_condenser.py
"""
import numpy as np
import time
import os
import sys
sys.path.insert(0, os.path.dirname(__file__))
from condenser import Condenser
def test_basic_compression():
"""Test 1: Can we compress and decompress without data loss?"""
print("\n--- Test 1: Lossless Compression Round-Trip ---")
condenser = Condenser(demotion_idle_ms=1)
# Register some numpy arrays
original_data = np.random.randn(256, 256).astype(np.float32)
condenser.register("test.weights", original_data.copy())
region = condenser.regions["test.weights"]
original_size = region.original_size
# Compress to WARM
saved = region.compress_to_warm()
assert region.tier == "WARM"
assert region.hot_data is None
assert region.warm_data is not None
print(f" Original: {original_size / 1024:.1f} KB")
print(f" Compressed: {region.compressed_size / 1024:.1f} KB")
print(f" Ratio: {original_size / region.compressed_size:.1f}:1")
print(f" Saved: {saved / 1024:.1f} KB")
# Promote back to HOT
restored = region.promote_to_hot()
assert region.tier == "HOT"
assert np.array_equal(restored, original_data), "Data corrupted after round-trip!"
print(f" Round-trip: LOSSLESS (arrays match exactly)")
# Compress to COLD (disk)
region.compress_to_cold(condenser.cold_dir)
assert region.tier == "COLD"
assert region.current_ram_usage == 0
print(f" Cold (on disk): 0 KB RAM")
# Promote from COLD back to HOT
restored2 = region.promote_to_hot()
assert region.tier == "HOT"
assert np.array_equal(restored2, original_data), "Data corrupted after cold round-trip!"
print(f" Cold round-trip: LOSSLESS")
condenser.cleanup()
print(" PASS")
def test_selective_condensation():
"""Test 2: Hot regions stay hot, cold regions compress.
16 regions, 4 hot, 12 cold. After condensation, only 4 should
be in RAM at full size.
"""
print("\n--- Test 2: Selective Condensation ---")
# 16 regions Γ— 64KB each = 1MB total
# Use structured data (sparse + patterns) β€” like real weights, not pure noise
state = {}
for i in range(16):
arr = np.zeros((128, 64), dtype=np.float32)
# Sparse: only ~20% nonzero (realistic for many weight matrices)
mask = np.random.random((128, 64)) < 0.2
arr[mask] = np.random.randn(mask.sum()).astype(np.float32)
state[f"block_{i}"] = arr
hot_blocks = {0, 1, 2, 3}
def workload(wrapped):
# Hot blocks: accessed every iteration
for i in hot_blocks:
_ = wrapped[f"block_{i}"]
# Cold blocks: rarely accessed
if np.random.random() < 0.05:
idx = np.random.choice(list(range(4, 16)))
_ = wrapped[f"block_{idx}"]
time.sleep(0.001)
condenser = Condenser(demotion_idle_ms=10, warmup_iters=15)
results = condenser.run_benchmark(state, workload, iterations=30,
name="selective")
condenser.print_results(results)
# Verify tier management is working β€” cold regions should exist
last_log = results["promotion_log"][-1] if results["promotion_log"] else {}
warm_cold = last_log.get("warm", 0) + last_log.get("cold", 0)
print(f" Condensed regions (WARM+COLD): {warm_cold} of {results['total_regions']}")
print(f" RAM saved: {results['saved_mb']:.2f} MB ({results['saved_pct']:.1f}%)")
assert warm_cold >= 8, f"Should condense at least 8 cold regions, got {warm_cold}"
condenser.cleanup()
print(" PASS")
def test_inference_workload():
"""Test 3: Simulated AI inference β€” THE benchmark.
6-layer model with attention + FFN + KV cache.
Config and unused layers should compress.
Active layers should stay hot.
"""
print("\n--- Test 3: AI Inference Workload (The Real Test) ---")
state = {}
# Model layers (each ~128KB) β€” sparse structured weights
for i in range(6):
for name in ["q", "k", "v"]:
arr = np.zeros((128, 128), dtype=np.float32)
mask = np.random.random((128, 128)) < 0.25
arr[mask] = np.random.randn(mask.sum()).astype(np.float32)
state[f"layer_{i}_{name}"] = arr
for name, shape in [("ffn_up", (128, 512)), ("ffn_down", (512, 128))]:
arr = np.zeros(shape, dtype=np.float32)
mask = np.random.random(shape) < 0.2
arr[mask] = np.random.randn(mask.sum()).astype(np.float32)
state[f"layer_{i}_{name}"] = arr
# KV cache β€” zeros (compresses extremely well)
for i in range(6):
state[f"kv_{i}_keys"] = np.zeros((256, 128), dtype=np.float32)
state[f"kv_{i}_vals"] = np.zeros((256, 128), dtype=np.float32)
# Config and metadata (small)
for i in range(20):
state[f"meta_{i}"] = np.zeros(32, dtype=np.float32)
def workload(wrapped):
# Token generation: sequential through layers
for token in range(3):
for layer_idx in range(6):
_ = wrapped[f"layer_{layer_idx}_q"]
_ = wrapped[f"layer_{layer_idx}_k"]
_ = wrapped[f"layer_{layer_idx}_v"]
_ = wrapped[f"kv_{layer_idx}_keys"]
_ = wrapped[f"kv_{layer_idx}_vals"]
_ = wrapped[f"layer_{layer_idx}_ffn_up"]
_ = wrapped[f"layer_{layer_idx}_ffn_down"]
time.sleep(0.0001)
# Metadata accessed once per request
_ = wrapped["meta_0"]
_ = wrapped["meta_1"]
print(f" State: {len(state)} regions, "
f"{sum(v.nbytes for v in state.values()) / 1024 / 1024:.2f} MB total")
condenser = Condenser(demotion_idle_ms=5, warmup_iters=10)
results = condenser.run_benchmark(state, workload, iterations=20,
name="inference")
condenser.print_results(results)
print(f"\n *** INFERENCE RESULTS ***")
print(f" Baseline RAM: {results['baseline_ram_mb']:.2f} MB")
print(f" Condensed RAM: {results['avg_condensed_ram_mb']:.2f} MB")
print(f" Saved: {results['saved_mb']:.2f} MB ({results['saved_pct']:.1f}%)")
print(f" Prediction acc: {results['prediction_accuracy']}%")
condenser.cleanup()
print(" PASS")
def test_large_state():
"""Test 4: Larger state β€” stress test with meaningful RAM numbers.
64 regions Γ— 256KB = 16 MB total state.
Only 8 regions hot at any time = 2 MB needed.
Target: condense ~14 MB.
"""
print("\n--- Test 4: Large State Stress Test ---")
# 64 regions Γ— 256KB each = 16 MB
# Structured sparse data β€” compresses well
state = {}
for i in range(64):
arr = np.zeros((256, 128), dtype=np.float32)
mask = np.random.random((256, 128)) < 0.15
arr[mask] = np.random.randn(mask.sum()).astype(np.float32)
state[f"region_{i}"] = arr
# 8 hot regions that rotate
hot_set_a = set(range(0, 8))
hot_set_b = set(range(32, 40))
iteration_count = [0]
def workload(wrapped):
iteration_count[0] += 1
# Alternate between two hot sets
hot = hot_set_a if (iteration_count[0] % 20) < 10 else hot_set_b
for i in hot:
_ = wrapped[f"region_{i}"]
time.sleep(0.002)
total_mb = sum(v.nbytes for v in state.values()) / 1024 / 1024
print(f" State: {len(state)} regions, {total_mb:.1f} MB total")
print(f" Only 8 regions hot at any time (2 MB needed)")
condenser = Condenser(demotion_idle_ms=15, warmup_iters=15)
results = condenser.run_benchmark(state, workload, iterations=40,
name="large")
condenser.print_results(results)
print(f"\n *** LARGE STATE RESULTS ***")
print(f" Baseline RAM: {results['baseline_ram_mb']:.1f} MB (all in RAM)")
print(f" Condensed RAM: {results['avg_condensed_ram_mb']:.1f} MB")
print(f" Saved: {results['saved_mb']:.1f} MB ({results['saved_pct']:.1f}%)")
condenser.cleanup()
print(" PASS")
def test_prediction_value():
"""Test 5: Measure prediction-driven vs reactive promotions.
The ratio of predicted vs reactive tells us how much the
predictor is actually helping vs just reacting to cache misses.
"""
print("\n--- Test 5: Prediction Value Measurement ---")
state = {f"chunk_{i}": np.random.randn(64, 64).astype(np.float32)
for i in range(20)}
# Predictable pattern: 0β†’1β†’2β†’3, then 10β†’11β†’12β†’13
def workload(wrapped):
for i in range(4):
_ = wrapped[f"chunk_{i}"]
time.sleep(0.001)
time.sleep(0.005)
for i in range(10, 14):
_ = wrapped[f"chunk_{i}"]
time.sleep(0.001)
time.sleep(0.005)
condenser = Condenser(demotion_idle_ms=8, warmup_iters=15)
results = condenser.run_benchmark(state, workload, iterations=25,
name="predval")
condenser.print_results(results)
pred = results["prediction_promotions"]
react = results["reactive_promotions"]
total = pred + react
if total > 0:
pred_pct = pred / total * 100
print(f"\n Promotions: {total} total")
print(f" Prediction-driven: {pred} ({pred_pct:.0f}%)")
print(f" Reactive (miss): {react} ({100-pred_pct:.0f}%)")
if pred_pct > 50:
print(f" GOOD β€” Majority of promotions are prediction-driven")
else:
print(f" Prediction helps but reactive still dominates")
else:
print(f" No promotions needed (everything stayed HOT)")
condenser.cleanup()
print(" PASS")
if __name__ == "__main__":
print("=" * 60)
print(" CONDENSATE β€” Layer 3 Condenser Tests")
print(" The Moment of Truth: Does It Actually Save RAM?")
print("=" * 60)
test_basic_compression()
test_selective_condensation()
test_inference_workload()
test_large_state()
test_prediction_value()
print("\n" + "=" * 60)
print(" ALL TESTS PASSED")
print("=" * 60)