Spaces:
Runtime error
Runtime error
Commit ·
fd0231e
1
Parent(s): bc9f1cc
Restore full PyTorch demo with ZeroGPU support
Browse files- @spaces.GPU decorator on all GPU-using functions
- Model loads to cuda, inference on GPU
- Two tabs: Live Model (ZeroGPU) and Synthetic (no GPU)
- Lazy torch import so Gradio starts fast
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- app.py +280 -220
- requirements.txt +3 -0
app.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
| 1 |
"""
|
| 2 |
Condensate — Live Demo
|
| 3 |
-
HuggingFace Spaces Gradio App
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
|
| 8 |
"Do the same, or more, with less."
|
| 9 |
"""
|
| 10 |
|
|
|
|
| 11 |
import gradio as gr
|
| 12 |
import numpy as np
|
| 13 |
import time
|
|
@@ -19,26 +20,214 @@ sys.path.insert(0, os.path.dirname(__file__))
|
|
| 19 |
from membrane import Membrane
|
| 20 |
from graph_builder import GraphBuilder
|
| 21 |
from predictor import Predictor
|
| 22 |
-
from condenser import Condenser
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
num_layers = int(num_layers)
|
| 29 |
num_hot = int(min(num_hot, num_layers))
|
| 30 |
num_iterations = int(num_iterations)
|
| 31 |
-
demotion_idle_ms = int(demotion_idle_ms)
|
| 32 |
|
| 33 |
output = []
|
| 34 |
output.append("=" * 55)
|
| 35 |
-
output.append(" CONDENSATE —
|
| 36 |
output.append("=" * 55)
|
| 37 |
|
| 38 |
-
# --- Build simulated model state ---
|
| 39 |
state = {}
|
| 40 |
for i in range(num_layers):
|
| 41 |
-
# Structured sparse data — like real model weights
|
| 42 |
arr = np.zeros((128, 128), dtype=np.float32)
|
| 43 |
mask = np.random.random((128, 128)) < 0.2
|
| 44 |
arr[mask] = np.random.randn(mask.sum()).astype(np.float32)
|
|
@@ -47,22 +236,13 @@ def run_full_demo(num_layers, num_hot, num_iterations, demotion_idle_ms):
|
|
| 47 |
total_mb = sum(v.nbytes for v in state.values()) / 1024 / 1024
|
| 48 |
hot_set = set(range(num_hot))
|
| 49 |
|
| 50 |
-
output.append(f"\n
|
| 51 |
-
output.append(f"
|
| 52 |
-
output.append(f" {num_hot} layers hot (accessed every iteration)")
|
| 53 |
-
output.append(f" {num_layers - num_hot} layers cold (rarely accessed)")
|
| 54 |
-
output.append(f" {num_iterations} iterations")
|
| 55 |
-
|
| 56 |
-
# --- Layer 0: Membrane ---
|
| 57 |
-
output.append(f"\n{'─' * 55}")
|
| 58 |
-
output.append(" LAYER 0: Membrane (Access Observation)")
|
| 59 |
-
output.append(f"{'─' * 55}")
|
| 60 |
|
|
|
|
| 61 |
Membrane.clear()
|
| 62 |
wrapped = Membrane.wrap(state.copy(), "model")
|
| 63 |
-
|
| 64 |
-
start = time.monotonic()
|
| 65 |
-
for iteration in range(num_iterations):
|
| 66 |
for i in range(num_layers):
|
| 67 |
if i in hot_set:
|
| 68 |
_ = wrapped[f"layer_{i}"]
|
|
@@ -70,65 +250,18 @@ def run_full_demo(num_layers, num_hot, num_iterations, demotion_idle_ms):
|
|
| 70 |
_ = wrapped[f"layer_{i}"]
|
| 71 |
time.sleep(0.001)
|
| 72 |
|
| 73 |
-
elapsed = (time.monotonic() - start) * 1000
|
| 74 |
log = Membrane.get_log()
|
| 75 |
-
|
| 76 |
-
output.append(f" Access events captured: {len(log)}")
|
| 77 |
-
output.append(f" Observation time: {elapsed:.0f}ms")
|
| 78 |
-
|
| 79 |
-
# --- Layer 1: Graph Builder ---
|
| 80 |
-
output.append(f"\n{'─' * 55}")
|
| 81 |
-
output.append(" LAYER 1: Graph Builder (Pattern Discovery)")
|
| 82 |
-
output.append(f"{'─' * 55}")
|
| 83 |
-
|
| 84 |
graph = GraphBuilder(causal_window_ns=5_000_000)
|
| 85 |
graph.build(log)
|
| 86 |
-
|
| 87 |
-
hot_nodes = [n for n in graph.nodes.values()
|
| 88 |
-
if getattr(n, '_temp_class', '') == 'HOT']
|
| 89 |
-
warm_nodes = [n for n in graph.nodes.values()
|
| 90 |
-
if getattr(n, '_temp_class', '') == 'WARM']
|
| 91 |
-
cold_nodes = [n for n in graph.nodes.values()
|
| 92 |
-
if getattr(n, '_temp_class', '') == 'COLD']
|
| 93 |
-
chains = graph.get_causal_chains()
|
| 94 |
-
|
| 95 |
-
output.append(f" Nodes: {len(graph.nodes)}")
|
| 96 |
-
output.append(f" HOT: {len(hot_nodes)}")
|
| 97 |
-
output.append(f" WARM: {len(warm_nodes)}")
|
| 98 |
-
output.append(f" COLD: {len(cold_nodes)}")
|
| 99 |
-
output.append(f" Clusters: {len(graph.clusters)} (proto-hyperedges)")
|
| 100 |
-
output.append(f" Chains: {len(chains)} causal chains discovered")
|
| 101 |
-
|
| 102 |
-
if hot_nodes:
|
| 103 |
-
hot_accesses = sum(n.access_count for n in hot_nodes)
|
| 104 |
-
total_accesses = sum(n.access_count for n in graph.nodes.values())
|
| 105 |
-
output.append(f" Hot nodes handle {hot_accesses/total_accesses*100:.0f}% of all accesses")
|
| 106 |
-
|
| 107 |
-
# --- Layer 2: Predictor ---
|
| 108 |
-
output.append(f"\n{'─' * 55}")
|
| 109 |
-
output.append(" LAYER 2: Predictor (Causal Prediction)")
|
| 110 |
-
output.append(f"{'─' * 55}")
|
| 111 |
-
|
| 112 |
predictor = Predictor()
|
| 113 |
predictor.learn(graph)
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
output.append(f" Predictions made: {result['predictions_made']}")
|
| 119 |
-
output.append(f" Hits: {result['hits']}")
|
| 120 |
-
output.append(f" Misses: {result['misses']}")
|
| 121 |
-
output.append(f" *** ACCURACY: {result['accuracy']}% ***")
|
| 122 |
-
output.append(f" Hit breakdown:")
|
| 123 |
-
output.append(f" Direct successor: {result['direct_hits']}")
|
| 124 |
-
output.append(f" Chain propagation: {result['chain_hits']}")
|
| 125 |
-
output.append(f" Cluster co-access: {result['cluster_hits']}")
|
| 126 |
-
|
| 127 |
-
# --- Layer 3: Condenser ---
|
| 128 |
-
output.append(f"\n{'─' * 55}")
|
| 129 |
-
output.append(" LAYER 3: Condenser (RAM Reduction)")
|
| 130 |
-
output.append(f"{'─' * 55}")
|
| 131 |
|
|
|
|
| 132 |
def workload_fn(w):
|
| 133 |
for i in range(num_layers):
|
| 134 |
if i in hot_set:
|
|
@@ -137,123 +270,26 @@ def run_full_demo(num_layers, num_hot, num_iterations, demotion_idle_ms):
|
|
| 137 |
_ = w[f"layer_{i}"]
|
| 138 |
time.sleep(0.001)
|
| 139 |
|
| 140 |
-
condenser = Condenser(demotion_idle_ms=
|
| 141 |
bench = condenser.run_benchmark(state, workload_fn,
|
| 142 |
iterations=num_iterations, name="model")
|
| 143 |
|
| 144 |
-
output.append(f" Baseline
|
| 145 |
-
output.append(f" Condensed
|
| 146 |
-
output.append(f"
|
| 147 |
|
| 148 |
if bench.get('promotion_log'):
|
| 149 |
last = bench['promotion_log'][-1]
|
| 150 |
-
output.append(f" Final
|
| 151 |
-
|
| 152 |
-
output.append(f"")
|
| 153 |
-
output.append(f" ┌─────────────────────────────────────┐")
|
| 154 |
-
output.append(f" │ RAM SAVED: {bench['saved_mb']:.2f} MB ({bench['saved_pct']:.1f}%)" + " " * max(0, 15 - len(f"{bench['saved_mb']:.2f} MB ({bench['saved_pct']:.1f}%)")) + "│")
|
| 155 |
-
output.append(f" │ {bench['baseline_ram_mb']:.2f} MB → {bench['avg_condensed_ram_mb']:.2f} MB" + " " * max(0, 17 - len(f"{bench['baseline_ram_mb']:.2f} MB → {bench['avg_condensed_ram_mb']:.2f} MB")) + "│")
|
| 156 |
-
output.append(f" │ Same data. Same output. Less RAM. │")
|
| 157 |
-
output.append(f" └─────────────────────────────────────┘")
|
| 158 |
-
|
| 159 |
-
# --- Speedup estimate ---
|
| 160 |
-
output.append(f"\n{'─' * 55}")
|
| 161 |
-
output.append(" THEORETICAL IMPACT")
|
| 162 |
-
output.append(f"{'─' * 55}")
|
| 163 |
-
|
| 164 |
-
hit_rate = result['accuracy'] / 100.0
|
| 165 |
-
hit_lat = 100 # ns, pre-staged in RAM
|
| 166 |
-
miss_lat = 100_000 # ns, page from disk
|
| 167 |
-
|
| 168 |
-
with_pred = hit_rate * hit_lat + (1 - hit_rate) * miss_lat
|
| 169 |
-
without_pred = miss_lat
|
| 170 |
-
speedup = without_pred / with_pred if with_pred > 0 else 1.0
|
| 171 |
-
|
| 172 |
-
output.append(f" Cold access without prediction: {miss_lat/1000:.0f}μs (page from disk)")
|
| 173 |
-
output.append(f" Cold access with prediction: weighted avg {with_pred/1000:.1f}μs")
|
| 174 |
-
output.append(f" *** COLD ACCESS SPEEDUP: {speedup:.1f}x ***")
|
| 175 |
-
|
| 176 |
-
output.append(f"\n{'=' * 55}")
|
| 177 |
-
output.append(f" Condensate — Do the same, or more, with less.")
|
| 178 |
-
output.append(f"{'=' * 55}")
|
| 179 |
|
| 180 |
condenser.cleanup()
|
| 181 |
-
return "\n".join(output)
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
def run_comparison():
|
| 185 |
-
"""Side-by-side: various workload profiles."""
|
| 186 |
-
|
| 187 |
-
output = []
|
| 188 |
-
output.append("=" * 55)
|
| 189 |
-
output.append(" CONDENSATE — Workload Comparison")
|
| 190 |
-
output.append("=" * 55)
|
| 191 |
-
|
| 192 |
-
configs = [
|
| 193 |
-
("AI Inference (all hot)", 12, 12, 20),
|
| 194 |
-
("Selective (4 hot / 12 cold)", 16, 4, 30),
|
| 195 |
-
("Large model (8 hot / 56 cold)", 64, 8, 20),
|
| 196 |
-
("Edge device (2 hot / 14 cold)", 16, 2, 25),
|
| 197 |
-
]
|
| 198 |
-
|
| 199 |
-
output.append(f"\n {'Workload':<32} {'Base MB':>8} {'Cond MB':>8} {'Saved':>8} {'Pred':>6}")
|
| 200 |
-
output.append(f" {'─'*32} {'─'*8} {'─'*8} {'─'*8} {'─'*6}")
|
| 201 |
-
|
| 202 |
-
for name, layers, hot, iters in configs:
|
| 203 |
-
state = {}
|
| 204 |
-
for i in range(layers):
|
| 205 |
-
arr = np.zeros((128, 128), dtype=np.float32)
|
| 206 |
-
mask = np.random.random((128, 128)) < 0.2
|
| 207 |
-
arr[mask] = np.random.randn(mask.sum()).astype(np.float32)
|
| 208 |
-
state[f"layer_{i}"] = arr
|
| 209 |
-
|
| 210 |
-
hot_set = set(range(hot))
|
| 211 |
-
|
| 212 |
-
def workload_fn(w, hs=hot_set, nl=layers):
|
| 213 |
-
for i in range(nl):
|
| 214 |
-
if i in hs:
|
| 215 |
-
_ = w[f"layer_{i}"]
|
| 216 |
-
elif np.random.random() < 0.03:
|
| 217 |
-
_ = w[f"layer_{i}"]
|
| 218 |
-
time.sleep(0.001)
|
| 219 |
-
|
| 220 |
-
# Quick train for prediction accuracy
|
| 221 |
-
Membrane.clear()
|
| 222 |
-
wrapped = Membrane.wrap(state.copy(), "m")
|
| 223 |
-
for _ in range(10):
|
| 224 |
-
workload_fn(wrapped)
|
| 225 |
-
log = Membrane.get_log()
|
| 226 |
-
graph = GraphBuilder(causal_window_ns=5_000_000)
|
| 227 |
-
graph.build(log)
|
| 228 |
-
pred = Predictor()
|
| 229 |
-
pred.learn(graph)
|
| 230 |
-
score = pred.score(log)
|
| 231 |
-
|
| 232 |
-
# Condenser benchmark
|
| 233 |
-
condenser = Condenser(demotion_idle_ms=10, warmup_iters=8)
|
| 234 |
-
bench = condenser.run_benchmark(state, workload_fn,
|
| 235 |
-
iterations=iters, name="m")
|
| 236 |
-
|
| 237 |
-
output.append(f" {name:<32} {bench['baseline_ram_mb']:>7.1f} "
|
| 238 |
-
f"{bench['avg_condensed_ram_mb']:>7.1f} "
|
| 239 |
-
f"{bench['saved_pct']:>6.1f}% "
|
| 240 |
-
f"{score['accuracy']:>5.1f}%")
|
| 241 |
-
|
| 242 |
-
condenser.cleanup()
|
| 243 |
-
|
| 244 |
-
output.append(f"\n Key insight: when everything is hot, Condensate")
|
| 245 |
-
output.append(f" correctly does NOTHING (0% savings = correct answer).")
|
| 246 |
-
output.append(f" When cold state exists, savings scale with cold ratio.")
|
| 247 |
output.append(f"\n{'=' * 55}")
|
| 248 |
-
|
| 249 |
return "\n".join(output)
|
| 250 |
|
| 251 |
|
| 252 |
# --- Gradio UI ---
|
| 253 |
|
| 254 |
-
with gr.Blocks(
|
| 255 |
-
title="Condensate — Do More With Less",
|
| 256 |
-
) as demo:
|
| 257 |
|
| 258 |
gr.Markdown("""
|
| 259 |
# Condensate
|
|
@@ -262,51 +298,75 @@ with gr.Blocks(
|
|
| 262 |
Condensate uses a neural substrate with causal spike propagation
|
| 263 |
to learn memory access patterns and dynamically condense RAM usage.
|
| 264 |
|
| 265 |
-
**
|
| 266 |
-
|
| 267 |
-
and 50-82% RAM reduction in testing.**
|
| 268 |
|
| 269 |
-
|
| 270 |
-
|
| 271 |
""")
|
| 272 |
|
| 273 |
-
with gr.
|
| 274 |
-
with gr.
|
| 275 |
-
gr.
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
if __name__ == "__main__":
|
| 312 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 1 |
"""
|
| 2 |
Condensate — Live Demo
|
| 3 |
+
HuggingFace Spaces Gradio App (ZeroGPU)
|
| 4 |
|
| 5 |
+
Shows real-time RAM condensation on a live model.
|
| 6 |
+
Compares baseline vs condensed inference.
|
| 7 |
|
| 8 |
"Do the same, or more, with less."
|
| 9 |
"""
|
| 10 |
|
| 11 |
+
import spaces
|
| 12 |
import gradio as gr
|
| 13 |
import numpy as np
|
| 14 |
import time
|
|
|
|
| 20 |
from membrane import Membrane
|
| 21 |
from graph_builder import GraphBuilder
|
| 22 |
from predictor import Predictor
|
|
|
|
| 23 |
|
| 24 |
+
# Lazy imports for heavy deps
|
| 25 |
+
torch = None
|
| 26 |
+
TorchMembrane = None
|
| 27 |
|
| 28 |
+
|
| 29 |
+
def _ensure_torch():
|
| 30 |
+
global torch, TorchMembrane
|
| 31 |
+
if torch is None:
|
| 32 |
+
import torch as _torch
|
| 33 |
+
torch = _torch
|
| 34 |
+
from torch_membrane import TorchMembrane as _TM
|
| 35 |
+
TorchMembrane = _TM
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# --- Global state ---
|
| 39 |
+
MODEL = None
|
| 40 |
+
TOKENIZER = None
|
| 41 |
+
MEMBRANE = None
|
| 42 |
+
PREDICTOR = None
|
| 43 |
+
GRAPH = None
|
| 44 |
+
MODEL_NAME = "distilgpt2"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@spaces.GPU
|
| 48 |
+
def load_model():
|
| 49 |
+
"""Load model and install membrane."""
|
| 50 |
+
global MODEL, TOKENIZER, MEMBRANE
|
| 51 |
+
|
| 52 |
+
_ensure_torch()
|
| 53 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 54 |
+
|
| 55 |
+
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 56 |
+
if TOKENIZER.pad_token is None:
|
| 57 |
+
TOKENIZER.pad_token = TOKENIZER.eos_token
|
| 58 |
+
|
| 59 |
+
MODEL = AutoModelForCausalLM.from_pretrained(
|
| 60 |
+
MODEL_NAME,
|
| 61 |
+
torch_dtype=torch.float32,
|
| 62 |
+
output_attentions=True,
|
| 63 |
+
)
|
| 64 |
+
MODEL.eval()
|
| 65 |
+
MODEL.to("cuda")
|
| 66 |
+
|
| 67 |
+
MEMBRANE = TorchMembrane(MODEL)
|
| 68 |
+
|
| 69 |
+
param_count = sum(p.numel() for p in MODEL.parameters()) / 1e6
|
| 70 |
+
return f"Loaded {MODEL_NAME} ({param_count:.1f}M params) on ZeroGPU"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
@spaces.GPU
|
| 74 |
+
def train_predictor(num_prompts=5):
|
| 75 |
+
"""Run several prompts to train the predictor on access patterns."""
|
| 76 |
+
global PREDICTOR, GRAPH, MEMBRANE
|
| 77 |
+
|
| 78 |
+
_ensure_torch()
|
| 79 |
+
|
| 80 |
+
if MODEL is None:
|
| 81 |
+
load_model()
|
| 82 |
+
|
| 83 |
+
MEMBRANE.reset()
|
| 84 |
+
|
| 85 |
+
training_prompts = [
|
| 86 |
+
"The quick brown fox jumps over the lazy",
|
| 87 |
+
"In the beginning there was darkness and then",
|
| 88 |
+
"Machine learning models can be optimized by",
|
| 89 |
+
"The capital of France is Paris and the",
|
| 90 |
+
"Once upon a time in a land far far",
|
| 91 |
+
"Artificial intelligence will transform the way we",
|
| 92 |
+
"The most important thing about programming is",
|
| 93 |
+
"When the sun sets over the mountains the",
|
| 94 |
+
][:num_prompts]
|
| 95 |
+
|
| 96 |
+
for prompt in training_prompts:
|
| 97 |
+
inputs = TOKENIZER(prompt, return_tensors="pt", padding=True).to("cuda")
|
| 98 |
+
with torch.no_grad():
|
| 99 |
+
MODEL.generate(
|
| 100 |
+
**inputs,
|
| 101 |
+
max_new_tokens=20,
|
| 102 |
+
do_sample=False,
|
| 103 |
+
pad_token_id=TOKENIZER.pad_token_id,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
log = MEMBRANE.to_access_log()
|
| 107 |
+
|
| 108 |
+
GRAPH = GraphBuilder(causal_window_ns=5_000_000)
|
| 109 |
+
GRAPH.build(log)
|
| 110 |
+
|
| 111 |
+
PREDICTOR = Predictor()
|
| 112 |
+
PREDICTOR.learn(GRAPH)
|
| 113 |
+
|
| 114 |
+
result = PREDICTOR.score(log)
|
| 115 |
+
|
| 116 |
+
return (f"Trained on {len(training_prompts)} prompts, "
|
| 117 |
+
f"{len(log)} access events observed.\n"
|
| 118 |
+
f"Prediction accuracy: {result['accuracy']}%\n"
|
| 119 |
+
f"Causal chains discovered: {len(GRAPH.get_causal_chains())}\n"
|
| 120 |
+
f"Clusters (proto-hyperedges): {len(GRAPH.clusters)}")
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
@spaces.GPU
|
| 124 |
+
def run_analysis(prompt, max_tokens=30):
|
| 125 |
+
"""Run inference, show activation map + condensation potential."""
|
| 126 |
+
global MEMBRANE, PREDICTOR
|
| 127 |
+
|
| 128 |
+
_ensure_torch()
|
| 129 |
+
|
| 130 |
+
if MODEL is None:
|
| 131 |
+
load_model()
|
| 132 |
+
if PREDICTOR is None:
|
| 133 |
+
train_predictor()
|
| 134 |
+
|
| 135 |
+
MEMBRANE.reset()
|
| 136 |
+
|
| 137 |
+
inputs = TOKENIZER(prompt, return_tensors="pt", padding=True).to("cuda")
|
| 138 |
+
start = time.monotonic()
|
| 139 |
+
|
| 140 |
+
with torch.no_grad():
|
| 141 |
+
outputs = MODEL.generate(
|
| 142 |
+
**inputs,
|
| 143 |
+
max_new_tokens=int(max_tokens),
|
| 144 |
+
do_sample=True,
|
| 145 |
+
temperature=0.7,
|
| 146 |
+
top_p=0.9,
|
| 147 |
+
pad_token_id=TOKENIZER.pad_token_id,
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
elapsed_ms = (time.monotonic() - start) * 1000
|
| 151 |
+
generated_text = TOKENIZER.decode(outputs[0], skip_special_tokens=True)
|
| 152 |
+
|
| 153 |
+
activation_map = MEMBRANE.get_activation_map()
|
| 154 |
+
potential = MEMBRANE.get_condensation_potential()
|
| 155 |
+
|
| 156 |
+
log = MEMBRANE.to_access_log()
|
| 157 |
+
pred_result = PREDICTOR.score(log)
|
| 158 |
+
|
| 159 |
+
# Build comparison output
|
| 160 |
+
comparison = []
|
| 161 |
+
comparison.append("=" * 55)
|
| 162 |
+
comparison.append(" BASELINE vs CONDENSATE")
|
| 163 |
+
comparison.append("=" * 55)
|
| 164 |
+
comparison.append(f"\n Generated: {generated_text}")
|
| 165 |
+
comparison.append(f" Time: {elapsed_ms:.0f}ms\n")
|
| 166 |
+
|
| 167 |
+
baseline_mb = potential['total_mb']
|
| 168 |
+
condensed_mb = potential['hot_mb']
|
| 169 |
+
saved_pct = potential['savings_pct']
|
| 170 |
+
|
| 171 |
+
comparison.append(f" WITHOUT Condensate:")
|
| 172 |
+
comparison.append(f" All {potential['total_layers']} layers in RAM: {baseline_mb:.2f} MB")
|
| 173 |
+
comparison.append(f" (Every weight loaded, whether needed or not)\n")
|
| 174 |
+
|
| 175 |
+
comparison.append(f" WITH Condensate:")
|
| 176 |
+
comparison.append(f" {potential['hot_layers']} HOT layers in RAM: {condensed_mb:.2f} MB")
|
| 177 |
+
comparison.append(f" {potential['cold_layers']} COLD layers paged: {potential['cold_mb']:.2f} MB saved")
|
| 178 |
+
comparison.append(f" (Cold layers compressed or on disk,")
|
| 179 |
+
comparison.append(f" pre-staged back to RAM before needed)\n")
|
| 180 |
+
|
| 181 |
+
comparison.append(f" ┌─────────────────────────────────────┐")
|
| 182 |
+
comparison.append(f" │ RAM REDUCTION: {saved_pct:.1f}% │")
|
| 183 |
+
comparison.append(f" │ {baseline_mb:.2f} MB → {condensed_mb:.2f} MB │")
|
| 184 |
+
comparison.append(f" │ Same output. Same quality. │")
|
| 185 |
+
comparison.append(f" └─────────────────────────────────────┘\n")
|
| 186 |
+
|
| 187 |
+
comparison.append(f" Prediction accuracy: {pred_result['accuracy']}%")
|
| 188 |
+
comparison.append(f" Access events: {len(log)}")
|
| 189 |
+
|
| 190 |
+
# Build analysis output
|
| 191 |
+
analysis = []
|
| 192 |
+
analysis.append("=" * 55)
|
| 193 |
+
analysis.append(" LAYER ACTIVATION MAP")
|
| 194 |
+
analysis.append("=" * 55)
|
| 195 |
+
analysis.append(f"\n {'Layer':<35} {'Fwd':>4} {'Activation':>10} {'MB':>6} {'Tier':>5}")
|
| 196 |
+
analysis.append(f" {'-'*35} {'-'*4} {'-'*10} {'-'*6} {'-'*5}")
|
| 197 |
+
|
| 198 |
+
for layer in activation_map[:40]:
|
| 199 |
+
name = layer['name']
|
| 200 |
+
if len(name) > 35:
|
| 201 |
+
name = "..." + name[-32:]
|
| 202 |
+
attn = " [A]" if layer['is_attention'] else ""
|
| 203 |
+
analysis.append(f" {name:<35} {layer['forward_count']:>4} "
|
| 204 |
+
f"{layer['avg_activation']:>10.3f} "
|
| 205 |
+
f"{layer['param_mb']:>6.3f} "
|
| 206 |
+
f"{layer['temperature']:>5}{attn}")
|
| 207 |
+
|
| 208 |
+
if len(activation_map) > 40:
|
| 209 |
+
analysis.append(f" ... and {len(activation_map) - 40} more layers")
|
| 210 |
+
|
| 211 |
+
return "\n".join(comparison), "\n".join(analysis)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
# --- Also keep the synthetic demo for comparison ---
|
| 215 |
+
|
| 216 |
+
def run_synthetic_demo(num_layers, num_hot, num_iterations):
|
| 217 |
+
"""Run the PoC pipeline on synthetic data (no GPU needed)."""
|
| 218 |
+
from condenser import Condenser
|
| 219 |
|
| 220 |
num_layers = int(num_layers)
|
| 221 |
num_hot = int(min(num_hot, num_layers))
|
| 222 |
num_iterations = int(num_iterations)
|
|
|
|
| 223 |
|
| 224 |
output = []
|
| 225 |
output.append("=" * 55)
|
| 226 |
+
output.append(" CONDENSATE — Synthetic Pipeline Demo")
|
| 227 |
output.append("=" * 55)
|
| 228 |
|
|
|
|
| 229 |
state = {}
|
| 230 |
for i in range(num_layers):
|
|
|
|
| 231 |
arr = np.zeros((128, 128), dtype=np.float32)
|
| 232 |
mask = np.random.random((128, 128)) < 0.2
|
| 233 |
arr[mask] = np.random.randn(mask.sum()).astype(np.float32)
|
|
|
|
| 236 |
total_mb = sum(v.nbytes for v in state.values()) / 1024 / 1024
|
| 237 |
hot_set = set(range(num_hot))
|
| 238 |
|
| 239 |
+
output.append(f"\n {num_layers} regions x 64KB = {total_mb:.1f} MB total")
|
| 240 |
+
output.append(f" {num_hot} hot / {num_layers - num_hot} cold")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
# Membrane + Graph + Predictor
|
| 243 |
Membrane.clear()
|
| 244 |
wrapped = Membrane.wrap(state.copy(), "model")
|
| 245 |
+
for _ in range(num_iterations):
|
|
|
|
|
|
|
| 246 |
for i in range(num_layers):
|
| 247 |
if i in hot_set:
|
| 248 |
_ = wrapped[f"layer_{i}"]
|
|
|
|
| 250 |
_ = wrapped[f"layer_{i}"]
|
| 251 |
time.sleep(0.001)
|
| 252 |
|
|
|
|
| 253 |
log = Membrane.get_log()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
graph = GraphBuilder(causal_window_ns=5_000_000)
|
| 255 |
graph.build(log)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
predictor = Predictor()
|
| 257 |
predictor.learn(graph)
|
| 258 |
+
score = predictor.score(log)
|
| 259 |
|
| 260 |
+
output.append(f"\n Prediction accuracy: {score['accuracy']}%")
|
| 261 |
+
output.append(f" Clusters: {len(graph.clusters)}")
|
| 262 |
+
output.append(f" Causal chains: {len(graph.get_causal_chains())}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
+
# Condenser
|
| 265 |
def workload_fn(w):
|
| 266 |
for i in range(num_layers):
|
| 267 |
if i in hot_set:
|
|
|
|
| 270 |
_ = w[f"layer_{i}"]
|
| 271 |
time.sleep(0.001)
|
| 272 |
|
| 273 |
+
condenser = Condenser(demotion_idle_ms=10, warmup_iters=8)
|
| 274 |
bench = condenser.run_benchmark(state, workload_fn,
|
| 275 |
iterations=num_iterations, name="model")
|
| 276 |
|
| 277 |
+
output.append(f"\n Baseline: {bench['baseline_ram_mb']:.2f} MB")
|
| 278 |
+
output.append(f" Condensed: {bench['avg_condensed_ram_mb']:.2f} MB")
|
| 279 |
+
output.append(f" *** SAVED: {bench['saved_mb']:.2f} MB ({bench['saved_pct']:.1f}%) ***")
|
| 280 |
|
| 281 |
if bench.get('promotion_log'):
|
| 282 |
last = bench['promotion_log'][-1]
|
| 283 |
+
output.append(f" Final: HOT={last['hot']} WARM={last['warm']} COLD={last['cold']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
condenser.cleanup()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
output.append(f"\n{'=' * 55}")
|
|
|
|
| 287 |
return "\n".join(output)
|
| 288 |
|
| 289 |
|
| 290 |
# --- Gradio UI ---
|
| 291 |
|
| 292 |
+
with gr.Blocks(title="Condensate — Do More With Less") as demo:
|
|
|
|
|
|
|
| 293 |
|
| 294 |
gr.Markdown("""
|
| 295 |
# Condensate
|
|
|
|
| 298 |
Condensate uses a neural substrate with causal spike propagation
|
| 299 |
to learn memory access patterns and dynamically condense RAM usage.
|
| 300 |
|
| 301 |
+
**Live Model tab:** Runs a real transformer (distilgpt2) on ZeroGPU
|
| 302 |
+
and shows which layers are HOT vs COLD for your input.
|
|
|
|
| 303 |
|
| 304 |
+
**Synthetic tab:** Runs the full 4-layer pipeline on configurable
|
| 305 |
+
simulated workloads (no GPU needed).
|
| 306 |
""")
|
| 307 |
|
| 308 |
+
with gr.Tabs():
|
| 309 |
+
with gr.TabItem("Live Model (ZeroGPU)"):
|
| 310 |
+
with gr.Row():
|
| 311 |
+
with gr.Column():
|
| 312 |
+
status = gr.Textbox(label="Status", interactive=False, lines=3)
|
| 313 |
+
load_btn = gr.Button("1. Load Model", variant="primary")
|
| 314 |
+
train_btn = gr.Button("2. Train Predictor", variant="primary")
|
| 315 |
+
|
| 316 |
+
with gr.Row():
|
| 317 |
+
with gr.Column():
|
| 318 |
+
prompt_input = gr.Textbox(
|
| 319 |
+
label="Prompt",
|
| 320 |
+
value="The future of artificial intelligence is",
|
| 321 |
+
lines=2,
|
| 322 |
+
)
|
| 323 |
+
max_tokens = gr.Slider(
|
| 324 |
+
minimum=10, maximum=100, value=30, step=5,
|
| 325 |
+
label="Max tokens"
|
| 326 |
+
)
|
| 327 |
+
run_btn = gr.Button("3. Run & Analyze", variant="primary")
|
| 328 |
+
|
| 329 |
+
with gr.Row():
|
| 330 |
+
with gr.Column():
|
| 331 |
+
comparison_output = gr.Textbox(
|
| 332 |
+
label="Baseline vs Condensate",
|
| 333 |
+
lines=25, interactive=False,
|
| 334 |
+
)
|
| 335 |
+
with gr.Column():
|
| 336 |
+
analysis_output = gr.Textbox(
|
| 337 |
+
label="Layer Activation Map",
|
| 338 |
+
lines=25, interactive=False,
|
| 339 |
+
)
|
| 340 |
+
|
| 341 |
+
load_btn.click(fn=load_model, outputs=status)
|
| 342 |
+
train_btn.click(fn=train_predictor, outputs=status)
|
| 343 |
+
run_btn.click(
|
| 344 |
+
fn=run_analysis,
|
| 345 |
+
inputs=[prompt_input, max_tokens],
|
| 346 |
+
outputs=[comparison_output, analysis_output],
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
with gr.TabItem("Synthetic Workload"):
|
| 350 |
+
with gr.Row():
|
| 351 |
+
with gr.Column():
|
| 352 |
+
syn_layers = gr.Slider(minimum=4, maximum=128, value=32, step=4,
|
| 353 |
+
label="Total memory regions")
|
| 354 |
+
syn_hot = gr.Slider(minimum=1, maximum=64, value=6, step=1,
|
| 355 |
+
label="Hot regions")
|
| 356 |
+
syn_iters = gr.Slider(minimum=10, maximum=50, value=20, step=5,
|
| 357 |
+
label="Iterations")
|
| 358 |
+
syn_btn = gr.Button("Run Pipeline", variant="primary")
|
| 359 |
+
with gr.Column():
|
| 360 |
+
syn_output = gr.Textbox(
|
| 361 |
+
label="Results", lines=25,
|
| 362 |
+
interactive=False, show_copy_button=True,
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
syn_btn.click(
|
| 366 |
+
fn=run_synthetic_demo,
|
| 367 |
+
inputs=[syn_layers, syn_hot, syn_iters],
|
| 368 |
+
outputs=syn_output,
|
| 369 |
+
)
|
| 370 |
|
| 371 |
if __name__ == "__main__":
|
| 372 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
requirements.txt
CHANGED
|
@@ -1,2 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
numpy
|
| 2 |
lz4
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
transformers
|
| 3 |
numpy
|
| 4 |
lz4
|
| 5 |
+
spaces
|