Executor-Tyrant-Framework Claude Opus 4.6 (1M context) commited on
Commit
0f055ca
·
1 Parent(s): b921d17

Restructure: single GPU call for load+train, no nesting

Browse files

Problem: train_predictor called load_model inside a @spaces.GPU
function — nested calls confused ZeroGPU allocation. GPU popup
never appeared for training.

Fix: One button "Load & Train" = one @spaces.GPU(duration=120)
call that loads model to cuda, runs training prompts, builds
predictor. No nesting. Clean GPU lifecycle.

Two GPU calls total: load_and_train, run_analysis.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +56 -84
app.py CHANGED
@@ -21,20 +21,6 @@ from membrane import Membrane
21
  from graph_builder import GraphBuilder
22
  from predictor import Predictor
23
 
24
- # Lazy imports for heavy deps
25
- torch = None
26
- TorchMembrane = None
27
-
28
-
29
- def _ensure_torch():
30
- global torch, TorchMembrane
31
- if torch is None:
32
- import torch as _torch
33
- torch = _torch
34
- from torch_membrane import TorchMembrane as _TM
35
- TorchMembrane = _TM
36
-
37
-
38
  # --- Global state ---
39
  MODEL = None
40
  TOKENIZER = None
@@ -44,49 +30,42 @@ GRAPH = None
44
  MODEL_NAME = "gpt2-large"
45
 
46
 
47
- def load_model():
48
- """Load model on CPU — ZeroGPU moves to GPU when needed."""
49
- global MODEL, TOKENIZER, MEMBRANE
 
50
 
51
- _ensure_torch()
52
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
53
 
 
54
  TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
55
  if TOKENIZER.pad_token is None:
56
  TOKENIZER.pad_token = TOKENIZER.eos_token
57
 
 
58
  MODEL = AutoModelForCausalLM.from_pretrained(
59
  MODEL_NAME,
60
- dtype=torch.float32,
61
  )
62
  MODEL.eval()
63
-
64
- MEMBRANE = TorchMembrane(MODEL)
65
 
66
  param_count = sum(p.numel() for p in MODEL.parameters()) / 1e6
67
- return f"Loaded {MODEL_NAME} ({param_count:.1f}M params)"
68
-
69
 
70
- @spaces.GPU(duration=120)
71
- def train_predictor(num_prompts=5):
72
- """Run several prompts to train the predictor on access patterns."""
73
- global PREDICTOR, GRAPH, MEMBRANE
74
-
75
- _ensure_torch()
76
-
77
- if MODEL is None:
78
- load_model()
79
-
80
- MODEL.to("cuda")
81
  MEMBRANE.reset()
82
 
 
83
  training_prompts = [
84
  "The quick brown fox jumps over the lazy",
85
  "In the beginning there was darkness and then",
86
  "Machine learning models can be optimized by",
87
  "The capital of France is Paris and the",
88
  "Once upon a time in a land far far",
89
- ][:num_prompts]
90
 
91
  for prompt in training_prompts:
92
  inputs = TOKENIZER(prompt, return_tensors="pt", padding=True).to("cuda")
@@ -98,6 +77,7 @@ def train_predictor(num_prompts=5):
98
  pad_token_id=TOKENIZER.pad_token_id,
99
  )
100
 
 
101
  log = MEMBRANE.to_access_log()
102
 
103
  GRAPH = GraphBuilder(causal_window_ns=5_000_000)
@@ -108,11 +88,12 @@ def train_predictor(num_prompts=5):
108
 
109
  result = PREDICTOR.score(log)
110
 
111
- return (f"Trained on {len(training_prompts)} prompts, "
112
- f"{len(log)} access events observed.\n"
 
113
  f"Prediction accuracy: {result['accuracy']}%\n"
114
- f"Causal chains discovered: {len(GRAPH.get_causal_chains())}\n"
115
- f"Clusters (proto-hyperedges): {len(GRAPH.clusters)}")
116
 
117
 
118
  @spaces.GPU(duration=120)
@@ -120,12 +101,10 @@ def run_analysis(prompt, max_tokens=30):
120
  """Run inference, show activation map + condensation potential."""
121
  global MEMBRANE, PREDICTOR
122
 
123
- _ensure_torch()
124
 
125
- if MODEL is None:
126
- load_model()
127
- if PREDICTOR is None:
128
- train_predictor()
129
 
130
  MODEL.to("cuda")
131
  MEMBRANE.reset()
@@ -155,7 +134,7 @@ def run_analysis(prompt, max_tokens=30):
155
  log = MEMBRANE.to_access_log()
156
  pred_result = PREDICTOR.score(log)
157
 
158
- # Build comparison output — showing BOTH granularities
159
  comparison = []
160
  comparison.append("=" * 55)
161
  comparison.append(" BASELINE vs CONDENSATE")
@@ -163,55 +142,52 @@ def run_analysis(prompt, max_tokens=30):
163
  comparison.append(f"\n Generated: {generated_text}")
164
  comparison.append(f" Time: {elapsed_ms:.0f}ms\n")
165
 
166
- # Layer-level (the floor)
167
  layer_baseline = potential['total_mb']
168
  layer_saved_pct = potential['savings_pct']
169
 
170
  comparison.append(f" WITHOUT Condensate:")
171
  comparison.append(f" All params in RAM: {layer_baseline:.2f} MB\n")
172
 
173
- comparison.append(f" ── Layer-Level (v1 floor) ──")
174
  comparison.append(f" HOT layers: {potential['hot_layers']} "
175
  f"COLD layers: {potential['cold_layers']}")
176
  comparison.append(f" Savings: {potential['cold_mb']:.2f} MB ({layer_saved_pct:.1f}%)\n")
177
 
178
- # Head-level (the real number)
179
  if head_potential['total_heads'] > 0:
180
- comparison.append(f" ── Head-Level (v2) ──")
181
  comparison.append(f" HOT heads: {head_potential['hot_heads']} "
182
  f"COLD heads: {head_potential['cold_heads']} "
183
  f"(of {head_potential['total_heads']} total)")
184
- comparison.append(f" Cold attention: {head_potential['attn_cold_mb']:.2f} MB")
185
  comparison.append(f" Cold non-attention: {head_potential['non_attn_cold_mb']:.2f} MB")
186
  comparison.append(f" Total cold: {head_potential['cold_mb']:.2f} MB\n")
187
 
188
- comparison.append(f" ┌─────────────────────────────────────────┐")
189
- comparison.append(f" HEAD-LEVEL RAM REDUCTION:")
190
- comparison.append(f" {head_potential['savings_pct']:.1f}% "
191
  f"({head_potential['cold_mb']:.2f} MB saved)"
192
- + " " * max(0, 14 - len(f"{head_potential['savings_pct']:.1f}% ({head_potential['cold_mb']:.2f} MB saved)"))
193
- + "")
194
- comparison.append(f" {head_potential['total_mb']:.2f} MB "
195
  f"{head_potential['hot_mb']:.2f} MB"
196
- + " " * max(0, 19 - len(f"{head_potential['total_mb']:.2f} MB {head_potential['hot_mb']:.2f} MB"))
197
- + "")
198
- comparison.append(f" Same output. Same quality.")
199
- comparison.append(f" └─────────────────────────────────────────┘\n")
200
 
201
  comparison.append(f" Layer-level floor: {layer_saved_pct:.1f}%")
202
  comparison.append(f" Head-level actual: {head_potential['savings_pct']:.1f}%")
203
  else:
204
- comparison.append(f" ┌─────────────────────────────────────┐")
205
- comparison.append(f" RAM REDUCTION: {layer_saved_pct:.1f}%")
206
- comparison.append(f" (Layer-level only — no heads found)")
207
- comparison.append(f" └─────────────────────────────────────┘\n")
208
 
209
  comparison.append(f"\n Prediction accuracy: {pred_result['accuracy']}%")
210
  comparison.append(f" Access events: {len(log)}")
211
 
212
- # Build analysis output — head-level detail
213
  analysis = []
214
-
215
  head_map = MEMBRANE.get_head_map()
216
  cold_heads = MEMBRANE.get_cold_heads()
217
  hot_heads = [h for h in head_map if h['temperature'] == 'HOT']
@@ -224,7 +200,6 @@ def run_analysis(prompt, max_tokens=30):
224
  analysis.append(f" {head_potential['hot_heads']} HOT / "
225
  f"{head_potential['cold_heads']} COLD\n")
226
 
227
- # Show coldest heads
228
  if cold_heads:
229
  analysis.append(f" COLDEST HEADS (condensable):")
230
  analysis.append(f" {'Head':<35} {'AvgAct':>10} {'MB':>6}")
@@ -236,7 +211,6 @@ def run_analysis(prompt, max_tokens=30):
236
  if len(cold_heads) > 20:
237
  analysis.append(f" ... and {len(cold_heads) - 20} more cold heads")
238
 
239
- # Show hottest for comparison
240
  if hot_heads:
241
  analysis.append(f"\n HOTTEST HEADS (must stay in RAM):")
242
  analysis.append(f" {'Head':<35} {'AvgAct':>10} {'MB':>6}")
@@ -246,7 +220,6 @@ def run_analysis(prompt, max_tokens=30):
246
  analysis.append(f" {name:<35} {h['avg_activation']:>10.4f} "
247
  f"{h['param_mb']:>6.4f}")
248
  else:
249
- # Fall back to layer-level
250
  analysis.append("=" * 55)
251
  analysis.append(" LAYER ACTIVATION MAP")
252
  analysis.append("=" * 55)
@@ -264,10 +237,10 @@ def run_analysis(prompt, max_tokens=30):
264
  return "\n".join(comparison), "\n".join(analysis)
265
 
266
 
267
- # --- Also keep the synthetic demo for comparison ---
268
 
269
  def run_synthetic_demo(num_layers, num_hot, num_iterations):
270
- """Run the PoC pipeline on synthetic data (no GPU needed)."""
271
  from condenser import Condenser
272
 
273
  num_layers = int(num_layers)
@@ -292,7 +265,6 @@ def run_synthetic_demo(num_layers, num_hot, num_iterations):
292
  output.append(f"\n {num_layers} regions x 64KB = {total_mb:.1f} MB total")
293
  output.append(f" {num_hot} hot / {num_layers - num_hot} cold")
294
 
295
- # Membrane + Graph + Predictor
296
  Membrane.clear()
297
  wrapped = Membrane.wrap(state.copy(), "model")
298
  for _ in range(num_iterations):
@@ -314,7 +286,6 @@ def run_synthetic_demo(num_layers, num_hot, num_iterations):
314
  output.append(f" Clusters: {len(graph.clusters)}")
315
  output.append(f" Causal chains: {len(graph.get_causal_chains())}")
316
 
317
- # Condenser
318
  def workload_fn(w):
319
  for i in range(num_layers):
320
  if i in hot_set:
@@ -351,8 +322,8 @@ with gr.Blocks(title="Condensate — Do More With Less") as demo:
351
  Condensate uses a neural substrate with causal spike propagation
352
  to learn memory access patterns and dynamically condense RAM usage.
353
 
354
- **Live Model tab:** Runs GPT-2 Large (774M params) on ZeroGPU
355
- and shows which layers AND attention heads are HOT vs COLD for your input.
356
 
357
  **Synthetic tab:** Runs the full 4-layer pipeline on configurable
358
  simulated workloads (no GPU needed).
@@ -362,9 +333,11 @@ with gr.Blocks(title="Condensate — Do More With Less") as demo:
362
  with gr.TabItem("Live Model (ZeroGPU)"):
363
  with gr.Row():
364
  with gr.Column():
365
- status = gr.Textbox(label="Status", interactive=False, lines=3)
366
- load_btn = gr.Button("1. Load Model", variant="primary")
367
- train_btn = gr.Button("2. Train Predictor", variant="primary")
 
 
368
 
369
  with gr.Row():
370
  with gr.Column():
@@ -377,22 +350,21 @@ with gr.Blocks(title="Condensate — Do More With Less") as demo:
377
  minimum=10, maximum=100, value=30, step=5,
378
  label="Max tokens"
379
  )
380
- run_btn = gr.Button("3. Run & Analyze", variant="primary")
381
 
382
  with gr.Row():
383
  with gr.Column():
384
  comparison_output = gr.Textbox(
385
  label="Baseline vs Condensate",
386
- lines=25, interactive=False,
387
  )
388
  with gr.Column():
389
  analysis_output = gr.Textbox(
390
- label="Layer Activation Map",
391
- lines=25, interactive=False,
392
  )
393
 
394
- load_btn.click(fn=load_model, outputs=status)
395
- train_btn.click(fn=train_predictor, outputs=status)
396
  run_btn.click(
397
  fn=run_analysis,
398
  inputs=[prompt_input, max_tokens],
 
21
  from graph_builder import GraphBuilder
22
  from predictor import Predictor
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # --- Global state ---
25
  MODEL = None
26
  TOKENIZER = None
 
30
  MODEL_NAME = "gpt2-large"
31
 
32
 
33
+ @spaces.GPU(duration=120)
34
+ def load_and_train():
35
+ """Load model + train predictor in a single GPU call."""
36
+ global MODEL, TOKENIZER, MEMBRANE, PREDICTOR, GRAPH
37
 
38
+ import torch
39
  from transformers import AutoModelForCausalLM, AutoTokenizer
40
+ from torch_membrane import TorchMembrane
41
 
42
+ # Load tokenizer
43
  TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
44
  if TOKENIZER.pad_token is None:
45
  TOKENIZER.pad_token = TOKENIZER.eos_token
46
 
47
+ # Load model directly to GPU
48
  MODEL = AutoModelForCausalLM.from_pretrained(
49
  MODEL_NAME,
50
+ torch_dtype=torch.float32,
51
  )
52
  MODEL.eval()
53
+ MODEL.to("cuda")
 
54
 
55
  param_count = sum(p.numel() for p in MODEL.parameters()) / 1e6
 
 
56
 
57
+ # Install membrane
58
+ MEMBRANE = TorchMembrane(MODEL)
 
 
 
 
 
 
 
 
 
59
  MEMBRANE.reset()
60
 
61
+ # Train on diverse prompts
62
  training_prompts = [
63
  "The quick brown fox jumps over the lazy",
64
  "In the beginning there was darkness and then",
65
  "Machine learning models can be optimized by",
66
  "The capital of France is Paris and the",
67
  "Once upon a time in a land far far",
68
+ ]
69
 
70
  for prompt in training_prompts:
71
  inputs = TOKENIZER(prompt, return_tensors="pt", padding=True).to("cuda")
 
77
  pad_token_id=TOKENIZER.pad_token_id,
78
  )
79
 
80
+ # Build graph and predictor
81
  log = MEMBRANE.to_access_log()
82
 
83
  GRAPH = GraphBuilder(causal_window_ns=5_000_000)
 
88
 
89
  result = PREDICTOR.score(log)
90
 
91
+ return (f"Loaded {MODEL_NAME} ({param_count:.1f}M params)\n"
92
+ f"Trained on {len(training_prompts)} prompts, "
93
+ f"{len(log)} access events.\n"
94
  f"Prediction accuracy: {result['accuracy']}%\n"
95
+ f"Chains: {len(GRAPH.get_causal_chains())} | "
96
+ f"Clusters: {len(GRAPH.clusters)}")
97
 
98
 
99
  @spaces.GPU(duration=120)
 
101
  """Run inference, show activation map + condensation potential."""
102
  global MEMBRANE, PREDICTOR
103
 
104
+ import torch
105
 
106
+ if MODEL is None or PREDICTOR is None:
107
+ return "Please click 'Load & Train' first.", ""
 
 
108
 
109
  MODEL.to("cuda")
110
  MEMBRANE.reset()
 
134
  log = MEMBRANE.to_access_log()
135
  pred_result = PREDICTOR.score(log)
136
 
137
+ # Build comparison output
138
  comparison = []
139
  comparison.append("=" * 55)
140
  comparison.append(" BASELINE vs CONDENSATE")
 
142
  comparison.append(f"\n Generated: {generated_text}")
143
  comparison.append(f" Time: {elapsed_ms:.0f}ms\n")
144
 
 
145
  layer_baseline = potential['total_mb']
146
  layer_saved_pct = potential['savings_pct']
147
 
148
  comparison.append(f" WITHOUT Condensate:")
149
  comparison.append(f" All params in RAM: {layer_baseline:.2f} MB\n")
150
 
151
+ comparison.append(f" -- Layer-Level (v1 floor) --")
152
  comparison.append(f" HOT layers: {potential['hot_layers']} "
153
  f"COLD layers: {potential['cold_layers']}")
154
  comparison.append(f" Savings: {potential['cold_mb']:.2f} MB ({layer_saved_pct:.1f}%)\n")
155
 
 
156
  if head_potential['total_heads'] > 0:
157
+ comparison.append(f" -- Head-Level (v2) --")
158
  comparison.append(f" HOT heads: {head_potential['hot_heads']} "
159
  f"COLD heads: {head_potential['cold_heads']} "
160
  f"(of {head_potential['total_heads']} total)")
161
+ comparison.append(f" Cold attention: {head_potential['attn_cold_mb']:.2f} MB")
162
  comparison.append(f" Cold non-attention: {head_potential['non_attn_cold_mb']:.2f} MB")
163
  comparison.append(f" Total cold: {head_potential['cold_mb']:.2f} MB\n")
164
 
165
+ comparison.append(f" +-------------------------------------------+")
166
+ comparison.append(f" | HEAD-LEVEL RAM REDUCTION: |")
167
+ comparison.append(f" | {head_potential['savings_pct']:.1f}% "
168
  f"({head_potential['cold_mb']:.2f} MB saved)"
169
+ + " " * max(0, 18 - len(f"{head_potential['savings_pct']:.1f}% ({head_potential['cold_mb']:.2f} MB saved)"))
170
+ + "|")
171
+ comparison.append(f" | {head_potential['total_mb']:.2f} MB -> "
172
  f"{head_potential['hot_mb']:.2f} MB"
173
+ + " " * max(0, 22 - len(f"{head_potential['total_mb']:.2f} MB -> {head_potential['hot_mb']:.2f} MB"))
174
+ + "|")
175
+ comparison.append(f" | Same output. Same quality. |")
176
+ comparison.append(f" +-------------------------------------------+\n")
177
 
178
  comparison.append(f" Layer-level floor: {layer_saved_pct:.1f}%")
179
  comparison.append(f" Head-level actual: {head_potential['savings_pct']:.1f}%")
180
  else:
181
+ comparison.append(f" +-------------------------------------------+")
182
+ comparison.append(f" | RAM REDUCTION: {layer_saved_pct:.1f}% |")
183
+ comparison.append(f" | (Layer-level only) |")
184
+ comparison.append(f" +-------------------------------------------+\n")
185
 
186
  comparison.append(f"\n Prediction accuracy: {pred_result['accuracy']}%")
187
  comparison.append(f" Access events: {len(log)}")
188
 
189
+ # Build head-level analysis output
190
  analysis = []
 
191
  head_map = MEMBRANE.get_head_map()
192
  cold_heads = MEMBRANE.get_cold_heads()
193
  hot_heads = [h for h in head_map if h['temperature'] == 'HOT']
 
200
  analysis.append(f" {head_potential['hot_heads']} HOT / "
201
  f"{head_potential['cold_heads']} COLD\n")
202
 
 
203
  if cold_heads:
204
  analysis.append(f" COLDEST HEADS (condensable):")
205
  analysis.append(f" {'Head':<35} {'AvgAct':>10} {'MB':>6}")
 
211
  if len(cold_heads) > 20:
212
  analysis.append(f" ... and {len(cold_heads) - 20} more cold heads")
213
 
 
214
  if hot_heads:
215
  analysis.append(f"\n HOTTEST HEADS (must stay in RAM):")
216
  analysis.append(f" {'Head':<35} {'AvgAct':>10} {'MB':>6}")
 
220
  analysis.append(f" {name:<35} {h['avg_activation']:>10.4f} "
221
  f"{h['param_mb']:>6.4f}")
222
  else:
 
223
  analysis.append("=" * 55)
224
  analysis.append(" LAYER ACTIVATION MAP")
225
  analysis.append("=" * 55)
 
237
  return "\n".join(comparison), "\n".join(analysis)
238
 
239
 
240
+ # --- Synthetic demo (no GPU needed) ---
241
 
242
  def run_synthetic_demo(num_layers, num_hot, num_iterations):
243
+ """Run the PoC pipeline on synthetic data."""
244
  from condenser import Condenser
245
 
246
  num_layers = int(num_layers)
 
265
  output.append(f"\n {num_layers} regions x 64KB = {total_mb:.1f} MB total")
266
  output.append(f" {num_hot} hot / {num_layers - num_hot} cold")
267
 
 
268
  Membrane.clear()
269
  wrapped = Membrane.wrap(state.copy(), "model")
270
  for _ in range(num_iterations):
 
286
  output.append(f" Clusters: {len(graph.clusters)}")
287
  output.append(f" Causal chains: {len(graph.get_causal_chains())}")
288
 
 
289
  def workload_fn(w):
290
  for i in range(num_layers):
291
  if i in hot_set:
 
322
  Condensate uses a neural substrate with causal spike propagation
323
  to learn memory access patterns and dynamically condense RAM usage.
324
 
325
+ **Live Model tab:** Runs GPT-2 Large (774M, 36 layers, 20 heads)
326
+ on ZeroGPU. Shows layer-level AND head-level activation analysis.
327
 
328
  **Synthetic tab:** Runs the full 4-layer pipeline on configurable
329
  simulated workloads (no GPU needed).
 
333
  with gr.TabItem("Live Model (ZeroGPU)"):
334
  with gr.Row():
335
  with gr.Column():
336
+ status = gr.Textbox(label="Status", interactive=False, lines=5)
337
+ load_train_btn = gr.Button(
338
+ "1. Load Model & Train Predictor (uses GPU)",
339
+ variant="primary"
340
+ )
341
 
342
  with gr.Row():
343
  with gr.Column():
 
350
  minimum=10, maximum=100, value=30, step=5,
351
  label="Max tokens"
352
  )
353
+ run_btn = gr.Button("2. Run & Analyze (uses GPU)", variant="primary")
354
 
355
  with gr.Row():
356
  with gr.Column():
357
  comparison_output = gr.Textbox(
358
  label="Baseline vs Condensate",
359
+ lines=30, interactive=False,
360
  )
361
  with gr.Column():
362
  analysis_output = gr.Textbox(
363
+ label="Head-Level Activation Map",
364
+ lines=30, interactive=False,
365
  )
366
 
367
+ load_train_btn.click(fn=load_and_train, outputs=status)
 
368
  run_btn.click(
369
  fn=run_analysis,
370
  inputs=[prompt_input, max_tokens],