AbstractPhil
/

geolip-constellation-activations

Model card Files Files and versions

xet

Community

AbstractPhil commited on Mar 18

Commit

84eb1a3

verified ·

1 Parent(s): 71159f8

a couple slow ones

Browse files

Files changed (1) hide show

constellation_relays_activation_effects_analysis.py +33 -8

constellation_relays_activation_effects_analysis.py CHANGED Viewed

@@ -12,6 +12,9 @@ Systematic test of:
 Each test uses the same random seed and input for fair comparison.
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -444,6 +447,9 @@ for i, depth in enumerate(depths_to_check):
 # TEST 5: TRAINED RELAY — ACTIVATION EFFECT ON LEARNING
 # ══════════════════════════════════════════════════════════════════
 print(f"\n{'━'*80}")
 print(f"TEST 5: Trained Relay — does activation choice affect what the relay LEARNS?")
 print(f"  Setup: 4-layer relay trained to classify 256d embeddings into 10 classes")
@@ -464,9 +470,12 @@ for c in range(N_CLASSES):
     noise = torch.randn(N_TRAIN // N_CLASSES, D, device=DEVICE) * 0.3
     pts = F.normalize(class_centers[c].unsqueeze(0) + noise, dim=-1)
     train_x.append(pts)
-    train_y.append(torch.full((N_TRAIN // N_CLASSES,), c, device=DEVICE))
 train_x = torch.cat(train_x)
 train_y = torch.cat(train_y)
 print(f"\n  {'pw_act':>14} {'acc':>8} {'loss':>8} {'cos_orig':>10} "
       f"{'CV':>8} {'eff_dim':>8} {'drift':>8} {'gate':>8}")
@@ -491,11 +500,15 @@ for act_name in ["none", "relu", "gelu", "silu", "tanh", "squared_relu", "star_r
     opt = torch.optim.Adam(model.parameters(), lr=1e-3)
     for step in range(TRAIN_STEPS):
-        idx = torch.randint(0, len(train_x), (128,), device=DEVICE)
         logits = model(train_x[idx])
         loss = F.cross_entropy(logits, train_y[idx])
         opt.zero_grad()
         loss.backward()
         opt.step()
     # Evaluate
@@ -523,6 +536,8 @@ for act_name in ["none", "relu", "gelu", "silu", "tanh", "squared_relu", "star_r
 # TEST 6: HYBRID RELAY — INFORMATION RETENTION
 # ══════════════════════════════════════════════════════════════════
 print(f"\n{'━'*80}")
 print(f"TEST 6: Hybrid Relay — Information Retention")
 print(f"  Setup: 8 layers of hybrid relay (attention + constellation)")
@@ -598,12 +613,14 @@ keys_a = F.normalize(torch.randn(N_CLS, D, device=DEVICE), dim=-1)
 keys_b = F.normalize(torch.randn(N_CLS, D, device=DEVICE), dim=-1)
 task_x = F.normalize(torch.randn(N_SAMPLES, S_TASK, D, device=DEVICE), dim=-1).clone()
-label_a = torch.randint(0, N_CLS, (N_SAMPLES,), device=DEVICE)
-label_b = torch.randint(0, N_CLS, (N_SAMPLES,), device=DEVICE)
 task_x[:, 0] = keys_a[label_a] + torch.randn(N_SAMPLES, D, device=DEVICE) * 0.2
 task_x[:, 1] = keys_b[label_b] + torch.randn(N_SAMPLES, D, device=DEVICE) * 0.2
 task_x = F.normalize(task_x, dim=-1)
-task_y = (label_a + label_b) % N_CLS  # class depends on BOTH tokens
 print(f"\n  {'relay_act':>14} {'acc':>8} {'loss':>8} {'g_relay':>8} "
       f"{'g_attn':>8} {'cross_Δ':>10}")
@@ -629,11 +646,15 @@ for act_name in ["none", "relu", "gelu", "silu", "tanh", "squared_relu"]:
     opt = torch.optim.Adam(model.parameters(), lr=3e-4)
     for step in range(STEPS):
-        idx = torch.randint(0, N_SAMPLES, (128,), device=DEVICE)
         logits = model(task_x[idx])
         loss = F.cross_entropy(logits, task_y[idx])
         opt.zero_grad()
         loss.backward()
         opt.step()
     model.eval()
@@ -696,11 +717,15 @@ for act_name in ["none", "relu", "gelu", "silu", "tanh", "squared_relu", "star_r
     drift_log = {}
     for step in range(TRAIN_STEPS):
-        idx = torch.randint(0, len(train_x), (128,), device=DEVICE)
         logits = model(train_x[idx])
         loss = F.cross_entropy(logits, train_y[idx])
         opt.zero_grad()
         loss.backward()
         opt.step()
         if (step + 1) in [50, 100, 200, 300, 500]:
@@ -743,6 +768,7 @@ for act_name in ["none", "relu", "gelu", "silu", "tanh", "squared_relu", "star_r
     h = x
     for layer in layers:
         h = layer(h)
     loss = h.sum()
     loss.backward()
@@ -751,7 +777,6 @@ for act_name in ["none", "relu", "gelu", "silu", "tanh", "squared_relu", "star_r
     anchor_grads = [l.anchors.grad.norm().item() for l in layers if l.anchors.grad is not None]
     gate_grads = [l.gate.grad.item() for l in layers if l.gate.grad is not None]
-    # Output gradient (last layer's contribution)
     grad_out = h.grad.norm().item() if h.grad is not None else 0
     print(f"  {act_name:>14} {grad_in:>10.4f} {grad_out:>10.4f} "

 Each test uses the same random seed and input for fair comparison.
 """
+import os
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 # TEST 5: TRAINED RELAY — ACTIVATION EFFECT ON LEARNING
 # ══════════════════════════════════════════════════════════════════
+# Cleanup from Tests 1-4
+torch.cuda.empty_cache()
 print(f"\n{'━'*80}")
 print(f"TEST 5: Trained Relay — does activation choice affect what the relay LEARNS?")
 print(f"  Setup: 4-layer relay trained to classify 256d embeddings into 10 classes")
     noise = torch.randn(N_TRAIN // N_CLASSES, D, device=DEVICE) * 0.3
     pts = F.normalize(class_centers[c].unsqueeze(0) + noise, dim=-1)
     train_x.append(pts)
+    train_y.append(torch.full((N_TRAIN // N_CLASSES,), c, dtype=torch.long, device=DEVICE))
 train_x = torch.cat(train_x)
 train_y = torch.cat(train_y)
+assert train_y.max() < N_CLASSES, f"Label OOB: max={train_y.max()}, n_classes={N_CLASSES}"
+assert train_y.min() >= 0, f"Negative label: min={train_y.min()}"
+torch.cuda.synchronize()
 print(f"\n  {'pw_act':>14} {'acc':>8} {'loss':>8} {'cos_orig':>10} "
       f"{'CV':>8} {'eff_dim':>8} {'drift':>8} {'gate':>8}")
     opt = torch.optim.Adam(model.parameters(), lr=1e-3)
     for step in range(TRAIN_STEPS):
+        idx = torch.randint(0, len(train_x), (128,))
         logits = model(train_x[idx])
         loss = F.cross_entropy(logits, train_y[idx])
+        if torch.isnan(loss) or torch.isinf(loss):
+            print(f"    ⚠ Bad loss at step {step}, act={act_name}")
+            break
         opt.zero_grad()
         loss.backward()
+        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
         opt.step()
     # Evaluate
 # TEST 6: HYBRID RELAY — INFORMATION RETENTION
 # ══════════════════════════════════════════════════════════════════
+torch.cuda.empty_cache()
 print(f"\n{'━'*80}")
 print(f"TEST 6: Hybrid Relay — Information Retention")
 print(f"  Setup: 8 layers of hybrid relay (attention + constellation)")
 keys_b = F.normalize(torch.randn(N_CLS, D, device=DEVICE), dim=-1)
 task_x = F.normalize(torch.randn(N_SAMPLES, S_TASK, D, device=DEVICE), dim=-1).clone()
+label_a = torch.randint(0, N_CLS, (N_SAMPLES,), dtype=torch.long, device=DEVICE)
+label_b = torch.randint(0, N_CLS, (N_SAMPLES,), dtype=torch.long, device=DEVICE)
 task_x[:, 0] = keys_a[label_a] + torch.randn(N_SAMPLES, D, device=DEVICE) * 0.2
 task_x[:, 1] = keys_b[label_b] + torch.randn(N_SAMPLES, D, device=DEVICE) * 0.2
 task_x = F.normalize(task_x, dim=-1)
+task_y = ((label_a + label_b) % N_CLS).long()
+assert task_y.max() < N_CLS and task_y.min() >= 0
+torch.cuda.synchronize()
 print(f"\n  {'relay_act':>14} {'acc':>8} {'loss':>8} {'g_relay':>8} "
       f"{'g_attn':>8} {'cross_Δ':>10}")
     opt = torch.optim.Adam(model.parameters(), lr=3e-4)
     for step in range(STEPS):
+        idx = torch.randint(0, N_SAMPLES, (128,))
         logits = model(task_x[idx])
         loss = F.cross_entropy(logits, task_y[idx])
+        if torch.isnan(loss) or torch.isinf(loss):
+            print(f"    ⚠ Bad loss at step {step}, act={act_name}")
+            break
         opt.zero_grad()
         loss.backward()
+        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
         opt.step()
     model.eval()
     drift_log = {}
     for step in range(TRAIN_STEPS):
+        idx = torch.randint(0, len(train_x), (128,))
         logits = model(train_x[idx])
         loss = F.cross_entropy(logits, train_y[idx])
+        if torch.isnan(loss) or torch.isinf(loss):
+            print(f"    ⚠ Bad loss at step {step}, act={act_name}")
+            break
         opt.zero_grad()
         loss.backward()
+        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
         opt.step()
         if (step + 1) in [50, 100, 200, 300, 500]:
     h = x
     for layer in layers:
         h = layer(h)
+    h.retain_grad()
     loss = h.sum()
     loss.backward()
     anchor_grads = [l.anchors.grad.norm().item() for l in layers if l.anchors.grad is not None]
     gate_grads = [l.gate.grad.item() for l in layers if l.gate.grad is not None]
     grad_out = h.grad.norm().item() if h.grad is not None else 0
     print(f"  {act_name:>14} {grad_in:>10.4f} {grad_out:>10.4f} "