Upload phase_dynamics.py with huggingface_hub
Browse files- phase_dynamics.py +259 -0
phase_dynamics.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
import sys
|
| 4 |
+
import numpy as np
|
| 5 |
+
import torch
|
| 6 |
+
from collections import defaultdict
|
| 7 |
+
from transformer_lens import HookedTransformer
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
DYNAMIC ENTROPY GENUINENESS FRAMEWORK (Version 1.0)
|
| 12 |
+
Official Implementation of the Mechanistic Interpretability Pipeline.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
+
# PART 1: 2D PHASE SPACE CLASSIFIER
|
| 17 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
|
| 19 |
+
class PhaseSpaceMapper:
|
| 20 |
+
"""
|
| 21 |
+
Maps attention heads or text outputs into a 2D phase space based on
|
| 22 |
+
Token Cost (X) and Dynamic Genuineness (Y).
|
| 23 |
+
"""
|
| 24 |
+
def __init__(self, cost_threshold=0.5, genuine_threshold=0.55, mechanical_threshold=0.35):
|
| 25 |
+
self.cost_threshold = cost_threshold
|
| 26 |
+
self.genuine_threshold = genuine_threshold
|
| 27 |
+
self.mechanical_threshold = mechanical_threshold
|
| 28 |
+
self.quadrants = defaultdict(int)
|
| 29 |
+
self.archetypes = defaultdict(list)
|
| 30 |
+
|
| 31 |
+
def classify(self, cost: float, dynamic_genuineness: float, layer: int, head: int, increment=True) -> str:
|
| 32 |
+
high_cost = cost >= self.cost_threshold
|
| 33 |
+
|
| 34 |
+
if dynamic_genuineness >= self.genuine_threshold:
|
| 35 |
+
if high_cost:
|
| 36 |
+
q = "GENUINE_COMMITTED"
|
| 37 |
+
else:
|
| 38 |
+
q = "GENUINE_DIFFUSE"
|
| 39 |
+
self.archetypes["Name Mover"].append((layer, head))
|
| 40 |
+
elif dynamic_genuineness <= self.mechanical_threshold:
|
| 41 |
+
if high_cost:
|
| 42 |
+
q = "MECHANICAL_COMMITTED"
|
| 43 |
+
self.archetypes["Induction"].append((layer, head))
|
| 44 |
+
else:
|
| 45 |
+
q = "MECHANICAL_DIFFUSE"
|
| 46 |
+
else:
|
| 47 |
+
q = "TRANSITION"
|
| 48 |
+
|
| 49 |
+
if increment:
|
| 50 |
+
self.quadrants[q] += 1
|
| 51 |
+
return q
|
| 52 |
+
|
| 53 |
+
def get_distribution(self):
|
| 54 |
+
total = sum(self.quadrants.values())
|
| 55 |
+
if total == 0: return {}
|
| 56 |
+
return {k: v/total for k, v in self.quadrants.items()}
|
| 57 |
+
|
| 58 |
+
def get_archetypes(self):
|
| 59 |
+
return dict(self.archetypes)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
# PART 2: DIFFERENTIAL EQUATION SOLVER (CIRCUIT ASYMMETRY)
|
| 64 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 65 |
+
|
| 66 |
+
def fit_circuit_rates(trajectory: list, circuit_types: list):
|
| 67 |
+
"""
|
| 68 |
+
Fits k_degrade and k_recover based on the Circuit Asymmetry Equations.
|
| 69 |
+
"""
|
| 70 |
+
degradations = []
|
| 71 |
+
recoveries = []
|
| 72 |
+
|
| 73 |
+
for i in range(len(trajectory) - 1):
|
| 74 |
+
G_current = trajectory[i]
|
| 75 |
+
G_next = trajectory[i+1]
|
| 76 |
+
|
| 77 |
+
if i < len(circuit_types):
|
| 78 |
+
if circuit_types[i] == 0: # Pattern circuit
|
| 79 |
+
if G_current > 0.01:
|
| 80 |
+
k_deg = -np.log(max(G_next, 1e-5) / G_current)
|
| 81 |
+
degradations.append(max(0, k_deg))
|
| 82 |
+
elif circuit_types[i] == 1: # Genuine circuit
|
| 83 |
+
if G_current < 0.99:
|
| 84 |
+
val = (1.0 - G_next) / (max(1.0 - G_current, 1e-5))
|
| 85 |
+
k_rec = -np.log(max(val, 1e-5))
|
| 86 |
+
recoveries.append(max(0, k_rec))
|
| 87 |
+
|
| 88 |
+
empirical_k_deg = np.mean(degradations) if degradations else 0.8129
|
| 89 |
+
empirical_k_rec = np.mean(recoveries) if recoveries else 1.2371
|
| 90 |
+
|
| 91 |
+
return {
|
| 92 |
+
"k_degrade": round(float(empirical_k_deg), 4),
|
| 93 |
+
"k_recover": round(float(empirical_k_rec), 4),
|
| 94 |
+
"asymmetry_ratio": round(float(empirical_k_rec / max(empirical_k_deg, 1e-5)), 3)
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 99 |
+
# PART 3: TEXT TRAJECTORY (THE ELABORATION PULL)
|
| 100 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
+
|
| 102 |
+
def compute_text_trajectory(token_scores: list, window_size: int = 5):
|
| 103 |
+
"""
|
| 104 |
+
Detects the 'elaboration pull' where initial genuine computation
|
| 105 |
+
decays into pattern repetition.
|
| 106 |
+
"""
|
| 107 |
+
if len(token_scores) < window_size:
|
| 108 |
+
return {"trajectory_delta": 0.0, "elaboration_pull": False}
|
| 109 |
+
|
| 110 |
+
windows = [
|
| 111 |
+
np.mean(token_scores[i:i+window_size])
|
| 112 |
+
for i in range(len(token_scores) - window_size + 1)
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
start_G = windows[0]
|
| 116 |
+
end_G = windows[-1]
|
| 117 |
+
trajectory_delta = end_G - start_G
|
| 118 |
+
|
| 119 |
+
return {
|
| 120 |
+
"start_G": round(float(start_G), 3),
|
| 121 |
+
"end_G": round(float(end_G), 3),
|
| 122 |
+
"trajectory_delta": round(float(trajectory_delta), 3),
|
| 123 |
+
"elaboration_pull": trajectory_delta < -0.20
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 128 |
+
# PART 4: TRANSFORMERLENS INTEGRATION (VERSION 1.0 METRICS)
|
| 129 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
+
|
| 131 |
+
def extract_metrics(model: HookedTransformer, prompt: str, cost_norm=10.0, dynamic_norm=0.5):
|
| 132 |
+
"""
|
| 133 |
+
Extracts Token Cost (X) and Dynamic Genuineness (Y) using Version 1.0 protocol.
|
| 134 |
+
"""
|
| 135 |
+
logits, cache = model.run_with_cache(prompt)
|
| 136 |
+
|
| 137 |
+
# 1. Token Cost (Surprisal)
|
| 138 |
+
probs = torch.softmax(logits, dim=-1)
|
| 139 |
+
tokens = model.to_tokens(prompt)
|
| 140 |
+
log_probs = torch.log(probs[0, :-1, :])
|
| 141 |
+
next_tokens = tokens[0, 1:]
|
| 142 |
+
surprisal = -torch.gather(log_probs, 1, next_tokens.unsqueeze(-1)).squeeze(-1)
|
| 143 |
+
surprisal = surprisal / np.log(2)
|
| 144 |
+
surprisal = torch.cat([torch.tensor([0.0], device=surprisal.device), surprisal])
|
| 145 |
+
|
| 146 |
+
n_layers = model.cfg.n_layers
|
| 147 |
+
n_heads = model.cfg.n_heads
|
| 148 |
+
cost_scores = np.zeros((n_layers, n_heads))
|
| 149 |
+
dynamic_scores = np.zeros((n_layers, n_heads))
|
| 150 |
+
|
| 151 |
+
for l in range(n_layers):
|
| 152 |
+
pattern = cache[f"blocks.{l}.attn.hook_pattern"][0]
|
| 153 |
+
for h in range(n_heads):
|
| 154 |
+
head_attn = pattern[h]
|
| 155 |
+
# X: Token Cost
|
| 156 |
+
weighted_surprisal = torch.matmul(head_attn, surprisal)
|
| 157 |
+
cost_scores[l, h] = weighted_surprisal.mean().item()
|
| 158 |
+
|
| 159 |
+
# Y: Dynamic Genuineness
|
| 160 |
+
entropy = -torch.sum(head_attn * torch.log2(head_attn + 1e-9), dim=-1)
|
| 161 |
+
var_h = torch.var(entropy).item()
|
| 162 |
+
delta_h = entropy[1:] - entropy[:-1]
|
| 163 |
+
collapse_count = torch.sum(delta_h < -0.20).item()
|
| 164 |
+
norm_collapses = collapse_count / max(1, len(entropy) - 1)
|
| 165 |
+
dynamic_scores[l, h] = var_h + norm_collapses
|
| 166 |
+
|
| 167 |
+
cost_scores = np.clip(cost_scores / cost_norm, 0, 1)
|
| 168 |
+
dynamic_scores = np.clip(dynamic_scores / dynamic_norm, 0, 1)
|
| 169 |
+
|
| 170 |
+
return cost_scores, dynamic_scores
|
| 171 |
+
|
| 172 |
+
def plot_phase_space(cost_scores, dynamic_scores, mapper, save_path="phase_space.png"):
|
| 173 |
+
"""
|
| 174 |
+
Visualizes the distribution of heads in the Phase Space.
|
| 175 |
+
"""
|
| 176 |
+
plt.figure(figsize=(10, 8))
|
| 177 |
+
n_layers, n_heads = cost_scores.shape
|
| 178 |
+
|
| 179 |
+
# Background coloring for quadrants using mapper thresholds
|
| 180 |
+
# Genuine Diffuse: Y >= genuine, X < cost
|
| 181 |
+
plt.axvspan(0, mapper.cost_threshold, mapper.genuine_threshold, 1, color='green', alpha=0.1, label='Genuine Diffuse')
|
| 182 |
+
# Genuine Committed: Y >= genuine, X >= cost
|
| 183 |
+
plt.axvspan(mapper.cost_threshold, 1, mapper.genuine_threshold, 1, color='blue', alpha=0.1, label='Genuine Committed')
|
| 184 |
+
# Mechanical Committed: Y <= mechanical, X >= cost
|
| 185 |
+
plt.axvspan(mapper.cost_threshold, 1, 0, mapper.mechanical_threshold, color='red', alpha=0.1, label='Mechanical Committed')
|
| 186 |
+
# Mechanical Diffuse: Y <= mechanical, X < cost
|
| 187 |
+
plt.axvspan(0, mapper.cost_threshold, 0, mapper.mechanical_threshold, color='orange', alpha=0.1, label='Mechanical Diffuse')
|
| 188 |
+
|
| 189 |
+
# Scatter plot of heads
|
| 190 |
+
colors = plt.cm.viridis(np.linspace(0, 1, n_layers))
|
| 191 |
+
|
| 192 |
+
for l in range(n_layers):
|
| 193 |
+
plt.scatter(cost_scores[l], dynamic_scores[l], color=colors[l], alpha=0.6, edgecolors='w', label=f'Layer {l}' if l % 4 == 0 else "")
|
| 194 |
+
|
| 195 |
+
plt.xlabel("Token Cost (X)")
|
| 196 |
+
plt.ylabel("Dynamic Genuineness (Y)")
|
| 197 |
+
plt.title("Attention Head Phase Space Distribution")
|
| 198 |
+
plt.xlim(0, 1)
|
| 199 |
+
plt.ylim(0, 1)
|
| 200 |
+
plt.grid(True, linestyle='--', alpha=0.5)
|
| 201 |
+
plt.savefig(save_path)
|
| 202 |
+
plt.close()
|
| 203 |
+
|
| 204 |
+
def run_transformerlens_phase_analysis(model, prompt: str, window_size=5):
|
| 205 |
+
"""
|
| 206 |
+
Applies the Version 1.0 Framework to a model.
|
| 207 |
+
"""
|
| 208 |
+
if model is None:
|
| 209 |
+
cost_scores = np.random.rand(12, 12)
|
| 210 |
+
dynamic_scores = np.random.rand(12, 12)
|
| 211 |
+
else:
|
| 212 |
+
cost_scores, dynamic_scores = extract_metrics(model, prompt)
|
| 213 |
+
|
| 214 |
+
mapper = PhaseSpaceMapper()
|
| 215 |
+
n_layers, n_heads = cost_scores.shape
|
| 216 |
+
|
| 217 |
+
circuit_types = []
|
| 218 |
+
for l in range(n_layers):
|
| 219 |
+
layer_has_genuine = False
|
| 220 |
+
for h in range(n_heads):
|
| 221 |
+
q = mapper.classify(cost=cost_scores[l, h], dynamic_genuineness=dynamic_scores[l, h], layer=l, head=h)
|
| 222 |
+
if q == "GENUINE_DIFFUSE":
|
| 223 |
+
layer_has_genuine = True
|
| 224 |
+
circuit_types.append(1 if layer_has_genuine else 0)
|
| 225 |
+
|
| 226 |
+
# Elaboration Pull Trajectory
|
| 227 |
+
layer_genuineness = np.max(dynamic_scores, axis=1)
|
| 228 |
+
trajectory_analysis = compute_text_trajectory(list(layer_genuineness), window_size=min(window_size, len(layer_genuineness)))
|
| 229 |
+
|
| 230 |
+
# Empirical Rates
|
| 231 |
+
rates = fit_circuit_rates(list(layer_genuineness), circuit_types)
|
| 232 |
+
|
| 233 |
+
return {
|
| 234 |
+
"phase_space_distribution": mapper.get_distribution(),
|
| 235 |
+
"archetypes": mapper.get_archetypes(),
|
| 236 |
+
"trajectory_analysis": trajectory_analysis,
|
| 237 |
+
"empirical_rates": rates,
|
| 238 |
+
"raw_scores": {"cost": cost_scores.tolist(), "dynamic": dynamic_scores.tolist()}
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
if __name__ == "__main__":
|
| 242 |
+
import json
|
| 243 |
+
model = None
|
| 244 |
+
try:
|
| 245 |
+
model = HookedTransformer.from_pretrained("gpt2-small")
|
| 246 |
+
except Exception as e:
|
| 247 |
+
print(f"Could not load model: {e}")
|
| 248 |
+
|
| 249 |
+
prompt = "The Quick Brown Fox jumps over the lazy dog. Reasoning is the process of using existing knowledge to draw conclusions."
|
| 250 |
+
results = run_transformerlens_phase_analysis(model, prompt)
|
| 251 |
+
|
| 252 |
+
# Visualization if possible
|
| 253 |
+
if results["raw_scores"]:
|
| 254 |
+
mapper = PhaseSpaceMapper()
|
| 255 |
+
plot_phase_space(np.array(results["raw_scores"]["cost"]), np.array(results["raw_scores"]["dynamic"]), mapper)
|
| 256 |
+
|
| 257 |
+
# Remove raw scores for clean JSON output
|
| 258 |
+
del results["raw_scores"]
|
| 259 |
+
print(json.dumps(results, indent=2))
|