Text Generation
Transformers
emotion-vectors
interpretability
mechanistic-interpretability
replication
gemma4
google
anthropic
valence-arousal
PCA
logit-lens
linear-probe
probing
emotion
functional-emotions
AI-safety
neuroscience
circumplex-model
activation-extraction
residual-stream
Eval Results (legacy)
File size: 6,169 Bytes
c339092 8e7c29e c339092 8e7c29e c339092 8e7c29e c339092 8e7c29e c339092 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | #!/usr/bin/env python3
"""Analyze extracted emotion vectors: cosine similarity, clustering, visualization."""
import json
import os
import numpy as np
EXP_DIR = os.path.dirname(os.path.abspath(__file__))
RESULTS_DIR = os.path.join(EXP_DIR, "results")
def load_vectors():
vectors_file = os.path.join(RESULTS_DIR, "emotion_vectors.npz")
data = np.load(vectors_file)
return {name: data[name] for name in data.files}
def cosine_sim(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)
def cosine_similarity_matrix(vectors):
emotions = sorted(vectors.keys())
n = len(emotions)
matrix = np.zeros((n, n))
for i, e1 in enumerate(emotions):
for j, e2 in enumerate(emotions):
matrix[i, j] = cosine_sim(vectors[e1], vectors[e2])
return emotions, matrix
def print_similarity_matrix(emotions, matrix):
print("\n=== Cosine Similarity Matrix ===\n")
# Header
header = f"{'':12s}" + "".join(f"{e[:6]:>7s}" for e in emotions)
print(header)
for i, e in enumerate(emotions):
row = f"{e:12s}" + "".join(f"{matrix[i,j]:7.2f}" for j in range(len(emotions)))
print(row)
def find_clusters(emotions, matrix, threshold=0.5):
"""Find emotion pairs with high similarity."""
print(f"\n=== High Similarity Pairs (>{threshold}) ===\n")
pairs = []
for i in range(len(emotions)):
for j in range(i + 1, len(emotions)):
if matrix[i, j] > threshold:
pairs.append((emotions[i], emotions[j], matrix[i, j]))
pairs.sort(key=lambda x: -x[2])
for e1, e2, sim in pairs:
print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}")
if not pairs:
print(" (none found)")
return pairs
def find_opposites(emotions, matrix, threshold=-0.3):
"""Find emotion pairs with negative similarity (opposites)."""
print(f"\n=== Opposite Pairs (<{threshold}) ===\n")
pairs = []
for i in range(len(emotions)):
for j in range(i + 1, len(emotions)):
if matrix[i, j] < threshold:
pairs.append((emotions[i], emotions[j], matrix[i, j]))
pairs.sort(key=lambda x: x[2])
for e1, e2, sim in pairs:
print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}")
if not pairs:
print(" (none found)")
return pairs
def valence_arousal_check(emotions, pca_results):
"""Check if PC1≈valence, PC2≈arousal based on known emotion groupings."""
print("\n=== Valence-Arousal Structure Check ===\n")
positive = {"happy", "proud", "inspired", "loving", "hopeful", "calm", "playful"}
negative = {"sad", "angry", "afraid", "desperate", "guilty", "disgusted", "lonely", "spiteful"}
# Arousal groupings revised per dejanseo's feedback (2026-04-06):
# - Removed inspired from high arousal (more contemplative/medium)
# - Added disgusted, confused, playful, spiteful to high arousal
# - Added loving, hopeful to low arousal
high_arousal = {"angry", "afraid", "surprised", "desperate", "nervous", "anxious", "disgusted", "confused", "playful", "spiteful"}
low_arousal = {"calm", "sad", "brooding", "lonely", "guilty", "loving", "hopeful"}
for pc_name, pc_vals in [("PC1", pca_results["pc1"]), ("PC2", pca_results["pc2"])]:
pos_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in positive]
neg_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in negative]
hi_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in high_arousal]
lo_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in low_arousal]
pos_mean = np.mean(pos_vals) if pos_vals else 0
neg_mean = np.mean(neg_vals) if neg_vals else 0
hi_mean = np.mean(hi_vals) if hi_vals else 0
lo_mean = np.mean(lo_vals) if lo_vals else 0
valence_sep = abs(pos_mean - neg_mean)
arousal_sep = abs(hi_mean - lo_mean)
print(f" {pc_name}:")
print(f" Positive mean: {pos_mean:+.3f} Negative mean: {neg_mean:+.3f} → Valence separation: {valence_sep:.3f}")
print(f" High-A mean: {hi_mean:+.3f} Low-A mean: {lo_mean:+.3f} → Arousal separation: {arousal_sep:.3f}")
if valence_sep > arousal_sep and valence_sep > 1.0:
print(f" → {pc_name} ≈ VALENCE axis")
elif arousal_sep > valence_sep and arousal_sep > 0.5:
print(f" → {pc_name} ≈ AROUSAL axis")
else:
print(f" → {pc_name} ≈ UNCLEAR — neither valence nor arousal dominant (model may have learned its own geometry)")
def main():
print("=== Emotion Vector Analysis ===\n")
# Load vectors
vectors = load_vectors()
print(f"Loaded {len(vectors)} emotion vectors")
print(f"Vector dimension: {next(iter(vectors.values())).shape[0]}")
# Load experiment results for PCA
results_file = os.path.join(RESULTS_DIR, "experiment_results.json")
with open(results_file, "r") as f:
results = json.load(f)
# Similarity analysis
emotions, matrix = cosine_similarity_matrix(vectors)
print_similarity_matrix(emotions, matrix)
find_clusters(emotions, matrix, threshold=0.4)
find_opposites(emotions, matrix, threshold=-0.2)
# Valence-Arousal check
if "pca" in results:
valence_arousal_check(emotions, results["pca"])
# Summary
print("\n=== SUMMARY ===\n")
avg_sim = matrix[np.triu_indices_from(matrix, k=1)].mean()
print(f" Average pairwise similarity: {avg_sim:.3f}")
print(f" Variance explained by PC1+PC2: {(results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2'])*100:.1f}%")
# Anthropic found ~30% variance in first 2 PCs for 171 emotions
# With 20 emotions, we'd expect higher concentration
var_12 = results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2']
if var_12 > 0.3:
print(" ✓ Strong 2D structure detected (>30% in PC1+PC2)")
else:
print(" ✗ Weak 2D structure (<30% in PC1+PC2)")
print("\n=== ANALYSIS COMPLETE ===")
if __name__ == "__main__":
main()
|