Text Generation
Transformers
emotion-vectors
interpretability
mechanistic-interpretability
replication
gemma4
google
anthropic
valence-arousal
PCA
logit-lens
linear-probe
probing
emotion
functional-emotions
AI-safety
neuroscience
circumplex-model
activation-extraction
residual-stream
Eval Results (legacy)
| #!/usr/bin/env python3 | |
| """Analyze extracted emotion vectors: cosine similarity, clustering, visualization.""" | |
| import json | |
| import os | |
| import numpy as np | |
| EXP_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| RESULTS_DIR = os.path.join(EXP_DIR, "results") | |
| def load_vectors(): | |
| vectors_file = os.path.join(RESULTS_DIR, "emotion_vectors.npz") | |
| data = np.load(vectors_file) | |
| return {name: data[name] for name in data.files} | |
| def cosine_sim(a, b): | |
| return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8) | |
| def cosine_similarity_matrix(vectors): | |
| emotions = sorted(vectors.keys()) | |
| n = len(emotions) | |
| matrix = np.zeros((n, n)) | |
| for i, e1 in enumerate(emotions): | |
| for j, e2 in enumerate(emotions): | |
| matrix[i, j] = cosine_sim(vectors[e1], vectors[e2]) | |
| return emotions, matrix | |
| def print_similarity_matrix(emotions, matrix): | |
| print("\n=== Cosine Similarity Matrix ===\n") | |
| # Header | |
| header = f"{'':12s}" + "".join(f"{e[:6]:>7s}" for e in emotions) | |
| print(header) | |
| for i, e in enumerate(emotions): | |
| row = f"{e:12s}" + "".join(f"{matrix[i,j]:7.2f}" for j in range(len(emotions))) | |
| print(row) | |
| def find_clusters(emotions, matrix, threshold=0.5): | |
| """Find emotion pairs with high similarity.""" | |
| print(f"\n=== High Similarity Pairs (>{threshold}) ===\n") | |
| pairs = [] | |
| for i in range(len(emotions)): | |
| for j in range(i + 1, len(emotions)): | |
| if matrix[i, j] > threshold: | |
| pairs.append((emotions[i], emotions[j], matrix[i, j])) | |
| pairs.sort(key=lambda x: -x[2]) | |
| for e1, e2, sim in pairs: | |
| print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}") | |
| if not pairs: | |
| print(" (none found)") | |
| return pairs | |
| def find_opposites(emotions, matrix, threshold=-0.3): | |
| """Find emotion pairs with negative similarity (opposites).""" | |
| print(f"\n=== Opposite Pairs (<{threshold}) ===\n") | |
| pairs = [] | |
| for i in range(len(emotions)): | |
| for j in range(i + 1, len(emotions)): | |
| if matrix[i, j] < threshold: | |
| pairs.append((emotions[i], emotions[j], matrix[i, j])) | |
| pairs.sort(key=lambda x: x[2]) | |
| for e1, e2, sim in pairs: | |
| print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}") | |
| if not pairs: | |
| print(" (none found)") | |
| return pairs | |
| def valence_arousal_check(emotions, pca_results): | |
| """Check if PC1≈valence, PC2≈arousal based on known emotion groupings.""" | |
| print("\n=== Valence-Arousal Structure Check ===\n") | |
| positive = {"happy", "proud", "inspired", "loving", "hopeful", "calm", "playful"} | |
| negative = {"sad", "angry", "afraid", "desperate", "guilty", "disgusted", "lonely", "spiteful"} | |
| # Arousal groupings revised per dejanseo's feedback (2026-04-06): | |
| # - Removed inspired from high arousal (more contemplative/medium) | |
| # - Added disgusted, confused, playful, spiteful to high arousal | |
| # - Added loving, hopeful to low arousal | |
| high_arousal = {"angry", "afraid", "surprised", "desperate", "nervous", "anxious", "disgusted", "confused", "playful", "spiteful"} | |
| low_arousal = {"calm", "sad", "brooding", "lonely", "guilty", "loving", "hopeful"} | |
| for pc_name, pc_vals in [("PC1", pca_results["pc1"]), ("PC2", pca_results["pc2"])]: | |
| pos_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in positive] | |
| neg_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in negative] | |
| hi_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in high_arousal] | |
| lo_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in low_arousal] | |
| pos_mean = np.mean(pos_vals) if pos_vals else 0 | |
| neg_mean = np.mean(neg_vals) if neg_vals else 0 | |
| hi_mean = np.mean(hi_vals) if hi_vals else 0 | |
| lo_mean = np.mean(lo_vals) if lo_vals else 0 | |
| valence_sep = abs(pos_mean - neg_mean) | |
| arousal_sep = abs(hi_mean - lo_mean) | |
| print(f" {pc_name}:") | |
| print(f" Positive mean: {pos_mean:+.3f} Negative mean: {neg_mean:+.3f} → Valence separation: {valence_sep:.3f}") | |
| print(f" High-A mean: {hi_mean:+.3f} Low-A mean: {lo_mean:+.3f} → Arousal separation: {arousal_sep:.3f}") | |
| if valence_sep > arousal_sep and valence_sep > 1.0: | |
| print(f" → {pc_name} ≈ VALENCE axis") | |
| elif arousal_sep > valence_sep and arousal_sep > 0.5: | |
| print(f" → {pc_name} ≈ AROUSAL axis") | |
| else: | |
| print(f" → {pc_name} ≈ UNCLEAR — neither valence nor arousal dominant (model may have learned its own geometry)") | |
| def main(): | |
| print("=== Emotion Vector Analysis ===\n") | |
| # Load vectors | |
| vectors = load_vectors() | |
| print(f"Loaded {len(vectors)} emotion vectors") | |
| print(f"Vector dimension: {next(iter(vectors.values())).shape[0]}") | |
| # Load experiment results for PCA | |
| results_file = os.path.join(RESULTS_DIR, "experiment_results.json") | |
| with open(results_file, "r") as f: | |
| results = json.load(f) | |
| # Similarity analysis | |
| emotions, matrix = cosine_similarity_matrix(vectors) | |
| print_similarity_matrix(emotions, matrix) | |
| find_clusters(emotions, matrix, threshold=0.4) | |
| find_opposites(emotions, matrix, threshold=-0.2) | |
| # Valence-Arousal check | |
| if "pca" in results: | |
| valence_arousal_check(emotions, results["pca"]) | |
| # Summary | |
| print("\n=== SUMMARY ===\n") | |
| avg_sim = matrix[np.triu_indices_from(matrix, k=1)].mean() | |
| print(f" Average pairwise similarity: {avg_sim:.3f}") | |
| print(f" Variance explained by PC1+PC2: {(results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2'])*100:.1f}%") | |
| # Anthropic found ~30% variance in first 2 PCs for 171 emotions | |
| # With 20 emotions, we'd expect higher concentration | |
| var_12 = results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2'] | |
| if var_12 > 0.3: | |
| print(" ✓ Strong 2D structure detected (>30% in PC1+PC2)") | |
| else: | |
| print(" ✗ Weak 2D structure (<30% in PC1+PC2)") | |
| print("\n=== ANALYSIS COMPLETE ===") | |
| if __name__ == "__main__": | |
| main() | |