emotion-vector-replication / analyze_vectors.py
rain1955's picture
Fix arousal groupings per @dejanseo feedback: revised emotion categories, added threshold check for unclear axes
8e7c29e verified
#!/usr/bin/env python3
"""Analyze extracted emotion vectors: cosine similarity, clustering, visualization."""
import json
import os
import numpy as np
EXP_DIR = os.path.dirname(os.path.abspath(__file__))
RESULTS_DIR = os.path.join(EXP_DIR, "results")
def load_vectors():
vectors_file = os.path.join(RESULTS_DIR, "emotion_vectors.npz")
data = np.load(vectors_file)
return {name: data[name] for name in data.files}
def cosine_sim(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)
def cosine_similarity_matrix(vectors):
emotions = sorted(vectors.keys())
n = len(emotions)
matrix = np.zeros((n, n))
for i, e1 in enumerate(emotions):
for j, e2 in enumerate(emotions):
matrix[i, j] = cosine_sim(vectors[e1], vectors[e2])
return emotions, matrix
def print_similarity_matrix(emotions, matrix):
print("\n=== Cosine Similarity Matrix ===\n")
# Header
header = f"{'':12s}" + "".join(f"{e[:6]:>7s}" for e in emotions)
print(header)
for i, e in enumerate(emotions):
row = f"{e:12s}" + "".join(f"{matrix[i,j]:7.2f}" for j in range(len(emotions)))
print(row)
def find_clusters(emotions, matrix, threshold=0.5):
"""Find emotion pairs with high similarity."""
print(f"\n=== High Similarity Pairs (>{threshold}) ===\n")
pairs = []
for i in range(len(emotions)):
for j in range(i + 1, len(emotions)):
if matrix[i, j] > threshold:
pairs.append((emotions[i], emotions[j], matrix[i, j]))
pairs.sort(key=lambda x: -x[2])
for e1, e2, sim in pairs:
print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}")
if not pairs:
print(" (none found)")
return pairs
def find_opposites(emotions, matrix, threshold=-0.3):
"""Find emotion pairs with negative similarity (opposites)."""
print(f"\n=== Opposite Pairs (<{threshold}) ===\n")
pairs = []
for i in range(len(emotions)):
for j in range(i + 1, len(emotions)):
if matrix[i, j] < threshold:
pairs.append((emotions[i], emotions[j], matrix[i, j]))
pairs.sort(key=lambda x: x[2])
for e1, e2, sim in pairs:
print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}")
if not pairs:
print(" (none found)")
return pairs
def valence_arousal_check(emotions, pca_results):
"""Check if PC1≈valence, PC2≈arousal based on known emotion groupings."""
print("\n=== Valence-Arousal Structure Check ===\n")
positive = {"happy", "proud", "inspired", "loving", "hopeful", "calm", "playful"}
negative = {"sad", "angry", "afraid", "desperate", "guilty", "disgusted", "lonely", "spiteful"}
# Arousal groupings revised per dejanseo's feedback (2026-04-06):
# - Removed inspired from high arousal (more contemplative/medium)
# - Added disgusted, confused, playful, spiteful to high arousal
# - Added loving, hopeful to low arousal
high_arousal = {"angry", "afraid", "surprised", "desperate", "nervous", "anxious", "disgusted", "confused", "playful", "spiteful"}
low_arousal = {"calm", "sad", "brooding", "lonely", "guilty", "loving", "hopeful"}
for pc_name, pc_vals in [("PC1", pca_results["pc1"]), ("PC2", pca_results["pc2"])]:
pos_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in positive]
neg_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in negative]
hi_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in high_arousal]
lo_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in low_arousal]
pos_mean = np.mean(pos_vals) if pos_vals else 0
neg_mean = np.mean(neg_vals) if neg_vals else 0
hi_mean = np.mean(hi_vals) if hi_vals else 0
lo_mean = np.mean(lo_vals) if lo_vals else 0
valence_sep = abs(pos_mean - neg_mean)
arousal_sep = abs(hi_mean - lo_mean)
print(f" {pc_name}:")
print(f" Positive mean: {pos_mean:+.3f} Negative mean: {neg_mean:+.3f} → Valence separation: {valence_sep:.3f}")
print(f" High-A mean: {hi_mean:+.3f} Low-A mean: {lo_mean:+.3f} → Arousal separation: {arousal_sep:.3f}")
if valence_sep > arousal_sep and valence_sep > 1.0:
print(f" → {pc_name} ≈ VALENCE axis")
elif arousal_sep > valence_sep and arousal_sep > 0.5:
print(f" → {pc_name} ≈ AROUSAL axis")
else:
print(f" → {pc_name} ≈ UNCLEAR — neither valence nor arousal dominant (model may have learned its own geometry)")
def main():
print("=== Emotion Vector Analysis ===\n")
# Load vectors
vectors = load_vectors()
print(f"Loaded {len(vectors)} emotion vectors")
print(f"Vector dimension: {next(iter(vectors.values())).shape[0]}")
# Load experiment results for PCA
results_file = os.path.join(RESULTS_DIR, "experiment_results.json")
with open(results_file, "r") as f:
results = json.load(f)
# Similarity analysis
emotions, matrix = cosine_similarity_matrix(vectors)
print_similarity_matrix(emotions, matrix)
find_clusters(emotions, matrix, threshold=0.4)
find_opposites(emotions, matrix, threshold=-0.2)
# Valence-Arousal check
if "pca" in results:
valence_arousal_check(emotions, results["pca"])
# Summary
print("\n=== SUMMARY ===\n")
avg_sim = matrix[np.triu_indices_from(matrix, k=1)].mean()
print(f" Average pairwise similarity: {avg_sim:.3f}")
print(f" Variance explained by PC1+PC2: {(results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2'])*100:.1f}%")
# Anthropic found ~30% variance in first 2 PCs for 171 emotions
# With 20 emotions, we'd expect higher concentration
var_12 = results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2']
if var_12 > 0.3:
print(" ✓ Strong 2D structure detected (>30% in PC1+PC2)")
else:
print(" ✗ Weak 2D structure (<30% in PC1+PC2)")
print("\n=== ANALYSIS COMPLETE ===")
if __name__ == "__main__":
main()