analyze_vectors.py · rain1955/emotion-vector-replication at main

emotion-vector-replication / analyze_vectors.py

rain1955

Fix arousal groupings per @dejanseo feedback: revised emotion categories, added threshold check for unclear axes

8e7c29e verified 4 days ago

6.17 kB

	#!/usr/bin/env python3
	"""Analyze extracted emotion vectors: cosine similarity, clustering, visualization."""

	import json
	import os
	import numpy as np

	EXP_DIR = os.path.dirname(os.path.abspath(__file__))
	RESULTS_DIR = os.path.join(EXP_DIR, "results")


	def load_vectors():
	vectors_file = os.path.join(RESULTS_DIR, "emotion_vectors.npz")
	data = np.load(vectors_file)
	return {name: data[name] for name in data.files}


	def cosine_sim(a, b):
	return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)


	def cosine_similarity_matrix(vectors):
	emotions = sorted(vectors.keys())
	n = len(emotions)
	matrix = np.zeros((n, n))
	for i, e1 in enumerate(emotions):
	for j, e2 in enumerate(emotions):
	matrix[i, j] = cosine_sim(vectors[e1], vectors[e2])
	return emotions, matrix


	def print_similarity_matrix(emotions, matrix):
	print("\n=== Cosine Similarity Matrix ===\n")
	# Header
	header = f"{'':12s}" + "".join(f"{e[:6]:>7s}" for e in emotions)
	print(header)
	for i, e in enumerate(emotions):
	row = f"{e:12s}" + "".join(f"{matrix[i,j]:7.2f}" for j in range(len(emotions)))
	print(row)


	def find_clusters(emotions, matrix, threshold=0.5):
	"""Find emotion pairs with high similarity."""
	print(f"\n=== High Similarity Pairs (>{threshold}) ===\n")
	pairs = []
	for i in range(len(emotions)):
	for j in range(i + 1, len(emotions)):
	if matrix[i, j] > threshold:
	pairs.append((emotions[i], emotions[j], matrix[i, j]))
	pairs.sort(key=lambda x: -x[2])
	for e1, e2, sim in pairs:
	print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}")
	if not pairs:
	print(" (none found)")
	return pairs


	def find_opposites(emotions, matrix, threshold=-0.3):
	"""Find emotion pairs with negative similarity (opposites)."""
	print(f"\n=== Opposite Pairs (<{threshold}) ===\n")
	pairs = []
	for i in range(len(emotions)):
	for j in range(i + 1, len(emotions)):
	if matrix[i, j] < threshold:
	pairs.append((emotions[i], emotions[j], matrix[i, j]))
	pairs.sort(key=lambda x: x[2])
	for e1, e2, sim in pairs:
	print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}")
	if not pairs:
	print(" (none found)")
	return pairs


	def valence_arousal_check(emotions, pca_results):
	"""Check if PC1≈valence, PC2≈arousal based on known emotion groupings."""
	print("\n=== Valence-Arousal Structure Check ===\n")

	positive = {"happy", "proud", "inspired", "loving", "hopeful", "calm", "playful"}
	negative = {"sad", "angry", "afraid", "desperate", "guilty", "disgusted", "lonely", "spiteful"}
	# Arousal groupings revised per dejanseo's feedback (2026-04-06):
	# - Removed inspired from high arousal (more contemplative/medium)
	# - Added disgusted, confused, playful, spiteful to high arousal
	# - Added loving, hopeful to low arousal
	high_arousal = {"angry", "afraid", "surprised", "desperate", "nervous", "anxious", "disgusted", "confused", "playful", "spiteful"}
	low_arousal = {"calm", "sad", "brooding", "lonely", "guilty", "loving", "hopeful"}

	for pc_name, pc_vals in [("PC1", pca_results["pc1"]), ("PC2", pca_results["pc2"])]:
	pos_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in positive]
	neg_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in negative]
	hi_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in high_arousal]
	lo_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in low_arousal]

	pos_mean = np.mean(pos_vals) if pos_vals else 0
	neg_mean = np.mean(neg_vals) if neg_vals else 0
	hi_mean = np.mean(hi_vals) if hi_vals else 0
	lo_mean = np.mean(lo_vals) if lo_vals else 0

	valence_sep = abs(pos_mean - neg_mean)
	arousal_sep = abs(hi_mean - lo_mean)

	print(f" {pc_name}:")
	print(f" Positive mean: {pos_mean:+.3f} Negative mean: {neg_mean:+.3f} → Valence separation: {valence_sep:.3f}")
	print(f" High-A mean: {hi_mean:+.3f} Low-A mean: {lo_mean:+.3f} → Arousal separation: {arousal_sep:.3f}")

	if valence_sep > arousal_sep and valence_sep > 1.0:
	print(f" → {pc_name} ≈ VALENCE axis")
	elif arousal_sep > valence_sep and arousal_sep > 0.5:
	print(f" → {pc_name} ≈ AROUSAL axis")
	else:
	print(f" → {pc_name} ≈ UNCLEAR — neither valence nor arousal dominant (model may have learned its own geometry)")


	def main():
	print("=== Emotion Vector Analysis ===\n")

	# Load vectors
	vectors = load_vectors()
	print(f"Loaded {len(vectors)} emotion vectors")
	print(f"Vector dimension: {next(iter(vectors.values())).shape[0]}")

	# Load experiment results for PCA
	results_file = os.path.join(RESULTS_DIR, "experiment_results.json")
	with open(results_file, "r") as f:
	results = json.load(f)

	# Similarity analysis
	emotions, matrix = cosine_similarity_matrix(vectors)
	print_similarity_matrix(emotions, matrix)
	find_clusters(emotions, matrix, threshold=0.4)
	find_opposites(emotions, matrix, threshold=-0.2)

	# Valence-Arousal check
	if "pca" in results:
	valence_arousal_check(emotions, results["pca"])

	# Summary
	print("\n=== SUMMARY ===\n")
	avg_sim = matrix[np.triu_indices_from(matrix, k=1)].mean()
	print(f" Average pairwise similarity: {avg_sim:.3f}")
	print(f" Variance explained by PC1+PC2: {(results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2'])*100:.1f}%")

	# Anthropic found ~30% variance in first 2 PCs for 171 emotions
	# With 20 emotions, we'd expect higher concentration
	var_12 = results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2']
	if var_12 > 0.3:
	print(" ✓ Strong 2D structure detected (>30% in PC1+PC2)")
	else:
	print(" ✗ Weak 2D structure (<30% in PC1+PC2)")

	print("\n=== ANALYSIS COMPLETE ===")


	if __name__ == "__main__":
	main()