#!/usr/bin/env python3 """Analyze extracted emotion vectors: cosine similarity, clustering, visualization.""" import json import os import numpy as np EXP_DIR = os.path.dirname(os.path.abspath(__file__)) RESULTS_DIR = os.path.join(EXP_DIR, "results") def load_vectors(): vectors_file = os.path.join(RESULTS_DIR, "emotion_vectors.npz") data = np.load(vectors_file) return {name: data[name] for name in data.files} def cosine_sim(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8) def cosine_similarity_matrix(vectors): emotions = sorted(vectors.keys()) n = len(emotions) matrix = np.zeros((n, n)) for i, e1 in enumerate(emotions): for j, e2 in enumerate(emotions): matrix[i, j] = cosine_sim(vectors[e1], vectors[e2]) return emotions, matrix def print_similarity_matrix(emotions, matrix): print("\n=== Cosine Similarity Matrix ===\n") # Header header = f"{'':12s}" + "".join(f"{e[:6]:>7s}" for e in emotions) print(header) for i, e in enumerate(emotions): row = f"{e:12s}" + "".join(f"{matrix[i,j]:7.2f}" for j in range(len(emotions))) print(row) def find_clusters(emotions, matrix, threshold=0.5): """Find emotion pairs with high similarity.""" print(f"\n=== High Similarity Pairs (>{threshold}) ===\n") pairs = [] for i in range(len(emotions)): for j in range(i + 1, len(emotions)): if matrix[i, j] > threshold: pairs.append((emotions[i], emotions[j], matrix[i, j])) pairs.sort(key=lambda x: -x[2]) for e1, e2, sim in pairs: print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}") if not pairs: print(" (none found)") return pairs def find_opposites(emotions, matrix, threshold=-0.3): """Find emotion pairs with negative similarity (opposites).""" print(f"\n=== Opposite Pairs (<{threshold}) ===\n") pairs = [] for i in range(len(emotions)): for j in range(i + 1, len(emotions)): if matrix[i, j] < threshold: pairs.append((emotions[i], emotions[j], matrix[i, j])) pairs.sort(key=lambda x: x[2]) for e1, e2, sim in pairs: print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}") if not pairs: print(" (none found)") return pairs def valence_arousal_check(emotions, pca_results): """Check if PC1≈valence, PC2≈arousal based on known emotion groupings.""" print("\n=== Valence-Arousal Structure Check ===\n") positive = {"happy", "proud", "inspired", "loving", "hopeful", "calm", "playful"} negative = {"sad", "angry", "afraid", "desperate", "guilty", "disgusted", "lonely", "spiteful"} # Arousal groupings revised per dejanseo's feedback (2026-04-06): # - Removed inspired from high arousal (more contemplative/medium) # - Added disgusted, confused, playful, spiteful to high arousal # - Added loving, hopeful to low arousal high_arousal = {"angry", "afraid", "surprised", "desperate", "nervous", "anxious", "disgusted", "confused", "playful", "spiteful"} low_arousal = {"calm", "sad", "brooding", "lonely", "guilty", "loving", "hopeful"} for pc_name, pc_vals in [("PC1", pca_results["pc1"]), ("PC2", pca_results["pc2"])]: pos_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in positive] neg_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in negative] hi_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in high_arousal] lo_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in low_arousal] pos_mean = np.mean(pos_vals) if pos_vals else 0 neg_mean = np.mean(neg_vals) if neg_vals else 0 hi_mean = np.mean(hi_vals) if hi_vals else 0 lo_mean = np.mean(lo_vals) if lo_vals else 0 valence_sep = abs(pos_mean - neg_mean) arousal_sep = abs(hi_mean - lo_mean) print(f" {pc_name}:") print(f" Positive mean: {pos_mean:+.3f} Negative mean: {neg_mean:+.3f} → Valence separation: {valence_sep:.3f}") print(f" High-A mean: {hi_mean:+.3f} Low-A mean: {lo_mean:+.3f} → Arousal separation: {arousal_sep:.3f}") if valence_sep > arousal_sep and valence_sep > 1.0: print(f" → {pc_name} ≈ VALENCE axis") elif arousal_sep > valence_sep and arousal_sep > 0.5: print(f" → {pc_name} ≈ AROUSAL axis") else: print(f" → {pc_name} ≈ UNCLEAR — neither valence nor arousal dominant (model may have learned its own geometry)") def main(): print("=== Emotion Vector Analysis ===\n") # Load vectors vectors = load_vectors() print(f"Loaded {len(vectors)} emotion vectors") print(f"Vector dimension: {next(iter(vectors.values())).shape[0]}") # Load experiment results for PCA results_file = os.path.join(RESULTS_DIR, "experiment_results.json") with open(results_file, "r") as f: results = json.load(f) # Similarity analysis emotions, matrix = cosine_similarity_matrix(vectors) print_similarity_matrix(emotions, matrix) find_clusters(emotions, matrix, threshold=0.4) find_opposites(emotions, matrix, threshold=-0.2) # Valence-Arousal check if "pca" in results: valence_arousal_check(emotions, results["pca"]) # Summary print("\n=== SUMMARY ===\n") avg_sim = matrix[np.triu_indices_from(matrix, k=1)].mean() print(f" Average pairwise similarity: {avg_sim:.3f}") print(f" Variance explained by PC1+PC2: {(results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2'])*100:.1f}%") # Anthropic found ~30% variance in first 2 PCs for 171 emotions # With 20 emotions, we'd expect higher concentration var_12 = results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2'] if var_12 > 0.3: print(" ✓ Strong 2D structure detected (>30% in PC1+PC2)") else: print(" ✗ Weak 2D structure (<30% in PC1+PC2)") print("\n=== ANALYSIS COMPLETE ===") if __name__ == "__main__": main()