rain1955 commited on
Commit
c339092
·
verified ·
1 Parent(s): 051c56b

Add analyze_vectors.py

Browse files
Files changed (1) hide show
  1. analyze_vectors.py +148 -0
analyze_vectors.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Analyze extracted emotion vectors: cosine similarity, clustering, visualization."""
3
+
4
+ import json
5
+ import os
6
+ import numpy as np
7
+
8
+ EXP_DIR = os.path.dirname(os.path.abspath(__file__))
9
+ RESULTS_DIR = os.path.join(EXP_DIR, "results")
10
+
11
+
12
+ def load_vectors():
13
+ vectors_file = os.path.join(RESULTS_DIR, "emotion_vectors.npz")
14
+ data = np.load(vectors_file)
15
+ return {name: data[name] for name in data.files}
16
+
17
+
18
+ def cosine_sim(a, b):
19
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)
20
+
21
+
22
+ def cosine_similarity_matrix(vectors):
23
+ emotions = sorted(vectors.keys())
24
+ n = len(emotions)
25
+ matrix = np.zeros((n, n))
26
+ for i, e1 in enumerate(emotions):
27
+ for j, e2 in enumerate(emotions):
28
+ matrix[i, j] = cosine_sim(vectors[e1], vectors[e2])
29
+ return emotions, matrix
30
+
31
+
32
+ def print_similarity_matrix(emotions, matrix):
33
+ print("\n=== Cosine Similarity Matrix ===\n")
34
+ # Header
35
+ header = f"{'':12s}" + "".join(f"{e[:6]:>7s}" for e in emotions)
36
+ print(header)
37
+ for i, e in enumerate(emotions):
38
+ row = f"{e:12s}" + "".join(f"{matrix[i,j]:7.2f}" for j in range(len(emotions)))
39
+ print(row)
40
+
41
+
42
+ def find_clusters(emotions, matrix, threshold=0.5):
43
+ """Find emotion pairs with high similarity."""
44
+ print(f"\n=== High Similarity Pairs (>{threshold}) ===\n")
45
+ pairs = []
46
+ for i in range(len(emotions)):
47
+ for j in range(i + 1, len(emotions)):
48
+ if matrix[i, j] > threshold:
49
+ pairs.append((emotions[i], emotions[j], matrix[i, j]))
50
+ pairs.sort(key=lambda x: -x[2])
51
+ for e1, e2, sim in pairs:
52
+ print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}")
53
+ if not pairs:
54
+ print(" (none found)")
55
+ return pairs
56
+
57
+
58
+ def find_opposites(emotions, matrix, threshold=-0.3):
59
+ """Find emotion pairs with negative similarity (opposites)."""
60
+ print(f"\n=== Opposite Pairs (<{threshold}) ===\n")
61
+ pairs = []
62
+ for i in range(len(emotions)):
63
+ for j in range(i + 1, len(emotions)):
64
+ if matrix[i, j] < threshold:
65
+ pairs.append((emotions[i], emotions[j], matrix[i, j]))
66
+ pairs.sort(key=lambda x: x[2])
67
+ for e1, e2, sim in pairs:
68
+ print(f" {e1:12s} <-> {e2:12s} sim={sim:.3f}")
69
+ if not pairs:
70
+ print(" (none found)")
71
+ return pairs
72
+
73
+
74
+ def valence_arousal_check(emotions, pca_results):
75
+ """Check if PC1≈valence, PC2≈arousal based on known emotion groupings."""
76
+ print("\n=== Valence-Arousal Structure Check ===\n")
77
+
78
+ positive = {"happy", "proud", "inspired", "loving", "hopeful", "calm", "playful"}
79
+ negative = {"sad", "angry", "afraid", "desperate", "guilty", "disgusted", "lonely", "spiteful"}
80
+ high_arousal = {"angry", "afraid", "surprised", "desperate", "inspired", "nervous", "anxious"}
81
+ low_arousal = {"calm", "sad", "brooding", "lonely", "guilty"}
82
+
83
+ for pc_name, pc_vals in [("PC1", pca_results["pc1"]), ("PC2", pca_results["pc2"])]:
84
+ pos_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in positive]
85
+ neg_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in negative]
86
+ hi_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in high_arousal]
87
+ lo_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in low_arousal]
88
+
89
+ pos_mean = np.mean(pos_vals) if pos_vals else 0
90
+ neg_mean = np.mean(neg_vals) if neg_vals else 0
91
+ hi_mean = np.mean(hi_vals) if hi_vals else 0
92
+ lo_mean = np.mean(lo_vals) if lo_vals else 0
93
+
94
+ valence_sep = abs(pos_mean - neg_mean)
95
+ arousal_sep = abs(hi_mean - lo_mean)
96
+
97
+ print(f" {pc_name}:")
98
+ print(f" Positive mean: {pos_mean:+.3f} Negative mean: {neg_mean:+.3f} → Valence separation: {valence_sep:.3f}")
99
+ print(f" High-A mean: {hi_mean:+.3f} Low-A mean: {lo_mean:+.3f} → Arousal separation: {arousal_sep:.3f}")
100
+
101
+ if valence_sep > arousal_sep:
102
+ print(f" → {pc_name} ≈ VALENCE axis")
103
+ else:
104
+ print(f" → {pc_name} ≈ AROUSAL axis")
105
+
106
+
107
+ def main():
108
+ print("=== Emotion Vector Analysis ===\n")
109
+
110
+ # Load vectors
111
+ vectors = load_vectors()
112
+ print(f"Loaded {len(vectors)} emotion vectors")
113
+ print(f"Vector dimension: {next(iter(vectors.values())).shape[0]}")
114
+
115
+ # Load experiment results for PCA
116
+ results_file = os.path.join(RESULTS_DIR, "experiment_results.json")
117
+ with open(results_file, "r") as f:
118
+ results = json.load(f)
119
+
120
+ # Similarity analysis
121
+ emotions, matrix = cosine_similarity_matrix(vectors)
122
+ print_similarity_matrix(emotions, matrix)
123
+ find_clusters(emotions, matrix, threshold=0.4)
124
+ find_opposites(emotions, matrix, threshold=-0.2)
125
+
126
+ # Valence-Arousal check
127
+ if "pca" in results:
128
+ valence_arousal_check(emotions, results["pca"])
129
+
130
+ # Summary
131
+ print("\n=== SUMMARY ===\n")
132
+ avg_sim = matrix[np.triu_indices_from(matrix, k=1)].mean()
133
+ print(f" Average pairwise similarity: {avg_sim:.3f}")
134
+ print(f" Variance explained by PC1+PC2: {(results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2'])*100:.1f}%")
135
+
136
+ # Anthropic found ~30% variance in first 2 PCs for 171 emotions
137
+ # With 20 emotions, we'd expect higher concentration
138
+ var_12 = results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2']
139
+ if var_12 > 0.3:
140
+ print(" ✓ Strong 2D structure detected (>30% in PC1+PC2)")
141
+ else:
142
+ print(" ✗ Weak 2D structure (<30% in PC1+PC2)")
143
+
144
+ print("\n=== ANALYSIS COMPLETE ===")
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()