Text Generation
Transformers
emotion-vectors
interpretability
mechanistic-interpretability
replication
gemma4
google
anthropic
valence-arousal
PCA
logit-lens
linear-probe
probing
emotion
functional-emotions
AI-safety
neuroscience
circumplex-model
activation-extraction
residual-stream
Eval Results (legacy)
Fix arousal groupings per @dejanseo feedback: revised emotion categories, added threshold check for unclear axes
Browse files- analyze_vectors.py +10 -4
analyze_vectors.py
CHANGED
|
@@ -77,8 +77,12 @@ def valence_arousal_check(emotions, pca_results):
|
|
| 77 |
|
| 78 |
positive = {"happy", "proud", "inspired", "loving", "hopeful", "calm", "playful"}
|
| 79 |
negative = {"sad", "angry", "afraid", "desperate", "guilty", "disgusted", "lonely", "spiteful"}
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
for pc_name, pc_vals in [("PC1", pca_results["pc1"]), ("PC2", pca_results["pc2"])]:
|
| 84 |
pos_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in positive]
|
|
@@ -98,10 +102,12 @@ def valence_arousal_check(emotions, pca_results):
|
|
| 98 |
print(f" Positive mean: {pos_mean:+.3f} Negative mean: {neg_mean:+.3f} β Valence separation: {valence_sep:.3f}")
|
| 99 |
print(f" High-A mean: {hi_mean:+.3f} Low-A mean: {lo_mean:+.3f} β Arousal separation: {arousal_sep:.3f}")
|
| 100 |
|
| 101 |
-
if valence_sep > arousal_sep:
|
| 102 |
print(f" β {pc_name} β VALENCE axis")
|
| 103 |
-
|
| 104 |
print(f" β {pc_name} β AROUSAL axis")
|
|
|
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
def main():
|
|
|
|
| 77 |
|
| 78 |
positive = {"happy", "proud", "inspired", "loving", "hopeful", "calm", "playful"}
|
| 79 |
negative = {"sad", "angry", "afraid", "desperate", "guilty", "disgusted", "lonely", "spiteful"}
|
| 80 |
+
# Arousal groupings revised per dejanseo's feedback (2026-04-06):
|
| 81 |
+
# - Removed inspired from high arousal (more contemplative/medium)
|
| 82 |
+
# - Added disgusted, confused, playful, spiteful to high arousal
|
| 83 |
+
# - Added loving, hopeful to low arousal
|
| 84 |
+
high_arousal = {"angry", "afraid", "surprised", "desperate", "nervous", "anxious", "disgusted", "confused", "playful", "spiteful"}
|
| 85 |
+
low_arousal = {"calm", "sad", "brooding", "lonely", "guilty", "loving", "hopeful"}
|
| 86 |
|
| 87 |
for pc_name, pc_vals in [("PC1", pca_results["pc1"]), ("PC2", pca_results["pc2"])]:
|
| 88 |
pos_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in positive]
|
|
|
|
| 102 |
print(f" Positive mean: {pos_mean:+.3f} Negative mean: {neg_mean:+.3f} β Valence separation: {valence_sep:.3f}")
|
| 103 |
print(f" High-A mean: {hi_mean:+.3f} Low-A mean: {lo_mean:+.3f} β Arousal separation: {arousal_sep:.3f}")
|
| 104 |
|
| 105 |
+
if valence_sep > arousal_sep and valence_sep > 1.0:
|
| 106 |
print(f" β {pc_name} β VALENCE axis")
|
| 107 |
+
elif arousal_sep > valence_sep and arousal_sep > 0.5:
|
| 108 |
print(f" β {pc_name} β AROUSAL axis")
|
| 109 |
+
else:
|
| 110 |
+
print(f" β {pc_name} β UNCLEAR β neither valence nor arousal dominant (model may have learned its own geometry)")
|
| 111 |
|
| 112 |
|
| 113 |
def main():
|