analyze_vectors.py · rain1955/emotion-vector-replication at main

File size: 6,169 Bytes

#!/usr/bin/env python3
"""Analyze extracted emotion vectors: cosine similarity, clustering, visualization."""

import json
import os
import numpy as np

EXP_DIR = os.path.dirname(os.path.abspath(__file__))
RESULTS_DIR = os.path.join(EXP_DIR, "results")


def load_vectors():
    vectors_file = os.path.join(RESULTS_DIR, "emotion_vectors.npz")
    data = np.load(vectors_file)
    return {name: data[name] for name in data.files}


def cosine_sim(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-8)


def cosine_similarity_matrix(vectors):
    emotions = sorted(vectors.keys())
    n = len(emotions)
    matrix = np.zeros((n, n))
    for i, e1 in enumerate(emotions):
        for j, e2 in enumerate(emotions):
            matrix[i, j] = cosine_sim(vectors[e1], vectors[e2])
    return emotions, matrix


def print_similarity_matrix(emotions, matrix):
    print("\n=== Cosine Similarity Matrix ===\n")
    # Header
    header = f"{'':12s}" + "".join(f"{e[:6]:>7s}" for e in emotions)
    print(header)
    for i, e in enumerate(emotions):
        row = f"{e:12s}" + "".join(f"{matrix[i,j]:7.2f}" for j in range(len(emotions)))
        print(row)


def find_clusters(emotions, matrix, threshold=0.5):
    """Find emotion pairs with high similarity."""
    print(f"\n=== High Similarity Pairs (>{threshold}) ===\n")
    pairs = []
    for i in range(len(emotions)):
        for j in range(i + 1, len(emotions)):
            if matrix[i, j] > threshold:
                pairs.append((emotions[i], emotions[j], matrix[i, j]))
    pairs.sort(key=lambda x: -x[2])
    for e1, e2, sim in pairs:
        print(f"  {e1:12s} <-> {e2:12s}  sim={sim:.3f}")
    if not pairs:
        print("  (none found)")
    return pairs


def find_opposites(emotions, matrix, threshold=-0.3):
    """Find emotion pairs with negative similarity (opposites)."""
    print(f"\n=== Opposite Pairs (<{threshold}) ===\n")
    pairs = []
    for i in range(len(emotions)):
        for j in range(i + 1, len(emotions)):
            if matrix[i, j] < threshold:
                pairs.append((emotions[i], emotions[j], matrix[i, j]))
    pairs.sort(key=lambda x: x[2])
    for e1, e2, sim in pairs:
        print(f"  {e1:12s} <-> {e2:12s}  sim={sim:.3f}")
    if not pairs:
        print("  (none found)")
    return pairs


def valence_arousal_check(emotions, pca_results):
    """Check if PC1≈valence, PC2≈arousal based on known emotion groupings."""
    print("\n=== Valence-Arousal Structure Check ===\n")

    positive = {"happy", "proud", "inspired", "loving", "hopeful", "calm", "playful"}
    negative = {"sad", "angry", "afraid", "desperate", "guilty", "disgusted", "lonely", "spiteful"}
    # Arousal groupings revised per dejanseo's feedback (2026-04-06):
    # - Removed inspired from high arousal (more contemplative/medium)
    # - Added disgusted, confused, playful, spiteful to high arousal
    # - Added loving, hopeful to low arousal
    high_arousal = {"angry", "afraid", "surprised", "desperate", "nervous", "anxious", "disgusted", "confused", "playful", "spiteful"}
    low_arousal = {"calm", "sad", "brooding", "lonely", "guilty", "loving", "hopeful"}

    for pc_name, pc_vals in [("PC1", pca_results["pc1"]), ("PC2", pca_results["pc2"])]:
        pos_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in positive]
        neg_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in negative]
        hi_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in high_arousal]
        lo_vals = [pc_vals[i] for i, e in enumerate(pca_results["emotions"]) if e in low_arousal]

        pos_mean = np.mean(pos_vals) if pos_vals else 0
        neg_mean = np.mean(neg_vals) if neg_vals else 0
        hi_mean = np.mean(hi_vals) if hi_vals else 0
        lo_mean = np.mean(lo_vals) if lo_vals else 0

        valence_sep = abs(pos_mean - neg_mean)
        arousal_sep = abs(hi_mean - lo_mean)

        print(f"  {pc_name}:")
        print(f"    Positive mean: {pos_mean:+.3f}  Negative mean: {neg_mean:+.3f}  → Valence separation: {valence_sep:.3f}")
        print(f"    High-A mean:   {hi_mean:+.3f}  Low-A mean:    {lo_mean:+.3f}  → Arousal separation: {arousal_sep:.3f}")

        if valence_sep > arousal_sep and valence_sep > 1.0:
            print(f"    → {pc_name} ≈ VALENCE axis")
        elif arousal_sep > valence_sep and arousal_sep > 0.5:
            print(f"    → {pc_name} ≈ AROUSAL axis")
        else:
            print(f"    → {pc_name} ≈ UNCLEAR — neither valence nor arousal dominant (model may have learned its own geometry)")


def main():
    print("=== Emotion Vector Analysis ===\n")

    # Load vectors
    vectors = load_vectors()
    print(f"Loaded {len(vectors)} emotion vectors")
    print(f"Vector dimension: {next(iter(vectors.values())).shape[0]}")

    # Load experiment results for PCA
    results_file = os.path.join(RESULTS_DIR, "experiment_results.json")
    with open(results_file, "r") as f:
        results = json.load(f)

    # Similarity analysis
    emotions, matrix = cosine_similarity_matrix(vectors)
    print_similarity_matrix(emotions, matrix)
    find_clusters(emotions, matrix, threshold=0.4)
    find_opposites(emotions, matrix, threshold=-0.2)

    # Valence-Arousal check
    if "pca" in results:
        valence_arousal_check(emotions, results["pca"])

    # Summary
    print("\n=== SUMMARY ===\n")
    avg_sim = matrix[np.triu_indices_from(matrix, k=1)].mean()
    print(f"  Average pairwise similarity: {avg_sim:.3f}")
    print(f"  Variance explained by PC1+PC2: {(results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2'])*100:.1f}%")

    # Anthropic found ~30% variance in first 2 PCs for 171 emotions
    # With 20 emotions, we'd expect higher concentration
    var_12 = results['pca']['explained_variance_pc1'] + results['pca']['explained_variance_pc2']
    if var_12 > 0.3:
        print("  ✓ Strong 2D structure detected (>30% in PC1+PC2)")
    else:
        print("  ✗ Weak 2D structure (<30% in PC1+PC2)")

    print("\n=== ANALYSIS COMPLETE ===")


if __name__ == "__main__":
    main()