File size: 3,158 Bytes
7964128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d773b1
7964128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d773b1
7964128
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import numpy as np
import logging
from pathlib import Path
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("Visualizer")

DATA_DIR = Path("data")
SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
OUTPUT_DIR = Path("docs")
OUTPUT_IMAGE = OUTPUT_DIR / "user_clusters_tsne.png"

def visualize_clusters(sample_size=2000):
    """
    Generates a 2D t-SNE projection of user vectors, colored by Persona.
    """
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns
        from sklearn.manifold import TSNE
    except ImportError as e:
        logger.error("Missing visualization libraries!")
        logger.error("Please run: uv pip install matplotlib seaborn")
        return

    logger.info("Loading Data...")
    if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
        logger.error("Missing Data! Run download scripts first.")
        return

    df_catalog = pd.read_csv(CATALOG_PATH)
    titles = df_catalog['title'].tolist()
    title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
    
    embeddings = np.load(EMBEDDINGS_PATH)
    
    df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
    
    if len(df_users) > sample_size:
        df_users = df_users.sample(sample_size, random_state=42)
    
    logger.info(f"Processing {len(df_users)} users...")
    
    user_vectors = []
    user_personas = []
    
    valid_users = 0
    for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
        history = row['book_sequence']
        persona = row['persona']
        
        valid_indices = []
        for book in history:
            norm_title = book.lower().strip()
            if norm_title in title_to_idx:
                valid_indices.append(title_to_idx[norm_title])
        
        if not valid_indices:
            continue
            
        vectors = embeddings[valid_indices]
        user_vec = np.mean(vectors, axis=0)
        
        user_vectors.append(user_vec)
        user_personas.append(persona)
        valid_users += 1
        
    X = np.array(user_vectors)
    
    logger.info("Running t-SNE")
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    X_embedded = tsne.fit_transform(X)
    
    logger.info("Generating Plot...")
    OUTPUT_DIR.mkdir(exist_ok=True)
    
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        x=X_embedded[:, 0],
        y=X_embedded[:, 1],
        hue=user_personas,
        palette="viridis",
        alpha=0.7,
        s=60
    )
    
    plt.title(f"Semantic User Clusters (t-SNE Projection of {valid_users} Users)", fontsize=16)
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Persona")
    plt.tight_layout()
    
    plt.savefig(OUTPUT_IMAGE, dpi=300)
    logger.info(f"Visualization saved to {OUTPUT_IMAGE}")
    print(f"Success! Check {OUTPUT_IMAGE} to see your user clusters.")

if __name__ == "__main__":
    visualize_clusters()