File size: 3,158 Bytes
7964128 2d773b1 7964128 2d773b1 7964128 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | import pandas as pd
import numpy as np
import logging
from pathlib import Path
from tqdm import tqdm
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("Visualizer")
DATA_DIR = Path("data")
SYNTHETIC_DATA_PATH = DATA_DIR / "synthetic" / "user_sequences.parquet"
CATALOG_PATH = DATA_DIR / "catalog" / "books_catalog.csv"
EMBEDDINGS_PATH = DATA_DIR / "embeddings_cache.npy"
OUTPUT_DIR = Path("docs")
OUTPUT_IMAGE = OUTPUT_DIR / "user_clusters_tsne.png"
def visualize_clusters(sample_size=2000):
"""
Generates a 2D t-SNE projection of user vectors, colored by Persona.
"""
try:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
except ImportError as e:
logger.error("Missing visualization libraries!")
logger.error("Please run: uv pip install matplotlib seaborn")
return
logger.info("Loading Data...")
if not CATALOG_PATH.exists() or not EMBEDDINGS_PATH.exists():
logger.error("Missing Data! Run download scripts first.")
return
df_catalog = pd.read_csv(CATALOG_PATH)
titles = df_catalog['title'].tolist()
title_to_idx = {t.lower().strip(): i for i, t in enumerate(titles)}
embeddings = np.load(EMBEDDINGS_PATH)
df_users = pd.read_parquet(SYNTHETIC_DATA_PATH)
if len(df_users) > sample_size:
df_users = df_users.sample(sample_size, random_state=42)
logger.info(f"Processing {len(df_users)} users...")
user_vectors = []
user_personas = []
valid_users = 0
for _, row in tqdm(df_users.iterrows(), total=len(df_users)):
history = row['book_sequence']
persona = row['persona']
valid_indices = []
for book in history:
norm_title = book.lower().strip()
if norm_title in title_to_idx:
valid_indices.append(title_to_idx[norm_title])
if not valid_indices:
continue
vectors = embeddings[valid_indices]
user_vec = np.mean(vectors, axis=0)
user_vectors.append(user_vec)
user_personas.append(persona)
valid_users += 1
X = np.array(user_vectors)
logger.info("Running t-SNE")
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_embedded = tsne.fit_transform(X)
logger.info("Generating Plot...")
OUTPUT_DIR.mkdir(exist_ok=True)
plt.figure(figsize=(12, 8))
sns.scatterplot(
x=X_embedded[:, 0],
y=X_embedded[:, 1],
hue=user_personas,
palette="viridis",
alpha=0.7,
s=60
)
plt.title(f"Semantic User Clusters (t-SNE Projection of {valid_users} Users)", fontsize=16)
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Persona")
plt.tight_layout()
plt.savefig(OUTPUT_IMAGE, dpi=300)
logger.info(f"Visualization saved to {OUTPUT_IMAGE}")
print(f"Success! Check {OUTPUT_IMAGE} to see your user clusters.")
if __name__ == "__main__":
visualize_clusters()
|