resonate-api / app /api /profile_analytics.py
sandy898's picture
Initial backend deployment
d597de7
Raw
History Blame Contribute Delete
8.54 kB
"""
app/api/profile_analytics.py
FastAPI route for /profile/analytics.
Covers: Vibe Radar, Flavor Fingerprint (with coverage), Blind Spot,
Adventurousness, and Temporal Drift.
Adapted for the continuous rating pipeline.
"""
from __future__ import annotations
import numpy as np
from fastapi import APIRouter
from pydantic import BaseModel
from typing import Optional, List
from app.core.ml_manager import ml_manager
router = APIRouter()
# ── Excluded tag names (generic / noise) ──────────────────────────────────────
_GENERIC_TAGS = {
"drama", "comedy", "action", "thriller", "romance", "horror",
"adventure", "animation", "family", "documentary", "fantasy",
"mystery", "based on a true story", "independent film", "biopic",
}
# ── Minimum history sizes ──────────────────────────────────────────────────────
_MIN_DRIFT = 8 # need at least this many films to show drift
_RECENT_WINDOW = 5 # always use the last N films as "recent"
# πŸ› οΈ THE FIX: Updated to match the frontend's new Zustand payload
class Interaction(BaseModel):
itemIndex: int
rating: float
timestamp: int
class HistoryRequest(BaseModel):
interactions: List[Interaction]
# ── Helpers ───────────────────────────────────────────────────────────────────
def _user_vector(item_indices: list[int]) -> np.ndarray:
"""Mean of L2-normalized plot embeddings for the given item indices."""
vecs = ml_manager.plot_embeddings[item_indices] # already normalized at load time
centroid = vecs.mean(axis=0)
norm = np.linalg.norm(centroid)
return centroid / norm if norm > 1e-9 else centroid
def _project_macro(user_vec: np.ndarray) -> list[dict]:
"""Cosine sim of user vector against 6 macro-axes, min-max scaled 0-100."""
raw = ml_manager.macro_matrix @ user_vec # (6,)
lo, hi = raw.min(), raw.max()
if hi > lo:
scaled = (raw - lo) / (hi - lo) * 100
else:
scaled = np.full_like(raw, 50.0)
return [
{"axis": name, "score": round(float(s), 1)}
for name, s in zip(ml_manager.macro_axes_names, scaled)
]
def _top_macro_name(user_vec: np.ndarray) -> str:
"""Name of the highest-scoring macro axis for a given user vector."""
scores = ml_manager.macro_matrix @ user_vec
return ml_manager.macro_axes_names[int(scores.argmax())]
def _adventurousness(item_indices: list[int]) -> int:
"""
Mean pairwise cosine distance between watched film embeddings,
scaled 0-100. Higher = more eclectic taste.
"""
vecs = ml_manager.plot_embeddings[item_indices] # (n, 1024) already unit-norm
sim_matrix = vecs @ vecs.T # (n, n)
n = len(item_indices)
if n < 2:
return 0
# Sum off-diagonal / n*(n-1)
avg_sim = (sim_matrix.sum() - np.trace(sim_matrix)) / (n * (n - 1))
score = (1.0 - float(avg_sim)) * 100
return int(round(min(max(score, 0), 100)))
def _blind_spot(user_vec: np.ndarray, item_indices: list[int]) -> Optional[dict]:
"""
For each FC cluster compute:
- affinity = cosine sim(user_vec, fc_prototype)
- density = films watched in cluster / total films in cluster catalog
Blind spot = highest (affinity - density) gap.
"""
affinities = (ml_manager.fc_matrix @ user_vec) # (29,)
# Build a set of item indices for fast membership lookup
watched_set = set(item_indices)
best_gap = -1.0
best_fc = None
for i, fc_key in enumerate(ml_manager.fc_keys):
aff = float(affinities[i])
if aff < 0.4:
continue
fc_items = ml_manager.ontology[fc_key].get("item_indices", [])
catalog_size = len(fc_items) if fc_items else 1
watched_in_fc = len(watched_set & set(fc_items))
density = watched_in_fc / catalog_size if catalog_size else 0.0
if density > 0.25:
continue
gap = aff - density
if gap > best_gap:
best_gap = gap
best_fc = {
"cluster_name": ml_manager.fc_names.get(fc_key, fc_key),
"description": ml_manager.ontology[fc_key].get("description", ""),
"affinity_score": round(aff, 4),
"watch_density": round(density, 4),
}
return best_fc
def _flavor_fingerprint(item_indices: list[int]) -> dict:
"""
Returns top tags from Tag Genome, with coverage metadata.
"""
if ml_manager.tag_scores is None:
return {"top_tags": [], "genome_covered_count": 0, "total_count": len(item_indices)}
tag_mat = ml_manager.tag_scores # (N_films, N_tags) mmap
# Identify which items have any Genome coverage (non-zero row)
covered_indices = [i for i in item_indices if tag_mat[i].max() > 0]
genome_covered = len(covered_indices)
if genome_covered == 0:
return {
"top_tags": [],
"genome_covered_count": 0,
"total_count": len(item_indices),
}
# Aggregate: mean score across covered films for each tag column
agg = tag_mat[covered_indices].mean(axis=0) # (N_tags,)
# Rank tags by aggregated score
ranked_cols = np.argsort(agg)[::-1]
top_tags = []
for col in ranked_cols:
if len(top_tags) >= 15:
break
tag_id = ml_manager.tag_col_to_id.get(int(col), "")
tag_name = ml_manager.tag_id_to_name.get(str(tag_id), "").strip()
if not tag_name:
continue
if tag_name.lower() in _GENERIC_TAGS:
continue
top_tags.append(tag_name)
return {
"top_tags": top_tags,
"genome_covered_count": genome_covered,
"total_count": len(item_indices),
}
def _temporal_drift(item_indices: list[int]) -> Optional[dict]:
"""
Fixed recency window:
- recent = last _RECENT_WINDOW films
- historic = everything before that
"""
n = len(item_indices)
if n < _MIN_DRIFT:
return None
recent_items = item_indices[-_RECENT_WINDOW:]
historic_items = item_indices[:-_RECENT_WINDOW]
if len(historic_items) == 0:
return None
recent_vec = _user_vector(recent_items)
historic_vec = _user_vector(historic_items)
# Cosine distance = 1 - cosine_sim (both already unit-norm)
cos_sim = float(np.dot(recent_vec, historic_vec))
magnitude = round(1.0 - cos_sim, 4) # 0 = identical, 1 = opposite
early_focus = _top_macro_name(historic_vec)
recent_focus = _top_macro_name(recent_vec)
return {
"early_focus": early_focus,
"recent_focus": recent_focus,
"shift_magnitude": magnitude,
}
# ── Endpoint ──────────────────────────────────────────────────────────────────
@router.post("/profile/analytics")
def profile_analytics(req: HistoryRequest):
# πŸ› οΈ THE FIX: Sort interactions chronologically based on timestamp
# This ensures your `_temporal_drift` logic remains perfectly accurate.
sorted_interactions = sorted(req.interactions, key=lambda x: x.timestamp)
# Extract only the items the user rated favorably (>= 3.0)
history = [i.itemIndex for i in sorted_interactions if i.rating >= 3.0]
if not history:
return {"status": "empty"}
# Clamp to valid item indices
n_items = ml_manager.plot_embeddings.shape[0]
valid = [i for i in history if 0 <= i < n_items]
if not valid:
return {"status": "empty"}
user_vec = _user_vector(valid)
radar = _project_macro(user_vec)
fingerprint = _flavor_fingerprint(valid)
blind_spot = _blind_spot(user_vec, valid)
adv = _adventurousness(valid)
drift = _temporal_drift(valid)
return {
"status": "ok",
"radar": radar,
"top_tags": fingerprint["top_tags"],
"tag_coverage": {
"covered": fingerprint["genome_covered_count"],
"total": fingerprint["total_count"],
},
"blind_spot": blind_spot,
"adventurousness": adv,
"drift": drift,
}