Spaces:
Running
Running
| """Persona engine β turn a user's review history into a behavioral fingerprint. | |
| The persona is the spine of the whole system. Both tasks ask it different | |
| questions: | |
| Task A: "Given this persona and this item, how would the user rate and review it?" | |
| Task B: "Given this persona, what items would the user want next?" | |
| A persona has two layers: | |
| 1. Quantitative signals (computed deterministically from history) | |
| - rating cadence: mean, std, distribution shape | |
| - review length: mean, std | |
| - vocabulary fingerprint: top distinctive terms | |
| - domain mix: which categories the user engages with | |
| - verified-purchase rate, helpful-vote signal | |
| 2. Qualitative summary (LLM-generated, cached) | |
| - tone descriptor (snarky / earnest / analytical / casual / ...) | |
| - common preferences (themes, styles) | |
| - common complaints (deal-breakers) | |
| - recommended audience for THIS user (one-liner persona pitch) | |
| The qualitative layer is what makes generated reviews feel like the actual | |
| user wrote them. Without it, you get generic LLM prose. With it, you get | |
| behavioral fidelity β which is one of Task A's three scored axes. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from collections import Counter | |
| from dataclasses import dataclass, field, asdict | |
| from typing import Any | |
| import pandas as pd | |
| from pydantic import BaseModel, Field | |
| from core.llm import LLMClient | |
| log = logging.getLogger(__name__) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Schemas | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class QualitativeSummary(BaseModel): | |
| """LLM-generated qualitative layer of a persona.""" | |
| tone: str = Field(description="One-word tone descriptor: snarky, earnest, analytical, casual, enthusiastic, terse, verbose, etc.") | |
| preferred_themes: list[str] = Field(description="3-5 themes/styles/qualities this user gravitates toward") | |
| common_complaints: list[str] = Field(description="2-4 recurring deal-breakers or critique patterns") | |
| voice_one_liner: str = Field(description="A single sentence describing this user's reviewing voice as if pitching them to a casting director") | |
| class UserPersona: | |
| """Complete persona β quantitative signals + qualitative summary + history.""" | |
| user_id: str | |
| # Quantitative | |
| n_reviews: int | |
| avg_rating: float | |
| std_rating: float | |
| avg_review_length: float | |
| std_review_length: float | |
| verified_rate: float | |
| domains: list[str] | |
| n_domains: int | |
| rating_distribution: dict[int, float] # {1: 0.05, 2: 0.1, ..., 5: 0.4} | |
| top_terms: list[str] # vocabulary fingerprint | |
| # Qualitative (lazily filled by PersonaEngine.enrich) | |
| tone: str = "" | |
| preferred_themes: list[str] = field(default_factory=list) | |
| common_complaints: list[str] = field(default_factory=list) | |
| voice_one_liner: str = "" | |
| # Sample history for retrieval/grounding (subset of training reviews) | |
| history_samples: list[dict[str, Any]] = field(default_factory=list) | |
| def to_prompt_block(self) -> str: | |
| """Render the persona as a structured prompt section. | |
| This text is what the LLM sees when generating reviews / recommendations. | |
| Keeping it formatted consistently is what makes generation behaviorally | |
| faithful. | |
| """ | |
| dist = " ".join(f"{r}β :{p:.0%}" for r, p in sorted(self.rating_distribution.items())) | |
| return ( | |
| f"USER PERSONA\n" | |
| f" Reviews written: {self.n_reviews}\n" | |
| f" Avg rating: {self.avg_rating:.2f} (Β±{self.std_rating:.2f})\n" | |
| f" Rating distribution: {dist}\n" | |
| f" Avg review length: {self.avg_review_length:.0f} words (Β±{self.std_review_length:.0f})\n" | |
| f" Verified-purchase rate: {self.verified_rate:.0%}\n" | |
| f" Active domains: {', '.join(self.domains)}\n" | |
| f" Vocabulary fingerprint: {', '.join(self.top_terms[:15])}\n" | |
| f" Tone: {self.tone or 'unspecified'}\n" | |
| f" Preferred themes: {', '.join(self.preferred_themes) or 'unspecified'}\n" | |
| f" Common complaints: {', '.join(self.common_complaints) or 'unspecified'}\n" | |
| f" Voice: {self.voice_one_liner or 'unspecified'}\n" | |
| ) | |
| def as_dict(self) -> dict: | |
| return asdict(self) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Engine | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # A small set of generic English stopwords + Amazon-review noise. Keeping | |
| # this in-module avoids pulling in nltk's download flow. | |
| _STOPWORDS = set(""" | |
| a an the and or but if then else when while of in on at by to for with from | |
| into onto over under is are was were be been being have has had do does did | |
| i you he she it we they me him her us them my your his its our their this | |
| that these those there here what which who whom whose how why so as too very | |
| just also more most some any all each every other another such no not nor only | |
| own same can will would could should might may must one two three really get | |
| got gets just like dont didnt isnt arent wasnt werent havent hadnt hasnt cant | |
| couldnt wouldnt shouldnt wont thats whats theres heres ive ill ive youve im | |
| """.split()) | |
| class PersonaEngine: | |
| """Build personas from review history. | |
| Two entry points: | |
| from_dataframe(user_id, training_reviews_df) -> UserPersona | |
| enrich(persona) -> UserPersona # adds qualitative summary via LLM | |
| """ | |
| def __init__(self, llm: LLMClient | None = None, | |
| top_terms_k: int = 20, | |
| history_samples_k: int = 8): | |
| self.llm = llm or LLMClient() | |
| self.top_terms_k = top_terms_k | |
| self.history_samples_k = history_samples_k | |
| # Enrichment cache β keyed by user_id. enrich() makes an LLM call per | |
| # user; for a user already seen this session, the cached qualitative | |
| # summary is reused. The result is identical (same user, same summary) | |
| # β this only removes a redundant call, it never changes output. | |
| self._enrichment_cache: dict[str, dict] = {} | |
| # βββββββββββββββββββββββββββ Quantitative ββββββββββββββββββββββββββββ | |
| def from_dataframe(self, user_id: str, | |
| reviews: pd.DataFrame) -> UserPersona: | |
| """Build a UserPersona from a DataFrame of one user's training reviews. | |
| Expected columns: user_id, parent_asin, rating, text, verified_purchase, | |
| domain, timestamp. | |
| """ | |
| user_reviews = reviews[reviews["user_id"] == user_id] | |
| if user_reviews.empty: | |
| raise ValueError(f"No reviews found for user_id={user_id!r}") | |
| ratings = user_reviews["rating"].astype(float) | |
| lengths = user_reviews["text"].fillna("").str.split().str.len() | |
| # Rating distribution as proportions | |
| dist = ratings.round().astype(int).value_counts(normalize=True).to_dict() | |
| rating_dist = {int(k): float(v) for k, v in dist.items()} | |
| # Vocabulary fingerprint: most common non-stopword tokens | |
| top_terms = self._top_terms(user_reviews["text"].tolist()) | |
| # Sample history items for retrieval grounding β keep the most recent | |
| history = user_reviews.sort_values("timestamp", ascending=False) \ | |
| .head(self.history_samples_k) | |
| history_samples = [ | |
| { | |
| "parent_asin": row["parent_asin"], | |
| "rating": float(row["rating"]), | |
| "text": row["text"][:500], | |
| "domain": row["domain"], | |
| } | |
| for _, row in history.iterrows() | |
| ] | |
| return UserPersona( | |
| user_id=user_id, | |
| n_reviews=len(user_reviews), | |
| avg_rating=float(ratings.mean()), | |
| std_rating=float(ratings.std()) if len(ratings) > 1 else 0.0, | |
| avg_review_length=float(lengths.mean()), | |
| std_review_length=float(lengths.std()) if len(lengths) > 1 else 0.0, | |
| verified_rate=float(user_reviews["verified_purchase"].mean()), | |
| domains=sorted(user_reviews["domain"].unique().tolist()), | |
| n_domains=int(user_reviews["domain"].nunique()), | |
| rating_distribution=rating_dist, | |
| top_terms=top_terms, | |
| history_samples=history_samples, | |
| ) | |
| def _top_terms(self, texts: list[str]) -> list[str]: | |
| """Most frequent content tokens, stopwords removed.""" | |
| counter: Counter = Counter() | |
| for txt in texts: | |
| if not isinstance(txt, str): | |
| continue | |
| tokens = [t.lower().strip(".,!?\"'()[]{}:;") for t in txt.split()] | |
| tokens = [t for t in tokens | |
| if t and len(t) > 2 and t not in _STOPWORDS and t.isalpha()] | |
| counter.update(tokens) | |
| return [w for w, _ in counter.most_common(self.top_terms_k)] | |
| # βββββββββββββββββββββββββββ Qualitative βββββββββββββββββββββββββββββ | |
| def enrich(self, persona: UserPersona) -> UserPersona: | |
| """Add LLM-generated qualitative summary to an existing persona. | |
| Uses the reasoning model (gpt-4o) β more reliable structured output | |
| than the bulk model. If the LLM call still fails, falls back to a | |
| deterministic summary derived from the writing samples so we never | |
| end up with an empty Voice/Tone. | |
| """ | |
| if not persona.history_samples: | |
| log.warning(f"User {persona.user_id} has no history samples; skipping enrichment") | |
| return self._apply_deterministic_fallback(persona) | |
| # Cache hit β reuse the qualitative summary computed earlier this | |
| # session for this user. Identical result, one fewer LLM call. | |
| cached = self._enrichment_cache.get(persona.user_id) | |
| if cached is not None: | |
| log.info(f"Persona enrichment cache hit for {persona.user_id}") | |
| persona.tone = cached["tone"] or persona.tone | |
| persona.preferred_themes = cached["preferred_themes"] or persona.preferred_themes | |
| persona.common_complaints = cached["common_complaints"] or persona.common_complaints | |
| persona.voice_one_liner = cached["voice_one_liner"] or persona.voice_one_liner | |
| return persona | |
| sample_block = "\n\n".join( | |
| f"[{i+1}] Rating: {s['rating']}β Domain: {s['domain']}\n{s['text'][:400]}" | |
| for i, s in enumerate(persona.history_samples) | |
| ) | |
| prompt = ( | |
| f"Below are review samples from a single user. Read them carefully " | |
| f"and infer their reviewing voice.\n\n" | |
| f"{sample_block}\n\n" | |
| f"Quantitative signals about this user:\n" | |
| f"- Average rating: {persona.avg_rating:.2f} of 5\n" | |
| f"- Average review length: {persona.avg_review_length:.0f} words\n" | |
| f"- Vocabulary they use often: {', '.join(persona.top_terms[:15])}\n\n" | |
| f"Produce a qualitative summary of their reviewer voice. " | |
| f"Be concise and concrete. If the samples are too sparse or generic, " | |
| f"infer the most plausible voice rather than refusing." | |
| ) | |
| try: | |
| summary = self.llm.structured( | |
| prompt, QualitativeSummary, model="reasoning", | |
| system="You are a behavioral analyst specializing in online review patterns. Always produce valid output.", | |
| ) | |
| persona.tone = summary.tone or persona.tone | |
| persona.preferred_themes = summary.preferred_themes or persona.preferred_themes | |
| persona.common_complaints = summary.common_complaints or persona.common_complaints | |
| persona.voice_one_liner = summary.voice_one_liner or persona.voice_one_liner | |
| # cache the successful summary for reuse this session | |
| self._enrichment_cache[persona.user_id] = { | |
| "tone": persona.tone, | |
| "preferred_themes": persona.preferred_themes, | |
| "common_complaints": persona.common_complaints, | |
| "voice_one_liner": persona.voice_one_liner, | |
| } | |
| except Exception as e: | |
| log.warning(f"LLM enrichment failed for {persona.user_id} ({type(e).__name__}); using deterministic fallback") | |
| persona = self._apply_deterministic_fallback(persona) | |
| return persona | |
| def _apply_deterministic_fallback(persona: UserPersona) -> UserPersona: | |
| """Fill in tone/themes/voice from quantitative signals when LLM fails. | |
| This isn't as rich as an LLM summary, but it guarantees downstream | |
| query construction has SOMETHING to work with β much better than | |
| an empty string. | |
| """ | |
| # Tone bucket from avg rating | |
| if persona.avg_rating >= 4.5: | |
| tone = "enthusiastic" | |
| elif persona.avg_rating >= 3.8: | |
| tone = "earnest" | |
| elif persona.avg_rating >= 3.0: | |
| tone = "measured" | |
| else: | |
| tone = "critical" | |
| # Use top distinctive terms as proxy themes (filter out true generics) | |
| generic_terms = {"book", "read", "story", "movie", "film", "great", "good", | |
| "really", "much", "first", "next", "through", "about"} | |
| candidate_themes = [t for t in persona.top_terms if t not in generic_terms][:5] | |
| themes = candidate_themes or persona.top_terms[:3] | |
| # Domain-grounded voice | |
| domain_str = "/".join(persona.domains) if persona.domains else "general" | |
| length_descriptor = ( | |
| "writes brief reviews" if persona.avg_review_length < 30 | |
| else "writes detailed reviews" if persona.avg_review_length > 150 | |
| else "writes moderate-length reviews" | |
| ) | |
| voice = ( | |
| f"A {tone} {domain_str} reviewer who {length_descriptor} " | |
| f"(avg {persona.avg_rating:.1f}β over {persona.n_reviews} reviews)." | |
| ) | |
| if not persona.tone: | |
| persona.tone = tone | |
| if not persona.preferred_themes: | |
| persona.preferred_themes = themes | |
| if not persona.voice_one_liner: | |
| persona.voice_one_liner = voice | |
| return persona | |