Israelbliz commited on
Commit
74e7e35
Β·
verified Β·
1 Parent(s): 72a7c36

Upload persona.py

Browse files
Files changed (1) hide show
  1. core/persona.py +314 -0
core/persona.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Persona engine β€” turn a user's review history into a behavioral fingerprint.
2
+
3
+ The persona is the spine of the whole system. Both tasks ask it different
4
+ questions:
5
+
6
+ Task A: "Given this persona and this item, how would the user rate and review it?"
7
+ Task B: "Given this persona, what items would the user want next?"
8
+
9
+ A persona has two layers:
10
+
11
+ 1. Quantitative signals (computed deterministically from history)
12
+ - rating cadence: mean, std, distribution shape
13
+ - review length: mean, std
14
+ - vocabulary fingerprint: top distinctive terms
15
+ - domain mix: which categories the user engages with
16
+ - verified-purchase rate, helpful-vote signal
17
+
18
+ 2. Qualitative summary (LLM-generated, cached)
19
+ - tone descriptor (snarky / earnest / analytical / casual / ...)
20
+ - common preferences (themes, styles)
21
+ - common complaints (deal-breakers)
22
+ - recommended audience for THIS user (one-liner persona pitch)
23
+
24
+ The qualitative layer is what makes generated reviews feel like the actual
25
+ user wrote them. Without it, you get generic LLM prose. With it, you get
26
+ behavioral fidelity β€” which is one of Task A's three scored axes.
27
+ """
28
+ from __future__ import annotations
29
+
30
+ import logging
31
+ from collections import Counter
32
+ from dataclasses import dataclass, field, asdict
33
+ from typing import Any
34
+
35
+ import pandas as pd
36
+ from pydantic import BaseModel, Field
37
+
38
+ from core.llm import LLMClient
39
+
40
+ log = logging.getLogger(__name__)
41
+
42
+
43
+ # ──────────────────────────────────────────────────────────────────────────────
44
+ # Schemas
45
+ # ──────────────────────────────────────────────────────────────────────────────
46
+
47
+ class QualitativeSummary(BaseModel):
48
+ """LLM-generated qualitative layer of a persona."""
49
+ tone: str = Field(description="One-word tone descriptor: snarky, earnest, analytical, casual, enthusiastic, terse, verbose, etc.")
50
+ preferred_themes: list[str] = Field(description="3-5 themes/styles/qualities this user gravitates toward")
51
+ common_complaints: list[str] = Field(description="2-4 recurring deal-breakers or critique patterns")
52
+ voice_one_liner: str = Field(description="A single sentence describing this user's reviewing voice as if pitching them to a casting director")
53
+
54
+
55
+ @dataclass
56
+ class UserPersona:
57
+ """Complete persona β€” quantitative signals + qualitative summary + history."""
58
+ user_id: str
59
+
60
+ # Quantitative
61
+ n_reviews: int
62
+ avg_rating: float
63
+ std_rating: float
64
+ avg_review_length: float
65
+ std_review_length: float
66
+ verified_rate: float
67
+ domains: list[str]
68
+ n_domains: int
69
+ rating_distribution: dict[int, float] # {1: 0.05, 2: 0.1, ..., 5: 0.4}
70
+ top_terms: list[str] # vocabulary fingerprint
71
+
72
+ # Qualitative (lazily filled by PersonaEngine.enrich)
73
+ tone: str = ""
74
+ preferred_themes: list[str] = field(default_factory=list)
75
+ common_complaints: list[str] = field(default_factory=list)
76
+ voice_one_liner: str = ""
77
+
78
+ # Sample history for retrieval/grounding (subset of training reviews)
79
+ history_samples: list[dict[str, Any]] = field(default_factory=list)
80
+
81
+ def to_prompt_block(self) -> str:
82
+ """Render the persona as a structured prompt section.
83
+
84
+ This text is what the LLM sees when generating reviews / recommendations.
85
+ Keeping it formatted consistently is what makes generation behaviorally
86
+ faithful.
87
+ """
88
+ dist = " ".join(f"{r}β˜…:{p:.0%}" for r, p in sorted(self.rating_distribution.items()))
89
+ return (
90
+ f"USER PERSONA\n"
91
+ f" Reviews written: {self.n_reviews}\n"
92
+ f" Avg rating: {self.avg_rating:.2f} (Β±{self.std_rating:.2f})\n"
93
+ f" Rating distribution: {dist}\n"
94
+ f" Avg review length: {self.avg_review_length:.0f} words (Β±{self.std_review_length:.0f})\n"
95
+ f" Verified-purchase rate: {self.verified_rate:.0%}\n"
96
+ f" Active domains: {', '.join(self.domains)}\n"
97
+ f" Vocabulary fingerprint: {', '.join(self.top_terms[:15])}\n"
98
+ f" Tone: {self.tone or 'unspecified'}\n"
99
+ f" Preferred themes: {', '.join(self.preferred_themes) or 'unspecified'}\n"
100
+ f" Common complaints: {', '.join(self.common_complaints) or 'unspecified'}\n"
101
+ f" Voice: {self.voice_one_liner or 'unspecified'}\n"
102
+ )
103
+
104
+ def as_dict(self) -> dict:
105
+ return asdict(self)
106
+
107
+
108
+ # ──────────────────────────────────────────────────────────────────────────────
109
+ # Engine
110
+ # ──────────────────────────────────────────────────────────────────────────────
111
+
112
+ # A small set of generic English stopwords + Amazon-review noise. Keeping
113
+ # this in-module avoids pulling in nltk's download flow.
114
+ _STOPWORDS = set("""
115
+ a an the and or but if then else when while of in on at by to for with from
116
+ into onto over under is are was were be been being have has had do does did
117
+ i you he she it we they me him her us them my your his its our their this
118
+ that these those there here what which who whom whose how why so as too very
119
+ just also more most some any all each every other another such no not nor only
120
+ own same can will would could should might may must one two three really get
121
+ got gets just like dont didnt isnt arent wasnt werent havent hadnt hasnt cant
122
+ couldnt wouldnt shouldnt wont thats whats theres heres ive ill ive youve im
123
+ """.split())
124
+
125
+
126
+ class PersonaEngine:
127
+ """Build personas from review history.
128
+
129
+ Two entry points:
130
+ from_dataframe(user_id, training_reviews_df) -> UserPersona
131
+ enrich(persona) -> UserPersona # adds qualitative summary via LLM
132
+ """
133
+
134
+ def __init__(self, llm: LLMClient | None = None,
135
+ top_terms_k: int = 20,
136
+ history_samples_k: int = 8):
137
+ self.llm = llm or LLMClient()
138
+ self.top_terms_k = top_terms_k
139
+ self.history_samples_k = history_samples_k
140
+ # Enrichment cache β€” keyed by user_id. enrich() makes an LLM call per
141
+ # user; for a user already seen this session, the cached qualitative
142
+ # summary is reused. The result is identical (same user, same summary)
143
+ # β€” this only removes a redundant call, it never changes output.
144
+ self._enrichment_cache: dict[str, dict] = {}
145
+
146
+ # ─────────────────────────── Quantitative ────────────────────────────
147
+ def from_dataframe(self, user_id: str,
148
+ reviews: pd.DataFrame) -> UserPersona:
149
+ """Build a UserPersona from a DataFrame of one user's training reviews.
150
+
151
+ Expected columns: user_id, parent_asin, rating, text, verified_purchase,
152
+ domain, timestamp.
153
+ """
154
+ user_reviews = reviews[reviews["user_id"] == user_id]
155
+ if user_reviews.empty:
156
+ raise ValueError(f"No reviews found for user_id={user_id!r}")
157
+
158
+ ratings = user_reviews["rating"].astype(float)
159
+ lengths = user_reviews["text"].fillna("").str.split().str.len()
160
+
161
+ # Rating distribution as proportions
162
+ dist = ratings.round().astype(int).value_counts(normalize=True).to_dict()
163
+ rating_dist = {int(k): float(v) for k, v in dist.items()}
164
+
165
+ # Vocabulary fingerprint: most common non-stopword tokens
166
+ top_terms = self._top_terms(user_reviews["text"].tolist())
167
+
168
+ # Sample history items for retrieval grounding β€” keep the most recent
169
+ history = user_reviews.sort_values("timestamp", ascending=False) \
170
+ .head(self.history_samples_k)
171
+ history_samples = [
172
+ {
173
+ "parent_asin": row["parent_asin"],
174
+ "rating": float(row["rating"]),
175
+ "text": row["text"][:500],
176
+ "domain": row["domain"],
177
+ }
178
+ for _, row in history.iterrows()
179
+ ]
180
+
181
+ return UserPersona(
182
+ user_id=user_id,
183
+ n_reviews=len(user_reviews),
184
+ avg_rating=float(ratings.mean()),
185
+ std_rating=float(ratings.std()) if len(ratings) > 1 else 0.0,
186
+ avg_review_length=float(lengths.mean()),
187
+ std_review_length=float(lengths.std()) if len(lengths) > 1 else 0.0,
188
+ verified_rate=float(user_reviews["verified_purchase"].mean()),
189
+ domains=sorted(user_reviews["domain"].unique().tolist()),
190
+ n_domains=int(user_reviews["domain"].nunique()),
191
+ rating_distribution=rating_dist,
192
+ top_terms=top_terms,
193
+ history_samples=history_samples,
194
+ )
195
+
196
+ def _top_terms(self, texts: list[str]) -> list[str]:
197
+ """Most frequent content tokens, stopwords removed."""
198
+ counter: Counter = Counter()
199
+ for txt in texts:
200
+ if not isinstance(txt, str):
201
+ continue
202
+ tokens = [t.lower().strip(".,!?\"'()[]{}:;") for t in txt.split()]
203
+ tokens = [t for t in tokens
204
+ if t and len(t) > 2 and t not in _STOPWORDS and t.isalpha()]
205
+ counter.update(tokens)
206
+ return [w for w, _ in counter.most_common(self.top_terms_k)]
207
+
208
+ # ─────────────────────────── Qualitative ─────────────────────────────
209
+ def enrich(self, persona: UserPersona) -> UserPersona:
210
+ """Add LLM-generated qualitative summary to an existing persona.
211
+
212
+ Uses the reasoning model (gpt-4o) β€” more reliable structured output
213
+ than the bulk model. If the LLM call still fails, falls back to a
214
+ deterministic summary derived from the writing samples so we never
215
+ end up with an empty Voice/Tone.
216
+ """
217
+ if not persona.history_samples:
218
+ log.warning(f"User {persona.user_id} has no history samples; skipping enrichment")
219
+ return self._apply_deterministic_fallback(persona)
220
+
221
+ # Cache hit β€” reuse the qualitative summary computed earlier this
222
+ # session for this user. Identical result, one fewer LLM call.
223
+ cached = self._enrichment_cache.get(persona.user_id)
224
+ if cached is not None:
225
+ log.info(f"Persona enrichment cache hit for {persona.user_id}")
226
+ persona.tone = cached["tone"] or persona.tone
227
+ persona.preferred_themes = cached["preferred_themes"] or persona.preferred_themes
228
+ persona.common_complaints = cached["common_complaints"] or persona.common_complaints
229
+ persona.voice_one_liner = cached["voice_one_liner"] or persona.voice_one_liner
230
+ return persona
231
+
232
+ sample_block = "\n\n".join(
233
+ f"[{i+1}] Rating: {s['rating']}β˜… Domain: {s['domain']}\n{s['text'][:400]}"
234
+ for i, s in enumerate(persona.history_samples)
235
+ )
236
+
237
+ prompt = (
238
+ f"Below are review samples from a single user. Read them carefully "
239
+ f"and infer their reviewing voice.\n\n"
240
+ f"{sample_block}\n\n"
241
+ f"Quantitative signals about this user:\n"
242
+ f"- Average rating: {persona.avg_rating:.2f} of 5\n"
243
+ f"- Average review length: {persona.avg_review_length:.0f} words\n"
244
+ f"- Vocabulary they use often: {', '.join(persona.top_terms[:15])}\n\n"
245
+ f"Produce a qualitative summary of their reviewer voice. "
246
+ f"Be concise and concrete. If the samples are too sparse or generic, "
247
+ f"infer the most plausible voice rather than refusing."
248
+ )
249
+
250
+ try:
251
+ summary = self.llm.structured(
252
+ prompt, QualitativeSummary, model="reasoning",
253
+ system="You are a behavioral analyst specializing in online review patterns. Always produce valid output.",
254
+ )
255
+ persona.tone = summary.tone or persona.tone
256
+ persona.preferred_themes = summary.preferred_themes or persona.preferred_themes
257
+ persona.common_complaints = summary.common_complaints or persona.common_complaints
258
+ persona.voice_one_liner = summary.voice_one_liner or persona.voice_one_liner
259
+ # cache the successful summary for reuse this session
260
+ self._enrichment_cache[persona.user_id] = {
261
+ "tone": persona.tone,
262
+ "preferred_themes": persona.preferred_themes,
263
+ "common_complaints": persona.common_complaints,
264
+ "voice_one_liner": persona.voice_one_liner,
265
+ }
266
+ except Exception as e:
267
+ log.warning(f"LLM enrichment failed for {persona.user_id} ({type(e).__name__}); using deterministic fallback")
268
+ persona = self._apply_deterministic_fallback(persona)
269
+
270
+ return persona
271
+
272
+ @staticmethod
273
+ def _apply_deterministic_fallback(persona: UserPersona) -> UserPersona:
274
+ """Fill in tone/themes/voice from quantitative signals when LLM fails.
275
+
276
+ This isn't as rich as an LLM summary, but it guarantees downstream
277
+ query construction has SOMETHING to work with β€” much better than
278
+ an empty string.
279
+ """
280
+ # Tone bucket from avg rating
281
+ if persona.avg_rating >= 4.5:
282
+ tone = "enthusiastic"
283
+ elif persona.avg_rating >= 3.8:
284
+ tone = "earnest"
285
+ elif persona.avg_rating >= 3.0:
286
+ tone = "measured"
287
+ else:
288
+ tone = "critical"
289
+
290
+ # Use top distinctive terms as proxy themes (filter out true generics)
291
+ generic_terms = {"book", "read", "story", "movie", "film", "great", "good",
292
+ "really", "much", "first", "next", "through", "about"}
293
+ candidate_themes = [t for t in persona.top_terms if t not in generic_terms][:5]
294
+ themes = candidate_themes or persona.top_terms[:3]
295
+
296
+ # Domain-grounded voice
297
+ domain_str = "/".join(persona.domains) if persona.domains else "general"
298
+ length_descriptor = (
299
+ "writes brief reviews" if persona.avg_review_length < 30
300
+ else "writes detailed reviews" if persona.avg_review_length > 150
301
+ else "writes moderate-length reviews"
302
+ )
303
+ voice = (
304
+ f"A {tone} {domain_str} reviewer who {length_descriptor} "
305
+ f"(avg {persona.avg_rating:.1f}β˜… over {persona.n_reviews} reviews)."
306
+ )
307
+
308
+ if not persona.tone:
309
+ persona.tone = tone
310
+ if not persona.preferred_themes:
311
+ persona.preferred_themes = themes
312
+ if not persona.voice_one_liner:
313
+ persona.voice_one_liner = voice
314
+ return persona