Spaces:

Israelbliz
/

User-Modeling-Agent

Running

App Files Files Community

Israelbliz commited on about 13 hours ago

Commit

74e7e35

verified ·

1 Parent(s): 72a7c36

Upload persona.py

Browse files

Files changed (1) hide show

core/persona.py +314 -0

core/persona.py ADDED Viewed

	@@ -0,0 +1,314 @@

+"""Persona engine — turn a user's review history into a behavioral fingerprint.
+The persona is the spine of the whole system. Both tasks ask it different
+questions:
+    Task A: "Given this persona and this item, how would the user rate and review it?"
+    Task B: "Given this persona, what items would the user want next?"
+A persona has two layers:
+    1. Quantitative signals (computed deterministically from history)
+       - rating cadence: mean, std, distribution shape
+       - review length: mean, std
+       - vocabulary fingerprint: top distinctive terms
+       - domain mix: which categories the user engages with
+       - verified-purchase rate, helpful-vote signal
+    2. Qualitative summary (LLM-generated, cached)
+       - tone descriptor (snarky / earnest / analytical / casual / ...)
+       - common preferences (themes, styles)
+       - common complaints (deal-breakers)
+       - recommended audience for THIS user (one-liner persona pitch)
+The qualitative layer is what makes generated reviews feel like the actual
+user wrote them. Without it, you get generic LLM prose. With it, you get
+behavioral fidelity — which is one of Task A's three scored axes.
+"""
+from __future__ import annotations
+import logging
+from collections import Counter
+from dataclasses import dataclass, field, asdict
+from typing import Any
+import pandas as pd
+from pydantic import BaseModel, Field
+from core.llm import LLMClient
+log = logging.getLogger(__name__)
+# ──────────────────────────────────────────────────────────────────────────────
+# Schemas
+# ──────────────────────────────────────────────────────────────────────────────
+class QualitativeSummary(BaseModel):
+    """LLM-generated qualitative layer of a persona."""
+    tone: str = Field(description="One-word tone descriptor: snarky, earnest, analytical, casual, enthusiastic, terse, verbose, etc.")
+    preferred_themes: list[str] = Field(description="3-5 themes/styles/qualities this user gravitates toward")
+    common_complaints: list[str] = Field(description="2-4 recurring deal-breakers or critique patterns")
+    voice_one_liner: str = Field(description="A single sentence describing this user's reviewing voice as if pitching them to a casting director")
+@dataclass
+class UserPersona:
+    """Complete persona — quantitative signals + qualitative summary + history."""
+    user_id: str
+    # Quantitative
+    n_reviews: int
+    avg_rating: float
+    std_rating: float
+    avg_review_length: float
+    std_review_length: float
+    verified_rate: float
+    domains: list[str]
+    n_domains: int
+    rating_distribution: dict[int, float]   # {1: 0.05, 2: 0.1, ..., 5: 0.4}
+    top_terms: list[str]                    # vocabulary fingerprint
+    # Qualitative (lazily filled by PersonaEngine.enrich)
+    tone: str = ""
+    preferred_themes: list[str] = field(default_factory=list)
+    common_complaints: list[str] = field(default_factory=list)
+    voice_one_liner: str = ""
+    # Sample history for retrieval/grounding (subset of training reviews)
+    history_samples: list[dict[str, Any]] = field(default_factory=list)
+    def to_prompt_block(self) -> str:
+        """Render the persona as a structured prompt section.
+        This text is what the LLM sees when generating reviews / recommendations.
+        Keeping it formatted consistently is what makes generation behaviorally
+        faithful.
+        """
+        dist = " ".join(f"{r}★:{p:.0%}" for r, p in sorted(self.rating_distribution.items()))
+        return (
+            f"USER PERSONA\n"
+            f"  Reviews written: {self.n_reviews}\n"
+            f"  Avg rating: {self.avg_rating:.2f} (±{self.std_rating:.2f})\n"
+            f"  Rating distribution: {dist}\n"
+            f"  Avg review length: {self.avg_review_length:.0f} words (±{self.std_review_length:.0f})\n"
+            f"  Verified-purchase rate: {self.verified_rate:.0%}\n"
+            f"  Active domains: {', '.join(self.domains)}\n"
+            f"  Vocabulary fingerprint: {', '.join(self.top_terms[:15])}\n"
+            f"  Tone: {self.tone or 'unspecified'}\n"
+            f"  Preferred themes: {', '.join(self.preferred_themes) or 'unspecified'}\n"
+            f"  Common complaints: {', '.join(self.common_complaints) or 'unspecified'}\n"
+            f"  Voice: {self.voice_one_liner or 'unspecified'}\n"
+        )
+    def as_dict(self) -> dict:
+        return asdict(self)
+# ──────────────────────────────────────────────────────────────────────────────
+# Engine
+# ──────────────────────────────────────────────────────────────────────────────
+# A small set of generic English stopwords + Amazon-review noise. Keeping
+# this in-module avoids pulling in nltk's download flow.
+_STOPWORDS = set("""
+a an the and or but if then else when while of in on at by to for with from
+into onto over under is are was were be been being have has had do does did
+i you he she it we they me him her us them my your his its our their this
+that these those there here what which who whom whose how why so as too very
+just also more most some any all each every other another such no not nor only
+own same can will would could should might may must one two three really get
+got gets just like dont didnt isnt arent wasnt werent havent hadnt hasnt cant
+couldnt wouldnt shouldnt wont thats whats theres heres ive ill ive youve im
+""".split())
+class PersonaEngine:
+    """Build personas from review history.
+    Two entry points:
+        from_dataframe(user_id, training_reviews_df) -> UserPersona
+        enrich(persona) -> UserPersona   # adds qualitative summary via LLM
+    """
+    def __init__(self, llm: LLMClient | None = None,
+                 top_terms_k: int = 20,
+                 history_samples_k: int = 8):
+        self.llm = llm or LLMClient()
+        self.top_terms_k = top_terms_k
+        self.history_samples_k = history_samples_k
+        # Enrichment cache — keyed by user_id. enrich() makes an LLM call per
+        # user; for a user already seen this session, the cached qualitative
+        # summary is reused. The result is identical (same user, same summary)
+        # — this only removes a redundant call, it never changes output.
+        self._enrichment_cache: dict[str, dict] = {}
+    # ─────────────────────────── Quantitative ────────────────────────────
+    def from_dataframe(self, user_id: str,
+                       reviews: pd.DataFrame) -> UserPersona:
+        """Build a UserPersona from a DataFrame of one user's training reviews.
+        Expected columns: user_id, parent_asin, rating, text, verified_purchase,
+                          domain, timestamp.
+        """
+        user_reviews = reviews[reviews["user_id"] == user_id]
+        if user_reviews.empty:
+            raise ValueError(f"No reviews found for user_id={user_id!r}")
+        ratings = user_reviews["rating"].astype(float)
+        lengths = user_reviews["text"].fillna("").str.split().str.len()
+        # Rating distribution as proportions
+        dist = ratings.round().astype(int).value_counts(normalize=True).to_dict()
+        rating_dist = {int(k): float(v) for k, v in dist.items()}
+        # Vocabulary fingerprint: most common non-stopword tokens
+        top_terms = self._top_terms(user_reviews["text"].tolist())
+        # Sample history items for retrieval grounding — keep the most recent
+        history = user_reviews.sort_values("timestamp", ascending=False) \
+                              .head(self.history_samples_k)
+        history_samples = [
+            {
+                "parent_asin": row["parent_asin"],
+                "rating": float(row["rating"]),
+                "text": row["text"][:500],
+                "domain": row["domain"],
+            }
+            for _, row in history.iterrows()
+        ]
+        return UserPersona(
+            user_id=user_id,
+            n_reviews=len(user_reviews),
+            avg_rating=float(ratings.mean()),
+            std_rating=float(ratings.std()) if len(ratings) > 1 else 0.0,
+            avg_review_length=float(lengths.mean()),
+            std_review_length=float(lengths.std()) if len(lengths) > 1 else 0.0,
+            verified_rate=float(user_reviews["verified_purchase"].mean()),
+            domains=sorted(user_reviews["domain"].unique().tolist()),
+            n_domains=int(user_reviews["domain"].nunique()),
+            rating_distribution=rating_dist,
+            top_terms=top_terms,
+            history_samples=history_samples,
+        )
+    def _top_terms(self, texts: list[str]) -> list[str]:
+        """Most frequent content tokens, stopwords removed."""
+        counter: Counter = Counter()
+        for txt in texts:
+            if not isinstance(txt, str):
+                continue
+            tokens = [t.lower().strip(".,!?\"'()[]{}:;") for t in txt.split()]
+            tokens = [t for t in tokens
+                      if t and len(t) > 2 and t not in _STOPWORDS and t.isalpha()]
+            counter.update(tokens)
+        return [w for w, _ in counter.most_common(self.top_terms_k)]
+    # ─────────────────────────── Qualitative ─────────────────────────────
+    def enrich(self, persona: UserPersona) -> UserPersona:
+        """Add LLM-generated qualitative summary to an existing persona.
+        Uses the reasoning model (gpt-4o) — more reliable structured output
+        than the bulk model. If the LLM call still fails, falls back to a
+        deterministic summary derived from the writing samples so we never
+        end up with an empty Voice/Tone.
+        """
+        if not persona.history_samples:
+            log.warning(f"User {persona.user_id} has no history samples; skipping enrichment")
+            return self._apply_deterministic_fallback(persona)
+        # Cache hit — reuse the qualitative summary computed earlier this
+        # session for this user. Identical result, one fewer LLM call.
+        cached = self._enrichment_cache.get(persona.user_id)
+        if cached is not None:
+            log.info(f"Persona enrichment cache hit for {persona.user_id}")
+            persona.tone = cached["tone"] or persona.tone
+            persona.preferred_themes = cached["preferred_themes"] or persona.preferred_themes
+            persona.common_complaints = cached["common_complaints"] or persona.common_complaints
+            persona.voice_one_liner = cached["voice_one_liner"] or persona.voice_one_liner
+            return persona
+        sample_block = "\n\n".join(
+            f"[{i+1}] Rating: {s['rating']}★  Domain: {s['domain']}\n{s['text'][:400]}"
+            for i, s in enumerate(persona.history_samples)
+        )
+        prompt = (
+            f"Below are review samples from a single user. Read them carefully "
+            f"and infer their reviewing voice.\n\n"
+            f"{sample_block}\n\n"
+            f"Quantitative signals about this user:\n"
+            f"- Average rating: {persona.avg_rating:.2f} of 5\n"
+            f"- Average review length: {persona.avg_review_length:.0f} words\n"
+            f"- Vocabulary they use often: {', '.join(persona.top_terms[:15])}\n\n"
+            f"Produce a qualitative summary of their reviewer voice. "
+            f"Be concise and concrete. If the samples are too sparse or generic, "
+            f"infer the most plausible voice rather than refusing."
+        )
+        try:
+            summary = self.llm.structured(
+                prompt, QualitativeSummary, model="reasoning",
+                system="You are a behavioral analyst specializing in online review patterns. Always produce valid output.",
+            )
+            persona.tone = summary.tone or persona.tone
+            persona.preferred_themes = summary.preferred_themes or persona.preferred_themes
+            persona.common_complaints = summary.common_complaints or persona.common_complaints
+            persona.voice_one_liner = summary.voice_one_liner or persona.voice_one_liner
+            # cache the successful summary for reuse this session
+            self._enrichment_cache[persona.user_id] = {
+                "tone": persona.tone,
+                "preferred_themes": persona.preferred_themes,
+                "common_complaints": persona.common_complaints,
+                "voice_one_liner": persona.voice_one_liner,
+            }
+        except Exception as e:
+            log.warning(f"LLM enrichment failed for {persona.user_id} ({type(e).__name__}); using deterministic fallback")
+            persona = self._apply_deterministic_fallback(persona)
+        return persona
+    @staticmethod
+    def _apply_deterministic_fallback(persona: UserPersona) -> UserPersona:
+        """Fill in tone/themes/voice from quantitative signals when LLM fails.
+        This isn't as rich as an LLM summary, but it guarantees downstream
+        query construction has SOMETHING to work with — much better than
+        an empty string.
+        """
+        # Tone bucket from avg rating
+        if persona.avg_rating >= 4.5:
+            tone = "enthusiastic"
+        elif persona.avg_rating >= 3.8:
+            tone = "earnest"
+        elif persona.avg_rating >= 3.0:
+            tone = "measured"
+        else:
+            tone = "critical"
+        # Use top distinctive terms as proxy themes (filter out true generics)
+        generic_terms = {"book", "read", "story", "movie", "film", "great", "good",
+                         "really", "much", "first", "next", "through", "about"}
+        candidate_themes = [t for t in persona.top_terms if t not in generic_terms][:5]
+        themes = candidate_themes or persona.top_terms[:3]
+        # Domain-grounded voice
+        domain_str = "/".join(persona.domains) if persona.domains else "general"
+        length_descriptor = (
+            "writes brief reviews" if persona.avg_review_length < 30
+            else "writes detailed reviews" if persona.avg_review_length > 150
+            else "writes moderate-length reviews"
+        )
+        voice = (
+            f"A {tone} {domain_str} reviewer who {length_descriptor} "
+            f"(avg {persona.avg_rating:.1f}★ over {persona.n_reviews} reviews)."
+        )
+        if not persona.tone:
+            persona.tone = tone
+        if not persona.preferred_themes:
+            persona.preferred_themes = themes
+        if not persona.voice_one_liner:
+            persona.voice_one_liner = voice
+        return persona