File size: 12,791 Bytes
1539e17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
"""Task A agent β€” the Impersonator.

Given a UserPersona and an item (title, description, categories, domain),
produce a predicted rating and a generated review that match the user's
behavioral voice.

The workflow is a deterministic 4-step pipeline:

    1. select_similar_history(persona, item)
         β†’ pick the 3 most similar past reviews from the persona's history
         β†’ "similar" means same domain when possible, else any
         β†’ these ground the generation in the user's actual writing samples
    2. build_prompt(persona, item, similar_history)
         β†’ render the persona + similar reviews + item into a structured prompt
         β†’ the prompt is what the LLM sees
    3. llm.structured(prompt, ReviewOutput)
         β†’ call GPT-4o (reasoning tier) and parse into a Pydantic schema
         β†’ schema enforces (rating: float, review: str, reasoning: str)
    4. postprocess(output, persona)
         β†’ clamp rating to 1-5
         β†’ if naija_mode is on, run the review through the style layer

The reasoning field is mandatory and exposed in the API response. This is
how the system demonstrates "intelligence per feature" β€” every generated
review comes with a sentence explaining why this rating, grounded in the
persona's signals.
"""
from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import Optional

from pydantic import BaseModel, Field

from core.llm import LLMClient
from core.persona import UserPersona
from core.nigerian import naija_style_review
from core.reflection import reflect_on_review, ReflectionTrace

log = logging.getLogger(__name__)


# ──────────────────────────────────────────────────────────────────────────────
# Schemas
# ──────────────────────────────────────────────────────────────────────────────

class ItemInput(BaseModel):
    """Item details given to the Impersonator."""
    parent_asin: str = Field(description="Item ID")
    title: str = Field(description="Item title")
    description: str = Field(default="", description="Item description / synopsis")
    categories: str = Field(default="", description="Category breadcrumbs")
    domain: str = Field(description="Books / Movies_and_TV / Kindle_Store")
    average_rating: Optional[float] = Field(default=None, description="Crowd average rating, if known")


class GeneratedReview(BaseModel):
    """Structured output from the LLM."""
    rating: float = Field(description="Star rating, 1.0 to 5.0, half-stars allowed")
    review: str = Field(description="The full review text in this user's voice")
    reasoning: str = Field(description="One-sentence justification grounded in the user's persona signals")


@dataclass
class ImpersonationResult:
    """Final output returned by the agent."""
    rating: float
    review: str
    reasoning: str
    used_history_count: int   # how many past reviews informed the generation
    naija_mode: bool
    # Self-reflection metadata (Stage 3b)
    reflection_iterations: int = 0   # how many critique cycles ran
    reflection_refined: bool = False  # whether the review was revised
    reflection_notes: list[str] = field(default_factory=list)  # critique findings


# ──────────────────────────────────────────────────────────────────────────────
# Workflow steps
# ──────────────────────────────────────────────────────────────────────────────

def select_similar_history(persona: UserPersona, item: ItemInput,
                           k: int = 3) -> list[dict]:
    """Pick up to k past reviews to ground the generation.

    Preference order:
      1. same domain as the item
      2. any domain (fallback)
    Within each group we just take the most recent (history_samples is
    already sorted by recency-desc from the persona builder).
    """
    if not persona.history_samples:
        return []

    same_domain = [s for s in persona.history_samples if s["domain"] == item.domain]
    other_domain = [s for s in persona.history_samples if s["domain"] != item.domain]

    chosen = same_domain[:k]
    if len(chosen) < k:
        chosen.extend(other_domain[:(k - len(chosen))])
    return chosen


def build_prompt(persona: UserPersona, item: ItemInput,
                 similar_history: list[dict]) -> str:
    """Render the impersonation prompt.

    Three sections:
      - PERSONA: who the user is, quantitative + qualitative
      - WRITING SAMPLES: actual reviews this user wrote
      - TARGET ITEM: the new thing they need to review

    The prompt is deliberately structured so the LLM has a clear template
    to follow and grounds outputs in real data.
    """
    parts = ["You are simulating a real Amazon reviewer. Generate a review that authentically reflects their voice, rating tendencies, and behavioral patterns.\n"]

    parts.append("=" * 60)
    parts.append("THE USER YOU ARE SIMULATING")
    parts.append("=" * 60)
    parts.append(persona.to_prompt_block())

    if similar_history:
        parts.append("=" * 60)
        parts.append(f"ACTUAL REVIEWS THIS USER WROTE (study the voice carefully)")
        parts.append("=" * 60)
        for i, h in enumerate(similar_history, 1):
            parts.append(f"\n[Sample {i}] {h['rating']}β˜… in {h['domain']}:")
            parts.append(h["text"][:600])

    parts.append("\n" + "=" * 60)
    parts.append("NEW ITEM TO REVIEW")
    parts.append("=" * 60)
    parts.append(f"Domain: {item.domain}")
    parts.append(f"Title: {item.title}")
    if item.categories:
        parts.append(f"Categories: {item.categories}")
    if item.description:
        parts.append(f"Description: {item.description[:800]}")
    if item.average_rating:
        parts.append(f"Crowd average: {item.average_rating:.1f}β˜…")

    parts.append("\n" + "=" * 60)
    parts.append("YOUR TASK")
    parts.append("=" * 60)
    parts.append(
        "Produce three things.\n\n"
        "1. A RATING from 1.0 to 5.0. Predict it in TWO explicit steps:\n"
        "   Step A β€” The PRIOR: what does this user usually give? Look at their\n"
        "     rating distribution and average. This is your starting point.\n"
        "   Step B β€” The ITEM EVIDENCE: now read the NEW ITEM carefully. The\n"
        "     title, description, and any crowd average carry signal about\n"
        "     whether THIS specific item is a hit or a miss FOR THIS USER.\n"
        "     - A title or description with negative/lukewarm language\n"
        "       (e.g. 'capable of better', 'lost than found', 'disappointing')\n"
        "       pulls the rating DOWN β€” even for a generous user.\n"
        "     - Rich, substantive material that fits the user's stated tastes\n"
        "       pulls the rating UP β€” even for a critical user. A critical\n"
        "       reviewer still gives 4-5β˜… to things that genuinely engage them.\n"
        "     - Do not assume 'critical tone' means the user dislikes things;\n"
        "       critical users rate highly when the material rewards their\n"
        "       attention. Do not assume a generous user gives 5β˜… to\n"
        "       everything; they still give 4β˜… to mild disappointments.\n"
        "   Final rating = the PRIOR adjusted by the ITEM EVIDENCE. If the\n"
        "   item evidence is neutral or absent, stay near the prior. If the\n"
        "   item evidence clearly points somewhere, MOVE toward it.\n\n"
        "2. A REVIEW in this user's voice β€” match their length, tone,\n"
        "   vocabulary, and quirks visible in their writing samples\n"
        "   (capitalization, sentence structure, how they signal approval or\n"
        "   disapproval). The review's sentiment MUST be consistent with the\n"
        "   rating you chose.\n\n"
        "3. A one-sentence REASONING explaining the rating. It MUST cite BOTH\n"
        "   (a) the persona prior AND (b) the specific item evidence that\n"
        "   adjusted it β€” e.g. 'This user averages 4.8β˜…, but the title signals\n"
        "   \"capable of better\", a mild letdown, so 4β˜… not 5β˜….'"
    )

    return "\n".join(parts)


def postprocess(output: GeneratedReview, persona: UserPersona,
                naija_mode: bool, llm: LLMClient) -> GeneratedReview:
    """Clamp rating, optionally apply Naija style transfer."""
    # Clamp to [1.0, 5.0] and snap to nearest half-star
    rating = max(1.0, min(5.0, output.rating))
    rating = round(rating * 2) / 2

    review = output.review.strip()
    if naija_mode and review:
        try:
            review = naija_style_review(review, llm=llm)
        except Exception as e:
            log.warning(f"Naija style transfer failed; returning original. ({e})")

    return GeneratedReview(rating=rating, review=review, reasoning=output.reasoning)


# ──────────────────────────────────────────────────────────────────────────────
# Agent
# ──────────────────────────────────────────────────────────────────────────────

class ImpersonationAgent:
    """The Task A agent.

    Usage:
        agent = ImpersonationAgent()
        result = agent.run(persona, item, naija_mode=False)
        # result.rating, result.review, result.reasoning
    """

    def __init__(self, llm: LLMClient | None = None,
                 history_samples_k: int = 3,
                 use_reflection: bool = True,
                 reflection_max_iterations: int = 2):
        self.llm = llm or LLMClient()
        self.history_samples_k = history_samples_k
        self.use_reflection = use_reflection
        self.reflection_max_iterations = reflection_max_iterations

    def run(self, persona: UserPersona, item: ItemInput,
            naija_mode: bool = False) -> ImpersonationResult:
        # Step 1: select grounding history
        similar = select_similar_history(persona, item, k=self.history_samples_k)
        log.info(f"Selected {len(similar)} similar history items for grounding")

        # Step 2: build prompt
        prompt = build_prompt(persona, item, similar)

        # Step 3: LLM call with structured output
        log.info(f"Calling LLM for impersonation of user {persona.user_id} on item {item.parent_asin}")
        raw_output = self.llm.structured(
            prompt,
            schema=GeneratedReview,
            model="reasoning",
            system="You are an expert behavioral simulator. You write reviews exactly as the specified user would write them, matching their tone, length, rating patterns, and quirks.",
        )

        # Step 4: self-reflection β€” critique + refine (Stage 3b)
        reflection_iterations = 0
        reflection_refined = False
        reflection_notes: list[str] = []
        rating, review = raw_output.rating, raw_output.review
        if self.use_reflection:
            log.info("Running self-reflection on generated review")
            rating, review, trace = reflect_on_review(
                self.llm, persona,
                item_title=item.title, item_domain=item.domain,
                rating=rating, review=review,
                max_iterations=self.reflection_max_iterations,
            )
            reflection_iterations = trace.iterations_run
            reflection_refined = trace.refined
            reflection_notes = list(trace.critiques)

        refined_output = GeneratedReview(
            rating=rating, review=review, reasoning=raw_output.reasoning,
        )

        # Step 5: postprocess (clamp rating, optional naija style)
        final = postprocess(refined_output, persona, naija_mode=naija_mode, llm=self.llm)

        return ImpersonationResult(
            rating=final.rating,
            review=final.review,
            reasoning=final.reasoning,
            used_history_count=len(similar),
            naija_mode=naija_mode,
            reflection_iterations=reflection_iterations,
            reflection_refined=reflection_refined,
            reflection_notes=reflection_notes,
        )