Spaces:
Running
Running
| """Self-reflection β the act β critique β revise loop. | |
| This is what makes NaijaTaste AI an agent rather than a one-shot pipeline. | |
| After a first-pass output, the agent critiques its own work against the | |
| persona and, if the critique finds problems, revises. | |
| Two public entry points: | |
| reflect_on_review(...) β Task A: critique + refine a generated review | |
| reflect_on_recommendations(...) β Task B: critique + refine a top-N list | |
| Each runs at most `max_iterations` revise cycles (default 2). The loop | |
| stops early once the critique passes (no blocking issues). Every cycle is | |
| logged so the paper can report how often refinement triggered and what it | |
| changed. | |
| Reference: Madaan et al. 2023, "Self-Refine: Iterative Refinement with | |
| Self-Feedback"; Shinn et al. 2023, "Reflexion". | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| from pydantic import BaseModel, Field | |
| from core.llm import LLMClient | |
| from core.persona import UserPersona | |
| log = logging.getLogger(__name__) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Critique schemas | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class ReviewCritique(BaseModel): | |
| """The critique LLM's assessment of a generated review (Task A).""" | |
| rating_text_consistent: bool = Field( | |
| description="True if the review text matches the star rating " | |
| "(e.g. a 4-star review doesn't read like a 2-star pan)" | |
| ) | |
| voice_match: bool = Field( | |
| description="True if the review sounds like THIS user β their length, " | |
| "register, vocabulary, and quirks" | |
| ) | |
| on_topic: bool = Field( | |
| description="True if the review is about the actual item, not generic filler" | |
| ) | |
| issues: str = Field( | |
| description="If any check failed, a specific 1-2 sentence description of what " | |
| "to fix. If all passed, the string 'none'." | |
| ) | |
| def passed(self) -> bool: | |
| return self.rating_text_consistent and self.voice_match and self.on_topic | |
| class RecommendationCritique(BaseModel): | |
| """The critique LLM's assessment of a top-N recommendation list (Task B).""" | |
| titles_are_real: bool = Field( | |
| description="True if the recommended items look like real products, " | |
| "not review-headline fragments" | |
| ) | |
| well_matched: bool = Field( | |
| description="True if the picks genuinely fit the persona's tastes" | |
| ) | |
| reasoning_grounded: bool = Field( | |
| description="True if each pick's reasoning cites specific persona signals, " | |
| "not generic filler" | |
| ) | |
| diverse_enough: bool = Field( | |
| description="True if the list isn't 10 near-identical items" | |
| ) | |
| issues: str = Field( | |
| description="If any check failed, a specific 1-2 sentence description of what " | |
| "to fix. If all passed, the string 'none'." | |
| ) | |
| def passed(self) -> bool: | |
| return (self.titles_are_real and self.well_matched | |
| and self.reasoning_grounded and self.diverse_enough) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Reflection trace (for logging / paper reporting) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class ReflectionTrace: | |
| """Record of what the reflection loop did β useful for the paper.""" | |
| iterations_run: int = 0 | |
| critiques: list[str] = field(default_factory=list) # issues found each cycle | |
| passed_final: bool = False | |
| refined: bool = False # True if at least one revision happened | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Task A β review reflection | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _critique_review(llm: LLMClient, persona: UserPersona, | |
| item_title: str, item_domain: str, | |
| rating: float, review: str) -> ReviewCritique: | |
| """One critique pass over a generated review.""" | |
| prompt = ( | |
| f"You are a strict editor checking whether an AI-generated review " | |
| f"faithfully imitates a specific user. Be critical β your job is to " | |
| f"catch problems, not to be nice.\n\n" | |
| f"{'=' * 55}\n" | |
| f"THE USER\n" | |
| f"{'=' * 55}\n" | |
| f"{persona.to_prompt_block()}\n\n" | |
| f"{'=' * 55}\n" | |
| f"ITEM REVIEWED\n" | |
| f"{'=' * 55}\n" | |
| f"Domain: {item_domain}\n" | |
| f"Title: {item_title}\n\n" | |
| f"{'=' * 55}\n" | |
| f"THE GENERATED REVIEW (check this)\n" | |
| f"{'=' * 55}\n" | |
| f"Rating: {rating}\u2605\n" | |
| f"Review: {review}\n\n" | |
| f"{'=' * 55}\n" | |
| f"YOUR CHECKS\n" | |
| f"{'=' * 55}\n" | |
| f"1. rating_text_consistent: Does the review TEXT match the {rating}-star " | |
| f"rating? A 4-5 star review should read positive; a 1-2 star review should " | |
| f"read negative; a 3 should read mixed.\n" | |
| f"2. voice_match: Does it sound like THIS user? Check their typical review " | |
| f"length ({persona.avg_review_length:.0f} words avg), tone ({persona.tone}), " | |
| f"and quirks. A terse user given a long essay = fail. A user who writes in " | |
| f"all-caps given lowercase = fail.\n" | |
| f"3. on_topic: Is the review about the actual item, or is it generic filler " | |
| f"that could apply to anything?\n\n" | |
| f"If any check fails, describe specifically what to fix in 'issues'. " | |
| f"If all pass, set 'issues' to 'none'." | |
| ) | |
| return llm.structured( | |
| prompt, ReviewCritique, model="reasoning", | |
| system="You are a meticulous editor. Catch every inconsistency.", | |
| ) | |
| def _refine_review(llm: LLMClient, persona: UserPersona, | |
| item_title: str, item_domain: str, | |
| prev_rating: float, prev_review: str, | |
| critique_issues: str) -> tuple[float, str]: | |
| """Regenerate a review given critique feedback. Returns (rating, review).""" | |
| class RefinedReview(BaseModel): | |
| rating: float = Field(description="Star rating 1.0-5.0") | |
| review: str = Field(description="The improved review in the user's voice") | |
| prompt = ( | |
| f"You previously wrote a review imitating a specific user, but an editor " | |
| f"found problems. Rewrite the review to fix them.\n\n" | |
| f"{'=' * 55}\n" | |
| f"THE USER\n" | |
| f"{'=' * 55}\n" | |
| f"{persona.to_prompt_block()}\n\n" | |
| f"ITEM: [{item_domain}] {item_title}\n\n" | |
| f"YOUR PREVIOUS ATTEMPT:\n" | |
| f" Rating: {prev_rating}\u2605\n" | |
| f" Review: {prev_review}\n\n" | |
| f"EDITOR'S FEEDBACK β fix these specific issues:\n" | |
| f" {critique_issues}\n\n" | |
| f"Rewrite the review addressing the feedback. Keep what worked; fix what " | |
| f"the editor flagged. Stay in the user's authentic voice." | |
| ) | |
| result = llm.structured( | |
| prompt, RefinedReview, model="reasoning", | |
| system="You are an expert behavioral simulator revising your work based on feedback.", | |
| ) | |
| return result.rating, result.review | |
| def reflect_on_review(llm: LLMClient, persona: UserPersona, | |
| item_title: str, item_domain: str, | |
| rating: float, review: str, | |
| max_iterations: int = 2) -> tuple[float, str, ReflectionTrace]: | |
| """Critique a generated review and refine it if needed. | |
| Returns: (final_rating, final_review, trace) | |
| The loop: | |
| 1. Critique the current review. | |
| 2. If it passes β stop, return as-is. | |
| 3. If it fails β refine using the critique, then critique again. | |
| 4. Stop after max_iterations even if still imperfect. | |
| """ | |
| trace = ReflectionTrace() | |
| cur_rating, cur_review = rating, review | |
| for i in range(max_iterations): | |
| try: | |
| critique = _critique_review(llm, persona, item_title, item_domain, | |
| cur_rating, cur_review) | |
| except Exception as e: | |
| log.warning(f"Review critique failed ({type(e).__name__}); " | |
| f"keeping current review") | |
| break | |
| trace.iterations_run = i + 1 | |
| if critique.passed: | |
| trace.critiques.append("passed") | |
| trace.passed_final = True | |
| log.info(f"Review reflection: passed on iteration {i + 1}") | |
| break | |
| trace.critiques.append(critique.issues) | |
| log.info(f"Review reflection iter {i + 1}: issues = {critique.issues}") | |
| # Refine | |
| try: | |
| cur_rating, cur_review = _refine_review( | |
| llm, persona, item_title, item_domain, | |
| cur_rating, cur_review, critique.issues, | |
| ) | |
| trace.refined = True | |
| except Exception as e: | |
| log.warning(f"Review refine failed ({type(e).__name__}); " | |
| f"keeping pre-refine review") | |
| break | |
| return cur_rating, cur_review, trace | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Task B β recommendation reflection | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _critique_recommendations(llm: LLMClient, persona: UserPersona, | |
| recommendations: list[dict], | |
| mode: str) -> RecommendationCritique: | |
| """One critique pass over a recommendation list.""" | |
| rec_block = "\n".join( | |
| f" #{i+1} [{r['domain']}] {r['title']}\n Why: {r['reasoning']}" | |
| for i, r in enumerate(recommendations) | |
| ) | |
| prompt = ( | |
| f"You are a strict reviewer checking the quality of a recommendation " | |
| f"list. Be critical β catch problems.\n\n" | |
| f"{'=' * 55}\n" | |
| f"THE USER\n" | |
| f"{'=' * 55}\n" | |
| f"{persona.to_prompt_block()}\n\n" | |
| f"{'=' * 55}\n" | |
| f"THE RECOMMENDATIONS (mode: {mode})\n" | |
| f"{'=' * 55}\n" | |
| f"{rec_block}\n\n" | |
| f"{'=' * 55}\n" | |
| f"YOUR CHECKS\n" | |
| f"{'=' * 55}\n" | |
| f"1. titles_are_real: Do these look like real product titles? FAIL if any " | |
| f"are review-headline fragments like 'Fast paced great read' or 'An " | |
| f"enjoyable read' or 'Loved it!'.\n" | |
| f"2. well_matched: Do the picks genuinely fit this user's tastes?\n" | |
| f"3. reasoning_grounded: Does each 'Why' cite specific persona signals, " | |
| f"or is it generic filler?\n" | |
| f"4. diverse_enough: Is there real variety, or are these 10 near-identical " | |
| f"items?\n\n" | |
| f"If any check fails, describe specifically what to fix in 'issues' " | |
| f"(e.g. 'items #4, #7, #9 have review-headline titles β replace them'). " | |
| f"If all pass, set 'issues' to 'none'." | |
| ) | |
| return llm.structured( | |
| prompt, RecommendationCritique, model="reasoning", | |
| system="You are a meticulous recommendation-quality auditor.", | |
| ) | |
| def reflect_on_recommendations(llm: LLMClient, persona: UserPersona, | |
| recommendations: list[dict], mode: str, | |
| refine_fn, | |
| max_iterations: int = 2, | |
| ) -> tuple[list[dict], ReflectionTrace]: | |
| """Critique a recommendation list and refine if needed. | |
| Unlike review reflection, refinement here can't just rewrite text β it | |
| needs to re-run reranking with feedback. So the caller passes a | |
| `refine_fn(issues: str) -> list[dict]` that re-runs the rerank with the | |
| critique injected, and this function orchestrates the loop. | |
| Returns: (final_recommendations, trace) | |
| """ | |
| trace = ReflectionTrace() | |
| cur_recs = recommendations | |
| for i in range(max_iterations): | |
| try: | |
| critique = _critique_recommendations(llm, persona, cur_recs, mode) | |
| except Exception as e: | |
| log.warning(f"Recommendation critique failed ({type(e).__name__}); " | |
| f"keeping current list") | |
| break | |
| trace.iterations_run = i + 1 | |
| if critique.passed: | |
| trace.critiques.append("passed") | |
| trace.passed_final = True | |
| log.info(f"Recommendation reflection: passed on iteration {i + 1}") | |
| break | |
| trace.critiques.append(critique.issues) | |
| log.info(f"Recommendation reflection iter {i + 1}: issues = {critique.issues}") | |
| # Refine via the caller-supplied function | |
| try: | |
| refined = refine_fn(critique.issues) | |
| if refined: | |
| cur_recs = refined | |
| trace.refined = True | |
| except Exception as e: | |
| log.warning(f"Recommendation refine failed ({type(e).__name__}); " | |
| f"keeping pre-refine list") | |
| break | |
| return cur_recs, trace | |