"""Post-episode reflection generator.

After each episode the agent calls `generate_reflection()`.
Qwen3 analyses the action log and outcome, then writes a concrete lesson
that gets stored in AgentMemory and injected into future prompts.

This is the core **learning mechanism**:
  Episode N fails  →  reflection generated  →  stored in memory
  Episode N+1      →  reflection in prompt  →  agent avoids past mistake
  ...
  Win rate rises measurably over episodes.
"""

from __future__ import annotations

import sys
from pathlib import Path
from typing import Any, Dict, List

_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(_ROOT))

from bedrock_model import invoke_qwen

# ---------------------------------------------------------------------------
# Reflection prompt
# ---------------------------------------------------------------------------

_REFLECTION_SYSTEM = """\
You are a senior detective reviewing a FAKE INSTAGRAM ACCOUNT detection case debrief.
This environment detects coordinated fake social media accounts — NOT financial fraud.

Signals available in this environment (use ONLY these):
  • comment_repeat_score > 0.6  → copy-paste spam comments (gang: 0.6-0.9, real: 0.0-0.08)
  • shared_ip_count > 5         → shares IP subnet (all 10 gang members have count=9)
  • photo_reuse_score > 0.5     → stolen profile photos
  • bio_template_score > 0.4    → copy-paste bio text
  • fake_risk_score > 0.75      → high-confidence gang member (composite score)
  • hub_legitimacy_score > 0.70 → celebrity account, do NOT flag
  • After FLAG: visible neighbors auto-become SUSPECT (priority targets)

Available actions: INSPECT (1 step, reveals profile), INVESTIGATE_NETWORK (2 steps, 2-hop expand),
FLAG, UNFLAG, SUBMIT.

CRITICAL: Write lessons about fake social media signals and INSPECT/INVESTIGATE_NETWORK strategy
ONLY. Do NOT mention transactions, financial transfers, banking, or any concepts not listed above.
Output only the lesson text — no headers, no bullet points, just 2-3 plain sentences.\
"""


def generate_reflection(
    task: str,
    action_log: List[str],
    final_message: str,
    won: bool,
    steps_used: int,
    max_steps: int,
    episode_num: int,
) -> str:
    """
    Ask Qwen3 to generate a concrete lesson from one completed episode.
    Returns a short string (2-3 sentences) suitable for memory storage.
    """
    outcome = "SUCCESS" if won else "FAILURE"
    log_preview = "\n".join(f"  {i+1}. {s}" for i, s in enumerate(action_log[:20]))
    if len(action_log) > 20:
        log_preview += f"\n  … [{len(action_log) - 20} more steps]"

    prompt = f"""\
FAKE INSTAGRAM ACCOUNT DETECTION — Episode {episode_num}
Task difficulty: {task.upper()}
Outcome: {outcome}
Steps used: {steps_used}/{max_steps}
Result: {final_message}

AVAILABLE SIGNALS (reference for your lesson):
  comment_repeat_score > 0.6 | shared_ip_count > 5 | photo_reuse_score > 0.5
  fake_risk_score > 0.75 | hub_legitimacy_score > 0.70 (celebrity, skip)
  After FLAG → neighbors become SUSPECT (inspect them immediately)
  INVESTIGATE_NETWORK on a flagged account reveals their 2-hop gang cluster

INVESTIGATION LOG:
{log_preview}

Write a 2-3 sentence lesson for your future self based on this case.
Focus on: which of the above signals were most diagnostic, whether using
INVESTIGATE_NETWORK after the first FLAG would have helped, and how to
better allocate the step budget. Be concrete and actionable.\
"""

    try:
        reflection = invoke_qwen(
            prompt=prompt,
            system=_REFLECTION_SYSTEM,
            max_tokens=180,
            temperature=0.6,
        )
        return reflection.strip()
    except Exception as exc:
        # If Bedrock fails, generate a minimal rule-based reflection
        return _rule_based_reflection(won, steps_used, max_steps, final_message)


def _rule_based_reflection(
    won: bool, steps_used: int, max_steps: int, final_message: str
) -> str:
    """Minimal fallback reflection when Bedrock is unavailable."""
    if won and steps_used < max_steps * 0.6:
        return (
            "Early INVESTIGATE_NETWORK calls efficiently expanded the graph to all gang members. "
            "Flagging accounts with both high photo_reuse AND bio_template scores maintained precision. "
            "Submitting with budget remaining earned an efficiency bonus."
        )
    if won:
        return (
            "Found all gang members but used most of the step budget. "
            "Look for intra-gang follow density earlier — once you find one member, INVESTIGATE_NETWORK immediately. "
            "Flag faster to leave budget for verification."
        )
    if "Recall=0.00" in final_message or "TP=0" in final_message:
        return (
            "Zero gang members found — the starting accounts were all real. "
            "After inspecting 3-4 low-signal accounts, use INVESTIGATE_NETWORK to jump to a different part of the graph. "
            "Gang members have photo_reuse > 0.5 and bio_template > 0.4 simultaneously."
        )
    return (
        "Partial recall — found some gang members but missed others. "
        "After flagging the first gang member, immediately use INVESTIGATE_NETWORK: gang members follow each other heavily. "
        "Don't waste steps inspecting low-signal accounts one-by-one."
    )


# ---------------------------------------------------------------------------
# Post-win reflection (reinforces what worked)
# ---------------------------------------------------------------------------

def generate_success_reflection(
    task: str,
    action_log: List[str],
    final_message: str,
    steps_used: int,
    max_steps: int,
    episode_num: int,
) -> str:
    """Generate a reinforcement reflection after a WIN to capture what worked."""
    log_preview = "\n".join(f"  {i+1}. {s}" for i, s in enumerate(action_log[:15]))

    prompt = f"""\
SUCCESSFUL CASE — Episode {episode_num}
Task: {task.upper()} | Steps used: {steps_used}/{max_steps}
Result: {final_message}

INVESTIGATION LOG (first 15 steps):
{log_preview}

In 2-3 sentences, describe the specific strategy that led to success.
What did you do right? What should you repeat in future cases?\
"""

    try:
        return invoke_qwen(
            prompt=prompt,
            system=_REFLECTION_SYSTEM,
            max_tokens=150,
            temperature=0.5,
        ).strip()
    except Exception:
        return _rule_based_reflection(True, steps_used, max_steps, final_message)