Spaces:
Running
Running
File size: 6,536 Bytes
50f71a7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | """Post-episode reflection generator.
After each episode the agent calls `generate_reflection()`.
Qwen3 analyses the action log and outcome, then writes a concrete lesson
that gets stored in AgentMemory and injected into future prompts.
This is the core **learning mechanism**:
Episode N fails β reflection generated β stored in memory
Episode N+1 β reflection in prompt β agent avoids past mistake
...
Win rate rises measurably over episodes.
"""
from __future__ import annotations
import sys
from pathlib import Path
from typing import Any, Dict, List
_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(_ROOT))
from bedrock_model import invoke_qwen
# ---------------------------------------------------------------------------
# Reflection prompt
# ---------------------------------------------------------------------------
_REFLECTION_SYSTEM = """\
You are a senior detective reviewing a FAKE INSTAGRAM ACCOUNT detection case debrief.
This environment detects coordinated fake social media accounts β NOT financial fraud.
Signals available in this environment (use ONLY these):
β’ comment_repeat_score > 0.6 β copy-paste spam comments (gang: 0.6-0.9, real: 0.0-0.08)
β’ shared_ip_count > 5 β shares IP subnet (all 10 gang members have count=9)
β’ photo_reuse_score > 0.5 β stolen profile photos
β’ bio_template_score > 0.4 β copy-paste bio text
β’ fake_risk_score > 0.75 β high-confidence gang member (composite score)
β’ hub_legitimacy_score > 0.70 β celebrity account, do NOT flag
β’ After FLAG: visible neighbors auto-become SUSPECT (priority targets)
Available actions: INSPECT (1 step, reveals profile), INVESTIGATE_NETWORK (2 steps, 2-hop expand),
FLAG, UNFLAG, SUBMIT.
CRITICAL: Write lessons about fake social media signals and INSPECT/INVESTIGATE_NETWORK strategy
ONLY. Do NOT mention transactions, financial transfers, banking, or any concepts not listed above.
Output only the lesson text β no headers, no bullet points, just 2-3 plain sentences.\
"""
def generate_reflection(
task: str,
action_log: List[str],
final_message: str,
won: bool,
steps_used: int,
max_steps: int,
episode_num: int,
) -> str:
"""
Ask Qwen3 to generate a concrete lesson from one completed episode.
Returns a short string (2-3 sentences) suitable for memory storage.
"""
outcome = "SUCCESS" if won else "FAILURE"
log_preview = "\n".join(f" {i+1}. {s}" for i, s in enumerate(action_log[:20]))
if len(action_log) > 20:
log_preview += f"\n β¦ [{len(action_log) - 20} more steps]"
prompt = f"""\
FAKE INSTAGRAM ACCOUNT DETECTION β Episode {episode_num}
Task difficulty: {task.upper()}
Outcome: {outcome}
Steps used: {steps_used}/{max_steps}
Result: {final_message}
AVAILABLE SIGNALS (reference for your lesson):
comment_repeat_score > 0.6 | shared_ip_count > 5 | photo_reuse_score > 0.5
fake_risk_score > 0.75 | hub_legitimacy_score > 0.70 (celebrity, skip)
After FLAG β neighbors become SUSPECT (inspect them immediately)
INVESTIGATE_NETWORK on a flagged account reveals their 2-hop gang cluster
INVESTIGATION LOG:
{log_preview}
Write a 2-3 sentence lesson for your future self based on this case.
Focus on: which of the above signals were most diagnostic, whether using
INVESTIGATE_NETWORK after the first FLAG would have helped, and how to
better allocate the step budget. Be concrete and actionable.\
"""
try:
reflection = invoke_qwen(
prompt=prompt,
system=_REFLECTION_SYSTEM,
max_tokens=180,
temperature=0.6,
)
return reflection.strip()
except Exception as exc:
# If Bedrock fails, generate a minimal rule-based reflection
return _rule_based_reflection(won, steps_used, max_steps, final_message)
def _rule_based_reflection(
won: bool, steps_used: int, max_steps: int, final_message: str
) -> str:
"""Minimal fallback reflection when Bedrock is unavailable."""
if won and steps_used < max_steps * 0.6:
return (
"Early INVESTIGATE_NETWORK calls efficiently expanded the graph to all gang members. "
"Flagging accounts with both high photo_reuse AND bio_template scores maintained precision. "
"Submitting with budget remaining earned an efficiency bonus."
)
if won:
return (
"Found all gang members but used most of the step budget. "
"Look for intra-gang follow density earlier β once you find one member, INVESTIGATE_NETWORK immediately. "
"Flag faster to leave budget for verification."
)
if "Recall=0.00" in final_message or "TP=0" in final_message:
return (
"Zero gang members found β the starting accounts were all real. "
"After inspecting 3-4 low-signal accounts, use INVESTIGATE_NETWORK to jump to a different part of the graph. "
"Gang members have photo_reuse > 0.5 and bio_template > 0.4 simultaneously."
)
return (
"Partial recall β found some gang members but missed others. "
"After flagging the first gang member, immediately use INVESTIGATE_NETWORK: gang members follow each other heavily. "
"Don't waste steps inspecting low-signal accounts one-by-one."
)
# ---------------------------------------------------------------------------
# Post-win reflection (reinforces what worked)
# ---------------------------------------------------------------------------
def generate_success_reflection(
task: str,
action_log: List[str],
final_message: str,
steps_used: int,
max_steps: int,
episode_num: int,
) -> str:
"""Generate a reinforcement reflection after a WIN to capture what worked."""
log_preview = "\n".join(f" {i+1}. {s}" for i, s in enumerate(action_log[:15]))
prompt = f"""\
SUCCESSFUL CASE β Episode {episode_num}
Task: {task.upper()} | Steps used: {steps_used}/{max_steps}
Result: {final_message}
INVESTIGATION LOG (first 15 steps):
{log_preview}
In 2-3 sentences, describe the specific strategy that led to success.
What did you do right? What should you repeat in future cases?\
"""
try:
return invoke_qwen(
prompt=prompt,
system=_REFLECTION_SYSTEM,
max_tokens=150,
temperature=0.5,
).strip()
except Exception:
return _rule_based_reflection(True, steps_used, max_steps, final_message)
|