graphstrike / agent /reflection.py
Pandago's picture
Upload folder using huggingface_hub
50f71a7 verified
"""Post-episode reflection generator.
After each episode the agent calls `generate_reflection()`.
Qwen3 analyses the action log and outcome, then writes a concrete lesson
that gets stored in AgentMemory and injected into future prompts.
This is the core **learning mechanism**:
Episode N fails β†’ reflection generated β†’ stored in memory
Episode N+1 β†’ reflection in prompt β†’ agent avoids past mistake
...
Win rate rises measurably over episodes.
"""
from __future__ import annotations
import sys
from pathlib import Path
from typing import Any, Dict, List
_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(_ROOT))
from bedrock_model import invoke_qwen
# ---------------------------------------------------------------------------
# Reflection prompt
# ---------------------------------------------------------------------------
_REFLECTION_SYSTEM = """\
You are a senior detective reviewing a FAKE INSTAGRAM ACCOUNT detection case debrief.
This environment detects coordinated fake social media accounts β€” NOT financial fraud.
Signals available in this environment (use ONLY these):
β€’ comment_repeat_score > 0.6 β†’ copy-paste spam comments (gang: 0.6-0.9, real: 0.0-0.08)
β€’ shared_ip_count > 5 β†’ shares IP subnet (all 10 gang members have count=9)
β€’ photo_reuse_score > 0.5 β†’ stolen profile photos
β€’ bio_template_score > 0.4 β†’ copy-paste bio text
β€’ fake_risk_score > 0.75 β†’ high-confidence gang member (composite score)
β€’ hub_legitimacy_score > 0.70 β†’ celebrity account, do NOT flag
β€’ After FLAG: visible neighbors auto-become SUSPECT (priority targets)
Available actions: INSPECT (1 step, reveals profile), INVESTIGATE_NETWORK (2 steps, 2-hop expand),
FLAG, UNFLAG, SUBMIT.
CRITICAL: Write lessons about fake social media signals and INSPECT/INVESTIGATE_NETWORK strategy
ONLY. Do NOT mention transactions, financial transfers, banking, or any concepts not listed above.
Output only the lesson text β€” no headers, no bullet points, just 2-3 plain sentences.\
"""
def generate_reflection(
task: str,
action_log: List[str],
final_message: str,
won: bool,
steps_used: int,
max_steps: int,
episode_num: int,
) -> str:
"""
Ask Qwen3 to generate a concrete lesson from one completed episode.
Returns a short string (2-3 sentences) suitable for memory storage.
"""
outcome = "SUCCESS" if won else "FAILURE"
log_preview = "\n".join(f" {i+1}. {s}" for i, s in enumerate(action_log[:20]))
if len(action_log) > 20:
log_preview += f"\n … [{len(action_log) - 20} more steps]"
prompt = f"""\
FAKE INSTAGRAM ACCOUNT DETECTION β€” Episode {episode_num}
Task difficulty: {task.upper()}
Outcome: {outcome}
Steps used: {steps_used}/{max_steps}
Result: {final_message}
AVAILABLE SIGNALS (reference for your lesson):
comment_repeat_score > 0.6 | shared_ip_count > 5 | photo_reuse_score > 0.5
fake_risk_score > 0.75 | hub_legitimacy_score > 0.70 (celebrity, skip)
After FLAG β†’ neighbors become SUSPECT (inspect them immediately)
INVESTIGATE_NETWORK on a flagged account reveals their 2-hop gang cluster
INVESTIGATION LOG:
{log_preview}
Write a 2-3 sentence lesson for your future self based on this case.
Focus on: which of the above signals were most diagnostic, whether using
INVESTIGATE_NETWORK after the first FLAG would have helped, and how to
better allocate the step budget. Be concrete and actionable.\
"""
try:
reflection = invoke_qwen(
prompt=prompt,
system=_REFLECTION_SYSTEM,
max_tokens=180,
temperature=0.6,
)
return reflection.strip()
except Exception as exc:
# If Bedrock fails, generate a minimal rule-based reflection
return _rule_based_reflection(won, steps_used, max_steps, final_message)
def _rule_based_reflection(
won: bool, steps_used: int, max_steps: int, final_message: str
) -> str:
"""Minimal fallback reflection when Bedrock is unavailable."""
if won and steps_used < max_steps * 0.6:
return (
"Early INVESTIGATE_NETWORK calls efficiently expanded the graph to all gang members. "
"Flagging accounts with both high photo_reuse AND bio_template scores maintained precision. "
"Submitting with budget remaining earned an efficiency bonus."
)
if won:
return (
"Found all gang members but used most of the step budget. "
"Look for intra-gang follow density earlier β€” once you find one member, INVESTIGATE_NETWORK immediately. "
"Flag faster to leave budget for verification."
)
if "Recall=0.00" in final_message or "TP=0" in final_message:
return (
"Zero gang members found β€” the starting accounts were all real. "
"After inspecting 3-4 low-signal accounts, use INVESTIGATE_NETWORK to jump to a different part of the graph. "
"Gang members have photo_reuse > 0.5 and bio_template > 0.4 simultaneously."
)
return (
"Partial recall β€” found some gang members but missed others. "
"After flagging the first gang member, immediately use INVESTIGATE_NETWORK: gang members follow each other heavily. "
"Don't waste steps inspecting low-signal accounts one-by-one."
)
# ---------------------------------------------------------------------------
# Post-win reflection (reinforces what worked)
# ---------------------------------------------------------------------------
def generate_success_reflection(
task: str,
action_log: List[str],
final_message: str,
steps_used: int,
max_steps: int,
episode_num: int,
) -> str:
"""Generate a reinforcement reflection after a WIN to capture what worked."""
log_preview = "\n".join(f" {i+1}. {s}" for i, s in enumerate(action_log[:15]))
prompt = f"""\
SUCCESSFUL CASE β€” Episode {episode_num}
Task: {task.upper()} | Steps used: {steps_used}/{max_steps}
Result: {final_message}
INVESTIGATION LOG (first 15 steps):
{log_preview}
In 2-3 sentences, describe the specific strategy that led to success.
What did you do right? What should you repeat in future cases?\
"""
try:
return invoke_qwen(
prompt=prompt,
system=_REFLECTION_SYSTEM,
max_tokens=150,
temperature=0.5,
).strip()
except Exception:
return _rule_based_reflection(True, steps_used, max_steps, final_message)