Spaces:

parth-1
/

MetaGuard

Sleeping

App Files Files Community

Kartik Goyal commited on Apr 25

Commit

abcf7c3

1 Parent(s): cedcbb4

updated src/models.py · src/environment.py · src/generator.py · apps/crm_api.py · apps/regulatory_api.py · apps/audit_api.py

Browse files

Files changed (6) hide show

apps/audit_api.py +7 -4
apps/crm_api.py +13 -4
apps/regulatory_api.py +28 -22
src/environment.py +234 -81
src/generator.py +64 -6
src/models.py +11 -5

apps/audit_api.py CHANGED Viewed

@@ -1,7 +1,8 @@
-#audit_api
 from fastapi import FastAPI
 from pydantic import BaseModel
 import uvicorn
 app = FastAPI(title="Compliance Audit API")
 logs = []
@@ -13,9 +14,11 @@ class AuditRecord(BaseModel):
 @app.post("/log")
 def log_audit(record: AuditRecord):
-    logs.append(record.dict())
-    return {"status": "success", "audit_id": f"AUD-{len(logs)}"}
 @app.get("/health")
 def health():
     return {"status": "ok", "service": "compliance-audit"}

 from fastapi import FastAPI
 from pydantic import BaseModel
 import uvicorn
+import random
+import uuid
 app = FastAPI(title="Compliance Audit API")
 logs = []
 @app.post("/log")
 def log_audit(record: AuditRecord):
+    if random.random() < 0.1:
+        return {"error": "service_unavailable", "retryable": True}
+    audit_id = f"AUD-{uuid.uuid4().hex[:8].upper()}"
+    logs.append({**record.dict(), "audit_id": audit_id})
+    return {"status": "logged", "audit_id": audit_id}
 @app.get("/health")
 def health():
     return {"status": "ok", "service": "compliance-audit"}

apps/crm_api.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from fastapi import FastAPI
 import uvicorn
 app = FastAPI(title="Advertiser CRM API")
@@ -38,14 +39,22 @@ ADVERTISERS = {
 @app.get("/advertiser/{advertiser_id}")
 def get_advertiser(advertiser_id: str):
-    if advertiser_id in ADVERTISERS:
-        return ADVERTISERS[advertiser_id]
-    return {
         "name": "Unknown Advertiser",
         "prior_violations": 0,
         "account_age_days": 7,
         "summary": "New unverified advertiser. No history. Treat with caution."
-    }
 @app.get("/health")
 def health():

 from fastapi import FastAPI
 import uvicorn
+import random
 app = FastAPI(title="Advertiser CRM API")
 @app.get("/advertiser/{advertiser_id}")
 def get_advertiser(advertiser_id: str):
+    if random.random() < 0.1:
+        return {"error": "service_unavailable", "retryable": True}
+    data = ADVERTISERS.get(advertiser_id, {
         "name": "Unknown Advertiser",
         "prior_violations": 0,
         "account_age_days": 7,
         "summary": "New unverified advertiser. No history. Treat with caution."
+    })
+    risk_score = min(1.0,
+        0.15 * data["prior_violations"] +
+        0.5 * (1 / (1 + data["account_age_days"] / 30))
+    )
+    return {**data, "risk_score": round(risk_score, 2)}
 @app.get("/health")
 def health():

apps/regulatory_api.py CHANGED Viewed

@@ -1,45 +1,51 @@
-# regulatory_api.py
 from fastapi import FastAPI
 import uvicorn
 app = FastAPI(title="Regulatory DB API")
 REGULATIONS = {
     "healthcare": {
-        "policy_summary": (
-            "Health claims require FDA approval. "
-            "Prohibited: unverified cure claims, 'guaranteed results', "
-            "prescription drug sales without authorization."
-        ),
-        "risk_level": "high"
     },
     "financial": {
-        "policy_summary": (
-            "Financial ads require SEC registration. "
-            "Prohibited: guaranteed returns, predatory APR above 36%, "
-            "high-pressure investment tactics."
-        ),
-        "risk_level": "high"
     },
     "targeting": {
-        "policy_summary": (
-            "Age-restricted products cannot target minors. "
-            "Financial and healthcare products require age_min >= 18."
-        ),
-        "risk_level": "high"
     },
     "general": {
-        "policy_summary": "Standard advertising standards apply. No deceptive claims.",
-        "risk_level": "low"
     },
     "none": {
-        "policy_summary": "Standard advertising standards apply. No deceptive claims.",
-        "risk_level": "low"
     }
 }
 @app.get("/regulations/{category}")
 def get_regulations(category: str):
     return REGULATIONS.get(category.lower(), REGULATIONS["general"])
 @app.get("/health")

 from fastapi import FastAPI
 import uvicorn
+import random
 app = FastAPI(title="Regulatory DB API")
 REGULATIONS = {
     "healthcare": {
+        "policy_hint": "Health claims require FDA verification. No unverified cures or prescription bypasses.",
+        "violations": [
+            {"type": "unverified_cure_claim", "confidence": 0.9},
+            {"type": "prescription_bypass", "confidence": 0.4}
+        ]
     },
     "financial": {
+        "policy_hint": "SEC registration required. No guaranteed returns or predatory lending.",
+        "violations": [
+            {"type": "guaranteed_returns", "confidence": 0.85},
+            {"type": "predatory_lending", "confidence": 0.5}
+        ]
     },
     "targeting": {
+        "policy_hint": "Age-restricted products cannot target minors. age_min must be >= 18.",
+        "violations": [
+            {"type": "minor_targeting", "confidence": 0.95}
+        ]
+    },
+    "ambiguous": {
+        "policy_hint": "Policy applicability is unclear. Gather additional signals before deciding.",
+        "violations": [
+            {"type": "possible_misleading_claim", "confidence": 0.45},
+            {"type": "unverified_endorsement", "confidence": 0.5}
+        ]
     },
     "general": {
+        "policy_hint": "Standard advertising standards apply. No deceptive claims.",
+        "violations": []
     },
     "none": {
+        "policy_hint": "Standard advertising standards apply. No deceptive claims.",
+        "violations": []
     }
 }
 @app.get("/regulations/{category}")
 def get_regulations(category: str):
+    if random.random() < 0.1:
+        return {"error": "service_unavailable", "retryable": True}
     return REGULATIONS.get(category.lower(), REGULATIONS["general"])
 @app.get("/health")

src/environment.py CHANGED Viewed

@@ -7,16 +7,44 @@ REGULATORY_API = "http://localhost:8001"
 CRM_API        = "http://localhost:8002"
 AUDIT_API      = "http://localhost:8003"
 class AdPolicyEnvironment(Environment):
     def __init__(self):
         super().__init__()
         self.generator = AdGenerator()
         self.current_ad = None
-        self.image_analyzed = False
-        self.regulations_queried = False
-        self.audit_submitted = False
         self.step_count = 0
         self.total_reward = 0.0
     def _ensure_ad(self, task_id=None):
         if self.current_ad is None:
@@ -33,106 +61,226 @@ class AdPolicyEnvironment(Environment):
     def reset(self, task_id: str = None) -> AdObservation:
         self.current_ad = self.generator.generate_random_ad(task_id)
-        self.current_ad["task_id"] = task_id or "task_1_healthcare"
-        self.image_analyzed = False
-        self.regulations_queried = False
-        self.audit_submitted = False
         self.step_count = 0
         self.total_reward = 0.0
         return self._get_obs(f"Ad loaded for {self.current_ad['task_id']}. Begin with query_regulations.")
     def step(self, action: AdAction) -> AdObservation:
         self._ensure_ad()
-        self.step_count += 1
-        reward = 0.0
-        done = False
         if not action or not hasattr(action, 'action_type'):
-            return self._get_obs("Invalid action format.", -0.1, False)
         act_type = str(action.action_type).lower()
-        task_id  = self.current_ad.get("task_id", "")
-        # ── TOOL ACTIONS ──────────────────────────────────────────────────────
-        if act_type == "query_regulations":
-            self.regulations_queried = True
-            reward = -0.05
-            category = self.current_ad.get("category", "general")
-            try:
                 resp = requests.get(f"{REGULATORY_API}/regulations/{category}", timeout=2)
-                message = resp.json().get("policy_summary", "Standard policy applies.")
-            except Exception:
-                message = "API Error: Default standard policy applies."
-        elif act_type == "analyze_image":
-            self.image_analyzed = True
-            reward = -0.05
-            message = self.current_ad.get("vlm_desc", "No visual anomalies detected.")
-        elif act_type == "check_advertiser_history":
-            reward = -0.05
-            advertiser_id = self.current_ad.get("advertiser_id", "adv_003")
-            try:
                 resp = requests.get(f"{CRM_API}/advertiser/{advertiser_id}", timeout=2)
-                message = f"CRM Summary: {resp.json().get('summary', 'No data')}"
-            except Exception:
-                message = "CRM offline. Cannot verify history."
-        elif act_type == "request_landing_page":
-            reward = -0.05
-            domain_age = self.current_ad.get("domain_age_days", 365)
-            risk_keywords = self.current_ad.get("landing_risk_keywords", [])
-            message = f"Domain age: {domain_age} days. Flagged terms: {risk_keywords or 'none'}."
-        elif act_type == "request_id_verification":
-            reward = -0.05
-            age_min = self.current_ad.get("targeting_data", {}).get("age_min", 18)
-            message = f"Target age {age_min}+." if age_min >= 18 else f"ALERT: Minor targeting detected (Age {age_min}+)."
-        elif act_type == "submit_audit":
-            self.audit_submitted = True
-            reward = 0.0
-            try:
                 payload = {
                     "ad_id": self.current_ad.get("ad_id", "test"),
                     "action_taken": "pending",
                     "reasoning": "audit requested"
                 }
                 resp = requests.post(f"{AUDIT_API}/log", json=payload, timeout=2)
-                message = f"Audit logged: {resp.json().get('audit_id', 'Local-1')}"
-            except Exception:
-                message = "Audit recorded locally."
-        # ── TERMINAL ACTIONS (Phase Gates) ────────────────────────────────────
-        elif act_type in ["approve", "reject"]:
-            # Gate 1: Must query rules
-            if not self.regulations_queried:
-                self.total_reward += -0.2
-                return self._get_obs("Policy Gate: Run query_regulations first.", -0.2, False)
-            # Gate 2: Multimodal tasks require image analysis
-            if "multimodal" in task_id and not self.image_analyzed:
-                self.total_reward += -0.3
-                return self._get_obs("Visual Gate: Image analysis required.", -0.3, False)
-            # Gate 3: Must audit
-            if not self.audit_submitted:
-                self.total_reward += -0.2
-                return self._get_obs("Compliance Gate: Run submit_audit before decision.", -0.2, False)
-            done = True
-            is_violation = self.current_ad.get("ground_truth", False)
-            is_correct = (act_type == "reject" and is_violation) or (act_type == "approve" and not is_violation)
-            reward = 1.0 if is_correct else -1.0
-            message = f"Decision: {act_type.upper()}. {'Correct!' if is_correct else 'Incorrect.'}"
         else:
-            reward = -0.05
-            message = f"Unknown action: {act_type}."
-        self.total_reward += reward
-        return self._get_obs(message, reward, done)
     def _get_obs(self, message, reward=0.0, done=False) -> AdObservation:
         self._ensure_ad()
         return AdObservation(
@@ -144,5 +292,10 @@ class AdPolicyEnvironment(Environment):
             image_url=str(self.current_ad.get("image_url", "N/A")),
             status_message=str(message),
             reward=reward,
-            done=done
         )

 CRM_API        = "http://localhost:8002"
 AUDIT_API      = "http://localhost:8003"
+VALID_ACTIONS = {
+    "query_regulations",
+    "analyze_image",
+    "check_advertiser_history",
+    "request_landing_page",
+    "request_id_verification",
+    "submit_audit",
+    "approve",
+    "reject"
+}
+TERMINAL_ACTIONS = {"approve", "reject"}
+REQUIRED_BEFORE_TERMINAL = {
+    "query_regulations",
+    "submit_audit"
+}
+MAX_STEPS = 8
 class AdPolicyEnvironment(Environment):
     def __init__(self):
         super().__init__()
         self.generator = AdGenerator()
         self.current_ad = None
         self.step_count = 0
         self.total_reward = 0.0
+        self.actions_taken = set()
+        self.api_failed = False
+        self.api_recovered = False
+        self.last_failed_action = None
+        self.last_error = None
+        self.trace = []
+        self.signals = {
+            "risk_score": None,
+            "policy_confidence": None,
+            "image_flag": None,
+            "landing_flag": None
+        }
     def _ensure_ad(self, task_id=None):
         if self.current_ad is None:
     def reset(self, task_id: str = None) -> AdObservation:
         self.current_ad = self.generator.generate_random_ad(task_id)
+        self.current_ad["task_id"] = task_id or "task_1_healthcare"
         self.step_count = 0
         self.total_reward = 0.0
+        self.actions_taken = set()
+        self.api_failed = False
+        self.api_recovered = False
+        self.last_failed_action = None
+        self.last_error = None
+        self.trace = []
+        self.signals = {
+            "risk_score": None,
+            "policy_confidence": None,
+            "image_flag": None,
+            "landing_flag": None
+        }
         return self._get_obs(f"Ad loaded for {self.current_ad['task_id']}. Begin with query_regulations.")
     def step(self, action: AdAction) -> AdObservation:
         self._ensure_ad()
         if not action or not hasattr(action, 'action_type'):
+            return self._get_obs("Invalid action format.", -0.5, True)
         act_type = str(action.action_type).lower()
+        # 1. Validate action
+        if act_type not in VALID_ACTIONS:
+            return self._get_obs(f"Invalid action: {act_type}.", -0.5, True)
+        # 2. Start constraint — state based
+        if "query_regulations" not in self.actions_taken:
+            if act_type != "query_regulations":
+                return self._get_obs("Must call query_regulations first.", -0.2, False)
+        self.step_count += 1
+        self.actions_taken.add(act_type)
+        # 3. Execute action
+        response = self._execute_action(act_type)
+        # 4. Update state
+        if "error" in response:
+            self.api_failed = True
+            self.last_failed_action = act_type
+            self.last_error = response["error"]
+        else:
+            if act_type == self.last_failed_action:
+                self.api_recovered = True
+            self.last_error = None
+            self._extract_signals(act_type, response)
+        # 5. Append trace
+        self.trace.append({
+            "step": self.step_count,
+            "action": act_type,
+            "result": self._summarize_response(act_type, response)
+        })
+        # 6. Compute reward
+        reward = -0.05  # step penalty
+        # 7. Handle terminal
+        done = False
+        if act_type in TERMINAL_ACTIONS:
+            reward += self._terminal_reward(act_type)
+            done = True
+        elif self.step_count >= MAX_STEPS:
+            reward -= 0.5
+            done = True
+        self.total_reward += reward
+        summary = self._summarize_response(act_type, response)["summary"]
+        return self._get_obs(summary, reward, done)
+    def _execute_action(self, act_type: str) -> dict:
+        task_id = self.current_ad.get("task_id", "")
+        # Deterministic failure for task_10_failure on step 1
+        if task_id == "task_10_failure" and self.step_count == 1:
+            return {"error": "service_unavailable", "retryable": True}
+        try:
+            if act_type == "query_regulations":
+                category = self.current_ad.get("category", "general")
                 resp = requests.get(f"{REGULATORY_API}/regulations/{category}", timeout=2)
+                return resp.json()
+            elif act_type == "analyze_image":
+                vlm_desc = self.current_ad.get("vlm_desc", "")
+                violation = any(kw in vlm_desc.lower() for kw in [
+                    "violation", "banned", "prescription", "fake", "flagged",
+                    "semaglutide", "adderall", "no rx", "no prescription"
+                ])
+                return {"violation_detected": violation, "description": vlm_desc}
+            elif act_type == "check_advertiser_history":
+                advertiser_id = self.current_ad.get("advertiser_id", "adv_003")
                 resp = requests.get(f"{CRM_API}/advertiser/{advertiser_id}", timeout=2)
+                return resp.json()
+            elif act_type == "request_landing_page":
+                domain_age = self.current_ad.get("domain_age_days", 365)
+                risk_keywords = self.current_ad.get("landing_risk_keywords", [])
+                suspicious = domain_age < 30 or len(risk_keywords) > 0
+                return {"suspicious": suspicious, "domain_age": domain_age, "risk_keywords": risk_keywords}
+            elif act_type == "request_id_verification":
+                age_min = self.current_ad.get("targeting_data", {}).get("age_min", 18)
+                return {"age_min": age_min, "minor_targeted": age_min < 18}
+            elif act_type == "submit_audit":
                 payload = {
                     "ad_id": self.current_ad.get("ad_id", "test"),
                     "action_taken": "pending",
                     "reasoning": "audit requested"
                 }
                 resp = requests.post(f"{AUDIT_API}/log", json=payload, timeout=2)
+                return resp.json()
+            else:
+                return {"status": "ok"}
+        except Exception as e:
+            return {"error": f"service_unavailable", "retryable": True}
+    def _extract_signals(self, action: str, response: dict):
+        if action == "check_advertiser_history":
+            self.signals["risk_score"] = response.get("risk_score")
+        elif action == "query_regulations":
+            violations = response.get("violations", [])
+            confs = [v["confidence"] for v in violations]
+            self.signals["policy_confidence"] = max(confs, default=0.0)
+        elif action == "analyze_image":
+            self.signals["image_flag"] = response.get("violation_detected", False)
+        elif action == "request_landing_page":
+            self.signals["landing_flag"] = response.get("suspicious", False)
+    def _summarize_response(self, action: str, response: dict) -> dict:
+        if "error" in response:
+            return {"summary": "API failure — retryable", "flag": False}
+        if action == "check_advertiser_history":
+            rs = response.get("risk_score", 0.0)
+            return {"summary": f"risk_score={rs:.2f}", "flag": rs > 0.7}
+        if action == "query_regulations":
+            violations = response.get("violations", [])
+            conf = max((v["confidence"] for v in violations), default=0.0)
+            return {"summary": f"policy_confidence={conf:.2f}", "flag": conf > 0.7}
+        if action == "analyze_image":
+            flagged = response.get("violation_detected", False)
+            return {
+                "summary": "image_violation_detected" if flagged else "image_clean",
+                "flag": flagged
+            }
+        if action == "request_landing_page":
+            sus = response.get("suspicious", False)
+            return {"summary": "landing_suspicious" if sus else "landing_clean", "flag": sus}
+        if action == "request_id_verification":
+            minor = response.get("minor_targeted", False)
+            age = response.get("age_min", 18)
+            return {
+                "summary": f"ALERT: minor targeting age={age}" if minor else f"age_min={age} OK",
+                "flag": minor
+            }
+        if action == "submit_audit":
+            audit_id = response.get("audit_id", "LOCAL")
+            return {"summary": f"audit_logged id={audit_id}", "flag": False}
+        return {"summary": "ok", "flag": False}
+    def _terminal_reward(self, act_type: str) -> float:
+        reward = 0.0
+        is_violation = self.current_ad.get("ground_truth", False)
+        is_correct = (act_type == "reject" and is_violation) or \
+                     (act_type == "approve" and not is_violation)
+        # Dominant signal
+        reward += 1.0 if is_correct else -1.0
+        # Sequence correctness
+        if REQUIRED_BEFORE_TERMINAL.issubset(self.actions_taken):
+            reward += 0.2
         else:
+            reward -= 0.2
+        # API failure handling
+        if self.api_failed and self.api_recovered:
+            reward += 0.3
+        elif self.api_failed and not self.api_recovered:
+            reward -= 0.3
+        # Risk-aware penalty
+        if act_type == "approve":
+            rs = self.signals["risk_score"]
+            pc = self.signals["policy_confidence"]
+            if rs is not None and pc is not None:
+                if rs > 0.7 and pc > 0.6:
+                    reward -= 0.5
+        # Ambiguity enforcement
+        pc = self.signals["policy_confidence"]
+        if pc is not None and pc < 0.6:
+            if "check_advertiser_history" not in self.actions_taken \
+               and "request_landing_page" not in self.actions_taken:
+                reward -= 0.4
+        # Landing page bonus under ambiguity
+        if self.signals["landing_flag"] and pc is not None and pc < 0.6:
+            reward += 0.2
+        return reward
     def _get_obs(self, message, reward=0.0, done=False) -> AdObservation:
         self._ensure_ad()
         return AdObservation(
             image_url=str(self.current_ad.get("image_url", "N/A")),
             status_message=str(message),
             reward=reward,
+            done=done,
+            risk_score=self.signals["risk_score"],
+            policy_confidence=self.signals["policy_confidence"],
+            image_flag=self.signals["image_flag"],
+            landing_flag=self.signals["landing_flag"],
+            last_error=self.last_error
         )

src/generator.py CHANGED Viewed

@@ -69,7 +69,40 @@ class AdGenerator:
          "A bottle of standard FDA-approved daily vitamins.",
          False, 400, []),
     ]
     @staticmethod
     def generate_random_ad(task_id: str = None):
         # Task overrides
@@ -105,6 +138,31 @@ class AdGenerator:
             domain_age  = 30
             risk_kw     = []
         else:
             scenario_type = random.choices(
                 ["obvious", "multimodal", "subtle", "safe"],
@@ -112,23 +170,23 @@ class AdGenerator:
                 k=1
             )[0]
             pool_map = {
-                "obvious":   AdGenerator.OBVIOUS_VIOLATIONS,
                 "multimodal": AdGenerator.MULTIMODAL_TRAPS,
-                "subtle":    AdGenerator.SUBTLE_VIOLATIONS,
-                "safe":      AdGenerator.SAFE_ADS,
             }
             category, adv_id, headline, body, vlm_desc, is_violation, domain_age, risk_kw = random.choice(
                 pool_map[scenario_type]
             )
         # Trust score — KEY FIX: multimodal traps get HIGH trust to defeat shortcuts
-        if task_id == "task_3_multimodal":
-            trust_score = round(random.uniform(0.82, 0.97), 2)  # high trust = stealth trap
         elif is_violation:
             trust_score = round(random.uniform(0.10, 0.50), 2)
         else:
             trust_score = round(random.uniform(0.70, 0.99), 2)
         # task_4 targeting age — FIX: use "age_min" consistently
         age_min = 15 if task_id == "task_4_targeting" else random.randint(18, 55)

          "A bottle of standard FDA-approved daily vitamins.",
          False, 400, []),
     ]
+    CONFLICT_ADS = [
+        ("FINANCIAL", "adv_002", "Smart Investment Tools",
+         "Our platform helps you manage your portfolio with ease.",
+         "A clean dashboard showing portfolio analytics.",
+         True, 20, ["guaranteed-edge"]),
+    ]
+    AMBIGUOUS_ADS = [
+        ("ambiguous", "adv_004", "Feel Better Naturally",
+         "Our wellness program supports your health journey.",
+         "A person meditating in a park.",
+         True, 45, ["ships-direct"]),
+    ]
+    ADVERSARIAL_ADS = [
+        ("HEALTHCARE", "adv_004", "Natural Energy Boost",
+         "Feel amazing every day with our daily supplement.",
+         "Fine print reads: contains semaglutide analog, ships without Rx needed.",
+         True, 60, []),
+    ]
+    DEPENDENCY_TRAP_ADS = [
+        ("HEALTHCARE", "adv_003", "Wellness Support Pack",
+         "Certified vitamins to support your daily routine.",
+         "Unlabeled pill bottle with small text: contains unapproved compound.",
+         True, 400, []),
+    ]
+    FAILURE_RECOVERY_ADS = [
+        ("HEALTHCARE", "adv_001", "Miracle Weight Loss Patch",
+         "Lose 10kg in 2 weeks. Guaranteed results or money back.",
+         "A before/after image with extreme weight loss claims.",
+         True, 30, ["guaranteed", "no-prescription"]),
+    ]
     @staticmethod
     def generate_random_ad(task_id: str = None):
         # Task overrides
             domain_age  = 30
             risk_kw     = []
+        elif task_id == "task_6_conflict":
+            category, adv_id, headline, body, vlm_desc, is_violation, domain_age, risk_kw = random.choice(
+                AdGenerator.CONFLICT_ADS
+            )
+        elif task_id == "task_7_ambiguous":
+            category, adv_id, headline, body, vlm_desc, is_violation, domain_age, risk_kw = random.choice(
+                AdGenerator.AMBIGUOUS_ADS
+            )
+        elif task_id == "task_8_adversarial":
+            category, adv_id, headline, body, vlm_desc, is_violation, domain_age, risk_kw = random.choice(
+                AdGenerator.ADVERSARIAL_ADS
+            )
+        elif task_id == "task_9_dependency_trap":
+            category, adv_id, headline, body, vlm_desc, is_violation, domain_age, risk_kw = random.choice(
+                AdGenerator.DEPENDENCY_TRAP_ADS
+            )
+        elif task_id == "task_10_failure":
+            category, adv_id, headline, body, vlm_desc, is_violation, domain_age, risk_kw = random.choice(
+                AdGenerator.FAILURE_RECOVERY_ADS
+            )
         else:
             scenario_type = random.choices(
                 ["obvious", "multimodal", "subtle", "safe"],
                 k=1
             )[0]
             pool_map = {
+                "obvious":    AdGenerator.OBVIOUS_VIOLATIONS,
                 "multimodal": AdGenerator.MULTIMODAL_TRAPS,
+                "subtle":     AdGenerator.SUBTLE_VIOLATIONS,
+                "safe":       AdGenerator.SAFE_ADS,
             }
             category, adv_id, headline, body, vlm_desc, is_violation, domain_age, risk_kw = random.choice(
                 pool_map[scenario_type]
             )
         # Trust score — KEY FIX: multimodal traps get HIGH trust to defeat shortcuts
+        # task_3 + task_6: high trust = stealth trap, forces CRM check
+        if task_id in ("task_3_multimodal", "task_6_conflict"):
+            trust_score = round(random.uniform(0.82, 0.97), 2)
         elif is_violation:
             trust_score = round(random.uniform(0.10, 0.50), 2)
         else:
             trust_score = round(random.uniform(0.70, 0.99), 2)
         # task_4 targeting age — FIX: use "age_min" consistently
         age_min = 15 if task_id == "task_4_targeting" else random.randint(18, 55)

src/models.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Literal, Optional, Dict, Any
 from openenv.core.env_server import Action, Observation, State
 class AdObservation(Observation):
@@ -9,15 +9,21 @@ class AdObservation(Observation):
     targeting_data: Dict[str, Any]
     image_url: str
     status_message: str
-    # 🚨 NEW: OpenEnv requires these to be part of the Observation!
     reward: float = 0.0
     done: bool = False
 class AdAction(Action):
     action_type: Literal[
-        "approve", "reject", "analyze_image",
-        "request_landing_page", "request_id_verification"
     ]
     reasoning: str
     violation_category: Optional[Literal["HEALTHCARE", "FINANCIAL", "NONE"]] = None

+from typing import Literal, Optional, Dict, Any, List
 from openenv.core.env_server import Action, Observation, State
 class AdObservation(Observation):
     targeting_data: Dict[str, Any]
     image_url: str
     status_message: str
     reward: float = 0.0
     done: bool = False
+    # signals exposed to agent
+    risk_score: Optional[float] = None
+    policy_confidence: Optional[float] = None
+    image_flag: Optional[bool] = None
+    landing_flag: Optional[bool] = None
+    last_error: Optional[str] = None
 class AdAction(Action):
     action_type: Literal[
+        "query_regulations", "analyze_image", "check_advertiser_history",
+        "request_landing_page", "request_id_verification",
+        "submit_audit", "approve", "reject"
     ]
     reasoning: str
     violation_category: Optional[Literal["HEALTHCARE", "FINANCIAL", "NONE"]] = None