File size: 11,445 Bytes
8a685c0
350500c
 
 
 
8a685c0
 
 
 
abcf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350500c
 
 
 
 
 
 
abcf7c3
 
 
 
 
 
 
 
 
 
 
 
350500c
8a685c0
350500c
8a685c0
 
350500c
 
 
 
 
 
8a685c0
350500c
 
 
 
abcf7c3
350500c
 
abcf7c3
 
 
 
 
 
 
 
 
 
 
 
8a685c0
350500c
8a685c0
350500c
 
 
abcf7c3
350500c
 
abcf7c3
 
 
 
 
 
 
 
 
 
 
d550096
abcf7c3
d550096
abcf7c3
 
 
 
 
 
 
d550096
abcf7c3
d550096
abcf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a685c0
abcf7c3
 
 
 
 
 
 
 
 
 
 
 
8a685c0
abcf7c3
 
 
 
 
 
 
 
 
 
 
 
 
8a685c0
 
 
 
 
 
abcf7c3
8a685c0
abcf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350500c
abcf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a685c0
abcf7c3
350500c
abcf7c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350500c
abcf7c3
 
8a685c0
350500c
 
 
 
 
 
 
 
 
8a685c0
abcf7c3
 
 
 
 
 
350500c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import requests
from openenv.core.env_server import Environment
from src.models import AdAction, AdObservation, AdState
from src.generator import AdGenerator

REGULATORY_API = "http://localhost:8001"
CRM_API        = "http://localhost:8002"
AUDIT_API      = "http://localhost:8003"

VALID_ACTIONS = {
    "query_regulations",
    "analyze_image",
    "check_advertiser_history",
    "request_landing_page",
    "request_id_verification",
    "submit_audit",
    "approve",
    "reject"
}

TERMINAL_ACTIONS = {"approve", "reject"}

REQUIRED_BEFORE_TERMINAL = {
    "query_regulations",
    "submit_audit"
}

MAX_STEPS = 8
class AdPolicyEnvironment(Environment):
    def __init__(self):
        super().__init__()
        self.generator = AdGenerator()
        self.current_ad = None
        self.step_count = 0
        self.total_reward = 0.0
        self.actions_taken = set()
        self.api_failed = False
        self.api_recovered = False
        self.last_failed_action = None
        self.last_error = None
        self.trace = []
        self.signals = {
            "risk_score": None,
            "policy_confidence": None,
            "image_flag": None,
            "landing_flag": None
        }

    def _ensure_ad(self, task_id=None):
        if self.current_ad is None:
            self.current_ad = self.generator.generate_random_ad(task_id)
            self.current_ad["task_id"] = task_id or "task_1_healthcare"

    def state(self) -> AdState:
        self._ensure_ad()
        return AdState(
            step_count=self.step_count,
            total_reward=self.total_reward,
            current_ad_id=self.current_ad.get("ad_id", "N/A")
        )

    def reset(self, task_id: str = None) -> AdObservation:
        self.current_ad = self.generator.generate_random_ad(task_id)
        self.current_ad["task_id"] = task_id or "task_1_healthcare"
        self.step_count = 0
        self.total_reward = 0.0
        self.actions_taken = set()
        self.api_failed = False
        self.api_recovered = False
        self.last_failed_action = None
        self.last_error = None
        self.trace = []
        self.signals = {
            "risk_score": None,
            "policy_confidence": None,
            "image_flag": None,
            "landing_flag": None
        }
        return self._get_obs(f"Ad loaded for {self.current_ad['task_id']}. Begin with query_regulations.")

    def step(self, action: AdAction) -> AdObservation:
        self._ensure_ad()

        if not action or not hasattr(action, 'action_type'):
            return self._get_obs("Invalid action format.", -0.5, True)

        act_type = str(action.action_type).lower()

        # 1. Validate action
        if act_type not in VALID_ACTIONS:
            return self._get_obs(f"Invalid action: {act_type}.", -0.5, True)

        # 2. Start constraint — state based
        if "query_regulations" not in self.actions_taken:
            if act_type != "query_regulations":
                return self._get_obs("Must call query_regulations first.", -0.2, False)

        self.step_count += 1
        

       # 3. Execute action
        response = self._execute_action(act_type)

        # 4. Update state
        if "error" in response:
            self.api_failed = True
            self.last_failed_action = act_type
            self.last_error = response["error"]
            # Notice we DO NOT add to self.actions_taken here
        else:
            self.actions_taken.add(act_type)  # <-- ADDED HERE: Only register successful actions
            if act_type == self.last_failed_action:
                self.api_recovered = True
            self.last_error = None
            self._extract_signals(act_type, response)

        # 5. Append trace
        self.trace.append({
            "step": self.step_count,
            "action": act_type,
            "result": self._summarize_response(act_type, response)
        })

        # 6. Compute reward
        reward = -0.05  # step penalty

        # 7. Handle terminal
        done = False
        if act_type in TERMINAL_ACTIONS:
            reward += self._terminal_reward(act_type)
            done = True
        elif self.step_count >= MAX_STEPS:
            reward -= 0.5
            done = True

        self.total_reward += reward
        summary = self._summarize_response(act_type, response)["summary"]
        return self._get_obs(summary, reward, done)

    def _execute_action(self, act_type: str) -> dict:
        task_id = self.current_ad.get("task_id", "")

        # Deterministic failure for task_10_failure on step 1
        if task_id == "task_10_failure" and self.step_count == 1:
            return {"error": "service_unavailable", "retryable": True}

        try:
            if act_type == "query_regulations":
                category = self.current_ad.get("category", "general")
                resp = requests.get(f"{REGULATORY_API}/regulations/{category}", timeout=2)
                return resp.json()

            elif act_type == "analyze_image":
                vlm_desc = self.current_ad.get("vlm_desc", "")
                violation = any(kw in vlm_desc.lower() for kw in [
                    "violation", "banned", "prescription", "fake", "flagged",
                    "semaglutide", "adderall", "no rx", "no prescription"
                ])
                return {"violation_detected": violation, "description": vlm_desc}

            elif act_type == "check_advertiser_history":
                advertiser_id = self.current_ad.get("advertiser_id", "adv_003")
                resp = requests.get(f"{CRM_API}/advertiser/{advertiser_id}", timeout=2)
                return resp.json()

            elif act_type == "request_landing_page":
                domain_age = self.current_ad.get("domain_age_days", 365)
                risk_keywords = self.current_ad.get("landing_risk_keywords", [])
                suspicious = domain_age < 30 or len(risk_keywords) > 0
                return {"suspicious": suspicious, "domain_age": domain_age, "risk_keywords": risk_keywords}

            elif act_type == "request_id_verification":
                age_min = self.current_ad.get("targeting_data", {}).get("age_min", 18)
                return {"age_min": age_min, "minor_targeted": age_min < 18}

            elif act_type == "submit_audit":
                payload = {
                    "ad_id": self.current_ad.get("ad_id", "test"),
                    "action_taken": "pending",
                    "reasoning": "audit requested"
                }
                resp = requests.post(f"{AUDIT_API}/log", json=payload, timeout=2)
                return resp.json()

            else:
                return {"status": "ok"}

        except Exception as e:
            return {"error": f"service_unavailable", "retryable": True}

    def _extract_signals(self, action: str, response: dict):
        if action == "check_advertiser_history":
            self.signals["risk_score"] = response.get("risk_score")

        elif action == "query_regulations":
            violations = response.get("violations", [])
            confs = [v["confidence"] for v in violations]
            self.signals["policy_confidence"] = max(confs, default=0.0)

        elif action == "analyze_image":
            self.signals["image_flag"] = response.get("violation_detected", False)

        elif action == "request_landing_page":
            self.signals["landing_flag"] = response.get("suspicious", False)

    def _summarize_response(self, action: str, response: dict) -> dict:
        if "error" in response:
            return {"summary": "API failure — retryable", "flag": False}

        if action == "check_advertiser_history":
            rs = response.get("risk_score", 0.0)
            return {"summary": f"risk_score={rs:.2f}", "flag": rs > 0.7}

        if action == "query_regulations":
            violations = response.get("violations", [])
            conf = max((v["confidence"] for v in violations), default=0.0)
            return {"summary": f"policy_confidence={conf:.2f}", "flag": conf > 0.7}

        if action == "analyze_image":
            flagged = response.get("violation_detected", False)
            return {
                "summary": "image_violation_detected" if flagged else "image_clean",
                "flag": flagged
            }

        if action == "request_landing_page":
            sus = response.get("suspicious", False)
            return {"summary": "landing_suspicious" if sus else "landing_clean", "flag": sus}

        if action == "request_id_verification":
            minor = response.get("minor_targeted", False)
            age = response.get("age_min", 18)
            return {
                "summary": f"ALERT: minor targeting age={age}" if minor else f"age_min={age} OK",
                "flag": minor
            }

        if action == "submit_audit":
            audit_id = response.get("audit_id", "LOCAL")
            return {"summary": f"audit_logged id={audit_id}", "flag": False}

        return {"summary": "ok", "flag": False}

    def _terminal_reward(self, act_type: str) -> float:
        reward = 0.0
        is_violation = self.current_ad.get("ground_truth", False)
        is_correct = (act_type == "reject" and is_violation) or \
                     (act_type == "approve" and not is_violation)

        # Dominant signal
        reward += 1.0 if is_correct else -1.0

        # Sequence correctness
        if REQUIRED_BEFORE_TERMINAL.issubset(self.actions_taken):
            reward += 0.2
        else:
            reward -= 0.2

        # API failure handling
        if self.api_failed and self.api_recovered:
            reward += 0.3
        elif self.api_failed and not self.api_recovered:
            reward -= 0.3

        # Risk-aware penalty
        if act_type == "approve":
            rs = self.signals["risk_score"]
            pc = self.signals["policy_confidence"]
            if rs is not None and pc is not None:
                if rs > 0.7 and pc > 0.6:
                    reward -= 0.5

        # Ambiguity enforcement
        pc = self.signals["policy_confidence"]
        if pc is not None and pc < 0.6:
            if "check_advertiser_history" not in self.actions_taken \
               and "request_landing_page" not in self.actions_taken:
                reward -= 0.4

        # Landing page bonus under ambiguity
        if self.signals["landing_flag"] and pc is not None and pc < 0.6:
            reward += 0.2

        return reward
    
    def _get_obs(self, message, reward=0.0, done=False) -> AdObservation:
        self._ensure_ad()
        return AdObservation(
            ad_id=str(self.current_ad.get("ad_id", "N/A")),
            headline=str(self.current_ad.get("headline", "N/A")),
            body_text=str(self.current_ad.get("body_text", "N/A")),
            advertiser_trust_score=float(self.current_ad.get("advertiser_trust_score", 0.0)),
            targeting_data=dict(self.current_ad.get("targeting_data", {})),
            image_url=str(self.current_ad.get("image_url", "N/A")),
            status_message=str(message),
            reward=reward,
            done=done,
            risk_score=self.signals["risk_score"],
            policy_confidence=self.signals["policy_confidence"],
            image_flag=self.signals["image_flag"],
            landing_flag=self.signals["landing_flag"],
            last_error=self.last_error
        )