Spaces:

parth-1
/

MetaGuard

Sleeping

App Files Files Community

3v324v23 commited on Apr 1

Commit

350500c

0 Parent(s):

Initial commit: Meta Ad-Policy Sandbox

Browse files

Files changed (19) hide show

.dockerignore +3 -0
README.md +29 -0
dockerfile +20 -0
inference.py +105 -0
main.py +16 -0
openenv.yaml +20 -0
pyproject.toml +15 -0
requirements.txt +5 -0
src/.gitignore +0 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/environment.cpython-313.pyc +0 -0
src/__pycache__/generator.cpython-313.pyc +0 -0
src/__pycache__/models.cpython-313.pyc +0 -0
src/environment.py +99 -0
src/generator.py +97 -0
src/models.py +28 -0
test_env.py +57 -0
validate.sh +48 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv/
+__pycache__/
+*.pyc

README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+# 🛡️ Meta Ad-Policy RL Sandbox
+A custom, bleeding-edge Reinforcement Learning environment built for the Meta Ad-Policy Hackathon. This sandbox evaluates the ability of Vision-Language Models (VLMs) and LLMs to act as autonomous ad moderators, navigating complex policy violations, multimodal traps, and illegal targeting.
+## 🚀 Core Features
+* **OpenEnv 0.2.3 Compliant:** Fully implements the latest Meta OpenEnv specifications, including Pydantic `StepResult` state serialization and `/step` & `/reset` API endpoints.
+* **Reward Shaping:** Implements a strict `-0.05` step penalty to force the AI agent to optimize tool usage and prevent infinite analysis loops.
+* **Multimodal Traps:** Tests the limits of VLMs by presenting ads where the text is benign, but the visual elements contain severe policy violations.
+* **Containerized Infrastructure:** Fully Dockerized and highly lightweight, easily running under the 2 vCPU / 8GB RAM hackathon constraints.
+## 📋 Evaluation Tasks
+The environment natively supports 4 distinct adversarial tasks, loadable via the `task_id` parameter:
+1. `task_1_healthcare`: Evaluates ads for unapproved medical claims, pharmaceuticals, and subtle dog whistles.
+2. `task_2_financial`: Evaluates ads for predatory financial services, scams, and high-pressure tactics.
+3. `task_3_multimodal`: Detects policy violations hidden entirely within visual elements that bypass standard NLP text filters.
+4. `task_4_targeting`: Identifies illegal demographic targeting (e.g., adult financial services targeting minors).
+## 🛠️ Available Agent Tools
+The environment exposes the following action space to the evaluating LLM:
+* `analyze_image`: Request VLM context for visual elements.
+* `request_landing_page`: Extract simulated URL endpoints.
+* `request_id_verification`: Check advertiser trust scores.
+* `approve` / `reject`: Terminal actions.
+## 🚦 Quick Start (Local)
+**1. Build the Docker Image**
+```bash
+docker build -t meta-ad-sandbox .

dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+# Use a lightweight Python image
+FROM python:3.11-slim
+# Set the working directory
+WORKDIR /app
+# Copy requirements first (makes builds faster)
+COPY requirements.txt .
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your code
+COPY . .
+# Expose the port Uvicorn uses
+EXPOSE 8000
+# Start the server
+CMD ["python", "main.py"]

inference.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+import json
+import requests
+from openai import OpenAI
+# 1. 🚨 MANDATORY VARIABLES EXACTLY AS REQUESTED BY SCALAR
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "dummy_local_token")
+MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3-70b-chat-hf")
+ENV_URL = "http://localhost:8000"
+MAX_STEPS = 10
+# 2. MANDATORY: Use OpenAI Client pointed at the HF Router
+client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+# The exact tasks defined in your openenv.yaml
+TASKS = [
+    "task_1_healthcare",
+    "task_2_financial",
+    "task_3_multimodal",
+    "task_4_targeting"
+]
+def get_llm_action(observation_data):
+    """Asks the LLM what action to take based on the ad observation."""
+    system_prompt = """You are an expert Meta Ad-Policy Moderator AI.
+    Evaluate the ad and output a decision. Using tools costs -0.05 points, so be efficient.
+    AVAILABLE ACTIONS:
+    - analyze_image
+    - request_landing_page
+    - request_id_verification
+    - approve
+    - reject
+    You MUST respond in valid JSON format containing "action_type" and "reasoning".
+    """
+    user_prompt = f"Current Ad Observation:\n{json.dumps(observation_data, indent=2)}\n\nWhat is your next action?"
+    try:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            response_format={"type": "json_object"},
+            temperature=0.1
+        )
+        result = json.loads(response.choices[0].message.content)
+        return {
+            "action_type": result.get("action_type", "analyze_image"),
+            "reasoning": result.get("reasoning", "Fallback reasoning")
+        }
+    except Exception as e:
+        print(f"⚠️ LLM Call Failed: {e}. Defaulting to safe fallback.")
+        return {"action_type": "analyze_image", "reasoning": "Error recovery."}
+def main() -> None:
+    print("🚀 Starting Meta Ad-Policy Automated Inference...")
+    total_score = 0.0
+    for task_id in TASKS:
+        print(f"\n--- 🎬 Starting Episode: {task_id} ---")
+        try:
+            res = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
+            if res.status_code != 200:
+                print(f"❌ Env connection failed. Check if Docker is running on port 8000.")
+                return
+        except requests.exceptions.ConnectionError:
+            print(f"❌ Env connection refused. Is your OpenEnv Docker container running?")
+            return
+        observation = res.json()
+        done = False
+        step_count = 0
+        while not done and step_count < MAX_STEPS:
+            step_count += 1
+            print(f"  Step {step_count} | Status: {observation.get('status_message', 'No status')}")
+            action_payload = get_llm_action(observation)
+            print(f"  🤖 Agent Action: {action_payload['action_type'].upper()}")
+            step_res = requests.post(f"{ENV_URL}/step", json=action_payload)
+            step_data = step_res.json()
+            # Extract from the OpenEnv schema
+            observation = step_data.get("observation", step_data)
+            done = observation.get("done", False)
+            reward = observation.get("reward", 0.0)
+            if done:
+                print(f"  ✅ Episode Finished! Final Step Reward: {reward}")
+                total_score += reward
+    print(f"\n🎉 Evaluation Complete! Total Agent Score: {total_score} / {len(TASKS)}")
+if __name__ == "__main__":
+    main()

main.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import uvicorn
+from openenv.core.env_server import create_fastapi_app
+from src.environment import AdPolicyEnvironment
+from src.models import AdAction, AdObservation
+# 1. Create the App
+# NOTICE: We pass the CLASS NAME (AdPolicyEnvironment), not 'env' or 'AdPolicyEnvironment()'
+app = create_fastapi_app(
+    AdPolicyEnvironment,
+    AdAction,
+    AdObservation
+)
+if __name__ == "__main__":
+    print("🚀 Starting Meta Ad-Policy Sandbox on http://localhost:8000")
+    uvicorn.run(app, host="0.0.0.0", port=8000)

openenv.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: meta-ad-policy-env
+version: "0.2.3"
+description: "A Reinforcement Learning sandbox for multimodal ad-policy moderation."
+server:
+  host: "0.0.0.0"
+  port: 8000
+tasks:
+  - task_id: "task_1_healthcare"
+    description: "Evaluate ads for unapproved medical claims, pharmaceuticals, and subtle dog whistles."
+  - task_id: "task_2_financial"
+    description: "Evaluate ads for predatory financial services, scams, and high-pressure tactics."
+  - task_id: "task_3_multimodal"
+    description: "Detect policy violations hidden entirely within visual elements that bypass text filters."
+  - task_id: "task_4_targeting"
+    description: "Identify illegal demographic targeting (e.g., adult financial services targeting minors)."

pyproject.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "meta-ad-policy-sandbox"
+version = "0.2.3"
+description = "Meta Ad-Policy RL Sandbox"
+dependencies = [
+    "fastapi",
+    "uvicorn",
+    "pydantic",
+    "requests",
+    "openai"
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+openenv-core>=0.2.1
+fastapi
+uvicorn
+pydantic
+requests

src/.gitignore ADDED Viewed

File without changes

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (147 Bytes). View file

src/__pycache__/environment.cpython-313.pyc ADDED Viewed

Binary file (5.19 kB). View file

src/__pycache__/generator.cpython-313.pyc ADDED Viewed

Binary file (4.29 kB). View file

src/__pycache__/models.cpython-313.pyc ADDED Viewed

Binary file (1.74 kB). View file

src/environment.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import uuid
+from openenv.core.env_server import Environment
+from src.models import AdAction, AdObservation, AdState
+from src.generator import AdGenerator
+class AdPolicyEnvironment(Environment):
+    def __init__(self):
+        super().__init__()
+        self.generator = AdGenerator()
+        self.current_ad = None
+        self.image_analyzed = False
+        self.step_count = 0
+        self.total_reward = 0.0
+    def _ensure_ad(self):
+        if self.current_ad is None:
+            self.current_ad = self.generator.generate_random_ad()
+    def state(self) -> AdState:
+        self._ensure_ad()
+        return AdState(
+            step_count=self.step_count,
+            total_reward=self.total_reward,
+            current_ad_id=self.current_ad.get("ad_id")
+        )
+    # Add task_id as an optional parameter
+    def reset(self, task_id: str = None) -> AdObservation:
+        # Pass the task_id down to the generator
+        self.current_ad = self.generator.generate_random_ad(task_id)
+        self.image_analyzed = False
+        self.step_count = 0
+        self.total_reward = 0.0
+        # Add the task_id to the welcome message so the bot knows it worked
+        msg = f"Ad loaded for {task_id}. Awaiting review." if task_id else "Random ad loaded. Awaiting review."
+        return self._get_obs(msg)
+    def step(self, action: AdAction) -> AdObservation:
+        self._ensure_ad()
+        self.step_count += 1
+        reward = 0.0
+        done = False
+        message = "Action processed."
+        if not action or not hasattr(action, 'action_type'):
+            # Heavy penalty for invalid formatting to train the agent faster
+            reward = -0.1
+            self.total_reward += reward
+            return self._get_obs("Invalid action.", reward, False)
+        act_type = str(action.action_type).lower()
+        # 🧠 REWARD SHAPING: Tool Usage Penalties
+        # We charge the agent -0.05 for using tools to force efficiency
+        if act_type in ["analyze_image", "request_landing_page", "request_id_verification"]:
+            reward = -0.05
+            if act_type == "analyze_image":
+                self.image_analyzed = True
+                vlm_text = self.current_ad.get('vlm_desc', 'No visual description.')
+                message = vlm_text  # Cleaned up the double "VLM Output:" prefix here!
+            elif act_type == "request_landing_page":
+                is_bad = self.current_ad.get("ground_truth", False)
+                message = "Landing page is high-risk." if is_bad else "Landing page is clean."
+            elif act_type == "request_id_verification":
+                message = "ID matches advertiser profile."
+        # 🎯 FINAL DECISION: Big Rewards / Big Penalties
+        elif act_type in ["approve", "reject"]:
+            done = True
+            is_violation = self.current_ad.get("ground_truth", False)
+            is_correct = (act_type == "reject" and is_violation) or \
+                         (act_type == "approve" and not is_violation)
+            reward = 1.0 if is_correct else -1.0
+            message = f"Decision: {act_type.upper()}. " + ("Correct!" if is_correct else "Incorrect.")
+        # Accumulate the total reward for the episode state
+        self.total_reward += reward
+        return self._get_obs(message, reward, done)
+    def _get_obs(self, message: str, reward: float = 0.0, done: bool = False) -> AdObservation:
+        self._ensure_ad()
+        return AdObservation(
+            ad_id=str(self.current_ad.get("ad_id", "N/A")),
+            headline=str(self.current_ad.get("headline", "N/A")),
+            body_text=str(self.current_ad.get("body_text", "N/A")),
+            advertiser_trust_score=float(self.current_ad.get("advertiser_trust_score", 0.0)),
+            targeting_data=dict(self.current_ad.get("targeting_data", {})),
+            image_url=str(self.current_ad.get("image_url", "N/A")),
+            status_message=str(message),
+            reward=reward,
+            done=done
+        )

src/generator.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import random
+import uuid
+class AdGenerator:
+    # 1. Obvious Violations (Easy mode for the agent)
+    OBVIOUS_VIOLATIONS = [
+        ("HEALTHCARE", "Secret D1abet3s Cure", "Reverse it today with this root extract!", "A bottle of pills with no label.", True),
+        ("FINANCIAL", "Garanteed 500% Returns", "Invest in LlamaCoin now. Zero risk.", "A stack of cash and a rocket emoji.", True)
+    ]
+    # 2. Multimodal Traps (Text is safe, but the image violates policy)
+    # The agent MUST use `analyze_image` to catch these!
+    MULTIMODAL_TRAPS = [
+        ("HEALTHCARE", "Achieve Your Dream Body", "Hard work and our new program will get you there.", "Before and after weight loss comparison photos. (Meta Policy Violation)", True),
+        ("FINANCIAL", "Get Approved in Minutes", "We help you navigate the complex world of finance.", "Fake government ID and a 'Guaranteed Loan Approval' stamp.", True),
+        ("HEALTHCARE", "Stay Focused All Day", "Our study supplement helps you crush your exams.", "Close up of an Adderall prescription pill.", True)
+    ]
+    # 3. Subtle "Dog Whistle" Violations (Hard mode)
+    SUBTLE_VIOLATIONS = [
+        ("HEALTHCARE", "Optimize Metabolic Markers", "Our proprietary GLP-1 analog ships directly to your door without a visit.", "A sleek, branded injection pen.", True), # Selling prescription drugs
+        ("FINANCIAL", "Beat the System", "Leverage our algorithmic trading bot to exploit market inefficiencies.", "A complex trading chart with green arrows pointing up.", True) # High-pressure/deceptive claims
+    ]
+    # 4. Perfectly Safe Ads (To prevent the agent from just rejecting everything)
+    SAFE_ADS = [
+        ("NONE", "Spring Collection 2026", "Check out our latest seasonal apparel.", "A model wearing a floral dress.", False),
+        ("NONE", "Learn Python Online", "Master coding with our 10-week interactive bootcamp.", "A laptop screen showing clean Python code.", False),
+        ("NONE", "Local Bakery Opening", "Come get fresh sourdough bread this Saturday!", "A warm, freshly baked loaf of bread.", False),
+        ("FINANCIAL", "Plan For Retirement", "Speak with our certified fiduciaries to secure your future.", "A happy older couple walking on a beach.", False), # Safe financial ad
+        ("HEALTHCARE", "Daily Multivitamin", "Support your immune system with Vitamin C and Zinc.", "A bottle of standard, FDA-approved daily vitamins.", False) # Safe health ad
+    ]
+    @staticmethod
+    def generate_random_ad(task_id: str = None):
+        # Default to completely random if no task is specified
+        scenario_type = random.choices(
+            ["obvious", "multimodal", "subtle", "safe", "targeting_trap"],
+            weights=[0.15, 0.25, 0.20, 0.30, 0.10],
+            k=1
+        )[0]
+        # 🧠 TASK OVERRIDES: Force specific scenarios based on the Meta Grader's request
+        if task_id == "task_1_healthcare":
+            # Force a healthcare ad (either obvious or subtle)
+            pool = [ad for ad in AdGenerator.OBVIOUS_VIOLATIONS + AdGenerator.SUBTLE_VIOLATIONS if ad[0] == "HEALTHCARE"]
+            category, headline, body, vlm_desc, is_violation = random.choice(pool)
+            scenario_type = "forced_health"
+        elif task_id == "task_2_financial":
+            # Force a financial ad
+            pool = [ad for ad in AdGenerator.OBVIOUS_VIOLATIONS + AdGenerator.SUBTLE_VIOLATIONS if ad[0] == "FINANCIAL"]
+            category, headline, body, vlm_desc, is_violation = random.choice(pool)
+            scenario_type = "forced_finance"
+        elif task_id == "task_3_multimodal":
+            scenario_type = "multimodal"
+        elif task_id == "task_4_targeting":
+            scenario_type = "targeting_trap"
+        # --- Base Logic Execution ---
+        age_limit = random.randint(18, 55)
+        category = category if 'category' in locals() else "NONE"
+        is_violation = is_violation if 'is_violation' in locals() else False
+        if scenario_type == "obvious":
+            category, headline, body, vlm_desc, is_violation = random.choice(AdGenerator.OBVIOUS_VIOLATIONS)
+        elif scenario_type == "multimodal":
+            category, headline, body, vlm_desc, is_violation = random.choice(AdGenerator.MULTIMODAL_TRAPS)
+        elif scenario_type == "subtle":
+            category, headline, body, vlm_desc, is_violation = random.choice(AdGenerator.SUBTLE_VIOLATIONS)
+        elif scenario_type == "safe":
+            category, headline, body, vlm_desc, is_violation = random.choice(AdGenerator.SAFE_ADS)
+        elif scenario_type == "targeting_trap":
+            category = "FINANCIAL"
+            headline = "Start Your First Investment Portfolio"
+            body = "Learn the basics of stock trading with our beginner-friendly app."
+            vlm_desc = "A smartphone showing a simple stock graph."
+            age_limit = 15 # The trap!
+            is_violation = True
+        trust_score = round(random.uniform(0.1, 0.5), 2) if is_violation else round(random.uniform(0.7, 0.99), 2)
+        if random.random() > 0.85:
+            trust_score = 0.95
+        return {
+            "ad_id": str(uuid.uuid4())[:8],
+            "headline": headline,
+            "body_text": body,
+            "advertiser_trust_score": trust_score,
+            "targeting_data": {"min_age": age_limit, "geo": "US"},
+            "image_url": f"https://mock-meta.com/img/{uuid.uuid4()}.jpg",
+            "ground_truth": is_violation,
+            "category": category,
+            "vlm_desc": vlm_desc
+        }

src/models.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from typing import Literal, Optional, Dict, Any
+from openenv.core.env_server import Action, Observation, State
+class AdObservation(Observation):
+    ad_id: str
+    headline: str
+    body_text: str
+    advertiser_trust_score: float
+    targeting_data: Dict[str, Any]
+    image_url: str
+    status_message: str
+    # 🚨 NEW: OpenEnv requires these to be part of the Observation!
+    reward: float = 0.0
+    done: bool = False
+class AdAction(Action):
+    action_type: Literal[
+        "approve", "reject", "analyze_image",
+        "request_landing_page", "request_id_verification"
+    ]
+    reasoning: str
+    violation_category: Optional[Literal["HEALTHCARE", "FINANCIAL", "NONE"]] = None
+class AdState(State):
+    step_count: int = 0
+    total_reward: float = 0.0
+    current_ad_id: Optional[str] = None

test_env.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import requests
+import json
+BASE_URL = "http://localhost:8000"
+def safe_post(endpoint, data=None):
+    """Helper to catch errors before they crash the script."""
+    try:
+        url = f"{BASE_URL}/{endpoint}"
+        response = requests.post(url, json=data)
+        # If the server sent an error code (4xx or 5xx), print the text
+        if response.status_code != 200:
+            print(f"❌ Server Error {response.status_code}: {response.text}")
+            return None
+        return response.json()
+    except Exception as e:
+        print(f"⚠️ Request Failed: {e}")
+        return None
+def run_test():
+    print("--- 🔄 Testing /reset ---")
+    reset_data = safe_post("reset")
+    if not reset_data: return
+    obs = reset_data.get('observation', reset_data)
+    print(f"Ad Loaded: {obs.get('headline', 'N/A')}\n")
+    print("--- 🔍 Testing 'analyze_image' Tool ---")
+    # Payload must be wrapped in 'action' for OpenEnv 2026
+    step1_payload = {
+        "action": {
+            "action_type": "analyze_image",
+            "reasoning": "Standard adversarial check."
+        }
+    }
+    s1_data = safe_post("step", step1_payload)
+    if s1_data:
+        s1_obs = s1_data.get('observation', s1_data)
+        print(f" {s1_obs.get('status_message', 'N/A')}\n")
+    print("--- ✅ Testing Final Decision ---")
+    step2_payload = {
+        "action": {
+            "action_type": "reject",
+            "reasoning": "Detected policy violation."
+        }
+    }
+    s2_data = safe_post("step", step2_payload)
+    if s2_data:
+        reward = s2_data.get('reward', 0.0)
+        done = s2_data.get('done', s2_data.get('terminal', False))
+        print(f"Final Reward: {reward} | Done: {done}")
+if __name__ == "__main__":
+    run_test()

validate.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env bash
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BOLD='\033[1m'
+NC='\033[0m'
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  exit 1
+fi
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; }
+fail() { log "${RED}FAILED${NC} -- $1"; exit 1; }
+printf "\n${BOLD}=== OpenEnv Validator ===${NC}\n"
+log "Step 1/3: Pinging HF Space ($PING_URL/reset) ..."
+HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST -H "Content-Type: application/json" -d '{}' "$PING_URL/reset" --max-time 30 || echo "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live!"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200). Is your Space running?"
+fi
+log "Step 2/3: Running docker build ..."
+if docker build "$REPO_DIR" > /dev/null 2>&1; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed"
+fi
+log "Step 3/3: Running openenv validate ..."
+if cd "$REPO_DIR" && openenv validate > /dev/null 2>&1; then
+  pass "openenv validate passed"
+else
+  fail "openenv validate failed. Check openenv.yaml"
+fi
+printf "\n${GREEN}${BOLD} All 3/3 checks passed! Ready to submit.${NC}\n"