Spaces:

parth-1
/

MetaGuard

Sleeping

File size: 7,434 Bytes

import os
import json
import requests
from openai import OpenAI

# 1. MANDATORY VARIABLES EXACTLY AS REQUESTED BY SCALAR
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "dummy_local_token")
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Meta-Llama-3-8B-Instruct")

ENV_URL = os.getenv("ENV_URL", "http://localhost:8000")
MAX_STEPS = 10

# 2. MANDATORY: Use OpenAI Client pointed at the HF Router
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

# The exact tasks defined in your openenv.yaml
TASKS = [
    "task_1_healthcare",
    "task_2_financial",
    "task_3_multimodal",
    "task_4_targeting"
]

# --- STRICT GRADING LOGGERS ---
def log_start(task: str, env: str, model: str) -> None:
    print(f"[START] task={task} env={env} model={model}", flush=True)

def log_step(step: int, action: str, reward: float, done: bool, error: str = None) -> None:
    error_val = error if error else "null"
    done_val = str(done).lower()
    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)

def log_end(success: bool, steps: int, score: float, rewards: list) -> None:
    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
    success_val = str(success).lower()
    print(f"[END] success={success_val} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True)
# ------------------------------

def get_llm_action(observation_data):
    """Asks the LLM what action to take based on the ad observation."""
    system_prompt = """You are an enterprise Ad Policy Compliance Agent.
    You navigate a multi-system compliance workflow. Always respond with ONLY valid JSON.

    REQUIRED PHASE ORDER:
    1. query_regulations   — always first
    2. analyze_image       — required for visual/multimodal tasks
    3. check_advertiser_history or request_landing_page — as needed
    4. submit_audit        — always before final decision
    5. approve or reject   — final decision only after audit

    AVAILABLE ACTIONS:
    - query_regulations
    - analyze_image
    - check_advertiser_history
    - request_landing_page
    - request_id_verification
    - submit_audit
    - approve
    - reject

    HARD RULES:
    - NEVER repeat an action listed in `actions_already_taken`.
    - You MUST progress through the phase order. Do NOT call submit_audit or approve/reject
      before the prerequisite phases are complete.
    - Choose your action_type ONLY from the AVAILABLE ACTIONS list above. Any other value is invalid.

    Response format:
    {"action_type": "<action>", "reasoning": "<brief reason>"}
    """

    user_prompt = f"Current Ad Observation:\n{json.dumps(observation_data, indent=2)}\n\nWhat is your next action?"

    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            # Removed response_format={"type": "json_object"} as HF router often rejects it
            temperature=0.1
        )
        
        # Clean the response in case the LLM wrapped it in markdown code blocks like ```json ... ```
        content = response.choices[0].message.content.strip()
        if content.startswith("```json"):
            content = content[7:-3].strip()
        elif content.startswith("```"):
            content = content[3:-3].strip()
            
        result = json.loads(content)
        return {
            "action_type": result.get("action_type", "query_regulations"),
            "reasoning": result.get("reasoning", "Fallback reasoning")
        }
    except Exception as e:
        print(f"\n[CRITICAL LLM ERROR]: {str(e)}\n", flush=True) # THIS WILL REVEAL THE BUG
        return {"action_type": "query_regulations", "reasoning": f"Error recovery: {str(e)}"}

def main() -> None:
    for task_id in TASKS:
        log_start(task=task_id, env="meta_ad_policy_sandbox", model=MODEL_NAME)
        
        rewards = []
        steps_taken = 0
        success = False
        actions_taken_list: list = []

        try:
            # 1. Reset the environment
            res = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
            if res.status_code != 200:
                log_step(step=1, action="reset_failed", reward=0.0, done=True, error=f"HTTP {res.status_code}")
                log_end(success=False, steps=0, score=0.01, rewards=[])
                continue
                
            # 2. Initialize data from the reset
            step_data = res.json() 
            observation = step_data.get("observation", step_data)
            done = False
            
            # 3. THE SINGLE LOOP (Fixed)
            while not done and steps_taken < MAX_STEPS:
                steps_taken += 1
                
                # Feedback memory for the LLM
                llm_observation = {
                    "task_id": task_id,
                    "last_feedback": step_data.get("status_message", "No feedback yet."),
                    "step_count": steps_taken,
                    "actions_already_taken": actions_taken_list,
                    "ad_details": observation
                }
                
                # Get action from LLM
                action_payload = get_llm_action(llm_observation)
                action_str = action_payload["action_type"]
                if "Error code: 402" in action_payload.get("reasoning", ""):
                    done = True
                    log_step(step=steps_taken, action=action_str, reward=0.0, done=True, error="API credits depleted")
                    break
                # Execute action in environment
                step_res = requests.post(f"{ENV_URL}/step", json={"action": action_payload})
                step_data = step_res.json() 
                
                # Update loop variables
                observation = step_data.get("observation", {})
                done = step_data.get("done", False)
                reward = step_data.get("reward", 0.0)

                rewards.append(reward)

                # Track only actions that actually advanced state. Skip API-failure
                # / invalid-action / wrong-order cases so the agent is free to retry.
                status_msg = (step_data.get("status_message") or "").lower()
                action_failed = (
                    "api failure" in status_msg
                    or "retryable" in status_msg
                    or "invalid action" in status_msg
                    or "must call" in status_msg
                )
                if not action_failed and action_str not in actions_taken_list:
                    actions_taken_list.append(action_str)

                log_step(step=steps_taken, action=action_str, reward=reward, done=done, error=None)
                
            # 4. Final Scoring (Single Log)
            raw_score = sum(rewards)
            success = raw_score > 0
            log_end(success=success, steps=steps_taken, score=raw_score, rewards=rewards)

        except Exception as e:
            log_step(step=steps_taken+1, action="exception", reward=0.0, done=True, error=str(e).replace("\n", " "))
            log_end(success=False, steps=steps_taken, score=0.01, rewards=rewards)

if __name__ == "__main__":
    main()