""" inference.py — LLM agent for ContentModerationEnv (Groq / OpenAI compatible) ============================================================================= Hackathon-compliant inference script for the OpenEnv Content Moderation benchmark. Uses the OpenAI-compatible client to drive an LLM agent through all 128 scenarios, then emits the exact stdout format required for automated evaluation scoring. Credentials (read from environment variables — first non-empty wins): GROQ_API_KEY — Groq API key (https://console.groq.com) HF_TOKEN — HuggingFace API key OPENAI_API_KEY — OpenAI API key API_BASE_URL — LLM endpoint (default: https://api.groq.com/openai/v1) MODEL_NAME — model identifier (default: llama-3.3-70b-versatile) Stdout format (zero deviation allowed): [START] task= env=content_moderation model= [STEP] step= action= reward=<0.00> done= error= [END] success= steps= score= rewards= """ import json import os import sys from pathlib import Path from typing import Dict, List, Optional from openai import OpenAI # ── local import ────────────────────────────────────────────────────────────── SCRIPT_DIR = Path(__file__).parent sys.path.insert(0, str(SCRIPT_DIR)) from content_moderation_env import ContentModerationEnv # ── Credentials ─────────────────────────────────────────────────────────────── # ── Credentials ─────────────────────────────────────────────────────────────── API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1") MODEL_NAME = os.getenv("MODEL_NAME", "llama-3.3-70b-versatile") HF_TOKEN = os.getenv("HF_TOKEN") LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") # ── Constants ───────────────────────────────────────────────────────────────── SCENARIOS_PATH = SCRIPT_DIR / "moderation_benchmark.json" ENV_NAME = "content_moderation" # Tasks — built dynamically from the JSON so all 128 scenarios are included # regardless of ID format (scen_easy_*, camp_*, scen_adv_*, etc.) def _build_tasks(scenarios_path: Path) -> List[Dict]: data = json.loads(scenarios_path.read_text(encoding="utf-8")) tiers: Dict[str, List[str]] = {"easy": [], "medium": [], "hard": []} for s in data: t = s.get("tier", "") if t in tiers: tiers[t].append(s["id"]) return [ {"name": "easy_moderation", "tier": "easy", "scenario_ids": sorted(tiers["easy"])}, {"name": "medium_moderation", "tier": "medium", "scenario_ids": sorted(tiers["medium"])}, {"name": "hard_moderation", "tier": "hard", "scenario_ids": sorted(tiers["hard"])}, ] TASKS = _build_tasks(SCENARIOS_PATH) # Fallback action used when the LLM response cannot be parsed FALLBACK_ACTION: Dict = {"label": "safe", "action": "allow"} # ── Logging helpers (exact stdout format) ───────────────────────────────────── def log_start(task_name: str, model: str) -> None: """Emit [START] line.""" print(f"[START] task={task_name} env={ENV_NAME} model={model}", flush=True) def log_step( step: int, action: Dict, reward: float, done: bool, error: Optional[str], ) -> None: """Emit [STEP] line. Rules: - action serialised as compact JSON (no spaces) - reward: exactly 2 decimal places - done: lowercase string 'true' or 'false' - error: raw string or the literal word null (not quoted 'null', not None) """ action_str = json.dumps(action, separators=(",", ":")) reward_str = f"{reward:.2f}" done_str = "true" if done else "false" error_str = error if error is not None else "null" print( f"[STEP] step={step} action={action_str} reward={reward_str} " f"done={done_str} error={error_str}", flush=True, ) def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None: """Emit [END] line. Rules: - success: lowercase string 'true' or 'false' - score: exactly 3 decimal places - rewards: comma-separated, each exactly 2 decimal places """ success_str = "true" if success else "false" rewards_str = ",".join(f"{r:.2f}" for r in rewards) print( f"[END] success={success_str} steps={steps} score={score:.3f} rewards={rewards_str}", flush=True, ) # ── Prompt builder ──────────────────────────────────────────────────────────── _SYSTEM_PROMPT = """\ You are an expert content moderation agent for a social media platform. You will be shown a piece of content and must make a moderation decision. You must respond with ONLY a valid JSON object, nothing else. No explanation, no markdown, no code blocks. Just the raw JSON. For easy and medium scenarios respond with: {"label": "