#!/usr/bin/env python3 """Competition baseline inference for FraudShield. This module provides the main entry point for evaluation: 1. Initialize environment with frozen data snapshot 2. Load agent (heuristic or LLM-powered) 3. Run all 3 task difficulties 4. Grade predictions against ground truth 5. Save results to fraudshield_baseline_results.json Execution Modes: - Heuristic (offline): No external API, deterministic fraud rules Command: python inference.py Result: Baseline score (easy=1.0, medium=0.877, hard=0.721, final=0.866) - LLM (online): Calls OpenAI-compatible API with reasoning prompt Command: API_BASE_URL=... MODEL_NAME=... python inference.py Result: LLM reasoning + baseline grading Output: - fraudshield_baseline_results.json: Complete grading report with: - Per-task scores (easy, medium, hard) - Final weighted score - Metadata (agent, model, seed, data snapshot) - Prediction traces (for replay/audit) Logging: - INFO: Task progress, scores, file paths - ERROR: Data load failures, agent exceptions - EXCEPTION: Full traceback if inference fails Usage Examples: # Heuristic baseline (no API needed) python inference.py # With LLM (requires API credentials) export API_BASE_URL=https://router.huggingface.co/v1 export MODEL_NAME=meta-llama/Llama-2-7b-chat-hf python inference.py # In Docker (PATH already set) docker run -e API_BASE_URL=... -e MODEL_NAME=... fraudshield:v0.2.0 """ from __future__ import annotations import json import logging import os import sys from typing import Dict, List, Tuple from fraudshield_env import FraudShieldEnvironment from graders import FraudShieldGrader from llm_agent import build_default_agent logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) RESULTS_FILE = "fraudshield_baseline_results.json" def get_env(*names: str, default: str = "") -> str: """Return the first non-empty environment variable from a list of aliases. Tries multiple variable names in order (useful for supporting different naming conventions). Args: *names: Environment variable names to check (in order of preference). default: Fallback value if none of the names are set. Returns: The first non-empty value found, or default if none matched. Example: api_url = get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1") model = get_env("MODEL_NAME", "MODELNAME", default="meta-llama/Llama-2-7b") """ for name in names: value = os.getenv(name) if value: return value return default def run_task(env: FraudShieldEnvironment, agent: object, task_name: str) -> Tuple[List[str], List[str], List[float]]: """Run one task episode and capture the full prediction trace. This function executes a complete episode for a single task difficulty, collecting all predictions, confidences, and ground truth labels. Args: env: FraudShieldEnvironment instance (with data already loaded). agent: Agent object with decide(observation) method. task_name: Task difficulty ("easy", "medium", or "hard"). Returns: Tuple of 3 lists: - predictions: List[str] of decisions ("fraud" or "legitimate") - ground_truth: List[str] of true labels - confidences: List[float] of confidence values [0.0, 1.0] Workflow: 1. Call env.reset(task_name) to initialize episode 2. Loop: agent.decide(obs) → env.step(action) → next obs 3. Log progress each step 4. Collect all decisions and ground truth 5. Return predictions for grading Logging: - Task header with agent name - Progress every 10 steps (or at first/last) - Final accuracy and cumulative reward Example: preds, labels, confs = run_task(env, agent, "easy") print(f"Accuracy: {sum(p == l for p, l in zip(preds, labels)) / len(preds)}") """ logger.info("%s", "=" * 72) logger.info("Running %s task with %s", task_name.upper(), getattr(agent, "name", agent.__class__.__name__)) logger.info("%s", "=" * 72) reset_result = env.reset(task_name) logger.info("Episode %s contains %s transactions", env.episode_id, reset_result.info["num_transactions"]) observation = reset_result.observation predictions: List[str] = [] confidences: List[float] = [] while not env.is_done: action = agent.decide(observation) predictions.append(action.decision.value) confidences.append(action.confidence) step_result = env.step(action) if env.step_count in {1, len(env.current_cases)} or env.step_count % 10 == 0: logger.info( "Step %02d | decision=%s | confidence=%.2f | reward=%+.2f", env.step_count, action.decision.value, action.confidence, step_result.reward.value, ) observation = step_result.observation logger.info( "Finished %s: accuracy_so_far=%.3f cumulative_reward=%.3f", task_name.upper(), env.correct_predictions / max(1, env.step_count), env.cumulative_reward, ) return predictions, list(env.ground_truth_labels), confidences def main() -> Dict[str, object]: """Run the baseline across all tasks and persist the report. This is the main entry point. It orchestrates the complete evaluation: 1. Create environment and load frozen data snapshot 2. Build agent (heuristic or LLM-powered) 3. Run easy/medium/hard tasks sequentially 4. Grade all predictions 5. Save results to fraudshield_baseline_results.json Returns: Grading report dict with keys: - easy: {score, predictions, ground_truth, confidences} - medium: {...} - hard: {...} - final_score: Weighted average across all tasks - metadata: {agent_name, model_name, seed, data_snapshot, tasks} Error Handling: - Exits with code 1 if data fails to load - Exits with code 1 if inference crashes - Logs full exception traceback Side Effects: - Writes fraudshield_baseline_results.json to cwd - Logs task progress and scores Environment Variables: - API_BASE_URL: OpenAI-compatible API endpoint (for LLM mode) - MODEL_NAME: Model to use (for LLM mode) - (Both optional; heuristic mode runs offline if not set) Example: result = main() print(f"Final score: {result['final_score']:.4f}") print(f"Easy: {result['easy']['score']:.4f}") """ logger.info("%s", "=" * 72) logger.info("FraudShield baseline inference") logger.info("%s", "=" * 72) env = FraudShieldEnvironment(data_path="data", seed=42) if not env.load_data(): logger.error("FraudShield data could not be loaded from ./data") sys.exit(1) agent = build_default_agent() logger.info( "Agent mode: %s | API_BASE_URL=%s | MODEL_NAME=%s", getattr(agent, "name", agent.__class__.__name__), get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1"), get_env("MODEL_NAME", "MODELNAME", default=""), ) easy_predictions, easy_ground_truth, easy_confidences = run_task(env, agent, "easy") medium_predictions, medium_ground_truth, medium_confidences = run_task(env, agent, "medium") hard_predictions, hard_ground_truth, hard_confidences = run_task(env, agent, "hard") grading_result = FraudShieldGrader.grade_all_tasks( easy_predictions, easy_ground_truth, easy_confidences, medium_predictions, medium_ground_truth, medium_confidences, hard_predictions, hard_ground_truth, hard_confidences, ) grading_result["metadata"] = { "agent_name": getattr(agent, "name", agent.__class__.__name__), "api_base_url": get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1"), "model_name": get_env("MODEL_NAME", "MODELNAME"), "seed": 42, "data_snapshot": env.data_loader.get_bundle_summary(), "tasks": { "easy": len(easy_ground_truth), "medium": len(medium_ground_truth), "hard": len(hard_ground_truth), }, } logger.info("Easy score: %.4f", grading_result["easy"]["score"]) logger.info("Medium score: %.4f", grading_result["medium"]["score"]) logger.info("Hard score: %.4f", grading_result["hard"]["score"]) logger.info("Final score: %.4f", grading_result["final_score"]) with open(RESULTS_FILE, "w", encoding="utf-8") as handle: json.dump(grading_result, handle, indent=2) logger.info("Saved baseline report to %s", RESULTS_FILE) return grading_result if __name__ == "__main__": # pragma: no cover try: main() except Exception as exc: logger.exception("Baseline inference failed: %s", exc) sys.exit(1)