#!/usr/bin/env python3
"""Competition baseline inference for FraudShield.

This module provides the main entry point for evaluation:
1. Initialize environment with frozen data snapshot
2. Load agent (heuristic or LLM-powered)
3. Run all 3 task difficulties
4. Grade predictions against ground truth
5. Save results to fraudshield_baseline_results.json

Execution Modes:
  - Heuristic (offline): No external API, deterministic fraud rules
    Command: python inference.py
    Result: Baseline score (easy=1.0, medium=0.877, hard=0.721, final=0.866)
  
  - LLM (online): Calls OpenAI-compatible API with reasoning prompt
    Command: API_BASE_URL=... MODEL_NAME=... python inference.py
    Result: LLM reasoning + baseline grading

Output:
  - fraudshield_baseline_results.json: Complete grading report with:
    - Per-task scores (easy, medium, hard)
    - Final weighted score
    - Metadata (agent, model, seed, data snapshot)
    - Prediction traces (for replay/audit)

Logging:
  - INFO: Task progress, scores, file paths
  - ERROR: Data load failures, agent exceptions
  - EXCEPTION: Full traceback if inference fails

Usage Examples:
  # Heuristic baseline (no API needed)
  python inference.py

  # With LLM (requires API credentials)
  export API_BASE_URL=https://router.huggingface.co/v1
  export MODEL_NAME=meta-llama/Llama-2-7b-chat-hf
  python inference.py

  # In Docker (PATH already set)
  docker run -e API_BASE_URL=... -e MODEL_NAME=... fraudshield:v0.2.0
"""

from __future__ import annotations

import json
import logging
import os
import sys
from typing import Dict, List, Tuple

from fraudshield_env import FraudShieldEnvironment
from graders import FraudShieldGrader
from llm_agent import build_default_agent

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

RESULTS_FILE = "fraudshield_baseline_results.json"


def get_env(*names: str, default: str = "") -> str:
    """Return the first non-empty environment variable from a list of aliases.
    
    Tries multiple variable names in order (useful for supporting different naming conventions).
    
    Args:
        *names: Environment variable names to check (in order of preference).
        default: Fallback value if none of the names are set.
    
    Returns:
        The first non-empty value found, or default if none matched.
    
    Example:
        api_url = get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1")
        model = get_env("MODEL_NAME", "MODELNAME", default="meta-llama/Llama-2-7b")
    """

    for name in names:
        value = os.getenv(name)
        if value:
            return value
    return default


def run_task(env: FraudShieldEnvironment, agent: object, task_name: str) -> Tuple[List[str], List[str], List[float]]:
    """Run one task episode and capture the full prediction trace.
    
    This function executes a complete episode for a single task difficulty,
    collecting all predictions, confidences, and ground truth labels.
    
    Args:
        env: FraudShieldEnvironment instance (with data already loaded).
        agent: Agent object with decide(observation) method.
        task_name: Task difficulty ("easy", "medium", or "hard").
    
    Returns:
        Tuple of 3 lists:
        - predictions: List[str] of decisions ("fraud" or "legitimate")
        - ground_truth: List[str] of true labels
        - confidences: List[float] of confidence values [0.0, 1.0]
    
    Workflow:
        1. Call env.reset(task_name) to initialize episode
        2. Loop: agent.decide(obs) → env.step(action) → next obs
        3. Log progress each step
        4. Collect all decisions and ground truth
        5. Return predictions for grading
    
    Logging:
        - Task header with agent name
        - Progress every 10 steps (or at first/last)
        - Final accuracy and cumulative reward
    
    Example:
        preds, labels, confs = run_task(env, agent, "easy")
        print(f"Accuracy: {sum(p == l for p, l in zip(preds, labels)) / len(preds)}")
    """

    logger.info("%s", "=" * 72)
    logger.info("Running %s task with %s", task_name.upper(), getattr(agent, "name", agent.__class__.__name__))
    logger.info("%s", "=" * 72)

    reset_result = env.reset(task_name)
    logger.info("Episode %s contains %s transactions", env.episode_id, reset_result.info["num_transactions"])

    observation = reset_result.observation
    predictions: List[str] = []
    confidences: List[float] = []

    while not env.is_done:
        action = agent.decide(observation)
        predictions.append(action.decision.value)
        confidences.append(action.confidence)
        step_result = env.step(action)

        if env.step_count in {1, len(env.current_cases)} or env.step_count % 10 == 0:
            logger.info(
                "Step %02d | decision=%s | confidence=%.2f | reward=%+.2f",
                env.step_count,
                action.decision.value,
                action.confidence,
                step_result.reward.value,
            )

        observation = step_result.observation

    logger.info(
        "Finished %s: accuracy_so_far=%.3f cumulative_reward=%.3f",
        task_name.upper(),
        env.correct_predictions / max(1, env.step_count),
        env.cumulative_reward,
    )
    return predictions, list(env.ground_truth_labels), confidences


def main() -> Dict[str, object]:
    """Run the baseline across all tasks and persist the report.
    
    This is the main entry point. It orchestrates the complete evaluation:
    1. Create environment and load frozen data snapshot
    2. Build agent (heuristic or LLM-powered)
    3. Run easy/medium/hard tasks sequentially
    4. Grade all predictions
    5. Save results to fraudshield_baseline_results.json
    
    Returns:
        Grading report dict with keys:
        - easy: {score, predictions, ground_truth, confidences}
        - medium: {...}
        - hard: {...}
        - final_score: Weighted average across all tasks
        - metadata: {agent_name, model_name, seed, data_snapshot, tasks}
    
    Error Handling:
        - Exits with code 1 if data fails to load
        - Exits with code 1 if inference crashes
        - Logs full exception traceback
    
    Side Effects:
        - Writes fraudshield_baseline_results.json to cwd
        - Logs task progress and scores
    
    Environment Variables:
        - API_BASE_URL: OpenAI-compatible API endpoint (for LLM mode)
        - MODEL_NAME: Model to use (for LLM mode)
        - (Both optional; heuristic mode runs offline if not set)
    
    Example:
        result = main()
        print(f"Final score: {result['final_score']:.4f}")
        print(f"Easy: {result['easy']['score']:.4f}")
    """

    logger.info("%s", "=" * 72)
    logger.info("FraudShield baseline inference")
    logger.info("%s", "=" * 72)

    env = FraudShieldEnvironment(data_path="data", seed=42)
    if not env.load_data():
        logger.error("FraudShield data could not be loaded from ./data")
        sys.exit(1)

    agent = build_default_agent()
    logger.info(
        "Agent mode: %s | API_BASE_URL=%s | MODEL_NAME=%s",
        getattr(agent, "name", agent.__class__.__name__),
        get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1"),
        get_env("MODEL_NAME", "MODELNAME", default="<offline-heuristic>"),
    )

    easy_predictions, easy_ground_truth, easy_confidences = run_task(env, agent, "easy")
    medium_predictions, medium_ground_truth, medium_confidences = run_task(env, agent, "medium")
    hard_predictions, hard_ground_truth, hard_confidences = run_task(env, agent, "hard")

    grading_result = FraudShieldGrader.grade_all_tasks(
        easy_predictions,
        easy_ground_truth,
        easy_confidences,
        medium_predictions,
        medium_ground_truth,
        medium_confidences,
        hard_predictions,
        hard_ground_truth,
        hard_confidences,
    )
    grading_result["metadata"] = {
        "agent_name": getattr(agent, "name", agent.__class__.__name__),
        "api_base_url": get_env("API_BASE_URL", "APIBASEURL", default="https://router.huggingface.co/v1"),
        "model_name": get_env("MODEL_NAME", "MODELNAME"),
        "seed": 42,
        "data_snapshot": env.data_loader.get_bundle_summary(),
        "tasks": {
            "easy": len(easy_ground_truth),
            "medium": len(medium_ground_truth),
            "hard": len(hard_ground_truth),
        },
    }

    logger.info("Easy score:   %.4f", grading_result["easy"]["score"])
    logger.info("Medium score: %.4f", grading_result["medium"]["score"])
    logger.info("Hard score:   %.4f", grading_result["hard"]["score"])
    logger.info("Final score:  %.4f", grading_result["final_score"])

    with open(RESULTS_FILE, "w", encoding="utf-8") as handle:
        json.dump(grading_result, handle, indent=2)
    logger.info("Saved baseline report to %s", RESULTS_FILE)
    return grading_result


if __name__ == "__main__":  # pragma: no cover
    try:
        main()
    except Exception as exc:
        logger.exception("Baseline inference failed: %s", exc)
        sys.exit(1)