"""Keyword-based evaluation for BPO benchmark."""

from typing import List, Dict, Any


def check_keywords(response: str, expected_keywords: List[str]) -> Dict[str, Any]:
    """
    Check if response contains expected keywords (supports OR with |).

    Args:
        response: The agent's response text
        expected_keywords: List of keywords to check. Each keyword can contain
                          alternatives separated by | (e.g., "67%|67 %|67")

    Returns:
        Dictionary with found/missing keywords, match rate, and pass status
    """
    found = []
    missing = []

    for keyword in expected_keywords:
        alternatives = keyword.split("|")
        if any(alt.lower() in response.lower() for alt in alternatives):
            found.append(keyword)
        else:
            missing.append(keyword)

    match_rate = len(found) / len(expected_keywords) if expected_keywords else 1.0
    return {
        "found": found,
        "missing": missing,
        "match_rate": match_rate,
        "passed": len(missing) == 0
    }


def evaluate_task(task: Dict[str, Any], response: str, tool_calls: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Evaluate a single task.

    Args:
        task: Task definition from tasks.json
        response: The agent's response text
        tool_calls: List of tool calls made by the agent

    Returns:
        Evaluation result dictionary
    """
    expected_output = task.get("expected_output", {})
    keywords = expected_output.get("keywords", [])

    result = check_keywords(response, keywords)

    # Extract tool names from tool calls
    tool_names = []
    for tc in tool_calls:
        if isinstance(tc, dict):
            name = tc.get("name") or tc.get("function", {}).get("name", "")
            if name:
                tool_names.append(name)
        elif isinstance(tc, str):
            tool_names.append(tc)

    # Check expected tool calls
    expected_tools = expected_output.get("tool_calls", [])
    expected_tool_names = [t.get("name", "") for t in expected_tools if isinstance(t, dict)]

    # Calculate tool call accuracy
    if expected_tool_names:
        matched_tools = sum(1 for t in expected_tool_names if any(t in tn for tn in tool_names))
        tool_accuracy = matched_tools / len(expected_tool_names)
    else:
        # No tools expected - check that none were called or that's acceptable
        tool_accuracy = 1.0 if not tool_names else 0.5

    # Calculate API count accuracy (lenient: correct if actual >= expected)
    api_call_count = len(tool_names)
    expected_api_count = len(expected_tool_names)
    api_count_correct = 1 if api_call_count >= expected_api_count else 0

    return {
        "task_id": task.get("name", "unknown"),
        "difficulty": task.get("difficulty", "unknown"),
        "intent": task.get("intent", ""),
        "response": response,
        "expected_keywords": keywords,
        "found_keywords": result["found"],
        "missing_keywords": result["missing"],
        "match_rate": result["match_rate"],
        "passed": result["passed"],
        "tool_calls": tool_names,
        "expected_tool_calls": expected_tool_names,
        "tool_accuracy": tool_accuracy,
        "api_call_count": api_call_count,
        "expected_api_count": expected_api_count,
        "api_count_correct": api_count_correct,
    }


def calculate_summary(results: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Calculate summary statistics from evaluation results.

    Args:
        results: List of evaluation results from evaluate_task

    Returns:
        Summary dictionary with pass rates and averages
    """
    if not results:
        return {
            "total_tasks": 0,
            "passed": 0,
            "pass_rate": 0.0,
            "avg_match_rate": 0.0,
            "avg_tool_accuracy": 0.0,
            "api_count_accuracy": 0.0,
            "by_difficulty": {},
        }

    total = len(results)
    passed = sum(1 for r in results if r.get("passed", False))
    avg_match = sum(r.get("match_rate", 0) for r in results) / total
    avg_tool = sum(r.get("tool_accuracy", 0) for r in results) / total
    api_count_correct = sum(r.get("api_count_correct", 0) for r in results)

    # Group by difficulty
    by_difficulty = {}
    for r in results:
        diff = r.get("difficulty", "unknown")
        if diff not in by_difficulty:
            by_difficulty[diff] = {"total": 0, "passed": 0}
        by_difficulty[diff]["total"] += 1
        if r.get("passed", False):
            by_difficulty[diff]["passed"] += 1

    for diff in by_difficulty:
        by_difficulty[diff]["pass_rate"] = (
            by_difficulty[diff]["passed"] / by_difficulty[diff]["total"]
        )

    return {
        "total_tasks": total,
        "passed": passed,
        "pass_rate": passed / total,
        "avg_match_rate": avg_match,
        "avg_tool_accuracy": avg_tool,
        "api_count_accuracy": api_count_correct / total,
        "by_difficulty": by_difficulty,
    }