"""analyze_profile — per-column data-quality profile (KM-608).

An analytical "family" tool: in ONE call it profiles each column's health —
dtype, inferred type, completeness (null count/rate), cardinality (distinct
count/rate, constant flag), and — for numeric columns — min/max/mean plus an
IQR-based outlier count; for non-numeric columns the most frequent value.
Answers "is this data clean enough to analyze?" and surfaces issues (lots of
nulls, a constant column, outliers) before deeper analysis.

STATUS: compute layer only — the function takes an already-materialized
DataFrame. The wrapper layer (fetching data from the catalog via source_id,
the ToolOutput envelope, ToolSpec registration) is added once the Planner
seam (KM-418) is settled. Keeping compute separate from data-fetching makes
this function easy to unit-test in isolation and stable when wrapped.
"""

from __future__ import annotations

import pandas as pd

from src.tools.analytics.descriptive import ColumnNotFoundError


def _clean(value: object) -> object:
    """Convert numpy/pandas scalars to plain Python so the output is JSON-clean.

    `top_value` (most frequent value) can be a `pandas.Timestamp` when profiling
    a datetime column — neither `Timestamp` nor numpy scalars are JSON-safe.
    """
    if isinstance(value, pd.Timestamp):
        return value.isoformat()
    if hasattr(value, "item"):
        return value.item()
    return value


def _profile_one(series: pd.Series) -> dict[str, object]:
    """Build the quality profile for a single column."""
    total = len(series)
    non_null = series.dropna()
    nn = len(non_null)
    distinct = int(series.nunique(dropna=True))

    is_bool = pd.api.types.is_bool_dtype(series)
    is_datetime = pd.api.types.is_datetime64_any_dtype(series)
    # bool is technically numeric in pandas; treat it as its own type.
    is_numeric = pd.api.types.is_numeric_dtype(series) and not is_bool

    if is_bool:
        inferred = "boolean"
    elif is_datetime:
        inferred = "datetime"
    elif is_numeric:
        inferred = "numeric"
    else:
        inferred = "categorical"

    out: dict[str, object] = {
        "dtype": str(series.dtype),
        "inferred_type": inferred,
        "count": int(total),
        "null_count": int(series.isna().sum()),
        "null_rate": float(series.isna().mean()) if total else 0.0,
        "distinct_count": distinct,
        "distinct_rate": (distinct / nn) if nn else 0.0,  # over non-null values
        "is_constant": distinct <= 1,
    }

    if is_numeric and nn > 0:
        out["min"] = _clean(non_null.min())
        out["max"] = _clean(non_null.max())
        out["mean"] = _clean(non_null.mean())
        # IQR rule: values outside [Q1 - 1.5*IQR, Q3 + 1.5*IQR] are outliers.
        # Needs enough points for stable quartiles.
        if nn >= 4:
            q1 = non_null.quantile(0.25)
            q3 = non_null.quantile(0.75)
            iqr = q3 - q1
            lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
            out["outlier_count"] = int(((non_null < lower) | (non_null > upper)).sum())
        else:
            out["outlier_count"] = None
    elif not is_numeric and nn > 0:
        counts = non_null.value_counts()
        out["top_value"] = _clean(counts.index[0])
        out["top_freq"] = int(counts.iloc[0])

    return out


# Prompt-style description read by the Planner to decide WHEN to pick this tool.
# Final destination is ToolSpec.description once the wrapper layer is built.
DESCRIPTION = """\
Summary: Per-column data-quality profile. For each column reports dtype, \
inferred type, completeness (null count/rate), cardinality (distinct count/rate, \
constant flag), and — for numeric columns — min/max/mean plus an IQR-based \
outlier count; for non-numeric columns the most frequent value.

USE WHEN the question is about the HEALTH of the data, not its statistics: \
missing values, duplicates, data types, outliers, "is this clean enough to \
analyze". Trigger words: "quality" (kualitas), "missing/nulls" (data kosong), \
"data type" (tipe data), "duplicates/unique" (duplikat/unik), "outliers".

DON'T USE WHEN:
  - the user wants statistics like mean/median/std/skew -> analyze_descriptive
  - it groups or compares -> analyze_aggregate / analyze_comparison

Example questions:
  - "is this dataset clean enough to analyze?"
  - "which columns have a lot of missing values?"
  - "what are the data types and unique counts per column?"
  - "are there outliers in the amount column?"
"""


def analyze_profile(
    df: pd.DataFrame,
    column_ids: list[str] | None = None,
) -> dict[str, dict[str, object]]:
    """Per-column data-quality profile.

    Args:
        df: already-materialized data (in the real system the wrapper fetches
            this from a source_id).
        column_ids: columns to profile. If None, every column is profiled.

    Returns:
        dict: { column_id: { profile fields, ... }, ... }

    Raises:
        ColumnNotFoundError: if any column_id is absent from df.
    """
    cols = list(column_ids) if column_ids is not None else list(df.columns)

    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ColumnNotFoundError(f"columns not found: {missing}")

    return {col: _profile_one(df[col]) for col in cols}