"""analyze_descriptive — single/multi-column EDA (KM-608). An analytical "family" tool: in ONE call it computes a column's center, spread, shape, and completeness (mean, median, mode, std, variance, quartiles, min/max, skew, null_rate). STATUS: compute layer only — the function takes an already-materialized DataFrame. The wrapper layer (fetching data from the catalog via source_id, the ToolOutput envelope, ToolSpec registration) is added once the Planner seam (KM-418) is settled. Keeping compute separate from data-fetching makes this function easy to unit-test in isolation and stable when wrapped. """ from __future__ import annotations import pandas as pd # Default metrics used when the caller does not narrow them via `metrics`. DEFAULT_METRICS = ( "count", "mean", "median", "mode", "std", "var", "q1", "q3", "min", "max", "skew", "null_count", "null_rate", ) class ColumnNotFoundError(ValueError): """A requested column is absent from the DataFrame (maps to error_code COLUMN_NOT_FOUND).""" def _clean(value: object) -> object: """Coerce a scalar to a JSON-clean Python value. `mode` can be any dtype: an integer column yields `numpy.int64` (NOT JSON-serializable), a datetime column yields `pandas.Timestamp`. The other metrics are already wrapped in `float(...)`; mode is the one that needs this. """ if value is None: return None if isinstance(value, pd.Timestamp): return value.isoformat() if hasattr(value, "item"): return value.item() return value def _describe_one(series: pd.Series, metrics: tuple[str, ...]) -> dict[str, object]: """Compute descriptive metrics for a single column. Numeric metrics are computed over non-null values. `null_rate` & `count` are computed over all rows (nulls included) so they reflect completeness as-is. Undefined cases (e.g. std of a single value) return None — degrade gracefully instead of raising. """ total = len(series) non_null = series.dropna() is_numeric = pd.api.types.is_numeric_dtype(series) out: dict[str, object] = {} for m in metrics: if m == "count": out["count"] = int(total) elif m == "null_count": out["null_count"] = int(series.isna().sum()) elif m == "null_rate": out["null_rate"] = float(series.isna().mean()) if total else 0.0 elif m == "mode": modes = non_null.mode() out["mode"] = _clean(modes.iloc[0]) if not modes.empty else None elif not is_numeric: out[m] = None elif m == "mean": out["mean"] = float(non_null.mean()) if not non_null.empty else None elif m == "median": out["median"] = float(non_null.median()) if not non_null.empty else None elif m == "std": out["std"] = float(non_null.std()) if non_null.shape[0] > 1 else None elif m == "var": out["var"] = float(non_null.var()) if non_null.shape[0] > 1 else None elif m == "q1": out["q1"] = float(non_null.quantile(0.25)) if not non_null.empty else None elif m == "q3": out["q3"] = float(non_null.quantile(0.75)) if not non_null.empty else None elif m == "min": out["min"] = float(non_null.min()) if not non_null.empty else None elif m == "max": out["max"] = float(non_null.max()) if not non_null.empty else None elif m == "skew": out["skew"] = float(non_null.skew()) if non_null.shape[0] > 2 else None return out # Prompt-style description read by the Planner to decide WHEN to pick this tool. # Final destination is ToolSpec.description once the wrapper layer is built. DESCRIPTION = """\ Summary: Descriptive statistics (EDA) for one or several columns in a single \ call — center (mean, median, mode), spread (std, variance, min, max, Q1/Q3 \ quartiles), distribution shape (skew), and completeness (null count & rate). USE WHEN the user asks for an overview, summary, or single-column statistics \ of ONE or SEVERAL columns as a whole, with NO grouping and NO comparison \ between groups. Trigger words: "overview/summary" (ringkasan), "average" \ (rata-rata), "median", "spread/distribution" (sebaran), "how many nulls" \ (berapa nilai kosong). DON'T USE WHEN: - the question groups by something ("per"/"each"/"by") -> analyze_aggregate - it compares two specific groups (A vs B) -> analyze_comparison - it tracks a metric over time -> analyze_trend - it checks data type, quality, duplicates, outliers, constants -> analyze_profile Example questions: - "what's the average and median customer age?" - "summarize the income column" - "how is product price distributed?" - "how many nulls are in the email column?" """ def analyze_descriptive( df: pd.DataFrame, column_ids: list[str], metrics: list[str] | None = None, ) -> dict[str, dict[str, object]]: """Descriptive EDA for one or many columns. Args: df: already-materialized data (in the real system the wrapper fetches this from a source_id). column_ids: columns to analyze. metrics: subset of metrics; defaults to all of DEFAULT_METRICS. Returns: dict: { column_id: { metric: value, ... }, ... } Raises: ColumnNotFoundError: if any column_id is absent from df. """ chosen = tuple(metrics) if metrics else DEFAULT_METRICS missing = [c for c in column_ids if c not in df.columns] if missing: raise ColumnNotFoundError(f"columns not found: {missing}") return {col: _describe_one(df[col], chosen) for col in column_ids}