| """analyze_descriptive — single/multi-column EDA (KM-608). |
| |
| An analytical "family" tool: in ONE call it computes a column's center, |
| spread, shape, and completeness (mean, median, mode, std, variance, |
| quartiles, min/max, skew, null_rate). |
| |
| STATUS: compute layer only — the function takes an already-materialized |
| DataFrame. The wrapper layer (fetching data from the catalog via source_id, |
| the ToolOutput envelope, ToolSpec registration) is added once the Planner |
| seam (KM-418) is settled. Keeping compute separate from data-fetching makes |
| this function easy to unit-test in isolation and stable when wrapped. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import pandas as pd |
|
|
| |
| DEFAULT_METRICS = ( |
| "count", |
| "mean", |
| "median", |
| "mode", |
| "std", |
| "var", |
| "q1", |
| "q3", |
| "min", |
| "max", |
| "skew", |
| "null_count", |
| "null_rate", |
| ) |
|
|
|
|
| class ColumnNotFoundError(ValueError): |
| """A requested column is absent from the DataFrame (maps to error_code COLUMN_NOT_FOUND).""" |
|
|
|
|
| def _clean(value: object) -> object: |
| """Coerce a scalar to a JSON-clean Python value. |
| |
| `mode` can be any dtype: an integer column yields `numpy.int64` (NOT |
| JSON-serializable), a datetime column yields `pandas.Timestamp`. The other |
| metrics are already wrapped in `float(...)`; mode is the one that needs this. |
| """ |
| if value is None: |
| return None |
| if isinstance(value, pd.Timestamp): |
| return value.isoformat() |
| if hasattr(value, "item"): |
| return value.item() |
| return value |
|
|
|
|
| def _describe_one(series: pd.Series, metrics: tuple[str, ...]) -> dict[str, object]: |
| """Compute descriptive metrics for a single column. |
| |
| Numeric metrics are computed over non-null values. `null_rate` & `count` |
| are computed over all rows (nulls included) so they reflect completeness |
| as-is. Undefined cases (e.g. std of a single value) return None — degrade |
| gracefully instead of raising. |
| """ |
| total = len(series) |
| non_null = series.dropna() |
| is_numeric = pd.api.types.is_numeric_dtype(series) |
|
|
| out: dict[str, object] = {} |
| for m in metrics: |
| if m == "count": |
| out["count"] = int(total) |
| elif m == "null_count": |
| out["null_count"] = int(series.isna().sum()) |
| elif m == "null_rate": |
| out["null_rate"] = float(series.isna().mean()) if total else 0.0 |
| elif m == "mode": |
| modes = non_null.mode() |
| out["mode"] = _clean(modes.iloc[0]) if not modes.empty else None |
| elif not is_numeric: |
| out[m] = None |
| elif m == "mean": |
| out["mean"] = float(non_null.mean()) if not non_null.empty else None |
| elif m == "median": |
| out["median"] = float(non_null.median()) if not non_null.empty else None |
| elif m == "std": |
| out["std"] = float(non_null.std()) if non_null.shape[0] > 1 else None |
| elif m == "var": |
| out["var"] = float(non_null.var()) if non_null.shape[0] > 1 else None |
| elif m == "q1": |
| out["q1"] = float(non_null.quantile(0.25)) if not non_null.empty else None |
| elif m == "q3": |
| out["q3"] = float(non_null.quantile(0.75)) if not non_null.empty else None |
| elif m == "min": |
| out["min"] = float(non_null.min()) if not non_null.empty else None |
| elif m == "max": |
| out["max"] = float(non_null.max()) if not non_null.empty else None |
| elif m == "skew": |
| out["skew"] = float(non_null.skew()) if non_null.shape[0] > 2 else None |
| return out |
|
|
|
|
| |
| |
| DESCRIPTION = """\ |
| Summary: Descriptive statistics (EDA) for one or several columns in a single \ |
| call — center (mean, median, mode), spread (std, variance, min, max, Q1/Q3 \ |
| quartiles), distribution shape (skew), and completeness (null count & rate). |
| |
| USE WHEN the user asks for an overview, summary, or single-column statistics \ |
| of ONE or SEVERAL columns as a whole, with NO grouping and NO comparison \ |
| between groups. Trigger words: "overview/summary" (ringkasan), "average" \ |
| (rata-rata), "median", "spread/distribution" (sebaran), "how many nulls" \ |
| (berapa nilai kosong). |
| |
| DON'T USE WHEN: |
| - the question groups by something ("per"/"each"/"by") -> analyze_aggregate |
| - it compares two specific groups (A vs B) -> analyze_comparison |
| - it tracks a metric over time -> analyze_trend |
| - it checks data type, quality, duplicates, outliers, constants -> analyze_profile |
| |
| Example questions: |
| - "what's the average and median customer age?" |
| - "summarize the income column" |
| - "how is product price distributed?" |
| - "how many nulls are in the email column?" |
| """ |
|
|
|
|
| def analyze_descriptive( |
| df: pd.DataFrame, |
| column_ids: list[str], |
| metrics: list[str] | None = None, |
| ) -> dict[str, dict[str, object]]: |
| """Descriptive EDA for one or many columns. |
| |
| Args: |
| df: already-materialized data (in the real system the wrapper fetches |
| this from a source_id). |
| column_ids: columns to analyze. |
| metrics: subset of metrics; defaults to all of DEFAULT_METRICS. |
| |
| Returns: |
| dict: { column_id: { metric: value, ... }, ... } |
| |
| Raises: |
| ColumnNotFoundError: if any column_id is absent from df. |
| """ |
| chosen = tuple(metrics) if metrics else DEFAULT_METRICS |
|
|
| missing = [c for c in column_ids if c not in df.columns] |
| if missing: |
| raise ColumnNotFoundError(f"columns not found: {missing}") |
|
|
| return {col: _describe_one(df[col], chosen) for col in column_ids} |
|
|