BID / insights.py
Teoman21's picture
Add project files with LFS for large CSV
a48d292
"""Insight generation utilities for the BI dashboard."""
from __future__ import annotations
from typing import Dict, Iterable, Optional, Tuple
import numpy as np
import pandas as pd
from utils import ColumnTypes
def top_bottom_performers(df: pd.DataFrame, column: str, n: int = 5) -> Dict[str, pd.DataFrame]:
"""Return the top and bottom performers for a numeric column."""
if column not in df.columns:
raise ValueError(f"Column '{column}' not found in dataset.")
numeric_series = pd.to_numeric(df[column], errors="coerce").dropna()
if numeric_series.empty:
raise ValueError(f"Column '{column}' does not contain numeric data.")
top = numeric_series.nlargest(n)
bottom = numeric_series.nsmallest(n)
return {
"top": top.reset_index(),
"bottom": bottom.reset_index(),
}
def detect_trend(df: pd.DataFrame, date_column: str, value_column: str) -> str:
"""Analyze basic trend between the first and last data points."""
if date_column not in df.columns or value_column not in df.columns:
raise ValueError("Selected columns are not present in the dataset.")
working = df[[date_column, value_column]].dropna()
working[date_column] = pd.to_datetime(working[date_column], errors="coerce")
working = working.dropna()
if working.empty or working[date_column].nunique() < 2:
return "Not enough data to evaluate a trend."
working = working.sort_values(by=date_column)
first_date = working[date_column].iloc[0]
last_date = working[date_column].iloc[-1]
first_value = working[value_column].iloc[0]
last_value = working[value_column].iloc[-1]
change = last_value - first_value
pct_change = (change / first_value * 100) if first_value != 0 else np.nan
if np.isnan(pct_change):
direction = "changed"
elif pct_change > 0:
direction = "increased"
elif pct_change < 0:
direction = "decreased"
else:
direction = "remained stable"
pct_text = f" ({pct_change:.2f}%)" if not np.isnan(pct_change) else ""
return (
f"Between {first_date.date()} and {last_date.date()}, "
f"{value_column} {direction} by {change:.2f}{pct_text}."
)
def detect_anomalies(df: pd.DataFrame, column: str, z_threshold: float = 3.0, limit: int = 5) -> pd.DataFrame:
"""Identify potential outliers using a simple z-score approach."""
if column not in df.columns:
raise ValueError(f"Column '{column}' not found in dataset.")
series = pd.to_numeric(df[column], errors="coerce")
z_scores = ((series - series.mean()) / series.std()).abs()
anomalies = df.loc[z_scores > z_threshold, [column]].copy()
anomalies["z_score"] = z_scores[z_scores > z_threshold]
return anomalies.sort_values(by="z_score", ascending=False).head(limit)
def get_default_insight_columns(column_types: ColumnTypes) -> Dict[str, Optional[str]]:
"""Determine default columns to use when auto-generating insights."""
numeric_col = column_types.numeric[0] if column_types.numeric else None
date_col = column_types.datetime[0] if column_types.datetime else None
return {"numeric": numeric_col, "datetime": date_col}