import pandas as pd import numpy as np def top_bottom_groups(df, group_col, value_col, top_n=3): """ Group by `group_col`, sum `value_col`, and return NON-overlapping top and bottom groups. This is used for things like: - Top / bottom models by Estimated_Deliveries Returns a dict: { "top": DataFrame, "bottom": DataFrame } """ grouped = ( df.groupby(group_col)[value_col] .sum() .reset_index() .rename(columns={value_col: f"total_{value_col}"}) ) if grouped.empty: return {"top": pd.DataFrame(), "bottom": pd.DataFrame()} # Sort descending for potential "top" list grouped_desc = grouped.sort_values(by=f"total_{value_col}", ascending=False) # We only want up to half of the unique groups in each list to avoid overlap max_pairs = max(1, len(grouped_desc) // 2) n = min(top_n, max_pairs) # Top n top = grouped_desc.head(n).reset_index(drop=True) # Bottom n from the remaining (no overlap with top) grouped_asc = grouped.sort_values(by=f"total_{value_col}", ascending=True) bottom = grouped_asc[~grouped_asc[group_col].isin(top[group_col])].head(n) bottom = bottom.reset_index(drop=True) return {"top": top, "bottom": bottom} def region_ranking(df, value_col="Estimated_Deliveries"): """ Rank regions by total value_col (for this project: Estimated_Deliveries). Returns a DataFrame with columns like: Region, total_Estimated_Deliveries, rank """ if "Region" not in df.columns or value_col not in df.columns: return pd.DataFrame() grouped = ( df.groupby("Region")[value_col] .sum() .reset_index() .rename(columns={value_col: f"total_{value_col}"}) ) if grouped.empty: return grouped grouped = grouped.sort_values(by=f"total_{value_col}", ascending=False) grouped["rank"] = range(1, len(grouped) + 1) return grouped.reset_index(drop=True) def model_production_vs_delivery( df, model_col="Model", deliveries_col="Estimated_Deliveries", prod_col="Production_Units", ): """ Compare total production vs total estimated deliveries by model. Returns a DataFrame with: Model, total_estimated_deliveries, total_production_units, delivery_rate_percent, inventory_gap """ needed_cols = [model_col, deliveries_col, prod_col] for c in needed_cols: if c not in df.columns: return pd.DataFrame() tmp = df[[model_col, deliveries_col, prod_col]].copy() grouped = ( tmp.groupby(model_col)[[deliveries_col, prod_col]] .sum() .reset_index() .rename( columns={ deliveries_col: "total_estimated_deliveries", prod_col: "total_production_units", } ) ) if grouped.empty: return grouped # Delivery rate = deliveries / production * 100 grouped["delivery_rate_percent"] = grouped.apply( lambda row: (row["total_estimated_deliveries"] / row["total_production_units"] * 100.0) if row["total_production_units"] != 0 else None, axis=1, ) # Inventory gap = produced but not (yet) delivered grouped["inventory_gap"] = ( grouped["total_production_units"] - grouped["total_estimated_deliveries"] ) # Round for nicer display grouped["delivery_rate_percent"] = grouped["delivery_rate_percent"].round(2) return grouped def overall_trend_summary(df, date_col, value_col, freq="Q"): """ Build a simple trend summary using time resampling. For this project we use it for: - Estimated_Deliveries over time (quarterly) Returns: summary_dict, quarterly_series The summary_dict is already written in human-friendly sentences so that utils.dict_to_text() will show something nice like: start: On 2015-03-31, estimated deliveries were 9,883,795. end: On 2025-12-31, estimated deliveries were 11,087,134. ... quarterly_series is the resampled pandas Series (for debugging or extension). """ summary = {} if date_col not in df.columns or value_col not in df.columns: summary["info"] = "Trend summary unavailable — missing date or value column." return summary, pd.Series(dtype="float64") tmp = df[[date_col, value_col]].dropna().copy() if tmp.empty: summary["info"] = "Trend summary unavailable — no valid rows after dropping missing values." return summary, pd.Series(dtype="float64") tmp[date_col] = pd.to_datetime(tmp[date_col]) tmp = tmp.sort_values(by=date_col) # Resample (e.g. quarterly) and sum series = tmp.set_index(date_col)[value_col].resample(freq).sum() if series.empty: summary["info"] = "Trend summary unavailable — no data after resampling." return summary, series start_period = series.index[0] end_period = series.index[-1] start_value = float(series.iloc[0]) end_value = float(series.iloc[-1]) absolute_change = end_value - start_value percent_change = (absolute_change / start_value * 100.0) if start_value != 0 else None best_period = series.idxmax() best_value = float(series.max()) # These strings already contain date + number as you requested summary["start"] = ( f"On {start_period.date()}, estimated deliveries were {start_value:,.0f}." ) summary["end"] = ( f"On {end_period.date()}, estimated deliveries were {end_value:,.0f}." ) summary["change"] = ( f"From {start_period.date()} to {end_period.date()}, deliveries changed by " f"{absolute_change:,.0f} units." ) if percent_change is not None: summary["growth"] = ( f"Overall growth between the first and last period is {percent_change:.2f}%." ) else: summary["growth"] = "Overall growth percentage could not be computed (start value is 0)." summary["best_quarter"] = ( f"The highest quarter in this dataset is {best_period.date()} " f"with {best_value:,.0f} estimated deliveries." ) return summary, series # Optional: kept here in case you want to experiment later. # Not used by app.py in the current design. def simple_anomaly_detection(df, date_col, value_col, freq="Q", z_threshold=2.0): """ Simple anomaly detection based on z-scores of resampled values. NOT used in the current dashboard, but left here for potential extensions. """ summary, series = overall_trend_summary(df, date_col, value_col, freq=freq) if series is None or series.empty: return pd.DataFrame() values = series.values.astype(float) mean = values.mean() std = values.std() if std == 0: return pd.DataFrame() z_scores = (values - mean) / std mask = np.abs(z_scores) >= z_threshold if not mask.any(): return pd.DataFrame() out = pd.DataFrame( { "period": series.index[mask].astype(str), "value": values[mask], "z_score": z_scores[mask], } ) return out