Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| def top_bottom_groups(df, group_col, value_col, top_n=3): | |
| """ | |
| Group by `group_col`, sum `value_col`, and return NON-overlapping | |
| top and bottom groups. | |
| This is used for things like: | |
| - Top / bottom models by Estimated_Deliveries | |
| Returns a dict: | |
| { | |
| "top": DataFrame, | |
| "bottom": DataFrame | |
| } | |
| """ | |
| grouped = ( | |
| df.groupby(group_col)[value_col] | |
| .sum() | |
| .reset_index() | |
| .rename(columns={value_col: f"total_{value_col}"}) | |
| ) | |
| if grouped.empty: | |
| return {"top": pd.DataFrame(), "bottom": pd.DataFrame()} | |
| # Sort descending for potential "top" list | |
| grouped_desc = grouped.sort_values(by=f"total_{value_col}", ascending=False) | |
| # We only want up to half of the unique groups in each list to avoid overlap | |
| max_pairs = max(1, len(grouped_desc) // 2) | |
| n = min(top_n, max_pairs) | |
| # Top n | |
| top = grouped_desc.head(n).reset_index(drop=True) | |
| # Bottom n from the remaining (no overlap with top) | |
| grouped_asc = grouped.sort_values(by=f"total_{value_col}", ascending=True) | |
| bottom = grouped_asc[~grouped_asc[group_col].isin(top[group_col])].head(n) | |
| bottom = bottom.reset_index(drop=True) | |
| return {"top": top, "bottom": bottom} | |
| def region_ranking(df, value_col="Estimated_Deliveries"): | |
| """ | |
| Rank regions by total value_col (for this project: Estimated_Deliveries). | |
| Returns a DataFrame with columns like: | |
| Region, total_Estimated_Deliveries, rank | |
| """ | |
| if "Region" not in df.columns or value_col not in df.columns: | |
| return pd.DataFrame() | |
| grouped = ( | |
| df.groupby("Region")[value_col] | |
| .sum() | |
| .reset_index() | |
| .rename(columns={value_col: f"total_{value_col}"}) | |
| ) | |
| if grouped.empty: | |
| return grouped | |
| grouped = grouped.sort_values(by=f"total_{value_col}", ascending=False) | |
| grouped["rank"] = range(1, len(grouped) + 1) | |
| return grouped.reset_index(drop=True) | |
| def model_production_vs_delivery( | |
| df, | |
| model_col="Model", | |
| deliveries_col="Estimated_Deliveries", | |
| prod_col="Production_Units", | |
| ): | |
| """ | |
| Compare total production vs total estimated deliveries by model. | |
| Returns a DataFrame with: | |
| Model, | |
| total_estimated_deliveries, | |
| total_production_units, | |
| delivery_rate_percent, | |
| inventory_gap | |
| """ | |
| needed_cols = [model_col, deliveries_col, prod_col] | |
| for c in needed_cols: | |
| if c not in df.columns: | |
| return pd.DataFrame() | |
| tmp = df[[model_col, deliveries_col, prod_col]].copy() | |
| grouped = ( | |
| tmp.groupby(model_col)[[deliveries_col, prod_col]] | |
| .sum() | |
| .reset_index() | |
| .rename( | |
| columns={ | |
| deliveries_col: "total_estimated_deliveries", | |
| prod_col: "total_production_units", | |
| } | |
| ) | |
| ) | |
| if grouped.empty: | |
| return grouped | |
| # Delivery rate = deliveries / production * 100 | |
| grouped["delivery_rate_percent"] = grouped.apply( | |
| lambda row: (row["total_estimated_deliveries"] / row["total_production_units"] * 100.0) | |
| if row["total_production_units"] != 0 | |
| else None, | |
| axis=1, | |
| ) | |
| # Inventory gap = produced but not (yet) delivered | |
| grouped["inventory_gap"] = ( | |
| grouped["total_production_units"] - grouped["total_estimated_deliveries"] | |
| ) | |
| # Round for nicer display | |
| grouped["delivery_rate_percent"] = grouped["delivery_rate_percent"].round(2) | |
| return grouped | |
| def overall_trend_summary(df, date_col, value_col, freq="Q"): | |
| """ | |
| Build a simple trend summary using time resampling. | |
| For this project we use it for: | |
| - Estimated_Deliveries over time (quarterly) | |
| Returns: | |
| summary_dict, quarterly_series | |
| The summary_dict is already written in human-friendly sentences so that | |
| utils.dict_to_text() will show something nice like: | |
| start: On 2015-03-31, estimated deliveries were 9,883,795. | |
| end: On 2025-12-31, estimated deliveries were 11,087,134. | |
| ... | |
| quarterly_series is the resampled pandas Series (for debugging or extension). | |
| """ | |
| summary = {} | |
| if date_col not in df.columns or value_col not in df.columns: | |
| summary["info"] = "Trend summary unavailable β missing date or value column." | |
| return summary, pd.Series(dtype="float64") | |
| tmp = df[[date_col, value_col]].dropna().copy() | |
| if tmp.empty: | |
| summary["info"] = "Trend summary unavailable β no valid rows after dropping missing values." | |
| return summary, pd.Series(dtype="float64") | |
| tmp[date_col] = pd.to_datetime(tmp[date_col]) | |
| tmp = tmp.sort_values(by=date_col) | |
| # Resample (e.g. quarterly) and sum | |
| series = tmp.set_index(date_col)[value_col].resample(freq).sum() | |
| if series.empty: | |
| summary["info"] = "Trend summary unavailable β no data after resampling." | |
| return summary, series | |
| start_period = series.index[0] | |
| end_period = series.index[-1] | |
| start_value = float(series.iloc[0]) | |
| end_value = float(series.iloc[-1]) | |
| absolute_change = end_value - start_value | |
| percent_change = (absolute_change / start_value * 100.0) if start_value != 0 else None | |
| best_period = series.idxmax() | |
| best_value = float(series.max()) | |
| # These strings already contain date + number as you requested | |
| summary["start"] = ( | |
| f"On {start_period.date()}, estimated deliveries were {start_value:,.0f}." | |
| ) | |
| summary["end"] = ( | |
| f"On {end_period.date()}, estimated deliveries were {end_value:,.0f}." | |
| ) | |
| summary["change"] = ( | |
| f"From {start_period.date()} to {end_period.date()}, deliveries changed by " | |
| f"{absolute_change:,.0f} units." | |
| ) | |
| if percent_change is not None: | |
| summary["growth"] = ( | |
| f"Overall growth between the first and last period is {percent_change:.2f}%." | |
| ) | |
| else: | |
| summary["growth"] = "Overall growth percentage could not be computed (start value is 0)." | |
| summary["best_quarter"] = ( | |
| f"The highest quarter in this dataset is {best_period.date()} " | |
| f"with {best_value:,.0f} estimated deliveries." | |
| ) | |
| return summary, series | |
| # Optional: kept here in case you want to experiment later. | |
| # Not used by app.py in the current design. | |
| def simple_anomaly_detection(df, date_col, value_col, freq="Q", z_threshold=2.0): | |
| """ | |
| Simple anomaly detection based on z-scores of resampled values. | |
| NOT used in the current dashboard, but left here for potential extensions. | |
| """ | |
| summary, series = overall_trend_summary(df, date_col, value_col, freq=freq) | |
| if series is None or series.empty: | |
| return pd.DataFrame() | |
| values = series.values.astype(float) | |
| mean = values.mean() | |
| std = values.std() | |
| if std == 0: | |
| return pd.DataFrame() | |
| z_scores = (values - mean) / std | |
| mask = np.abs(z_scores) >= z_threshold | |
| if not mask.any(): | |
| return pd.DataFrame() | |
| out = pd.DataFrame( | |
| { | |
| "period": series.index[mask].astype(str), | |
| "value": values[mask], | |
| "z_score": z_scores[mask], | |
| } | |
| ) | |
| return out | |