CS5130_finalProject / insights.py
Khang Nguyen
inital commit
aa893a9
import pandas as pd
import numpy as np
def top_bottom_groups(df, group_col, value_col, top_n=3):
"""
Group by `group_col`, sum `value_col`, and return NON-overlapping
top and bottom groups.
This is used for things like:
- Top / bottom models by Estimated_Deliveries
Returns a dict:
{
"top": DataFrame,
"bottom": DataFrame
}
"""
grouped = (
df.groupby(group_col)[value_col]
.sum()
.reset_index()
.rename(columns={value_col: f"total_{value_col}"})
)
if grouped.empty:
return {"top": pd.DataFrame(), "bottom": pd.DataFrame()}
# Sort descending for potential "top" list
grouped_desc = grouped.sort_values(by=f"total_{value_col}", ascending=False)
# We only want up to half of the unique groups in each list to avoid overlap
max_pairs = max(1, len(grouped_desc) // 2)
n = min(top_n, max_pairs)
# Top n
top = grouped_desc.head(n).reset_index(drop=True)
# Bottom n from the remaining (no overlap with top)
grouped_asc = grouped.sort_values(by=f"total_{value_col}", ascending=True)
bottom = grouped_asc[~grouped_asc[group_col].isin(top[group_col])].head(n)
bottom = bottom.reset_index(drop=True)
return {"top": top, "bottom": bottom}
def region_ranking(df, value_col="Estimated_Deliveries"):
"""
Rank regions by total value_col (for this project: Estimated_Deliveries).
Returns a DataFrame with columns like:
Region, total_Estimated_Deliveries, rank
"""
if "Region" not in df.columns or value_col not in df.columns:
return pd.DataFrame()
grouped = (
df.groupby("Region")[value_col]
.sum()
.reset_index()
.rename(columns={value_col: f"total_{value_col}"})
)
if grouped.empty:
return grouped
grouped = grouped.sort_values(by=f"total_{value_col}", ascending=False)
grouped["rank"] = range(1, len(grouped) + 1)
return grouped.reset_index(drop=True)
def model_production_vs_delivery(
df,
model_col="Model",
deliveries_col="Estimated_Deliveries",
prod_col="Production_Units",
):
"""
Compare total production vs total estimated deliveries by model.
Returns a DataFrame with:
Model,
total_estimated_deliveries,
total_production_units,
delivery_rate_percent,
inventory_gap
"""
needed_cols = [model_col, deliveries_col, prod_col]
for c in needed_cols:
if c not in df.columns:
return pd.DataFrame()
tmp = df[[model_col, deliveries_col, prod_col]].copy()
grouped = (
tmp.groupby(model_col)[[deliveries_col, prod_col]]
.sum()
.reset_index()
.rename(
columns={
deliveries_col: "total_estimated_deliveries",
prod_col: "total_production_units",
}
)
)
if grouped.empty:
return grouped
# Delivery rate = deliveries / production * 100
grouped["delivery_rate_percent"] = grouped.apply(
lambda row: (row["total_estimated_deliveries"] / row["total_production_units"] * 100.0)
if row["total_production_units"] != 0
else None,
axis=1,
)
# Inventory gap = produced but not (yet) delivered
grouped["inventory_gap"] = (
grouped["total_production_units"] - grouped["total_estimated_deliveries"]
)
# Round for nicer display
grouped["delivery_rate_percent"] = grouped["delivery_rate_percent"].round(2)
return grouped
def overall_trend_summary(df, date_col, value_col, freq="Q"):
"""
Build a simple trend summary using time resampling.
For this project we use it for:
- Estimated_Deliveries over time (quarterly)
Returns:
summary_dict, quarterly_series
The summary_dict is already written in human-friendly sentences so that
utils.dict_to_text() will show something nice like:
start: On 2015-03-31, estimated deliveries were 9,883,795.
end: On 2025-12-31, estimated deliveries were 11,087,134.
...
quarterly_series is the resampled pandas Series (for debugging or extension).
"""
summary = {}
if date_col not in df.columns or value_col not in df.columns:
summary["info"] = "Trend summary unavailable β€” missing date or value column."
return summary, pd.Series(dtype="float64")
tmp = df[[date_col, value_col]].dropna().copy()
if tmp.empty:
summary["info"] = "Trend summary unavailable β€” no valid rows after dropping missing values."
return summary, pd.Series(dtype="float64")
tmp[date_col] = pd.to_datetime(tmp[date_col])
tmp = tmp.sort_values(by=date_col)
# Resample (e.g. quarterly) and sum
series = tmp.set_index(date_col)[value_col].resample(freq).sum()
if series.empty:
summary["info"] = "Trend summary unavailable β€” no data after resampling."
return summary, series
start_period = series.index[0]
end_period = series.index[-1]
start_value = float(series.iloc[0])
end_value = float(series.iloc[-1])
absolute_change = end_value - start_value
percent_change = (absolute_change / start_value * 100.0) if start_value != 0 else None
best_period = series.idxmax()
best_value = float(series.max())
# These strings already contain date + number as you requested
summary["start"] = (
f"On {start_period.date()}, estimated deliveries were {start_value:,.0f}."
)
summary["end"] = (
f"On {end_period.date()}, estimated deliveries were {end_value:,.0f}."
)
summary["change"] = (
f"From {start_period.date()} to {end_period.date()}, deliveries changed by "
f"{absolute_change:,.0f} units."
)
if percent_change is not None:
summary["growth"] = (
f"Overall growth between the first and last period is {percent_change:.2f}%."
)
else:
summary["growth"] = "Overall growth percentage could not be computed (start value is 0)."
summary["best_quarter"] = (
f"The highest quarter in this dataset is {best_period.date()} "
f"with {best_value:,.0f} estimated deliveries."
)
return summary, series
# Optional: kept here in case you want to experiment later.
# Not used by app.py in the current design.
def simple_anomaly_detection(df, date_col, value_col, freq="Q", z_threshold=2.0):
"""
Simple anomaly detection based on z-scores of resampled values.
NOT used in the current dashboard, but left here for potential extensions.
"""
summary, series = overall_trend_summary(df, date_col, value_col, freq=freq)
if series is None or series.empty:
return pd.DataFrame()
values = series.values.astype(float)
mean = values.mean()
std = values.std()
if std == 0:
return pd.DataFrame()
z_scores = (values - mean) / std
mask = np.abs(z_scores) >= z_threshold
if not mask.any():
return pd.DataFrame()
out = pd.DataFrame(
{
"period": series.index[mask].astype(str),
"value": values[mask],
"z_score": z_scores[mask],
}
)
return out