Spaces:

dkhangn
/

CS5130_finalProject

Sleeping

CS5130_finalProject / insights.py

Khang Nguyen

inital commit

aa893a9 about 2 months ago

7.25 kB

	import pandas as pd
	import numpy as np


	def top_bottom_groups(df, group_col, value_col, top_n=3):
	"""
	Group by `group_col`, sum `value_col`, and return NON-overlapping
	top and bottom groups.

	This is used for things like:
	- Top / bottom models by Estimated_Deliveries

	Returns a dict:
	{
	"top": DataFrame,
	"bottom": DataFrame
	}
	"""
	grouped = (
	df.groupby(group_col)[value_col]
	.sum()
	.reset_index()
	.rename(columns={value_col: f"total_{value_col}"})
	)

	if grouped.empty:
	return {"top": pd.DataFrame(), "bottom": pd.DataFrame()}

	# Sort descending for potential "top" list
	grouped_desc = grouped.sort_values(by=f"total_{value_col}", ascending=False)

	# We only want up to half of the unique groups in each list to avoid overlap
	max_pairs = max(1, len(grouped_desc) // 2)
	n = min(top_n, max_pairs)

	# Top n
	top = grouped_desc.head(n).reset_index(drop=True)

	# Bottom n from the remaining (no overlap with top)
	grouped_asc = grouped.sort_values(by=f"total_{value_col}", ascending=True)
	bottom = grouped_asc[~grouped_asc[group_col].isin(top[group_col])].head(n)
	bottom = bottom.reset_index(drop=True)

	return {"top": top, "bottom": bottom}


	def region_ranking(df, value_col="Estimated_Deliveries"):
	"""
	Rank regions by total value_col (for this project: Estimated_Deliveries).

	Returns a DataFrame with columns like:
	Region, total_Estimated_Deliveries, rank
	"""
	if "Region" not in df.columns or value_col not in df.columns:
	return pd.DataFrame()

	grouped = (
	df.groupby("Region")[value_col]
	.sum()
	.reset_index()
	.rename(columns={value_col: f"total_{value_col}"})
	)

	if grouped.empty:
	return grouped

	grouped = grouped.sort_values(by=f"total_{value_col}", ascending=False)
	grouped["rank"] = range(1, len(grouped) + 1)
	return grouped.reset_index(drop=True)


	def model_production_vs_delivery(
	df,
	model_col="Model",
	deliveries_col="Estimated_Deliveries",
	prod_col="Production_Units",
	):
	"""
	Compare total production vs total estimated deliveries by model.

	Returns a DataFrame with:
	Model,
	total_estimated_deliveries,
	total_production_units,
	delivery_rate_percent,
	inventory_gap
	"""
	needed_cols = [model_col, deliveries_col, prod_col]
	for c in needed_cols:
	if c not in df.columns:
	return pd.DataFrame()

	tmp = df[[model_col, deliveries_col, prod_col]].copy()

	grouped = (
	tmp.groupby(model_col)[[deliveries_col, prod_col]]
	.sum()
	.reset_index()
	.rename(
	columns={
	deliveries_col: "total_estimated_deliveries",
	prod_col: "total_production_units",
	}
	)
	)

	if grouped.empty:
	return grouped

	# Delivery rate = deliveries / production * 100
	grouped["delivery_rate_percent"] = grouped.apply(
	lambda row: (row["total_estimated_deliveries"] / row["total_production_units"] * 100.0)
	if row["total_production_units"] != 0
	else None,
	axis=1,
	)

	# Inventory gap = produced but not (yet) delivered
	grouped["inventory_gap"] = (
	grouped["total_production_units"] - grouped["total_estimated_deliveries"]
	)

	# Round for nicer display
	grouped["delivery_rate_percent"] = grouped["delivery_rate_percent"].round(2)

	return grouped


	def overall_trend_summary(df, date_col, value_col, freq="Q"):
	"""
	Build a simple trend summary using time resampling.

	For this project we use it for:
	- Estimated_Deliveries over time (quarterly)

	Returns:
	summary_dict, quarterly_series

	The summary_dict is already written in human-friendly sentences so that
	utils.dict_to_text() will show something nice like:

	start: On 2015-03-31, estimated deliveries were 9,883,795.
	end: On 2025-12-31, estimated deliveries were 11,087,134.
	...

	quarterly_series is the resampled pandas Series (for debugging or extension).
	"""
	summary = {}

	if date_col not in df.columns or value_col not in df.columns:
	summary["info"] = "Trend summary unavailable — missing date or value column."
	return summary, pd.Series(dtype="float64")

	tmp = df[[date_col, value_col]].dropna().copy()
	if tmp.empty:
	summary["info"] = "Trend summary unavailable — no valid rows after dropping missing values."
	return summary, pd.Series(dtype="float64")

	tmp[date_col] = pd.to_datetime(tmp[date_col])
	tmp = tmp.sort_values(by=date_col)

	# Resample (e.g. quarterly) and sum
	series = tmp.set_index(date_col)[value_col].resample(freq).sum()

	if series.empty:
	summary["info"] = "Trend summary unavailable — no data after resampling."
	return summary, series

	start_period = series.index[0]
	end_period = series.index[-1]

	start_value = float(series.iloc[0])
	end_value = float(series.iloc[-1])

	absolute_change = end_value - start_value
	percent_change = (absolute_change / start_value * 100.0) if start_value != 0 else None

	best_period = series.idxmax()
	best_value = float(series.max())

	# These strings already contain date + number as you requested
	summary["start"] = (
	f"On {start_period.date()}, estimated deliveries were {start_value:,.0f}."
	)
	summary["end"] = (
	f"On {end_period.date()}, estimated deliveries were {end_value:,.0f}."
	)
	summary["change"] = (
	f"From {start_period.date()} to {end_period.date()}, deliveries changed by "
	f"{absolute_change:,.0f} units."
	)
	if percent_change is not None:
	summary["growth"] = (
	f"Overall growth between the first and last period is {percent_change:.2f}%."
	)
	else:
	summary["growth"] = "Overall growth percentage could not be computed (start value is 0)."

	summary["best_quarter"] = (
	f"The highest quarter in this dataset is {best_period.date()} "
	f"with {best_value:,.0f} estimated deliveries."
	)

	return summary, series


	# Optional: kept here in case you want to experiment later.
	# Not used by app.py in the current design.
	def simple_anomaly_detection(df, date_col, value_col, freq="Q", z_threshold=2.0):
	"""
	Simple anomaly detection based on z-scores of resampled values.

	NOT used in the current dashboard, but left here for potential extensions.
	"""
	summary, series = overall_trend_summary(df, date_col, value_col, freq=freq)

	if series is None or series.empty:
	return pd.DataFrame()

	values = series.values.astype(float)
	mean = values.mean()
	std = values.std()

	if std == 0:
	return pd.DataFrame()

	z_scores = (values - mean) / std
	mask = np.abs(z_scores) >= z_threshold

	if not mask.any():
	return pd.DataFrame()

	out = pd.DataFrame(
	{
	"period": series.index[mask].astype(str),
	"value": values[mask],
	"z_score": z_scores[mask],
	}
	)
	return out