Spaces:

dkhangn
/

CS5130_finalProject

Sleeping

File size: 7,247 Bytes

aa893a9

import pandas as pd
import numpy as np


def top_bottom_groups(df, group_col, value_col, top_n=3):
    """
    Group by `group_col`, sum `value_col`, and return NON-overlapping
    top and bottom groups.

    This is used for things like:
        - Top / bottom models by Estimated_Deliveries

    Returns a dict:
        {
            "top":    DataFrame,
            "bottom": DataFrame
        }
    """
    grouped = (
        df.groupby(group_col)[value_col]
        .sum()
        .reset_index()
        .rename(columns={value_col: f"total_{value_col}"})
    )

    if grouped.empty:
        return {"top": pd.DataFrame(), "bottom": pd.DataFrame()}

    # Sort descending for potential "top" list
    grouped_desc = grouped.sort_values(by=f"total_{value_col}", ascending=False)

    # We only want up to half of the unique groups in each list to avoid overlap
    max_pairs = max(1, len(grouped_desc) // 2)
    n = min(top_n, max_pairs)

    # Top n
    top = grouped_desc.head(n).reset_index(drop=True)

    # Bottom n from the remaining (no overlap with top)
    grouped_asc = grouped.sort_values(by=f"total_{value_col}", ascending=True)
    bottom = grouped_asc[~grouped_asc[group_col].isin(top[group_col])].head(n)
    bottom = bottom.reset_index(drop=True)

    return {"top": top, "bottom": bottom}


def region_ranking(df, value_col="Estimated_Deliveries"):
    """
    Rank regions by total value_col (for this project: Estimated_Deliveries).

    Returns a DataFrame with columns like:
        Region, total_Estimated_Deliveries, rank
    """
    if "Region" not in df.columns or value_col not in df.columns:
        return pd.DataFrame()

    grouped = (
        df.groupby("Region")[value_col]
        .sum()
        .reset_index()
        .rename(columns={value_col: f"total_{value_col}"})
    )

    if grouped.empty:
        return grouped

    grouped = grouped.sort_values(by=f"total_{value_col}", ascending=False)
    grouped["rank"] = range(1, len(grouped) + 1)
    return grouped.reset_index(drop=True)


def model_production_vs_delivery(
    df,
    model_col="Model",
    deliveries_col="Estimated_Deliveries",
    prod_col="Production_Units",
):
    """
    Compare total production vs total estimated deliveries by model.

    Returns a DataFrame with:
        Model,
        total_estimated_deliveries,
        total_production_units,
        delivery_rate_percent,
        inventory_gap
    """
    needed_cols = [model_col, deliveries_col, prod_col]
    for c in needed_cols:
        if c not in df.columns:
            return pd.DataFrame()

    tmp = df[[model_col, deliveries_col, prod_col]].copy()

    grouped = (
        tmp.groupby(model_col)[[deliveries_col, prod_col]]
        .sum()
        .reset_index()
        .rename(
            columns={
                deliveries_col: "total_estimated_deliveries",
                prod_col: "total_production_units",
            }
        )
    )

    if grouped.empty:
        return grouped

    # Delivery rate = deliveries / production * 100
    grouped["delivery_rate_percent"] = grouped.apply(
        lambda row: (row["total_estimated_deliveries"] / row["total_production_units"] * 100.0)
        if row["total_production_units"] != 0
        else None,
        axis=1,
    )

    # Inventory gap = produced but not (yet) delivered
    grouped["inventory_gap"] = (
        grouped["total_production_units"] - grouped["total_estimated_deliveries"]
    )

    # Round for nicer display
    grouped["delivery_rate_percent"] = grouped["delivery_rate_percent"].round(2)

    return grouped


def overall_trend_summary(df, date_col, value_col, freq="Q"):
    """
    Build a simple trend summary using time resampling.

    For this project we use it for:
        - Estimated_Deliveries over time (quarterly)

    Returns:
        summary_dict, quarterly_series

    The summary_dict is already written in human-friendly sentences so that
    utils.dict_to_text() will show something nice like:

        start: On 2015-03-31, estimated deliveries were 9,883,795.
        end: On 2025-12-31, estimated deliveries were 11,087,134.
        ...

    quarterly_series is the resampled pandas Series (for debugging or extension).
    """
    summary = {}

    if date_col not in df.columns or value_col not in df.columns:
        summary["info"] = "Trend summary unavailable — missing date or value column."
        return summary, pd.Series(dtype="float64")

    tmp = df[[date_col, value_col]].dropna().copy()
    if tmp.empty:
        summary["info"] = "Trend summary unavailable — no valid rows after dropping missing values."
        return summary, pd.Series(dtype="float64")

    tmp[date_col] = pd.to_datetime(tmp[date_col])
    tmp = tmp.sort_values(by=date_col)

    # Resample (e.g. quarterly) and sum
    series = tmp.set_index(date_col)[value_col].resample(freq).sum()

    if series.empty:
        summary["info"] = "Trend summary unavailable — no data after resampling."
        return summary, series

    start_period = series.index[0]
    end_period = series.index[-1]

    start_value = float(series.iloc[0])
    end_value = float(series.iloc[-1])

    absolute_change = end_value - start_value
    percent_change = (absolute_change / start_value * 100.0) if start_value != 0 else None

    best_period = series.idxmax()
    best_value = float(series.max())

    # These strings already contain date + number as you requested
    summary["start"] = (
        f"On {start_period.date()}, estimated deliveries were {start_value:,.0f}."
    )
    summary["end"] = (
        f"On {end_period.date()}, estimated deliveries were {end_value:,.0f}."
    )
    summary["change"] = (
        f"From {start_period.date()} to {end_period.date()}, deliveries changed by "
        f"{absolute_change:,.0f} units."
    )
    if percent_change is not None:
        summary["growth"] = (
            f"Overall growth between the first and last period is {percent_change:.2f}%."
        )
    else:
        summary["growth"] = "Overall growth percentage could not be computed (start value is 0)."

    summary["best_quarter"] = (
        f"The highest quarter in this dataset is {best_period.date()} "
        f"with {best_value:,.0f} estimated deliveries."
    )

    return summary, series


# Optional: kept here in case you want to experiment later.
# Not used by app.py in the current design.
def simple_anomaly_detection(df, date_col, value_col, freq="Q", z_threshold=2.0):
    """
    Simple anomaly detection based on z-scores of resampled values.

    NOT used in the current dashboard, but left here for potential extensions.
    """
    summary, series = overall_trend_summary(df, date_col, value_col, freq=freq)

    if series is None or series.empty:
        return pd.DataFrame()

    values = series.values.astype(float)
    mean = values.mean()
    std = values.std()

    if std == 0:
        return pd.DataFrame()

    z_scores = (values - mean) / std
    mask = np.abs(z_scores) >= z_threshold

    if not mask.any():
        return pd.DataFrame()

    out = pd.DataFrame(
        {
            "period": series.index[mask].astype(str),
            "value": values[mask],
            "z_score": z_scores[mask],
        }
    )
    return out