waterdb

Sleeping

File size: 5,321 Bytes

007b0fc

import pandas as pd
from great_tables import GT, html

from utils.timing import timer


@timer(include_params=False)
def create_station_stats(
    pivoted: pd.DataFrame, station: str | float | int
) -> pd.DataFrame:
    """
    Create statistics for a specific station from pivoted data.

    Args:
        pivoted: Pivoted DataFrame containing water quality measurements
        station: Station identifier

    Returns:
        DataFrame with statistics for various water quality parameters
    """
    PARAMETERS = {
        "Secchi Depth (feet)": ("Depth, Secchi Disk Depth", ["Surface"]),
        "Temperature (°C)": ("Temperature, Water", ["Surface", "Bottom"]),
        "Dissolved Oxygen (mg/L)": ("Dissolved Oxygen", ["Surface", "Bottom"]),
        "Turbidity (NTU)": ("Turbidity", ["Surface", "Bottom"]),
        "Salinity (ppt)": ("Salinity", ["Surface", "Bottom"]),
        "pH": ("pH", ["Surface", "Bottom"]),
    }
    STATS = {"Average": "mean", "Maximum": "max", "Minimum": "min", "n=": "count"}
    data = {"Station": station, "Statistic": list(STATS.keys())}
    for param_name, (param_code, positions) in PARAMETERS.items():
        for position in positions:
            col_name = f"{param_name} {position}" if len(positions) > 1 else param_name
            data[col_name] = [
                pivoted[stat][position][station, param_code] for stat in STATS.values()
            ]
    return pd.DataFrame(data)


def create_summary_by_station_and_position(
    df: pd.DataFrame, exclude_analytes: list[str] | None = None
) -> pd.DataFrame:
    """
    Create a summary statistics table from water quality measurements.

    Args:
        df (pd.DataFrame): Processed dataframe from get_data function

    Returns:
        pd.DataFrame: Summary statistics table with surface/bottom measurements
    """
    if exclude_analytes is None:
        exclude_analytes = []

    summary = (
        df.query("Org_Analyte_Name not in @exclude_analytes")
        .groupby(
            ["Station_Number", "Sample_Position", "Org_Analyte_Name"], observed=False
        )["Org_Result_Value"]
        .agg(["mean", "max", "min", "count"])
        .round(2)
    )
    pivoted = summary.reset_index().pivot_table(
        index=["Station_Number", "Org_Analyte_Name"],
        columns=["Sample_Position"],
        values=["mean", "max", "min", "count"],
        observed=False,
    )
    stations = sorted(df["Station_Number"].unique())
    return pd.concat(
        [create_station_stats(pivoted, station) for station in stations]
    ).set_index(["Station", "Statistic"])


@timer(include_params=False)
def create_overall_summary(df: pd.DataFrame) -> pd.DataFrame:
    """Create overall summary statistics for the dataset"""
    summary = (
        df.groupby(["Org_Analyte_Name"], observed=False)["Org_Result_Value"]
        .agg(["mean", "max", "min", "count"])
        .round(2)
        .rename(
            columns={
                "count": "Count",
                "mean": "Mean",
                "max": "Maximum",
                "min": "Minimum",
            }
        )
    )
    summary.index.name = None
    transposed = summary.T
    return transposed.rename(
        columns={
            "Depth, Secchi Disk Depth": "Secchi Depth (feet)",
            "Dissolved Oxygen": "Dissolved Oxygen (mg/L)",
            "Salinity": "Salinity (ppt)",
            "Turbidity": "Turbidity (NTU)",
            "Temperature, Water": "Temperature (°C)",
        }
    ).loc[
        :,
        [
            "Secchi Depth (feet)",
            "Temperature (°C)",
            "Dissolved Oxygen (mg/L)",
            "Turbidity (NTU)",
            "Salinity (ppt)",
            "pH",
        ],
    ]


@timer(include_params=False)
def create_multiindex_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Create multi-index columns for the summary dataframe"""
    new_df = df.copy()
    new_df.columns = pd.MultiIndex.from_tuples(
        [
            (col.rsplit(" ", 1)[0], col.rsplit(" ", 1)[1])
            if col != "Secchi Depth (feet)"
            else ("", col)
            for col in df.columns
        ],
        names=["Analyte", "Position"],
    )
    return new_df


@timer(include_params=False)
def create_overall_summary_table(df: pd.DataFrame) -> GT:
    df.index.name = "Statistic"
    df = df.reset_index()

    return (
        GT(df, rowname_col="Statistic")
        .tab_header(
            title="Overall Water Quality",
            subtitle="Summary statistics for all data analyzed during study period",
        )
        .fmt_number(
            columns=[
                "Secchi Depth (feet)",
                "Temperature (°C)",
                "Dissolved Oxygen (mg/L)",
            ],
            decimals=1,
        )
        .fmt_integer(
            columns=list(df.columns[1:]),
            rows=lambda x: x["Statistic"] == "Count",  # type: ignore
            use_seps=True,
        )
        .cols_label(
            **{
                col: html(f"{col.rpartition(' ')[0]}<br>{col.rpartition(' ')[-1]}")
                if col != "pH"
                else html(f"{col}<br>&nbsp;")
                for col in df.columns[1:]
            }  # type: ignore
        )
        .cols_width(cases={col: "14%" for col in df.columns[1:]})
        .opt_align_table_header(align="center")
    )