import pandas as pd
from great_tables import GT, html
from utils.timing import timer
@timer(include_params=False)
def create_station_stats(
pivoted: pd.DataFrame, station: str | float | int
) -> pd.DataFrame:
"""
Create statistics for a specific station from pivoted data.
Args:
pivoted: Pivoted DataFrame containing water quality measurements
station: Station identifier
Returns:
DataFrame with statistics for various water quality parameters
"""
PARAMETERS = {
"Secchi Depth (feet)": ("Depth, Secchi Disk Depth", ["Surface"]),
"Temperature (°C)": ("Temperature, Water", ["Surface", "Bottom"]),
"Dissolved Oxygen (mg/L)": ("Dissolved Oxygen", ["Surface", "Bottom"]),
"Turbidity (NTU)": ("Turbidity", ["Surface", "Bottom"]),
"Salinity (ppt)": ("Salinity", ["Surface", "Bottom"]),
"pH": ("pH", ["Surface", "Bottom"]),
}
STATS = {"Average": "mean", "Maximum": "max", "Minimum": "min", "n=": "count"}
data = {"Station": station, "Statistic": list(STATS.keys())}
for param_name, (param_code, positions) in PARAMETERS.items():
for position in positions:
col_name = f"{param_name} {position}" if len(positions) > 1 else param_name
data[col_name] = [
pivoted[stat][position][station, param_code] for stat in STATS.values()
]
return pd.DataFrame(data)
def create_summary_by_station_and_position(
df: pd.DataFrame, exclude_analytes: list[str] | None = None
) -> pd.DataFrame:
"""
Create a summary statistics table from water quality measurements.
Args:
df (pd.DataFrame): Processed dataframe from get_data function
Returns:
pd.DataFrame: Summary statistics table with surface/bottom measurements
"""
if exclude_analytes is None:
exclude_analytes = []
summary = (
df.query("Org_Analyte_Name not in @exclude_analytes")
.groupby(
["Station_Number", "Sample_Position", "Org_Analyte_Name"], observed=False
)["Org_Result_Value"]
.agg(["mean", "max", "min", "count"])
.round(2)
)
pivoted = summary.reset_index().pivot_table(
index=["Station_Number", "Org_Analyte_Name"],
columns=["Sample_Position"],
values=["mean", "max", "min", "count"],
observed=False,
)
stations = sorted(df["Station_Number"].unique())
return pd.concat(
[create_station_stats(pivoted, station) for station in stations]
).set_index(["Station", "Statistic"])
@timer(include_params=False)
def create_overall_summary(df: pd.DataFrame) -> pd.DataFrame:
"""Create overall summary statistics for the dataset"""
summary = (
df.groupby(["Org_Analyte_Name"], observed=False)["Org_Result_Value"]
.agg(["mean", "max", "min", "count"])
.round(2)
.rename(
columns={
"count": "Count",
"mean": "Mean",
"max": "Maximum",
"min": "Minimum",
}
)
)
summary.index.name = None
transposed = summary.T
return transposed.rename(
columns={
"Depth, Secchi Disk Depth": "Secchi Depth (feet)",
"Dissolved Oxygen": "Dissolved Oxygen (mg/L)",
"Salinity": "Salinity (ppt)",
"Turbidity": "Turbidity (NTU)",
"Temperature, Water": "Temperature (°C)",
}
).loc[
:,
[
"Secchi Depth (feet)",
"Temperature (°C)",
"Dissolved Oxygen (mg/L)",
"Turbidity (NTU)",
"Salinity (ppt)",
"pH",
],
]
@timer(include_params=False)
def create_multiindex_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Create multi-index columns for the summary dataframe"""
new_df = df.copy()
new_df.columns = pd.MultiIndex.from_tuples(
[
(col.rsplit(" ", 1)[0], col.rsplit(" ", 1)[1])
if col != "Secchi Depth (feet)"
else ("", col)
for col in df.columns
],
names=["Analyte", "Position"],
)
return new_df
@timer(include_params=False)
def create_overall_summary_table(df: pd.DataFrame) -> GT:
df.index.name = "Statistic"
df = df.reset_index()
return (
GT(df, rowname_col="Statistic")
.tab_header(
title="Overall Water Quality",
subtitle="Summary statistics for all data analyzed during study period",
)
.fmt_number(
columns=[
"Secchi Depth (feet)",
"Temperature (°C)",
"Dissolved Oxygen (mg/L)",
],
decimals=1,
)
.fmt_integer(
columns=list(df.columns[1:]),
rows=lambda x: x["Statistic"] == "Count", # type: ignore
use_seps=True,
)
.cols_label(
**{
col: html(f"{col.rpartition(' ')[0]}
{col.rpartition(' ')[-1]}")
if col != "pH"
else html(f"{col}
")
for col in df.columns[1:]
} # type: ignore
)
.cols_width(cases={col: "14%" for col in df.columns[1:]})
.opt_align_table_header(align="center")
)