import pandas as pd from great_tables import GT, html from utils.timing import timer @timer(include_params=False) def create_station_stats( pivoted: pd.DataFrame, station: str | float | int ) -> pd.DataFrame: """ Create statistics for a specific station from pivoted data. Args: pivoted: Pivoted DataFrame containing water quality measurements station: Station identifier Returns: DataFrame with statistics for various water quality parameters """ PARAMETERS = { "Secchi Depth (feet)": ("Depth, Secchi Disk Depth", ["Surface"]), "Temperature (°C)": ("Temperature, Water", ["Surface", "Bottom"]), "Dissolved Oxygen (mg/L)": ("Dissolved Oxygen", ["Surface", "Bottom"]), "Turbidity (NTU)": ("Turbidity", ["Surface", "Bottom"]), "Salinity (ppt)": ("Salinity", ["Surface", "Bottom"]), "pH": ("pH", ["Surface", "Bottom"]), } STATS = {"Average": "mean", "Maximum": "max", "Minimum": "min", "n=": "count"} data = {"Station": station, "Statistic": list(STATS.keys())} for param_name, (param_code, positions) in PARAMETERS.items(): for position in positions: col_name = f"{param_name} {position}" if len(positions) > 1 else param_name data[col_name] = [ pivoted[stat][position][station, param_code] for stat in STATS.values() ] return pd.DataFrame(data) def create_summary_by_station_and_position( df: pd.DataFrame, exclude_analytes: list[str] | None = None ) -> pd.DataFrame: """ Create a summary statistics table from water quality measurements. Args: df (pd.DataFrame): Processed dataframe from get_data function Returns: pd.DataFrame: Summary statistics table with surface/bottom measurements """ if exclude_analytes is None: exclude_analytes = [] summary = ( df.query("Org_Analyte_Name not in @exclude_analytes") .groupby( ["Station_Number", "Sample_Position", "Org_Analyte_Name"], observed=False )["Org_Result_Value"] .agg(["mean", "max", "min", "count"]) .round(2) ) pivoted = summary.reset_index().pivot_table( index=["Station_Number", "Org_Analyte_Name"], columns=["Sample_Position"], values=["mean", "max", "min", "count"], observed=False, ) stations = sorted(df["Station_Number"].unique()) return pd.concat( [create_station_stats(pivoted, station) for station in stations] ).set_index(["Station", "Statistic"]) @timer(include_params=False) def create_overall_summary(df: pd.DataFrame) -> pd.DataFrame: """Create overall summary statistics for the dataset""" summary = ( df.groupby(["Org_Analyte_Name"], observed=False)["Org_Result_Value"] .agg(["mean", "max", "min", "count"]) .round(2) .rename( columns={ "count": "Count", "mean": "Mean", "max": "Maximum", "min": "Minimum", } ) ) summary.index.name = None transposed = summary.T return transposed.rename( columns={ "Depth, Secchi Disk Depth": "Secchi Depth (feet)", "Dissolved Oxygen": "Dissolved Oxygen (mg/L)", "Salinity": "Salinity (ppt)", "Turbidity": "Turbidity (NTU)", "Temperature, Water": "Temperature (°C)", } ).loc[ :, [ "Secchi Depth (feet)", "Temperature (°C)", "Dissolved Oxygen (mg/L)", "Turbidity (NTU)", "Salinity (ppt)", "pH", ], ] @timer(include_params=False) def create_multiindex_columns(df: pd.DataFrame) -> pd.DataFrame: """Create multi-index columns for the summary dataframe""" new_df = df.copy() new_df.columns = pd.MultiIndex.from_tuples( [ (col.rsplit(" ", 1)[0], col.rsplit(" ", 1)[1]) if col != "Secchi Depth (feet)" else ("", col) for col in df.columns ], names=["Analyte", "Position"], ) return new_df @timer(include_params=False) def create_overall_summary_table(df: pd.DataFrame) -> GT: df.index.name = "Statistic" df = df.reset_index() return ( GT(df, rowname_col="Statistic") .tab_header( title="Overall Water Quality", subtitle="Summary statistics for all data analyzed during study period", ) .fmt_number( columns=[ "Secchi Depth (feet)", "Temperature (°C)", "Dissolved Oxygen (mg/L)", ], decimals=1, ) .fmt_integer( columns=list(df.columns[1:]), rows=lambda x: x["Statistic"] == "Count", # type: ignore use_seps=True, ) .cols_label( **{ col: html(f"{col.rpartition(' ')[0]}
{col.rpartition(' ')[-1]}") if col != "pH" else html(f"{col}
 ") for col in df.columns[1:] } # type: ignore ) .cols_width(cases={col: "14%" for col in df.columns[1:]}) .opt_align_table_header(align="center") )