| import pandas as pd |
| from great_tables import GT, html |
|
|
| from utils.timing import timer |
|
|
|
|
| @timer(include_params=False) |
| def create_station_stats( |
| pivoted: pd.DataFrame, station: str | float | int |
| ) -> pd.DataFrame: |
| """ |
| Create statistics for a specific station from pivoted data. |
| |
| Args: |
| pivoted: Pivoted DataFrame containing water quality measurements |
| station: Station identifier |
| |
| Returns: |
| DataFrame with statistics for various water quality parameters |
| """ |
| PARAMETERS = { |
| "Secchi Depth (feet)": ("Depth, Secchi Disk Depth", ["Surface"]), |
| "Temperature (°C)": ("Temperature, Water", ["Surface", "Bottom"]), |
| "Dissolved Oxygen (mg/L)": ("Dissolved Oxygen", ["Surface", "Bottom"]), |
| "Turbidity (NTU)": ("Turbidity", ["Surface", "Bottom"]), |
| "Salinity (ppt)": ("Salinity", ["Surface", "Bottom"]), |
| "pH": ("pH", ["Surface", "Bottom"]), |
| } |
| STATS = {"Average": "mean", "Maximum": "max", "Minimum": "min", "n=": "count"} |
| data = {"Station": station, "Statistic": list(STATS.keys())} |
| for param_name, (param_code, positions) in PARAMETERS.items(): |
| for position in positions: |
| col_name = f"{param_name} {position}" if len(positions) > 1 else param_name |
| data[col_name] = [ |
| pivoted[stat][position][station, param_code] for stat in STATS.values() |
| ] |
| return pd.DataFrame(data) |
|
|
|
|
| def create_summary_by_station_and_position( |
| df: pd.DataFrame, exclude_analytes: list[str] | None = None |
| ) -> pd.DataFrame: |
| """ |
| Create a summary statistics table from water quality measurements. |
| |
| Args: |
| df (pd.DataFrame): Processed dataframe from get_data function |
| |
| Returns: |
| pd.DataFrame: Summary statistics table with surface/bottom measurements |
| """ |
| if exclude_analytes is None: |
| exclude_analytes = [] |
|
|
| summary = ( |
| df.query("Org_Analyte_Name not in @exclude_analytes") |
| .groupby( |
| ["Station_Number", "Sample_Position", "Org_Analyte_Name"], observed=False |
| )["Org_Result_Value"] |
| .agg(["mean", "max", "min", "count"]) |
| .round(2) |
| ) |
| pivoted = summary.reset_index().pivot_table( |
| index=["Station_Number", "Org_Analyte_Name"], |
| columns=["Sample_Position"], |
| values=["mean", "max", "min", "count"], |
| observed=False, |
| ) |
| stations = sorted(df["Station_Number"].unique()) |
| return pd.concat( |
| [create_station_stats(pivoted, station) for station in stations] |
| ).set_index(["Station", "Statistic"]) |
|
|
|
|
| @timer(include_params=False) |
| def create_overall_summary(df: pd.DataFrame) -> pd.DataFrame: |
| """Create overall summary statistics for the dataset""" |
| summary = ( |
| df.groupby(["Org_Analyte_Name"], observed=False)["Org_Result_Value"] |
| .agg(["mean", "max", "min", "count"]) |
| .round(2) |
| .rename( |
| columns={ |
| "count": "Count", |
| "mean": "Mean", |
| "max": "Maximum", |
| "min": "Minimum", |
| } |
| ) |
| ) |
| summary.index.name = None |
| transposed = summary.T |
| return transposed.rename( |
| columns={ |
| "Depth, Secchi Disk Depth": "Secchi Depth (feet)", |
| "Dissolved Oxygen": "Dissolved Oxygen (mg/L)", |
| "Salinity": "Salinity (ppt)", |
| "Turbidity": "Turbidity (NTU)", |
| "Temperature, Water": "Temperature (°C)", |
| } |
| ).loc[ |
| :, |
| [ |
| "Secchi Depth (feet)", |
| "Temperature (°C)", |
| "Dissolved Oxygen (mg/L)", |
| "Turbidity (NTU)", |
| "Salinity (ppt)", |
| "pH", |
| ], |
| ] |
|
|
|
|
| @timer(include_params=False) |
| def create_multiindex_columns(df: pd.DataFrame) -> pd.DataFrame: |
| """Create multi-index columns for the summary dataframe""" |
| new_df = df.copy() |
| new_df.columns = pd.MultiIndex.from_tuples( |
| [ |
| (col.rsplit(" ", 1)[0], col.rsplit(" ", 1)[1]) |
| if col != "Secchi Depth (feet)" |
| else ("", col) |
| for col in df.columns |
| ], |
| names=["Analyte", "Position"], |
| ) |
| return new_df |
|
|
|
|
| @timer(include_params=False) |
| def create_overall_summary_table(df: pd.DataFrame) -> GT: |
| df.index.name = "Statistic" |
| df = df.reset_index() |
|
|
| return ( |
| GT(df, rowname_col="Statistic") |
| .tab_header( |
| title="Overall Water Quality", |
| subtitle="Summary statistics for all data analyzed during study period", |
| ) |
| .fmt_number( |
| columns=[ |
| "Secchi Depth (feet)", |
| "Temperature (°C)", |
| "Dissolved Oxygen (mg/L)", |
| ], |
| decimals=1, |
| ) |
| .fmt_integer( |
| columns=list(df.columns[1:]), |
| rows=lambda x: x["Statistic"] == "Count", |
| use_seps=True, |
| ) |
| .cols_label( |
| **{ |
| col: html(f"{col.rpartition(' ')[0]}<br>{col.rpartition(' ')[-1]}") |
| if col != "pH" |
| else html(f"{col}<br> ") |
| for col in df.columns[1:] |
| } |
| ) |
| .cols_width(cases={col: "14%" for col in df.columns[1:]}) |
| .opt_align_table_header(align="center") |
| ) |
|
|