Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from great_tables import GT, html | |
| from utils.timing import timer | |
| def create_station_stats( | |
| pivoted: pd.DataFrame, station: str | float | int | |
| ) -> pd.DataFrame: | |
| """ | |
| Create statistics for a specific station from pivoted data. | |
| Args: | |
| pivoted: Pivoted DataFrame containing water quality measurements | |
| station: Station identifier | |
| Returns: | |
| DataFrame with statistics for various water quality parameters | |
| """ | |
| PARAMETERS = { | |
| "Secchi Depth (feet)": ("Depth, Secchi Disk Depth", ["Surface"]), | |
| "Temperature (°C)": ("Temperature, Water", ["Surface", "Bottom"]), | |
| "Dissolved Oxygen (mg/L)": ("Dissolved Oxygen", ["Surface", "Bottom"]), | |
| "Turbidity (NTU)": ("Turbidity", ["Surface", "Bottom"]), | |
| "Salinity (ppt)": ("Salinity", ["Surface", "Bottom"]), | |
| "pH": ("pH", ["Surface", "Bottom"]), | |
| } | |
| STATS = {"Average": "mean", "Maximum": "max", "Minimum": "min", "n=": "count"} | |
| data = {"Station": station, "Statistic": list(STATS.keys())} | |
| for param_name, (param_code, positions) in PARAMETERS.items(): | |
| for position in positions: | |
| col_name = f"{param_name} {position}" if len(positions) > 1 else param_name | |
| data[col_name] = [ | |
| pivoted[stat][position][station, param_code] for stat in STATS.values() | |
| ] | |
| return pd.DataFrame(data) | |
| def create_summary_by_station_and_position( | |
| df: pd.DataFrame, exclude_analytes: list[str] | None = None | |
| ) -> pd.DataFrame: | |
| """ | |
| Create a summary statistics table from water quality measurements. | |
| Args: | |
| df (pd.DataFrame): Processed dataframe from get_data function | |
| Returns: | |
| pd.DataFrame: Summary statistics table with surface/bottom measurements | |
| """ | |
| if exclude_analytes is None: | |
| exclude_analytes = [] | |
| summary = ( | |
| df.query("Org_Analyte_Name not in @exclude_analytes") | |
| .groupby( | |
| ["Station_Number", "Sample_Position", "Org_Analyte_Name"], observed=False | |
| )["Org_Result_Value"] | |
| .agg(["mean", "max", "min", "count"]) | |
| .round(2) | |
| ) | |
| pivoted = summary.reset_index().pivot_table( | |
| index=["Station_Number", "Org_Analyte_Name"], | |
| columns=["Sample_Position"], | |
| values=["mean", "max", "min", "count"], | |
| observed=False, | |
| ) | |
| stations = sorted(df["Station_Number"].unique()) | |
| return pd.concat( | |
| [create_station_stats(pivoted, station) for station in stations] | |
| ).set_index(["Station", "Statistic"]) | |
| def create_overall_summary(df: pd.DataFrame) -> pd.DataFrame: | |
| """Create overall summary statistics for the dataset""" | |
| summary = ( | |
| df.groupby(["Org_Analyte_Name"], observed=False)["Org_Result_Value"] | |
| .agg(["mean", "max", "min", "count"]) | |
| .round(2) | |
| .rename( | |
| columns={ | |
| "count": "Count", | |
| "mean": "Mean", | |
| "max": "Maximum", | |
| "min": "Minimum", | |
| } | |
| ) | |
| ) | |
| summary.index.name = None | |
| transposed = summary.T | |
| return transposed.rename( | |
| columns={ | |
| "Depth, Secchi Disk Depth": "Secchi Depth (feet)", | |
| "Dissolved Oxygen": "Dissolved Oxygen (mg/L)", | |
| "Salinity": "Salinity (ppt)", | |
| "Turbidity": "Turbidity (NTU)", | |
| "Temperature, Water": "Temperature (°C)", | |
| } | |
| ).loc[ | |
| :, | |
| [ | |
| "Secchi Depth (feet)", | |
| "Temperature (°C)", | |
| "Dissolved Oxygen (mg/L)", | |
| "Turbidity (NTU)", | |
| "Salinity (ppt)", | |
| "pH", | |
| ], | |
| ] | |
| def create_multiindex_columns(df: pd.DataFrame) -> pd.DataFrame: | |
| """Create multi-index columns for the summary dataframe""" | |
| new_df = df.copy() | |
| new_df.columns = pd.MultiIndex.from_tuples( | |
| [ | |
| (col.rsplit(" ", 1)[0], col.rsplit(" ", 1)[1]) | |
| if col != "Secchi Depth (feet)" | |
| else ("", col) | |
| for col in df.columns | |
| ], | |
| names=["Analyte", "Position"], | |
| ) | |
| return new_df | |
| def create_overall_summary_table(df: pd.DataFrame) -> GT: | |
| df.index.name = "Statistic" | |
| df = df.reset_index() | |
| return ( | |
| GT(df, rowname_col="Statistic") | |
| .tab_header( | |
| title="Overall Water Quality", | |
| subtitle="Summary statistics for all data analyzed during study period", | |
| ) | |
| .fmt_number( | |
| columns=[ | |
| "Secchi Depth (feet)", | |
| "Temperature (°C)", | |
| "Dissolved Oxygen (mg/L)", | |
| ], | |
| decimals=1, | |
| ) | |
| .fmt_integer( | |
| columns=list(df.columns[1:]), | |
| rows=lambda x: x["Statistic"] == "Count", # type: ignore | |
| use_seps=True, | |
| ) | |
| .cols_label( | |
| **{ | |
| col: html(f"{col.rpartition(' ')[0]}<br>{col.rpartition(' ')[-1]}") | |
| if col != "pH" | |
| else html(f"{col}<br> ") | |
| for col in df.columns[1:] | |
| } # type: ignore | |
| ) | |
| .cols_width(cases={col: "14%" for col in df.columns[1:]}) | |
| .opt_align_table_header(align="center") | |
| ) | |