waterdb

Sleeping

github-actions[bot]

Add all files with LFS support

007b0fc over 1 year ago

5.32 kB

	import pandas as pd
	from great_tables import GT, html

	from utils.timing import timer


	@timer(include_params=False)
	def create_station_stats(
	pivoted: pd.DataFrame, station: str \| float \| int
	) -> pd.DataFrame:
	"""
	Create statistics for a specific station from pivoted data.

	Args:
	pivoted: Pivoted DataFrame containing water quality measurements
	station: Station identifier

	Returns:
	DataFrame with statistics for various water quality parameters
	"""
	PARAMETERS = {
	"Secchi Depth (feet)": ("Depth, Secchi Disk Depth", ["Surface"]),
	"Temperature (°C)": ("Temperature, Water", ["Surface", "Bottom"]),
	"Dissolved Oxygen (mg/L)": ("Dissolved Oxygen", ["Surface", "Bottom"]),
	"Turbidity (NTU)": ("Turbidity", ["Surface", "Bottom"]),
	"Salinity (ppt)": ("Salinity", ["Surface", "Bottom"]),
	"pH": ("pH", ["Surface", "Bottom"]),
	}
	STATS = {"Average": "mean", "Maximum": "max", "Minimum": "min", "n=": "count"}
	data = {"Station": station, "Statistic": list(STATS.keys())}
	for param_name, (param_code, positions) in PARAMETERS.items():
	for position in positions:
	col_name = f"{param_name} {position}" if len(positions) > 1 else param_name
	data[col_name] = [
	pivoted[stat][position][station, param_code] for stat in STATS.values()
	]
	return pd.DataFrame(data)


	def create_summary_by_station_and_position(
	df: pd.DataFrame, exclude_analytes: list[str] \| None = None
	) -> pd.DataFrame:
	"""
	Create a summary statistics table from water quality measurements.

	Args:
	df (pd.DataFrame): Processed dataframe from get_data function

	Returns:
	pd.DataFrame: Summary statistics table with surface/bottom measurements
	"""
	if exclude_analytes is None:
	exclude_analytes = []

	summary = (
	df.query("Org_Analyte_Name not in @exclude_analytes")
	.groupby(
	["Station_Number", "Sample_Position", "Org_Analyte_Name"], observed=False
	)["Org_Result_Value"]
	.agg(["mean", "max", "min", "count"])
	.round(2)
	)
	pivoted = summary.reset_index().pivot_table(
	index=["Station_Number", "Org_Analyte_Name"],
	columns=["Sample_Position"],
	values=["mean", "max", "min", "count"],
	observed=False,
	)
	stations = sorted(df["Station_Number"].unique())
	return pd.concat(
	[create_station_stats(pivoted, station) for station in stations]
	).set_index(["Station", "Statistic"])


	@timer(include_params=False)
	def create_overall_summary(df: pd.DataFrame) -> pd.DataFrame:
	"""Create overall summary statistics for the dataset"""
	summary = (
	df.groupby(["Org_Analyte_Name"], observed=False)["Org_Result_Value"]
	.agg(["mean", "max", "min", "count"])
	.round(2)
	.rename(
	columns={
	"count": "Count",
	"mean": "Mean",
	"max": "Maximum",
	"min": "Minimum",
	}
	)
	)
	summary.index.name = None
	transposed = summary.T
	return transposed.rename(
	columns={
	"Depth, Secchi Disk Depth": "Secchi Depth (feet)",
	"Dissolved Oxygen": "Dissolved Oxygen (mg/L)",
	"Salinity": "Salinity (ppt)",
	"Turbidity": "Turbidity (NTU)",
	"Temperature, Water": "Temperature (°C)",
	}
	).loc[
	:,
	[
	"Secchi Depth (feet)",
	"Temperature (°C)",
	"Dissolved Oxygen (mg/L)",
	"Turbidity (NTU)",
	"Salinity (ppt)",
	"pH",
	],
	]


	@timer(include_params=False)
	def create_multiindex_columns(df: pd.DataFrame) -> pd.DataFrame:
	"""Create multi-index columns for the summary dataframe"""
	new_df = df.copy()
	new_df.columns = pd.MultiIndex.from_tuples(
	[
	(col.rsplit(" ", 1)[0], col.rsplit(" ", 1)[1])
	if col != "Secchi Depth (feet)"
	else ("", col)
	for col in df.columns
	],
	names=["Analyte", "Position"],
	)
	return new_df


	@timer(include_params=False)
	def create_overall_summary_table(df: pd.DataFrame) -> GT:
	df.index.name = "Statistic"
	df = df.reset_index()

	return (
	GT(df, rowname_col="Statistic")
	.tab_header(
	title="Overall Water Quality",
	subtitle="Summary statistics for all data analyzed during study period",
	)
	.fmt_number(
	columns=[
	"Secchi Depth (feet)",
	"Temperature (°C)",
	"Dissolved Oxygen (mg/L)",
	],
	decimals=1,
	)
	.fmt_integer(
	columns=list(df.columns[1:]),
	rows=lambda x: x["Statistic"] == "Count", # type: ignore
	use_seps=True,
	)
	.cols_label(
	**{
	col: html(f"{col.rpartition(' ')[0]}<br>{col.rpartition(' ')[-1]}")
	if col != "pH"
	else html(f"{col}<br> ")
	for col in df.columns[1:]
	} # type: ignore
	)
	.cols_width(cases={col: "14%" for col in df.columns[1:]})
	.opt_align_table_header(align="center")
	)