from typing import Literal import altair as alt import matplotlib.pyplot as plt import numpy as np import pandas as pd import plotly.graph_objects as go import seaborn as sns import streamlit as st from matplotlib.figure import Figure from utils.data_loading import timer @st.cache_data @timer(include_params=True) def plot_parameter_correlations( df: pd.DataFrame, analyte_names: list[str], subset_by: str, subset: str, filter_by: str, threshold: float = 0.2, corr_method: Literal["pearson", "kendall", "spearman"] = "kendall", ) -> tuple[Figure, pd.DataFrame]: """ Creates a correlation heatmap showing relationships between water quality parameters, with additional information about data completeness. Parameters ---------- df : pd.DataFrame Input DataFrame containing water quality measurements. Must have columns: - Org_Analyte_Name: Name of the analyte - Org_Result_Value: Measurement value - Activity_Start_Date_Time: Timestamp of measurement - Reporting_Year: Year of measurement - Station_Number: Monitoring station identifier - Name: Station name - Sample_Position: Sample depth position (e.g., "Surface", "Bottom") analyte_names : list[str] List of analyte names to include in correlation analysis subset_by : str Column name used for subsetting the data (e.g., "Sector", "Waterbody_Class") subset : str Value within subset_by column to filter data (e.g., specific sector name) filter_by : str Sample position filter ("Surface", "Bottom", or "All") threshold : float, default=0.2 Minimum data completeness threshold (0-1). Parameters with completeness below this threshold will be excluded from correlation analysis but listed in footnote. corr_method : {"pearson", "kendall", "spearman"}, default="kendall" Method of correlation to use. Options are: - "pearson": Standard correlation coefficient - "kendall": Kendall Tau correlation coefficient - "spearman": Spearman rank correlation coefficient Returns ------- tuple[Figure, pd.DataFrame] - Figure: Matplotlib figure containing: - Correlation heatmap with values - Title showing subset and sample size - Footnote listing excluded parameters - DataFrame: Pivot table of filtered data used for correlation analysis Notes ----- - Uses abbreviated parameter names for cleaner display (e.g., "DO" for "Dissolved Oxygen") - Masks upper triangle of correlation matrix - Colors correlations using RdBu_r colormap centered at 0 - Includes data completeness information in footnote - Caches results using streamlit cache decorator """ # Constants from style guide GREY30 = "#4d4d4d" # Dark grey for titles GREY40 = "#666666" # Medium grey for axes and labels measured_params = ( df[df["Org_Analyte_Name"].isin(analyte_names)] .groupby("Org_Analyte_Name", observed=True) .size() ) # Create pivot table only for measured parameters that were requested pivot_df = df[ df["Org_Analyte_Name"].isin(set(measured_params.index) & set(analyte_names)) ].pivot_table( index="Activity_Start_Date_Time", columns="Org_Analyte_Name", values="Org_Result_Value", observed=False, ) name_mapping = { "Depth, Secchi Disk Depth": "Secchi Depth", "Dissolved Oxygen": "DO", "Fecal Coliform (MPN)": "Fecal Coliform", "Total Nitrogen": "TN", "Total Phosphorus": "TP", } # Calculate completeness based on number of measurements completeness = {} for param in measured_params.index: if param in analyte_names and param in pivot_df.columns: total_measurements = measured_params[param] # Use original name to get values from pivot_df valid_values = pivot_df[param].notna().sum() # Store result using new name if it exists new_name = name_mapping.get(param, param) completeness[new_name] = valid_values / total_measurements completeness = pd.Series(completeness) pivot_df = pivot_df.rename(columns=name_mapping) # Calculate data completeness for each parameter completeness = pivot_df.notna().mean() valid_params = completeness[completeness >= threshold].index excluded_params = completeness[completeness < threshold] # Filter pivot_df to only include parameters meeting the threshold pivot_df = pivot_df[valid_params] # Calculate correlation matrix corr = pivot_df.corr(method=corr_method) # Calculate sample size n_samples = len(df) fig = plt.figure(figsize=(3, 3.5)) # Adjust gridspec ratios and spacing gs = fig.add_gridspec( 3, 1, height_ratios=[ 1, # Title space 4, # Heatmap 1.5, # Footnote ], hspace=0.4, ) # Add title axes, heatmap axes, and footnote axes title_ax = fig.add_subplot(gs[0]) heatmap_ax = fig.add_subplot(gs[1]) footnote_ax = fig.add_subplot(gs[2]) # Create heatmap mask = np.triu(np.ones_like(corr, dtype=bool)) heatmap = sns.heatmap( corr, mask=mask, annot=True, cmap="RdBu_r", center=0, vmin=-1, vmax=1, ax=heatmap_ax, yticklabels=1, cbar=True, xticklabels=1, annot_kws={"size": 5}, fmt=".2f", ) # Style spines according to guide for spine in heatmap_ax.spines.values(): spine.set_visible(False) # Rotate x-axis labels and adjust their position heatmap_ax.set_xticklabels( heatmap_ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor", color=GREY40, # Style guide color for labels ) # Update tick parameters - remove ticks but keep labels heatmap_ax.tick_params( axis="x", pad=5, labelsize=5, length=0, colors=GREY40, # Style guide color for labels ) heatmap_ax.tick_params( axis="y", pad=5, labelsize=5, length=0, colors=GREY40, # Style guide color for labels ) # Fix the colorbar ticks warning by setting ticks first colorbar = heatmap.figure.axes[-1] # type: ignore ticks = colorbar.get_yticks() colorbar.set_yticks(ticks) tick_labels = [f"{x:>8.2f}" for x in ticks] colorbar.set_yticklabels( tick_labels, size=5, color=GREY40, # Style guide color for labels ) # Rotate y-axis labels to horizontal heatmap_ax.set_yticklabels( heatmap_ax.get_yticklabels(), rotation=0, color=GREY40, # Style guide color for labels ) # Remove axis labels heatmap_ax.set_xlabel("") heatmap_ax.set_ylabel("") # Configure footnote axis footnote_ax.set_frame_on(False) # Hide the frame footnote_ax.set_xticks([]) # Remove x-ticks footnote_ax.set_yticks([]) # Remove y-ticks # Add footnote with adjusted position and style if not excluded_params.empty: footnote_text = "Excluded parameters (<{:.0%} data completeness):\n".format( threshold ) for param, completeness_val in excluded_params.items(): footnote_text += f" - {param}: {completeness_val:.1%} complete\n" footnote_ax.text( 0.01, 0.40, footnote_text.rstrip(), ha="left", va="center", fontsize=5, fontstyle="italic", transform=footnote_ax.transAxes, color=GREY40, # Style guide color bbox=dict( facecolor="white", alpha=0.8, edgecolor="none" ), # Style guide text box ) title_ax.set_frame_on(False) title_ax.set_xticks([]) title_ax.set_yticks([]) display_filter = "Surface and Bottom" if filter_by == "All" else filter_by # Add year information to the subtitle year_info = ( f"Reporting Year {df['Reporting_Year'].iloc[0]}" if len(df["Reporting_Year"].unique()) == 1 else "All Years" ) # Add titles with style guide typography title_ax.text( 0.45, 0.8, f"{subset_by}: {subset}", ha="center", va="center", fontsize=8, fontweight="bold", transform=fig.transFigure, color=GREY30, # Style guide title color ) title_ax.text( 0.45, 0.75, f"{display_filter}, {year_info} (n={n_samples:,})", ha="center", va="bottom", fontsize=6, fontstyle="italic", transform=fig.transFigure, color=GREY40, # Style guide color ) # Replace tight_layout with more explicit spacing control # First, calculate the figure bounds fig.canvas.draw() # Get the tight_bbox renderer = fig.canvas.get_renderer() # type: ignore fig.get_tightbbox(renderer) # Adjust the subplot positions manually fig.subplots_adjust(left=0.15, right=0.95, bottom=0.02, top=0.85, hspace=0.4) return fig, pivot_df def plot_np_ratios(df: pd.DataFrame) -> Figure: """ Create a visualization of N:P ratios over time and their distribution. Parameters: ----------- df : pd.DataFrame Input dataframe containing nutrient measurements Returns: -------- Figure Matplotlib figure containing time series and distribution plots """ # Style constants from guide GREY30 = "#4d4d4d" # Dark grey for titles GREY40 = "#666666" # Medium grey for axes and labels # Create dataframe with N, P, and Sector information nutrients_df = ( df[df["Org_Analyte_Name"].isin(["Total Nitrogen", "Total Phosphorus"])] .pivot_table( index=["Activity_Start_Date_Time", "Sector"], columns="Org_Analyte_Name", values="Org_Result_Value", observed=True, ) .reset_index() ) # Calculate N:P ratio nutrients_df["N:P Ratio"] = ( nutrients_df["Total Nitrogen"] / nutrients_df["Total Phosphorus"] ) # Create figure with two subplots - following guide dimensions fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8)) # Time series plot with colors by sector scatter = sns.scatterplot( # noqa: F841 data=nutrients_df, x="Activity_Start_Date_Time", y="N:P Ratio", hue="Sector", ax=ax1, alpha=0.6, s=20, # Smaller point size for better clarity ) # Add Redfield ratio line with style guide specifications ax1.axhline( y=16, color="red", linestyle="--", alpha=0.7, linewidth=1.5, label="Redfield Ratio (16:1)", ) # Style spines according to guide for ax in [ax1, ax2]: ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_color(GREY40) ax.spines["bottom"].set_linewidth(0.5) ax.spines["left"].set_visible(False) # Remove tick marks but keep labels ax.tick_params(axis="both", which="both", length=0, colors=GREY40) # Add grid with matching style ax.grid(True, axis="y", alpha=0.15, linestyle="-", color="gray") # Style axis labels and titles according to guide ax1.set_ylabel("N:P Ratio", color=GREY40, fontsize=10) ax1.set_xlabel("Date", color=GREY40, fontsize=10) ax1.set_title("N:P Ratio Over Time", color=GREY30, fontsize=10, pad=10) # Adjust legend position and style legend = ax1.legend( bbox_to_anchor=(1.05, 1), loc="upper left", frameon=False, title="Sector", ) legend.get_title().set_color(GREY40) for text in legend.get_texts(): text.set_color(GREY40) # Histogram plot with style guide specifications sns.histplot( x=nutrients_df["N:P Ratio"].dropna(), ax=ax2, alpha=0.6, ) # Add Redfield ratio line to histogram ax2.axvline( x=16, color="red", linestyle="--", alpha=0.7, linewidth=1.5, label="Redfield Ratio (16:1)", ) # Style second plot labels ax2.set_xlabel("N:P Ratio", color=GREY40, fontsize=10) ax2.set_ylabel("Count", color=GREY40, fontsize=10) ax2.set_title("Distribution of N:P Ratios", color=GREY30, fontsize=10, pad=10) ax2.legend(frameon=False) # Add sample size annotation n_samples = len(nutrients_df.dropna(subset=["N:P Ratio"])) stats_text = f"n = {n_samples:,}" ax2.text( 0.02, 0.98, stats_text, transform=ax2.transAxes, verticalalignment="top", fontsize=8, bbox=dict(facecolor="white", alpha=0.8, edgecolor="none"), color=GREY40, ) # Adjust layout to accommodate legend while maintaining proper spacing plt.tight_layout() # Adjust subplot positions manually for legend space plt.subplots_adjust(right=0.85) return fig def altair_plot_np_ratios(df: pd.DataFrame) -> alt.VConcatChart: # Create dataframe with N, P, and Sector information nutrients_df = ( df[df["Org_Analyte_Name"].isin(["Total Nitrogen", "Total Phosphorus"])] .pivot_table( index=["Activity_Start_Date_Time", "Sector"], columns="Org_Analyte_Name", values="Org_Result_Value", observed=True, ) .reset_index() ) # Calculate N:P ratio nutrients_df["N:P Ratio"] = ( nutrients_df["Total Nitrogen"] / nutrients_df["Total Phosphorus"] ) # Time series plot with colors by sector time_series = ( alt.Chart(nutrients_df) .mark_circle(size=60) .encode( x=alt.X( "Activity_Start_Date_Time:T", axis=alt.Axis(format="%Y", tickCount="year"), title="Date", ), y=alt.Y(r"N\:P Ratio:Q", title="N:P Ratio"), color="Sector:N", tooltip=[ alt.Tooltip("Activity_Start_Date_Time:T", title="Date"), alt.Tooltip(r"N\:P Ratio:Q", format=".0f", title="N:P Ratio"), alt.Tooltip("Sector:N", title="Sector"), ], ) .properties(title="N:P Ratio Over Time", width=600, height=300) .interactive() ) # Add Redfield Ratio line redfield_line = ( alt.Chart(pd.DataFrame({"y": [16]})).mark_rule(color="red").encode(y="y:Q") ) # Histogram plot histogram = ( alt.Chart(nutrients_df) .mark_bar() .encode( x=alt.X(r"N\:P Ratio:Q", bin=alt.Bin(maxbins=30), title="N:P Ratio"), y="count()", tooltip=["count()"], ) .properties(title="Distribution of N:P Ratios", width=600, height=300) .interactive() ) # Add Redfield Ratio line to histogram redfield_hist_line = ( alt.Chart(pd.DataFrame({"x": [16]})).mark_rule(color="red").encode(x="x:Q") ) # Combine plots combined_chart = alt.vconcat( time_series + redfield_line, histogram + redfield_hist_line ).resolve_scale(y="independent") return combined_chart def plot_do_temp_relationship(df: pd.DataFrame) -> Figure: """ Create a scatter plot of DO vs temperature with regression line using seaborn. Parameters: ----------- df : pd.DataFrame Input dataframe containing DO and temperature measurements Returns: -------- Figure Matplotlib figure containing the plot """ do_temp_data = ( df[df["Org_Analyte_Name"].isin(["Dissolved Oxygen", "Temperature, Water"])] .pivot_table( index=["Activity_Start_Date_Time", "Station_Number", "Sample_Position"], columns="Org_Analyte_Name", values="Org_Result_Value", observed=True, ) .reset_index() .dropna(subset=["Dissolved Oxygen", "Temperature, Water"]) ) # Create custom color palette matching DO timeseries custom_palette = {"Surface": "#5AA4D8", "Bottom": "#1B4B8A"} # Create plot with regression line and adjust the hue order g = sns.lmplot( data=do_temp_data, x="Temperature, Water", y="Dissolved Oxygen", hue="Sample_Position", hue_order=["Bottom", "Surface"], # Plot 'Bottom' first palette=custom_palette, scatter_kws={"alpha": 0.5, "zorder": 2, "s": 20}, # Scatter plots at zorder=2 line_kws={"zorder": 3, "linewidth": 1}, # Trend lines at zorder=3 height=8, aspect=1.5, legend=False, ) # Add DO threshold and set z-order ax = g.axes[0, 0] ax.axhline( y=4.8, color="#FF8C00", linestyle="--", alpha=0.9, zorder=1, linewidth=1 ) # Threshold line at zorder=1 ax.text( ax.get_xlim()[0], 4.9, " 4.8 mg/L DO threshold", ha="left", va="bottom", color="#FF8C00", alpha=0.9, ) # Customize spines - only show bottom spine ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["left"].set_visible(False) ax.spines["bottom"].set_color("black") ax.spines["bottom"].set_linewidth(0.5) g.set_axis_labels("Water Temperature (°C)", "Dissolved Oxygen (mg/L)") ax.set_title("Dissolved Oxygen vs Water Temperature", pad=20, fontsize=16) # Adjust legend to show 'Surface' first handles, labels = ax.get_legend_handles_labels() # Reverse the order of handles and labels handles = handles[::-1] labels = labels[::-1] ax.legend( handles, labels, bbox_to_anchor=(1.0, 1.0), loc="upper right", frameon=False, handletextpad=0.5, ) # Add grid with matching style ax.grid(True, axis="y", alpha=0.15, linestyle="-", color="gray") # Remove tick marks but keep labels ax.tick_params(axis="y", which="both", length=0) # Set y-axis limits with some padding ymin = max(int(min(do_temp_data["Dissolved Oxygen"].min(), 4.8) * 0.9) - 1, 0) ymax = do_temp_data["Dissolved Oxygen"].max() * 1.1 ax.set_ylim(ymin, ymax) yticks = np.arange(ymin, ymax, 2) ax.set_yticks(yticks) return g.figure def plotly_plot_do_temp_relationship(df: pd.DataFrame) -> go.Figure: """ Create an interactive scatter plot of DO vs temperature with regression lines using Plotly. Matches the style and features of the original matplotlib/seaborn plot. Parameters: ----------- df : pd.DataFrame Input dataframe containing DO and temperature measurements Returns: -------- go.Figure Plotly figure object """ # Prepare the data similarly to the original function do_temp_data = ( df[df["Org_Analyte_Name"].isin(["Dissolved Oxygen", "Temperature, Water"])] .pivot_table( index=[ "Activity_Start_Date_Time", "Station_Number", "Sample_Position", "Sector", # Added for tooltip ], columns="Org_Analyte_Name", values="Org_Result_Value", observed=True, ) .reset_index() .dropna(subset=["Dissolved Oxygen", "Temperature, Water"]) ) # Create figure fig = go.Figure() # Colors matching seaborn's muted palette colors = {"Surface": "#8da0cb", "Bottom": "#fc8d62"} # Add scatter plots and regression lines for each position for position in ["Surface", "Bottom"]: pos_data = do_temp_data[do_temp_data["Sample_Position"] == position] # Add scatter plot fig.add_trace( go.Scatter( x=pos_data["Temperature, Water"], y=pos_data["Dissolved Oxygen"], mode="markers", name=position, marker=dict(color=colors[position], size=8, opacity=0.6), hovertemplate=( "Temperature: %{x:.1f}°C
" "DO: %{y:.1f} mg/L
" "Position: " + position + "
" "Station: %{customdata[0]}
" "Sector: %{customdata[1]}
" "" ), customdata=pos_data[["Station_Number", "Sector"]], ) ) # Calculate and add regression line z = np.polyfit(pos_data["Temperature, Water"], pos_data["Dissolved Oxygen"], 1) p = np.poly1d(z) x_range = np.linspace( pos_data["Temperature, Water"].min(), pos_data["Temperature, Water"].max(), 100, ) fig.add_trace( go.Scatter( x=x_range, y=p(x_range), mode="lines", line=dict(color=colors[position], dash="dash"), name=f"{position} Trend", hovertemplate=None, hoverinfo="skip", showlegend=False, ) ) # Add DO threshold line fig.add_hline( y=4.8, line=dict(color="#FF8C00", width=1, dash="dash"), opacity=0.5, annotation_text="4.8 mg/L DO threshold", annotation_position="left", annotation=dict( font=dict(color="#FF8C00", size=12), xanchor="left", yanchor="bottom", opacity=0.8, ), ) # Update layout fig.update_layout( title=dict( text="Dissolved Oxygen vs Water Temperature", x=0.5, y=0.95, xanchor="center", yanchor="top", font=dict(size=16), ), xaxis_title="Water Temperature (°C)", yaxis_title="Dissolved Oxygen (mg/L)", legend_title="Sample Position", legend=dict( yanchor="top", y=1, xanchor="left", x=1.05, ), template="plotly_white", width=800, height=600, showlegend=True, ) # Update axes fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor="rgba(128, 128, 128, 0.2)") fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor="rgba(128, 128, 128, 0.2)") return fig def altair_plot_do_temp_relationship(df: pd.DataFrame) -> alt.LayerChart: """ Create an interactive scatter plot of DO vs temperature with regression lines using Altair. Matches the style and features of the original matplotlib/seaborn plot. Parameters: ----------- df : pd.DataFrame Input dataframe containing DO and temperature measurements Returns: -------- alt.Chart Altair chart object """ # Prepare the data similarly to the original function do_temp_data = ( df[df["Org_Analyte_Name"].isin(["Dissolved Oxygen", "Temperature, Water"])] .pivot_table( index=[ "Activity_Start_Date_Time", "Station_Number", "Sample_Position", "Sector", ], columns="Org_Analyte_Name", values="Org_Result_Value", observed=True, ) .reset_index() .dropna(subset=["Dissolved Oxygen", "Temperature, Water"]) ) # Create the base scatter plot scatter = ( alt.Chart(do_temp_data) .mark_circle(size=60, opacity=0.6) .encode( x=alt.X( "Temperature, Water:Q", title="Water Temperature (°C)", scale=alt.Scale(zero=False), ), y=alt.Y( "Dissolved Oxygen:Q", title="Dissolved Oxygen (mg/L)", scale=alt.Scale(zero=False), ), color=alt.Color( "Sample_Position:N", scale=alt.Scale( domain=["Surface", "Bottom"], range=["#8da0cb", "#fc8d62"], # Muted blue and orange ), legend=alt.Legend(title="Sample Position"), ), tooltip=[ alt.Tooltip("Temperature, Water:Q", title="Temperature", format=".1f"), alt.Tooltip("Dissolved Oxygen:Q", title="DO", format=".1f"), alt.Tooltip("Sample_Position:N", title="Position"), alt.Tooltip("Sector:N", title="Sector"), alt.Tooltip("Station_Number:N", title="Station"), ], ) ) # Add regression lines for each Sample_Position regression = ( scatter.transform_regression( "Temperature, Water", "Dissolved Oxygen", groupby=["Sample_Position"] ) .mark_line(size=2) .encode( color=alt.Color( "Sample_Position:N", scale=alt.Scale( domain=["Surface", "Bottom"], range=["#8da0cb", "#fc8d62"] ), ) ) ) # Create DO threshold line threshold_df = pd.DataFrame({"y": [5]}) threshold_line = ( alt.Chart(threshold_df) .mark_rule(strokeDash=[4, 4], color="red", opacity=0.5) .encode(y="y:Q") ) # Add threshold label threshold_label = ( alt.Chart( pd.DataFrame({"x": [do_temp_data["Temperature, Water"].min()], "y": [5.1]}) ) .mark_text( align="left", baseline="bottom", color="red", opacity=0.5, text=" 5 mg/L DO threshold", ) .encode(x="x:Q", y="y:Q") ) # Combine all layers and configure final_chart = ( alt.layer(scatter, regression, threshold_line, threshold_label) .properties( width=800, height=750, ) .configure_axis(grid=True, gridOpacity=0.3) .interactive() ) return final_chart