github-actions[bot] commited on
Commit
5d4e96b
·
1 Parent(s): 948797f

Fresh start without shapefiles

Browse files
.github/workflows/deploy-to-huggingface.yml CHANGED
@@ -25,19 +25,32 @@ jobs:
25
  # Clone the HF space
26
  git clone https://philmaxwell:$HF_TOKEN@huggingface.co/spaces/philmaxwell/sabw-wq-data hf_space
27
 
28
- # Copy files, excluding large files and unnecessary directories
 
 
 
 
 
 
 
 
 
29
  rsync -av \
30
  --exclude 'hf_space' \
31
  --exclude '.git' \
 
32
  --exclude '*.ipynb' \
33
  --exclude 'Ideas.md' \
34
  --exclude 'data/*.parquet' \
35
  --exclude '*.csv' \
36
  --exclude '*.xlsx' \
 
 
 
37
  ./ hf_space/
38
 
39
  # Commit and push changes
40
  cd hf_space
41
  git add .
42
- git commit -m "Update application files"
43
- git push origin main
 
25
  # Clone the HF space
26
  git clone https://philmaxwell:$HF_TOKEN@huggingface.co/spaces/philmaxwell/sabw-wq-data hf_space
27
 
28
+ # Clean up everything except .git directory
29
+ cd hf_space
30
+ find . -mindepth 1 -not -path './.git*' -delete
31
+
32
+ # Remove LFS configuration
33
+ git rm .gitattributes || true
34
+ git commit -am "Clean repository"
35
+
36
+ # Copy new files, excluding shapefiles
37
+ cd ..
38
  rsync -av \
39
  --exclude 'hf_space' \
40
  --exclude '.git' \
41
+ --exclude '.gitattributes' \
42
  --exclude '*.ipynb' \
43
  --exclude 'Ideas.md' \
44
  --exclude 'data/*.parquet' \
45
  --exclude '*.csv' \
46
  --exclude '*.xlsx' \
47
+ --exclude 'data/SAB/*.shp' \
48
+ --exclude 'data/SAB/*.dbf' \
49
+ --exclude 'data/SAB/*.shx' \
50
  ./ hf_space/
51
 
52
  # Commit and push changes
53
  cd hf_space
54
  git add .
55
+ git commit -m "Fresh start without shapefiles"
56
+ git push -f origin main
.gitignore CHANGED
@@ -22,3 +22,7 @@ wheels/
22
  /.quarto/
23
 
24
  .cache/
 
 
 
 
 
22
  /.quarto/
23
 
24
  .cache/
25
+ cache/
26
+ data/KOR.zip
27
+ *.ipynb
28
+ *.json
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Water Quality Report
3
+ emoji: 💧
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.40.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
analysis.py ADDED
@@ -0,0 +1,1234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import altair as alt
2
+ import contextily as ctx
3
+ import geopandas as gpd
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import pandas as pd
7
+ import plotly.graph_objects as go
8
+ import scipy.stats as stats
9
+ import seaborn as sns
10
+ from matplotlib.colors import LinearSegmentedColormap
11
+ from matplotlib.figure import Figure
12
+ from plotly.subplots import make_subplots
13
+
14
+
15
+ def load_data(filename: str) -> pd.DataFrame:
16
+ return pd.read_csv(filename, dtype={"Station_Number": str}).assign(
17
+ Org_Result_Value=lambda df: pd.to_numeric(
18
+ df["Org_Result_Value"].replace("Not Reported", pd.NA), errors="coerce"
19
+ ),
20
+ Activity_Start_Date_Time=lambda df: pd.to_datetime(
21
+ df["Activity_Start_Date_Time"]
22
+ ),
23
+ )
24
+
25
+
26
+ def plot_analyte_trends(
27
+ df: pd.DataFrame, analyte_names: list[str], sample_position: str, figsize=(15, 12)
28
+ ) -> Figure:
29
+ """
30
+ Create subplots of analyte trends for the given dataframe and analytes.
31
+
32
+ Parameters:
33
+ -----------
34
+ df : pandas DataFrame
35
+ The filtered dataframe containing data for a specific station and position
36
+ analyte_names : list[str]
37
+ List of analyte names to plot
38
+ figsize : tuple
39
+ Figure size in inches (width, height)
40
+ """
41
+ # Calculate number of rows needed (2 columns)
42
+ n_rows = (len(analyte_names) + 1) // 2
43
+
44
+ fig, axes = plt.subplots(n_rows, 2, figsize=figsize)
45
+ axes = axes.flatten() # Flatten axes array for easier indexing
46
+
47
+ station_number = df["Station_Number"].iloc[0]
48
+ station_name = df["Name"].iloc[0]
49
+
50
+ if sample_position == "All":
51
+ sample_position_label = "Surface and Bottom"
52
+ else:
53
+ sample_position_label = sample_position
54
+
55
+ for idx, analyte_name in enumerate(analyte_names):
56
+ ax = axes[idx]
57
+ data = (
58
+ df[df["Org_Analyte_Name"] == analyte_name]
59
+ .assign(Year=lambda df: df["Activity_Start_Date_Time"].dt.year)
60
+ .dropna(subset=["Org_Result_Value"])
61
+ )
62
+
63
+ if data.empty:
64
+ ax.text(
65
+ 0.5,
66
+ 0.5,
67
+ f"No data available for {analyte_name}",
68
+ ha="center",
69
+ va="center",
70
+ )
71
+ continue
72
+
73
+ # Determine if log scale should be used
74
+ log_scale_analytes = [
75
+ "Turbidity",
76
+ "Fecal Coliform (MPN)",
77
+ "Total Nitrogen",
78
+ "Total Phosphorus",
79
+ ]
80
+ log_scale = analyte_name in log_scale_analytes
81
+ if log_scale:
82
+ ax.set_yscale("log")
83
+ ax.yaxis.set_major_formatter(plt.ScalarFormatter()) # type: ignore
84
+
85
+ # Create box plot
86
+ groups = data.groupby("Year")
87
+ positions = np.array(list(groups.groups.keys()))
88
+ group_data = [group["Org_Result_Value"] for name, group in groups]
89
+
90
+ ax.boxplot(
91
+ group_data,
92
+ positions=positions,
93
+ widths=0.6,
94
+ patch_artist=True,
95
+ boxprops=dict(facecolor="lightblue", color="blue", alpha=0.5),
96
+ medianprops=dict(color="blue"),
97
+ whiskerprops=dict(color="blue"),
98
+ capprops=dict(color="blue"),
99
+ flierprops=dict(color="blue", markeredgecolor="blue", alpha=0.5),
100
+ )
101
+
102
+ # Calculate and plot trend line
103
+ yearly_means = data.groupby("Year")["Org_Result_Value"].mean()
104
+ X = yearly_means.index.values.reshape(-1, 1)
105
+ y = yearly_means.values
106
+
107
+ # Plot means
108
+ ax.plot(X, y, "bo-", linewidth=1, markersize=4, label="Annual Mean")
109
+
110
+ # Calculate trend line
111
+ if len(X) > 1: # Only calculate trend if we have more than one point
112
+ slope, intercept, r_value, p_value, std_err = stats.linregress(X.ravel(), y)
113
+ trend_line = slope * X.ravel() + intercept
114
+ ax.plot(X, trend_line, "r--", alpha=0.8, linewidth=1, label="Trend")
115
+
116
+ # Add statistics
117
+ stats_text = f"R²={r_value**2:.3f}\np={p_value:.3f}" # type: ignore
118
+ ax.text(
119
+ 0.02,
120
+ 0.98,
121
+ stats_text,
122
+ transform=ax.transAxes,
123
+ verticalalignment="top",
124
+ bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
125
+ parse_math=False,
126
+ )
127
+
128
+ # Customize subplot
129
+ ax.set_title(f"{analyte_name}", pad=15)
130
+ ax.set_xlabel("Year")
131
+ analyte_unit = data["Org_Result_Unit"].iloc[0]
132
+ if analyte_name == "Depth, Secchi Disk Depth":
133
+ y_label = f"Depth ({analyte_unit})"
134
+ elif analyte_name == "pH":
135
+ y_label = None
136
+ elif analyte_name.startswith("Dissolved"):
137
+ y_label = f"DO ({analyte_unit})"
138
+ elif analyte_name.startswith("Fecal Coliform"):
139
+ y_label = f"Fecal Coliform ({analyte_unit})"
140
+ else:
141
+ y_label = f"{analyte_name} ({analyte_unit})"
142
+
143
+ ax.set_ylabel(y_label)
144
+ ax.grid(True, alpha=0.3)
145
+
146
+ # Add sample sizes
147
+ for year, group in groups:
148
+ ax.text(
149
+ year,
150
+ ax.get_ylim()[1],
151
+ f"n={len(group)}",
152
+ ha="center",
153
+ va="bottom",
154
+ fontsize=8,
155
+ )
156
+
157
+ # Remove any unused subplots
158
+ for idx in range(len(analyte_names), len(axes)):
159
+ fig.delaxes(axes[idx])
160
+
161
+ # Add overall title with more space
162
+ fig.suptitle(
163
+ f"Water Quality Trends for {station_number} - {station_name} - {sample_position_label}",
164
+ fontsize=14,
165
+ y=0.95,
166
+ )
167
+
168
+ # Adjust layout with more space
169
+ plt.tight_layout(rect=(0, 0, 1, 0.95))
170
+ return fig
171
+
172
+
173
+ def altair_plot_sector_trends(
174
+ df: pd.DataFrame, analyte_names: list[str]
175
+ ) -> alt.VConcatChart:
176
+ """
177
+ Create plots of mean annual analyte trends by sector using Altair.
178
+
179
+ Parameters:
180
+ -----------
181
+ df : pd.DataFrame
182
+ Input dataframe
183
+ analyte_names : list[str]
184
+ List of analytes to plot
185
+
186
+ Returns:
187
+ --------
188
+ alt.VConcatChart
189
+ Vertically concatenated Altair charts for each analyte
190
+ """
191
+ # Custom color scheme matching the matplotlib version
192
+ color_scale = alt.Scale(
193
+ domain=df["Sector"].unique().tolist(),
194
+ range=[
195
+ "#1f77b4", # blue
196
+ "#ff7f0e", # orange
197
+ "#2ca02c", # green
198
+ "#d62728", # red
199
+ "#9467bd", # purple
200
+ "#8c564b", # brown
201
+ "#e377c2", # pink
202
+ "#7f7f7f", # gray
203
+ ],
204
+ )
205
+
206
+ charts = []
207
+ for analyte_name in analyte_names:
208
+ # Filter data for current analyte
209
+ analyte_data = df[df["Org_Analyte_Name"] == analyte_name].copy()
210
+
211
+ # For Salinity, exclude Fresh Water Lakes
212
+ if analyte_name == "Salinity":
213
+ analyte_data = analyte_data[analyte_data["Sector"] != "Fresh Water Lakes"]
214
+
215
+ # Calculate annual means and standard errors
216
+ processed_data = (
217
+ analyte_data.assign(Year=lambda df: df["Activity_Start_Date_Time"].dt.year)
218
+ .groupby(["Year", "Sector"])["Org_Result_Value"]
219
+ .agg(["mean", "sem"])
220
+ .reset_index()
221
+ .rename(columns={"mean": "Mean", "sem": "SE"})
222
+ )
223
+
224
+ # Add confidence interval bounds
225
+ processed_data["Upper"] = processed_data["Mean"] + processed_data["SE"]
226
+ processed_data["Lower"] = processed_data["Mean"] - processed_data["SE"]
227
+
228
+ # Get the unit for the y-axis label
229
+ unit = analyte_data["Org_Result_Unit"].iloc[0] if not analyte_data.empty else ""
230
+
231
+ # Determine if log scale should be used
232
+ use_log_scale = analyte_name in [
233
+ "Turbidity",
234
+ "Fecal Coliform (MPN)",
235
+ "Total Nitrogen",
236
+ "Total Phosphorus",
237
+ ]
238
+
239
+ # Create base chart
240
+ base = alt.Chart(processed_data).encode(
241
+ x=alt.X("Year:O", axis=alt.Axis(title=None)),
242
+ color=alt.Color("Sector:N", scale=color_scale),
243
+ tooltip=[
244
+ alt.Tooltip("Year:O"),
245
+ alt.Tooltip("Sector:N"),
246
+ alt.Tooltip("Mean:Q", format=".2f"),
247
+ alt.Tooltip("SE:Q", format=".2f"),
248
+ ],
249
+ )
250
+
251
+ # Create line and point layers
252
+ lines = base.mark_line().encode(
253
+ y=alt.Y(
254
+ "Mean:Q",
255
+ title=f"({unit})",
256
+ scale=alt.Scale(type="log" if use_log_scale else "linear"),
257
+ )
258
+ )
259
+
260
+ points = base.mark_point(size=50).encode(y=alt.Y("Mean:Q"))
261
+
262
+ # Create confidence interval area
263
+ area = base.mark_area(opacity=0.15).encode(
264
+ y=alt.Y("Lower:Q"), y2=alt.Y2("Upper:Q")
265
+ )
266
+
267
+ # Combine layers
268
+ chart = (
269
+ (area + lines + points)
270
+ .properties(
271
+ width=600,
272
+ height=300,
273
+ title=alt.TitleParams(text=analyte_name, anchor="middle", fontSize=14),
274
+ )
275
+ .interactive()
276
+ )
277
+
278
+ charts.append(chart)
279
+
280
+ # Combine all charts vertically
281
+ final_chart = alt.vconcat(*charts).configure(
282
+ view={"strokeWidth": 0}, axis={"grid": True, "gridOpacity": 0.2}
283
+ )
284
+
285
+ return final_chart
286
+
287
+
288
+ def plotly_plot_analyte_trends(df: pd.DataFrame, analyte_names: list[str]) -> go.Figure:
289
+ """
290
+ Create subplots of analyte trends using Plotly for the given dataframe and analytes.
291
+
292
+ Parameters:
293
+ -----------
294
+ df : pandas DataFrame
295
+ The filtered dataframe containing data for a specific station and position
296
+ analyte_names : list[str]
297
+ List of analyte names to plot
298
+
299
+ Returns:
300
+ --------
301
+ go.Figure
302
+ Plotly figure containing the subplots
303
+ """
304
+ # Calculate number of rows needed (2 columns)
305
+ n_rows = (len(analyte_names) + 1) // 2
306
+
307
+ # Create subplot figure
308
+ fig = make_subplots(
309
+ rows=n_rows,
310
+ cols=2,
311
+ subplot_titles=analyte_names,
312
+ vertical_spacing=0.12,
313
+ horizontal_spacing=0.1,
314
+ )
315
+
316
+ station_number = df["Station_Number"].iloc[0]
317
+ sample_position = df["Sample_Position"].iloc[0]
318
+
319
+ for idx, analyte_name in enumerate(analyte_names):
320
+ row = idx // 2 + 1
321
+ col = idx % 2 + 1
322
+
323
+ data = (
324
+ df[df["Org_Analyte_Name"] == analyte_name]
325
+ .assign(Year=lambda df: df["Activity_Start_Date_Time"].dt.year)
326
+ .dropna(subset=["Org_Result_Value"])
327
+ )
328
+
329
+ if data.empty:
330
+ fig.add_annotation(
331
+ text=f"No data available for {analyte_name}",
332
+ xref=f"x{idx+1}",
333
+ yref=f"y{idx+1}",
334
+ x=0.5,
335
+ y=0.5,
336
+ showarrow=False,
337
+ row=row,
338
+ col=col,
339
+ )
340
+ continue
341
+
342
+ # Determine if log scale should be used
343
+ log_scale = analyte_name in ["Turbidity", "Fecal Coliform (MPN)"]
344
+
345
+ # Create box plot
346
+ groups = data.groupby("Year")
347
+ years = list(groups.groups.keys())
348
+
349
+ # Add box plot
350
+ fig.add_trace(
351
+ go.Box(
352
+ x=data["Year"],
353
+ y=data["Org_Result_Value"],
354
+ name="Box Plot",
355
+ boxpoints="outliers",
356
+ line=dict(color="blue"),
357
+ fillcolor="lightblue",
358
+ showlegend=False,
359
+ ),
360
+ row=row,
361
+ col=col,
362
+ )
363
+
364
+ # Calculate and plot means
365
+ yearly_means = data.groupby("Year")["Org_Result_Value"].mean()
366
+
367
+ # Add mean line
368
+ fig.add_trace(
369
+ go.Scatter(
370
+ x=years,
371
+ y=yearly_means.values,
372
+ mode="lines+markers",
373
+ name="Annual Mean",
374
+ line=dict(color="blue"),
375
+ showlegend=False,
376
+ ),
377
+ row=row,
378
+ col=col,
379
+ )
380
+
381
+ # Calculate and add trend line
382
+ if len(years) > 1:
383
+ X = np.array(years)
384
+ y = yearly_means.values
385
+ slope, intercept, r_value, p_value, std_err = stats.linregress(X, y)
386
+ trend_line = slope * X + intercept
387
+
388
+ fig.add_trace(
389
+ go.Scatter(
390
+ x=years,
391
+ y=trend_line,
392
+ mode="lines",
393
+ name="Trend",
394
+ line=dict(color="red", dash="dash"),
395
+ showlegend=False,
396
+ ),
397
+ row=row,
398
+ col=col,
399
+ )
400
+
401
+ # Add statistics annotation
402
+ stats_text = f"R² = {r_value**2:.3f}<br>p = {p_value:.3f}" # type: ignore
403
+ fig.add_annotation(
404
+ text=stats_text,
405
+ xref=f"x{idx+1}",
406
+ yref=f"y{idx+1}",
407
+ x=min(years), # type: ignore
408
+ y=max(data["Org_Result_Value"]),
409
+ showarrow=False,
410
+ bgcolor="white",
411
+ bordercolor="black",
412
+ borderwidth=1,
413
+ row=row,
414
+ col=col,
415
+ )
416
+
417
+ # Add sample size annotations
418
+ for year, group in groups:
419
+ fig.add_annotation(
420
+ text=f"n={len(group)}",
421
+ x=year,
422
+ y=max(data["Org_Result_Value"]),
423
+ showarrow=False,
424
+ font=dict(size=8),
425
+ row=row,
426
+ col=col,
427
+ )
428
+
429
+ # Update axes
430
+ if log_scale:
431
+ fig.update_yaxes(type="log", row=row, col=col)
432
+
433
+ fig.update_xaxes(title_text="Year", row=row, col=col)
434
+ fig.update_yaxes(
435
+ title_text=f'Value ({data["Org_Result_Unit"].iloc[0]})', row=row, col=col
436
+ )
437
+
438
+ # Update layout
439
+ fig.update_layout(
440
+ title=f"Water Quality Trends<br>Station {station_number} - {sample_position}",
441
+ title_x=0.5,
442
+ showlegend=False,
443
+ height=300 * n_rows + 100,
444
+ width=1000,
445
+ template="plotly_white",
446
+ )
447
+
448
+ return fig
449
+
450
+
451
+ def plot_sector_trends(
452
+ df: pd.DataFrame, analyte_names: list[str], base_height: float = 4
453
+ ) -> Figure:
454
+ """
455
+ Create plots of mean annual analyte trends by sector.
456
+
457
+ Parameters:
458
+ -----------
459
+ df : pd.DataFrame
460
+ Input dataframe
461
+ analyte_names : list[str]
462
+ List of analytes to plot
463
+ base_height : float
464
+ Height per subplot in inches (default=4)
465
+ """
466
+ # Calculate figure dimensions
467
+ n_rows = len(analyte_names)
468
+ fig_height = base_height * n_rows
469
+
470
+ # Create figure with dynamic height
471
+ fig, axes = plt.subplots(n_rows, 1, figsize=(15, fig_height))
472
+ if n_rows == 1:
473
+ axes = [axes]
474
+
475
+ custom_colors = [
476
+ "#1f77b4", # blue
477
+ "#ff7f0e", # orange
478
+ "#2ca02c", # green
479
+ "#d62728", # red
480
+ "#9467bd", # purple
481
+ "#8c564b", # brown
482
+ "#e377c2", # pink
483
+ "#7f7f7f", # gray
484
+ ]
485
+
486
+ for idx, analyte_name in enumerate(analyte_names):
487
+ ax = axes[idx]
488
+
489
+ # Filter data for current analyte
490
+ analyte_data = df[df["Org_Analyte_Name"] == analyte_name]
491
+
492
+ # For Salinity, exclude Fresh Water Lakes
493
+ if analyte_name == "Salinity":
494
+ analyte_data = analyte_data[analyte_data["Sector"] != "Fresh Water Lakes"]
495
+
496
+ # Plot each sector with custom colors
497
+ for sector, color in zip(df["Sector"].unique(), custom_colors):
498
+ sector_data = (
499
+ analyte_data[analyte_data["Sector"] == sector]
500
+ .assign(Year=lambda df: df["Activity_Start_Date_Time"].dt.year)
501
+ .groupby("Year")["Org_Result_Value"]
502
+ .agg(["mean", "sem"])
503
+ .reset_index()
504
+ )
505
+
506
+ if not sector_data.empty:
507
+ # Plot mean line with error bands
508
+ ax.plot(
509
+ sector_data["Year"],
510
+ sector_data["mean"],
511
+ "-o",
512
+ color=color,
513
+ label=sector,
514
+ markersize=4,
515
+ linewidth=2, # Slightly thicker lines
516
+ )
517
+
518
+ # Add error bands with slightly reduced opacity
519
+ ax.fill_between(
520
+ sector_data["Year"],
521
+ sector_data["mean"] - sector_data["sem"],
522
+ sector_data["mean"] + sector_data["sem"],
523
+ color=color,
524
+ alpha=0.15, # Reduced opacity for better visibility
525
+ )
526
+
527
+ # Set x-axis to show only whole years
528
+ years = analyte_data["Activity_Start_Date_Time"].dt.year.unique()
529
+ ax.set_xticks(years)
530
+ ax.set_xticklabels(years.astype(int))
531
+
532
+ # Customize subplot with lighter titles and no x-label
533
+ ax.set_title(analyte_name, pad=10, fontsize=11, fontweight="normal")
534
+ ax.set_xlabel("")
535
+
536
+ if not analyte_data.empty:
537
+ analyte_unit = analyte_data["Org_Result_Unit"].iloc[0]
538
+ ax.set_ylabel(f"({analyte_unit})", fontsize=10)
539
+
540
+ # Improve grid appearance
541
+ ax.grid(True, alpha=0.2, linestyle="--")
542
+ ax.spines["top"].set_visible(False)
543
+ ax.spines["right"].set_visible(False)
544
+
545
+ # Simplified legend appearance (removed 3D effects)
546
+ ax.legend(
547
+ bbox_to_anchor=(1.05, 1),
548
+ loc="upper left",
549
+ borderaxespad=0.0,
550
+ frameon=True,
551
+ fancybox=False,
552
+ shadow=False,
553
+ fontsize=9,
554
+ )
555
+
556
+ if analyte_name in [
557
+ "Turbidity",
558
+ "Fecal Coliform (MPN)",
559
+ "Total Nitrogen",
560
+ "Total Phosphorus",
561
+ ]:
562
+ ax.set_yscale("log")
563
+
564
+ # Adjust layout with more vertical space between subplots
565
+ plt.tight_layout(rect=(0, 0, 0.85, 1), h_pad=2.0)
566
+ return fig
567
+
568
+
569
+ def plot_parameter_correlations(
570
+ df: pd.DataFrame,
571
+ analyte_names: list[str],
572
+ subset_by: str,
573
+ subset: str,
574
+ filter_by: str,
575
+ threshold: float = 0.2,
576
+ ) -> Figure:
577
+ pivot_df = df[df["Org_Analyte_Name"].isin(analyte_names)].pivot_table(
578
+ index="Activity_Start_Date_Time",
579
+ columns="Org_Analyte_Name",
580
+ values="Org_Result_Value",
581
+ observed=False,
582
+ )
583
+
584
+ # Clean up column names
585
+ pivot_df = pivot_df.rename(
586
+ columns={
587
+ "Depth, Secchi Disk Depth": "Secchi Depth",
588
+ "Dissolved Oxygen": "DO",
589
+ "Fecal Coliform (MPN)": "Fecal Coliform",
590
+ "Total Nitrogen": "TN",
591
+ "Total Phosphorus": "TP",
592
+ }
593
+ )
594
+
595
+ # Calculate data completeness for each parameter
596
+ completeness = pivot_df.notna().mean()
597
+ valid_params = completeness[completeness >= threshold].index
598
+ excluded_params = completeness[completeness < threshold]
599
+
600
+ # Filter pivot_df to only include parameters meeting the threshold
601
+ pivot_df = pivot_df[valid_params]
602
+
603
+ # Calculate correlation matrix
604
+ corr = pivot_df.corr()
605
+
606
+ # Calculate sample size
607
+ n_samples = len(df)
608
+
609
+ # Create figure with more explicit spacing at the top
610
+ fig = plt.figure(figsize=(6, 7))
611
+
612
+ # Adjust gridspec ratios and spacing - modified to leave more room at top
613
+ gs = fig.add_gridspec(
614
+ 3,
615
+ 1,
616
+ height_ratios=[
617
+ 1, # Title space
618
+ 4, # Heatmap
619
+ 1.5, # Footnote
620
+ ],
621
+ hspace=0.4,
622
+ )
623
+
624
+ # Add title axes, heatmap axes, and footnote axes
625
+ title_ax = fig.add_subplot(gs[0])
626
+ heatmap_ax = fig.add_subplot(gs[1])
627
+ footnote_ax = fig.add_subplot(gs[2])
628
+
629
+ # Create heatmap
630
+ mask = np.triu(np.ones_like(corr, dtype=bool))
631
+ heatmap = sns.heatmap(
632
+ corr,
633
+ mask=mask,
634
+ annot=True,
635
+ cmap="RdBu_r",
636
+ center=0,
637
+ vmin=-1,
638
+ vmax=1,
639
+ ax=heatmap_ax,
640
+ yticklabels=1,
641
+ cbar=True,
642
+ xticklabels=1,
643
+ )
644
+
645
+ # Rotate x-axis labels and adjust their position
646
+ heatmap_ax.set_xticklabels(
647
+ heatmap_ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor"
648
+ )
649
+
650
+ # Move bottom axis labels down
651
+ heatmap_ax.tick_params(axis="x", pad=10)
652
+
653
+ # Fix the colorbar ticks warning by setting ticks first
654
+ colorbar = heatmap.figure.axes[-1] # type: ignore
655
+ ticks = colorbar.get_yticks()
656
+ colorbar.set_yticks(ticks)
657
+ tick_labels = [f"{x:>8.2f}" for x in ticks]
658
+ colorbar.set_yticklabels(tick_labels)
659
+
660
+ # Rotate y-axis labels to horizontal
661
+ heatmap_ax.set_yticklabels(heatmap_ax.get_yticklabels(), rotation=0)
662
+
663
+ # Remove axis labels
664
+ heatmap_ax.set_xlabel("")
665
+ heatmap_ax.set_ylabel("")
666
+
667
+ # Configure footnote axis
668
+ footnote_ax.set_frame_on(False) # Hide the frame
669
+ footnote_ax.set_xticks([]) # Remove x-ticks
670
+ footnote_ax.set_yticks([]) # Remove y-ticks
671
+
672
+ # Add footnote with adjusted position
673
+ if not excluded_params.empty:
674
+ footnote_text = "Excluded parameters (<{:.0%} data completeness):\n".format(
675
+ threshold
676
+ )
677
+ for param, completeness_val in excluded_params.items():
678
+ footnote_text += f" - {param}: {completeness_val:.1%} complete\n"
679
+
680
+ footnote_ax.text(
681
+ 0.01,
682
+ 0.40,
683
+ footnote_text.rstrip(),
684
+ ha="left",
685
+ va="center",
686
+ fontsize=9,
687
+ fontstyle="italic",
688
+ transform=footnote_ax.transAxes,
689
+ )
690
+
691
+ # Do the same for title axis
692
+ title_ax.set_frame_on(False)
693
+ title_ax.set_xticks([])
694
+ title_ax.set_yticks([])
695
+
696
+ # Modify the filter_by text for display
697
+ display_filter = "Surface and Bottom" if filter_by == "All" else filter_by
698
+
699
+ # Add year information to the subtitle
700
+ year_info = df["Year"].iloc[0] if len(df["Year"].unique()) == 1 else "All Years"
701
+
702
+ # Add titles - using figure coordinates with adjusted positions
703
+ title_ax.text(
704
+ 0.45,
705
+ 0.8, # Moved higher in figure coordinates
706
+ f"{subset_by}: {subset}",
707
+ ha="center",
708
+ va="center",
709
+ fontsize=12,
710
+ fontweight="bold",
711
+ transform=fig.transFigure,
712
+ )
713
+ title_ax.text(
714
+ 0.45,
715
+ 0.75, # Moved higher in figure coordinates
716
+ f"{display_filter}, {year_info} (n={n_samples:,})",
717
+ ha="center",
718
+ va="bottom",
719
+ fontsize=10,
720
+ fontstyle="italic",
721
+ transform=fig.transFigure,
722
+ )
723
+
724
+ # Replace tight_layout with more explicit spacing control
725
+ # First, calculate the figure bounds
726
+ fig.canvas.draw()
727
+
728
+ # Get the tight_bbox
729
+ renderer = fig.canvas.get_renderer() # type: ignore
730
+ fig.get_tightbbox(renderer)
731
+
732
+ # Adjust the subplot positions manually
733
+ fig.subplots_adjust(left=0.1, right=0.95, bottom=0.02, top=0.85, hspace=0.4)
734
+
735
+ return fig
736
+
737
+
738
+ def plot_np_ratios(df: pd.DataFrame) -> Figure:
739
+ # Create dataframe with N, P, and Sector information
740
+ nutrients_df = (
741
+ df[df["Org_Analyte_Name"].isin(["Total Nitrogen", "Total Phosphorus"])]
742
+ .pivot_table(
743
+ index=["Activity_Start_Date_Time", "Sector"],
744
+ columns="Org_Analyte_Name",
745
+ values="Org_Result_Value",
746
+ observed=True,
747
+ )
748
+ .reset_index()
749
+ )
750
+
751
+ # Calculate N:P ratio
752
+ nutrients_df["N:P Ratio"] = (
753
+ nutrients_df["Total Nitrogen"] / nutrients_df["Total Phosphorus"]
754
+ )
755
+
756
+ # Create figure with two subplots
757
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
758
+
759
+ # Time series plot with colors by sector
760
+ sns.scatterplot(
761
+ data=nutrients_df,
762
+ x="Activity_Start_Date_Time",
763
+ y="N:P Ratio",
764
+ hue="Sector",
765
+ ax=ax1,
766
+ alpha=0.6,
767
+ )
768
+ ax1.axhline(y=16, color="r", linestyle="--", label="Redfield Ratio (16:1)")
769
+ ax1.set_ylabel("N:P Ratio")
770
+ ax1.set_xlabel("Date")
771
+ ax1.set_title("N:P Ratio Over Time")
772
+
773
+ # Adjust legend position
774
+ ax1.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
775
+
776
+ # Histogram plot
777
+ sns.histplot(x=nutrients_df["N:P Ratio"].dropna(), ax=ax2)
778
+ ax2.axvline(x=16, color="r", linestyle="--", label="Redfield Ratio (16:1)")
779
+ ax2.set_xlabel("N:P Ratio")
780
+ ax2.set_title("Distribution of N:P Ratios")
781
+ ax2.legend()
782
+
783
+ # Adjust layout to accommodate legend
784
+ plt.tight_layout(rect=(0, 0, 0.9, 1))
785
+ return fig
786
+
787
+
788
+ def altair_plot_np_ratios(df: pd.DataFrame) -> alt.VConcatChart:
789
+ # Create dataframe with N, P, and Sector information
790
+ nutrients_df = (
791
+ df[df["Org_Analyte_Name"].isin(["Total Nitrogen", "Total Phosphorus"])]
792
+ .pivot_table(
793
+ index=["Activity_Start_Date_Time", "Sector"],
794
+ columns="Org_Analyte_Name",
795
+ values="Org_Result_Value",
796
+ observed=True,
797
+ )
798
+ .reset_index()
799
+ )
800
+
801
+ # Calculate N:P ratio
802
+ nutrients_df["N:P Ratio"] = (
803
+ nutrients_df["Total Nitrogen"] / nutrients_df["Total Phosphorus"]
804
+ )
805
+
806
+ # Time series plot with colors by sector
807
+ time_series = (
808
+ alt.Chart(nutrients_df)
809
+ .mark_circle(size=60)
810
+ .encode(
811
+ x=alt.X(
812
+ "Activity_Start_Date_Time:T",
813
+ axis=alt.Axis(format="%Y", tickCount="year"),
814
+ title="Date",
815
+ ),
816
+ y=alt.Y(r"N\:P Ratio:Q", title="N:P Ratio"),
817
+ color="Sector:N",
818
+ tooltip=[
819
+ alt.Tooltip("Activity_Start_Date_Time:T", title="Date"),
820
+ alt.Tooltip(r"N\:P Ratio:Q", format=".0f", title="N:P Ratio"),
821
+ alt.Tooltip("Sector:N", title="Sector"),
822
+ ],
823
+ )
824
+ .properties(title="N:P Ratio Over Time", width=600, height=300)
825
+ .interactive()
826
+ )
827
+
828
+ # Add Redfield Ratio line
829
+ redfield_line = (
830
+ alt.Chart(pd.DataFrame({"y": [16]})).mark_rule(color="red").encode(y="y:Q")
831
+ )
832
+
833
+ # Histogram plot
834
+ histogram = (
835
+ alt.Chart(nutrients_df)
836
+ .mark_bar()
837
+ .encode(
838
+ x=alt.X(r"N\:P Ratio:Q", bin=alt.Bin(maxbins=30), title="N:P Ratio"),
839
+ y="count()",
840
+ tooltip=["count()"],
841
+ )
842
+ .properties(title="Distribution of N:P Ratios", width=600, height=300)
843
+ .interactive()
844
+ )
845
+
846
+ # Add Redfield Ratio line to histogram
847
+ redfield_hist_line = (
848
+ alt.Chart(pd.DataFrame({"x": [16]})).mark_rule(color="red").encode(x="x:Q")
849
+ )
850
+
851
+ # Combine plots
852
+ combined_chart = alt.vconcat(
853
+ time_series + redfield_line, histogram + redfield_hist_line
854
+ ).resolve_scale(y="independent")
855
+
856
+ return combined_chart
857
+
858
+
859
+ def plot_calendar_heatmap(
860
+ df: pd.DataFrame, analyte: str, colormap: str | None = None
861
+ ) -> Figure:
862
+ data = df[df["Org_Analyte_Name"] == analyte].copy()
863
+ data["Year"] = data["Activity_Start_Date_Time"].dt.year
864
+ data["Month"] = data["Activity_Start_Date_Time"].dt.month
865
+
866
+ pivot_data = data.pivot_table(
867
+ values="Org_Result_Value", index="Year", columns="Month", aggfunc="mean"
868
+ )
869
+
870
+ # Choose appropriate colormap based on analyte type
871
+ if analyte in ["Fecal Coliform (MPN)"]:
872
+ cmap = "viridis" # Blue-green-yellow
873
+ elif analyte in ["Temperature, Water"]:
874
+ cmap = "coolwarm"
875
+ elif analyte in ["Dissolved Oxygen"]:
876
+ cmap = "RdYlBu"
877
+ elif analyte in ["Total Nitrogen", "Total Phosphorus"]:
878
+ cmap = "GnBu" # Green-Blue
879
+ elif analyte in ["Depth, Secchi Disk Depth"]:
880
+ cmap = "Blues_r"
881
+ else:
882
+ cmap = "Blues" # Default blue gradient
883
+
884
+ # If colormap is set, override the analyte-specific default
885
+ if colormap:
886
+ cmap = colormap
887
+
888
+ fig, ax = plt.subplots(figsize=(6, len(pivot_data) * 0.5))
889
+
890
+ # Create heatmap
891
+ sns.heatmap(
892
+ pivot_data,
893
+ cmap=cmap,
894
+ annot=True,
895
+ fmt=".2f",
896
+ cbar_kws={"label": data["Org_Result_Unit"].iloc[0]},
897
+ annot_kws={"size": 6},
898
+ )
899
+
900
+ ax.set_title(f"Monthly Averages Heatmap: {analyte}", fontsize=10, pad=5)
901
+ ax.tick_params(axis="both", which="major", labelsize=7)
902
+
903
+ # Get the colorbar and adjust its label size
904
+ colorbar = ax.collections[0].colorbar
905
+ colorbar.ax.tick_params(labelsize=7) # type: ignore
906
+ colorbar.set_label(data["Org_Result_Unit"].iloc[0], size=7) # type: ignore
907
+
908
+ return fig
909
+
910
+
911
+ def plot_seasonal_salinity(
912
+ salinity_data: pd.DataFrame,
913
+ year: str,
914
+ basemap_provider,
915
+ alpha=0.5,
916
+ shapefile_path="data/SAB/SAB.shp",
917
+ ):
918
+ """
919
+ Create seasonal plots of mean salinity values by WBID with basemap.
920
+
921
+ Args:
922
+ salinity_data: DataFrame containing salinity measurements
923
+ year: Year to filter data for (str)
924
+ """
925
+ # Read and filter WBIDs
926
+ wbids = gpd.read_file(shapefile_path)
927
+ relevant_wbids = salinity_data["WBID"].unique()
928
+ wbids = wbids[wbids["WBID"].isin(relevant_wbids)]
929
+ wbids = wbids.to_crs(epsg=3857)
930
+
931
+ # Process data - create a copy to avoid SettingWithCopyWarning
932
+ year_data = salinity_data[
933
+ salinity_data["Activity_Start_Date_Time"].dt.year == int(year)
934
+ ].copy()
935
+
936
+ # Add season column using loc
937
+ year_data.loc[:, "season"] = pd.cut(
938
+ year_data["Activity_Start_Date_Time"].dt.month,
939
+ bins=[0, 3, 6, 9, 12],
940
+ labels=["Winter", "Spring", "Summer", "Fall"],
941
+ )
942
+
943
+ # Calculate seasonal means with observed=True
944
+ seasonal_means = (
945
+ year_data.groupby(["WBID", "season"], observed=True)["Salinity"]
946
+ .mean()
947
+ .reset_index()
948
+ )
949
+
950
+ fig = plt.figure(figsize=(20, 14))
951
+
952
+ # Create custom colormap with focused range
953
+ colors = ["#08519c", "#73a9cf", "#fee090", "#fc8d59", "#d73027"]
954
+ cmap = LinearSegmentedColormap.from_list("custom", colors, N=100)
955
+
956
+ # Get global min/max for consistent colormap
957
+ vmin = seasonal_means["Salinity"].min()
958
+ vmax = 40
959
+
960
+ # Calculate map extent
961
+ bounds = wbids.total_bounds
962
+ x_buffer = (bounds[2] - bounds[0]) * 0.05
963
+ y_buffer = (bounds[3] - bounds[1]) * 0.05
964
+ extent = [
965
+ bounds[0] - x_buffer,
966
+ bounds[2] + x_buffer,
967
+ bounds[1] - y_buffer,
968
+ bounds[3] + y_buffer,
969
+ ]
970
+
971
+ # Create subplots with tighter spacing
972
+ gs = fig.add_gridspec(
973
+ 2,
974
+ 2,
975
+ width_ratios=[1, 1],
976
+ wspace=0.05, # Minimal horizontal space between plots
977
+ hspace=-0.15, # More negative value to further reduce vertical space
978
+ left=0.02, # Left margin
979
+ right=0.98, # Right margin
980
+ top=0.95, # Slightly reduced top margin to give more space
981
+ bottom=0.05, # Slightly increased bottom margin to give more space
982
+ )
983
+
984
+ for idx, season in enumerate(["Winter", "Spring", "Summer", "Fall"]):
985
+ ax = fig.add_subplot(gs[idx // 2, idx % 2])
986
+
987
+ season_data = seasonal_means[seasonal_means["season"] == season]
988
+ merged = wbids.merge(season_data, on="WBID", how="left")
989
+
990
+ # Plot WBIDs
991
+ merged.plot(
992
+ column="Salinity",
993
+ ax=ax,
994
+ cmap=cmap,
995
+ vmin=vmin,
996
+ vmax=vmax,
997
+ alpha=0.7,
998
+ missing_kwds={"color": "lightgrey", "alpha": 0.5},
999
+ )
1000
+
1001
+ ctx.add_basemap(ax, source=basemap_provider, zoom=11, alpha=alpha) # type: ignore
1002
+
1003
+ ax.set_xlim(extent[0], extent[1])
1004
+ ax.set_ylim(extent[2], extent[3])
1005
+ # Adjust title position
1006
+ if idx < 2: # Top row
1007
+ ax.set_title(
1008
+ f"{season} {year} Mean Salinity", pad=15
1009
+ ) # More padding for top row
1010
+ else: # Bottom row
1011
+ ax.set_title(
1012
+ f"{season} {year} Mean Salinity", pad=5
1013
+ ) # Less padding for bottom row
1014
+ ax.set_axis_off()
1015
+
1016
+ # Add colorbar
1017
+ norm = plt.Normalize(vmin=vmin, vmax=vmax) # type: ignore
1018
+ sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
1019
+ sm.set_array([])
1020
+ fig.colorbar(
1021
+ sm,
1022
+ ax=fig.axes,
1023
+ orientation="vertical",
1024
+ label="Salinity (ppt)",
1025
+ pad=0.01,
1026
+ fraction=0.015,
1027
+ ticks=np.arange(0, 45, 5), # Add ticks every 5 units
1028
+ )
1029
+
1030
+ return fig
1031
+
1032
+
1033
+ def plot_seasonal_salinity_for_bays(
1034
+ salinity_data: pd.DataFrame,
1035
+ year: str,
1036
+ basemap_provider=ctx.providers.USGS.USTopo, # type: ignore
1037
+ alpha=0.5,
1038
+ shapefile_path="data/SAB/SAB.shp",
1039
+ ):
1040
+ """
1041
+ Create seasonal plots of mean salinity values by WBID for N, E, W, SAB, GL and Lake Powell.
1042
+ """
1043
+ fig = plot_seasonal_salinity(
1044
+ salinity_data.query(
1045
+ "WBID.isin(['1061A', '1061B', '1061C', '1061D', '1061E', '1061F', '1061G', '1061H', '1055A'])"
1046
+ ),
1047
+ year=year,
1048
+ basemap_provider=basemap_provider,
1049
+ alpha=alpha,
1050
+ shapefile_path=shapefile_path,
1051
+ )
1052
+ return fig
1053
+
1054
+
1055
+ def plot_do_temp_relationship(df: pd.DataFrame) -> Figure:
1056
+ """
1057
+ Create a scatter plot of DO vs temperature with regression line using seaborn.
1058
+
1059
+ Parameters:
1060
+ -----------
1061
+ df : pd.DataFrame
1062
+ Input dataframe containing DO and temperature measurements
1063
+
1064
+ Returns:
1065
+ --------
1066
+ Figure
1067
+ Matplotlib figure containing the plot
1068
+ """
1069
+ do_temp_data = (
1070
+ df[df["Org_Analyte_Name"].isin(["Dissolved Oxygen", "Temperature, Water"])]
1071
+ .pivot_table(
1072
+ index=["Activity_Start_Date_Time", "Station_Number", "Sample_Position"],
1073
+ columns="Org_Analyte_Name",
1074
+ values="Org_Result_Value",
1075
+ observed=False,
1076
+ )
1077
+ .reset_index()
1078
+ .dropna(subset=["Dissolved Oxygen", "Temperature, Water"])
1079
+ )
1080
+
1081
+ sns.set_palette("muted")
1082
+
1083
+ # Create plot with regression line
1084
+ g = sns.lmplot(
1085
+ data=do_temp_data,
1086
+ x="Temperature, Water",
1087
+ y="Dissolved Oxygen",
1088
+ hue="Sample_Position",
1089
+ hue_order=["Surface", "Bottom"],
1090
+ scatter_kws={"alpha": 0.6},
1091
+ height=8,
1092
+ aspect=1.5,
1093
+ legend=False,
1094
+ )
1095
+
1096
+ # Add DO threshold and customize plot
1097
+ ax = g.axes[0, 0]
1098
+ ax.axhline(y=5, color="red", linestyle=":", alpha=0.5)
1099
+ ax.text(
1100
+ ax.get_xlim()[0],
1101
+ 5.1,
1102
+ " 5 mg/L DO threshold",
1103
+ ha="left",
1104
+ va="bottom",
1105
+ color="red",
1106
+ alpha=0.5,
1107
+ )
1108
+
1109
+ g.set_axis_labels("Water Temperature (°C)", "Dissolved Oxygen (mg/L)")
1110
+ ax.set_title("Dissolved Oxygen vs Water Temperature", pad=20, fontsize=16)
1111
+ ax.legend(title="Sample Position", bbox_to_anchor=(1.05, 1), loc="upper left")
1112
+ # Add grid
1113
+ ax.grid(True, alpha=0.3)
1114
+
1115
+ return g.figure
1116
+
1117
+
1118
+ def altair_plot_do_temp_relationship(df: pd.DataFrame) -> alt.LayerChart:
1119
+ """
1120
+ Create an interactive scatter plot of DO vs temperature with regression lines using Altair.
1121
+ Matches the style and features of the original matplotlib/seaborn plot.
1122
+
1123
+ Parameters:
1124
+ -----------
1125
+ df : pd.DataFrame
1126
+ Input dataframe containing DO and temperature measurements
1127
+
1128
+ Returns:
1129
+ --------
1130
+ alt.Chart
1131
+ Altair chart object
1132
+ """
1133
+ # Prepare the data similarly to the original function
1134
+ do_temp_data = (
1135
+ df[df["Org_Analyte_Name"].isin(["Dissolved Oxygen", "Temperature, Water"])]
1136
+ .pivot_table(
1137
+ index=[
1138
+ "Activity_Start_Date_Time",
1139
+ "Station_Number",
1140
+ "Sample_Position",
1141
+ "Sector",
1142
+ ],
1143
+ columns="Org_Analyte_Name",
1144
+ values="Org_Result_Value",
1145
+ observed=False,
1146
+ )
1147
+ .reset_index()
1148
+ .dropna(subset=["Dissolved Oxygen", "Temperature, Water"])
1149
+ )
1150
+
1151
+ # Create the base scatter plot
1152
+ scatter = (
1153
+ alt.Chart(do_temp_data)
1154
+ .mark_circle(size=60, opacity=0.6)
1155
+ .encode(
1156
+ x=alt.X(
1157
+ "Temperature, Water:Q",
1158
+ title="Water Temperature (°C)",
1159
+ scale=alt.Scale(zero=False),
1160
+ ),
1161
+ y=alt.Y(
1162
+ "Dissolved Oxygen:Q",
1163
+ title="Dissolved Oxygen (mg/L)",
1164
+ scale=alt.Scale(zero=False),
1165
+ ),
1166
+ color=alt.Color(
1167
+ "Sample_Position:N",
1168
+ scale=alt.Scale(
1169
+ domain=["Surface", "Bottom"],
1170
+ range=["#8da0cb", "#fc8d62"], # Muted blue and orange
1171
+ ),
1172
+ legend=alt.Legend(title="Sample Position"),
1173
+ ),
1174
+ tooltip=[
1175
+ alt.Tooltip("Temperature, Water:Q", title="Temperature", format=".1f"),
1176
+ alt.Tooltip("Dissolved Oxygen:Q", title="DO", format=".1f"),
1177
+ alt.Tooltip("Sample_Position:N", title="Position"),
1178
+ alt.Tooltip("Sector:N", title="Sector"),
1179
+ alt.Tooltip("Station_Number:N", title="Station"),
1180
+ ],
1181
+ )
1182
+ )
1183
+
1184
+ # Add regression lines for each Sample_Position
1185
+ regression = (
1186
+ scatter.transform_regression(
1187
+ "Temperature, Water", "Dissolved Oxygen", groupby=["Sample_Position"]
1188
+ )
1189
+ .mark_line(size=2)
1190
+ .encode(
1191
+ color=alt.Color(
1192
+ "Sample_Position:N",
1193
+ scale=alt.Scale(
1194
+ domain=["Surface", "Bottom"], range=["#8da0cb", "#fc8d62"]
1195
+ ),
1196
+ )
1197
+ )
1198
+ )
1199
+
1200
+ # Create DO threshold line
1201
+ threshold_df = pd.DataFrame({"y": [5]})
1202
+ threshold_line = (
1203
+ alt.Chart(threshold_df)
1204
+ .mark_rule(strokeDash=[4, 4], color="red", opacity=0.5)
1205
+ .encode(y="y:Q")
1206
+ )
1207
+
1208
+ # Add threshold label
1209
+ threshold_label = (
1210
+ alt.Chart(
1211
+ pd.DataFrame({"x": [do_temp_data["Temperature, Water"].min()], "y": [5.1]})
1212
+ )
1213
+ .mark_text(
1214
+ align="left",
1215
+ baseline="bottom",
1216
+ color="red",
1217
+ opacity=0.5,
1218
+ text=" 5 mg/L DO threshold",
1219
+ )
1220
+ .encode(x="x:Q", y="y:Q")
1221
+ )
1222
+
1223
+ # Combine all layers and configure
1224
+ final_chart = (
1225
+ alt.layer(scatter, regression, threshold_line, threshold_label)
1226
+ .properties(
1227
+ width=800,
1228
+ height=750,
1229
+ )
1230
+ .configure_axis(grid=True, gridOpacity=0.3)
1231
+ .interactive()
1232
+ )
1233
+
1234
+ return final_chart
app.py ADDED
@@ -0,0 +1,1070 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import os
4
+ import textwrap
5
+ import time
6
+ import uuid
7
+ from datetime import datetime
8
+ from functools import wraps
9
+ from pathlib import Path
10
+
11
+ import pandas as pd
12
+ import plotly.express as px
13
+ import streamlit as st
14
+ from great_tables import GT, html
15
+ from matplotlib import pyplot as plt
16
+
17
+ from analysis import (
18
+ altair_plot_do_temp_relationship,
19
+ altair_plot_np_ratios,
20
+ altair_plot_sector_trends,
21
+ plot_analyte_trends,
22
+ plot_calendar_heatmap,
23
+ plot_do_temp_relationship,
24
+ plot_np_ratios,
25
+ plot_parameter_correlations,
26
+ plot_seasonal_salinity_for_bays,
27
+ plot_sector_trends,
28
+ )
29
+ from main import (
30
+ create_multiindex_columns,
31
+ create_overall_summary,
32
+ create_summary_by_station_and_position,
33
+ get_analyte_data_with_lat_long,
34
+ get_raw_data,
35
+ get_stations_data,
36
+ )
37
+
38
+
39
+ def log_visit():
40
+ """Log visitor analytics including timestamp, user agent, and page info"""
41
+ if st.session_state.get("admin_authenticated", False):
42
+ return
43
+ log_file = Path("analytics.json")
44
+ now = datetime.now()
45
+ today = now.strftime("%Y-%m-%d")
46
+
47
+ if "visitor_id" not in st.session_state:
48
+ st.session_state.visitor_id = str(uuid.uuid4())
49
+
50
+ try:
51
+ user_agent = st.context.headers.get("User-Agent", "Unknown")
52
+ except Exception:
53
+ user_agent = "Unknown"
54
+
55
+ visit_type = (
56
+ "initial" if not st.session_state.get("logged_visit") else "section_change"
57
+ )
58
+
59
+ visit_data = {
60
+ "timestamp": now.isoformat(),
61
+ "date": today,
62
+ "user_agent": user_agent,
63
+ "visitor_id": st.session_state.visitor_id,
64
+ "page_section": st.session_state.get("current_section", "Overall Summary"),
65
+ "visit_type": visit_type,
66
+ "query_params": dict(st.query_params),
67
+ }
68
+
69
+ if log_file.exists():
70
+ with open(log_file, "r") as f:
71
+ data = json.load(f)
72
+ if "visits" not in data:
73
+ data["visits"] = []
74
+ if "daily_counts" not in data:
75
+ data["daily_counts"] = {}
76
+ if "section_counts" not in data:
77
+ data["section_counts"] = {}
78
+ if "daily_visitors" not in data:
79
+ data["daily_visitors"] = {}
80
+ else:
81
+ data = {
82
+ "visits": [],
83
+ "daily_counts": {},
84
+ "section_counts": {},
85
+ "daily_visitors": {},
86
+ }
87
+
88
+ if today not in data["daily_visitors"]:
89
+ data["daily_visitors"][today] = []
90
+ if st.session_state.visitor_id not in data["daily_visitors"][today]:
91
+ data["daily_visitors"][today].append(st.session_state.visitor_id)
92
+ data["daily_counts"][today] = len(data["daily_visitors"][today])
93
+
94
+ data["visits"].append(visit_data)
95
+ current_section = visit_data["page_section"]
96
+ data["section_counts"][current_section] = (
97
+ data["section_counts"].get(current_section, 0) + 1
98
+ )
99
+
100
+ with open(log_file, "w") as f:
101
+ json.dump(data, f, indent=2)
102
+
103
+
104
+ if not st.session_state.get("logged_visit"):
105
+ log_visit()
106
+ st.session_state["logged_visit"] = True
107
+
108
+ ADMIN_PASSWORD = os.getenv("ADMIN_PASSWORD", "")
109
+ ENABLE_TIMING = False
110
+ ENABLE_ALTAIR = False
111
+
112
+
113
+ def check_admin_access():
114
+ """Handle admin authentication logic only"""
115
+ if not ADMIN_PASSWORD:
116
+ return False
117
+
118
+ if "admin_authenticated" not in st.session_state:
119
+ st.session_state.admin_authenticated = False
120
+
121
+ return st.session_state.admin_authenticated
122
+
123
+
124
+ def render_admin_panel():
125
+ """Handle admin UI elements only"""
126
+ with st.sidebar:
127
+ st.markdown("---")
128
+ with st.expander("🔒 Admin", expanded=False):
129
+ if st.session_state.admin_authenticated:
130
+ if st.button("Logout"):
131
+ st.session_state.admin_authenticated = False
132
+ st.rerun()
133
+ else:
134
+ password_input = st.text_input("Password", type="password")
135
+ if st.button("Login"):
136
+ if password_input == ADMIN_PASSWORD:
137
+ st.session_state.admin_authenticated = True
138
+ st.rerun()
139
+ else:
140
+ st.error("Incorrect password")
141
+
142
+
143
+ st.set_page_config(
144
+ page_title="Water Quality Summary",
145
+ page_icon="💧",
146
+ layout="wide",
147
+ initial_sidebar_state="expanded",
148
+ menu_items={"Get Help": None, "Report a bug": None, "About": None},
149
+ )
150
+
151
+ st.sidebar.title("Navigation")
152
+
153
+ sections = [
154
+ "Overall Summary",
155
+ "Summary by Station",
156
+ "Nutrient Ratios",
157
+ "Sector Trends",
158
+ "Trends by Station",
159
+ "Parameter Correlations",
160
+ "DO/Temp Relationship",
161
+ "Calendar Heatmaps",
162
+ "Seasonal Trends",
163
+ "Raw Data",
164
+ ]
165
+
166
+ is_admin = check_admin_access()
167
+ if is_admin:
168
+ sections.append("Analytics")
169
+ if is_admin:
170
+ ENABLE_TIMING = st.sidebar.toggle("Enable Timing", value=ENABLE_TIMING)
171
+
172
+ section = st.sidebar.radio(
173
+ "Go to",
174
+ sections,
175
+ )
176
+ if not st.session_state.get("admin_authenticated", False) and (
177
+ "current_section" not in st.session_state
178
+ or st.session_state.current_section != section
179
+ ):
180
+ st.session_state.current_section = section
181
+ log_visit()
182
+
183
+ if not st.session_state.get("admin_authenticated", False) and not st.session_state.get(
184
+ "logged_visit"
185
+ ):
186
+ log_visit()
187
+ st.session_state["logged_visit"] = True
188
+
189
+ if section == "Overall Summary":
190
+ render_admin_panel()
191
+
192
+
193
+ def summarize_parameter_value(value: str, max_length: int = 100) -> str:
194
+ """Summarize parameter values that are too long or complex."""
195
+ if not value:
196
+ return ""
197
+
198
+ # Handle DataFrames
199
+ if "DataFrame" in value and "[" in value and "]" in value:
200
+ try:
201
+ # Extract dimensions if present in string like "DataFrame[1000x20]"
202
+ dims = value[value.find("[") + 1 : value.find("]")]
203
+ return f"DataFrame[{dims}]"
204
+ except Exception:
205
+ return "DataFrame"
206
+
207
+ # Handle lists, tuples, and other sequences
208
+ if value.startswith(("[", "(", "{")):
209
+ try:
210
+ # Count items if it's a sequence
211
+ item_count = value.count(",") + 1
212
+ return f"{value[:20]}... ({item_count} items)"
213
+ except Exception:
214
+ return f"{value[:20]}..."
215
+
216
+ # Handle long strings
217
+ if len(value) > max_length:
218
+ return f"{value[:max_length]}..."
219
+
220
+ return value
221
+
222
+
223
+ def timer(include_params=False):
224
+ def decorator(func):
225
+ @wraps(func)
226
+ def wrapper(*args, **kwargs):
227
+ if not ENABLE_TIMING:
228
+ return func(*args, **kwargs)
229
+
230
+ start = time.perf_counter()
231
+ result = func(*args, **kwargs)
232
+ end = time.perf_counter()
233
+ duration = end - start
234
+
235
+ # Initialize timing_stats if it doesn't exist
236
+ if "timing_stats" not in st.session_state:
237
+ st.session_state.timing_stats = {}
238
+ st.session_state.timing_logs = []
239
+
240
+ # Initialize list for this function if it doesn't exist
241
+ if func.__name__ not in st.session_state.timing_stats:
242
+ st.session_state.timing_stats[func.__name__] = []
243
+
244
+ # Append new duration to the list
245
+ st.session_state.timing_stats[func.__name__].append(duration)
246
+
247
+ # Create log entry with optional parameter info
248
+ log_entry = {
249
+ "timestamp": datetime.now().isoformat(),
250
+ "function": func.__name__,
251
+ "duration": duration,
252
+ }
253
+
254
+ if include_params:
255
+ # Get parameter names from function signature
256
+ import inspect
257
+
258
+ sig = inspect.signature(func)
259
+ param_names = list(sig.parameters.keys())
260
+
261
+ # Combine args and kwargs into a parameter dictionary
262
+ param_values = {}
263
+ for i, arg in enumerate(args):
264
+ if i < len(param_names):
265
+ param_values[param_names[i]] = summarize_parameter_value(
266
+ str(arg),
267
+ max_length=40,
268
+ )
269
+ param_values.update(
270
+ {
271
+ k: summarize_parameter_value(str(v), max_length=40)
272
+ for k, v in kwargs.items()
273
+ }
274
+ )
275
+
276
+ log_entry["parameters"] = param_values
277
+
278
+ st.session_state.timing_logs.append(log_entry)
279
+
280
+ return result
281
+
282
+ return wrapper
283
+
284
+ return decorator
285
+
286
+
287
+ @timer(include_params=False)
288
+ def load_raw_data():
289
+ return get_raw_data("data/master_data_file_2019-01-01_-_2024-10-31.parquet")
290
+
291
+
292
+ @timer(include_params=False)
293
+ def create_summaries(raw_df):
294
+ summary_by_station = create_summary_by_station_and_position(raw_df)
295
+ overall_summary = create_overall_summary(raw_df)
296
+ multiindex_df = create_multiindex_columns(summary_by_station)
297
+ return summary_by_station, overall_summary, multiindex_df
298
+
299
+
300
+ @timer(include_params=False)
301
+ def prepare_downloads(summary_by_station, multiindex_df, raw_df):
302
+ summary_csv = summary_by_station.reset_index().to_csv(index=False)
303
+ excel_buffer = io.BytesIO()
304
+ with pd.ExcelWriter(excel_buffer, engine="xlsxwriter") as writer:
305
+ multiindex_df.to_excel(writer, sheet_name="Water Quality Summary")
306
+ summary_excel = excel_buffer.getvalue()
307
+ raw_csv = raw_df.to_csv(index=False)
308
+ return {
309
+ "summary": {
310
+ "CSV": (summary_csv, "csv", "text/csv"),
311
+ "Excel": (summary_excel, "xlsx", "application/vnd.ms-excel"),
312
+ },
313
+ "raw": {
314
+ "CSV": (raw_csv, "csv", "text/csv"),
315
+ },
316
+ }
317
+
318
+
319
+ @timer(include_params=False)
320
+ def load_seasonal_data(raw_df, analyte):
321
+ """Load and prepare data for seasonal trends analysis"""
322
+ return get_analyte_data_with_lat_long(raw_df, analyte)
323
+
324
+
325
+ @timer(include_params=True)
326
+ def generate_seasonal_plot(data, year, shapefile_path):
327
+ """Generate the seasonal trends plot"""
328
+ return plot_seasonal_salinity_for_bays(data, year, shapefile_path=shapefile_path)
329
+
330
+
331
+ # if ENABLE_TIMING:
332
+
333
+ # def load_data():
334
+ # """
335
+ # Load all data views needed by the application.
336
+
337
+ # Returns:
338
+ # dict: Contains different views of the data
339
+ # """
340
+ # raw_df = load_raw_data()
341
+ # summary_by_station, overall_summary, multiindex_df = create_summaries(raw_df)
342
+ # downloads = prepare_downloads(summary_by_station, multiindex_df, raw_df)
343
+ # return {
344
+ # "raw_df": raw_df,
345
+ # "summary_by_station": summary_by_station,
346
+ # "overall_summary": overall_summary,
347
+ # "multiindex_df": multiindex_df,
348
+ # "downloads": downloads,
349
+ # }
350
+ # else:
351
+
352
+
353
+ @st.cache_data
354
+ def load_data():
355
+ """
356
+ Load and cache all data views needed by the application.
357
+
358
+ Returns:
359
+ dict: Contains different views of the data
360
+ """
361
+ raw_df = load_raw_data()
362
+ summary_by_station, overall_summary, multiindex_df = create_summaries(raw_df)
363
+ downloads = prepare_downloads(summary_by_station, multiindex_df, raw_df)
364
+ return {
365
+ "raw_df": raw_df,
366
+ "summary_by_station": summary_by_station,
367
+ "overall_summary": overall_summary,
368
+ "multiindex_df": multiindex_df,
369
+ "downloads": downloads,
370
+ }
371
+
372
+
373
+ @st.cache_data
374
+ def generate_correlation_plot(
375
+ subset_df, analyte_names, subset_by, subset, position_filter
376
+ ):
377
+ fig = plot_parameter_correlations(
378
+ subset_df, analyte_names, subset_by, subset, position_filter
379
+ )
380
+ return fig
381
+
382
+
383
+ def create_overall_summary_table(df: pd.DataFrame) -> GT:
384
+ df.index.name = "Statistic"
385
+ df = df.reset_index()
386
+
387
+ return (
388
+ GT(df, rowname_col="Statistic")
389
+ .tab_header(
390
+ title="Overall Water Quality",
391
+ subtitle="Summary statistics for all data analyzed during study period",
392
+ )
393
+ .fmt_number(
394
+ columns=[
395
+ "Secchi Depth (feet)",
396
+ "Temperature (°C)",
397
+ "Dissolved Oxygen (mg/L)",
398
+ ],
399
+ decimals=1,
400
+ )
401
+ .fmt_integer(
402
+ columns=list(df.columns[1:]),
403
+ rows=lambda x: x["Statistic"] == "Count", # type: ignore
404
+ use_seps=True,
405
+ )
406
+ .cols_label(
407
+ **{
408
+ col: html(f"{col.rpartition(' ')[0]}<br>{col.rpartition(' ')[-1]}")
409
+ if col != "pH"
410
+ else html(f"{col}<br>&nbsp;")
411
+ for col in df.columns[1:]
412
+ } # type: ignore
413
+ )
414
+ .cols_width(cases={col: "14%" for col in df.columns[1:]})
415
+ .opt_align_table_header(align="center")
416
+ )
417
+
418
+
419
+ data = load_data()
420
+
421
+ if section == "Overall Summary":
422
+ st.title("Overall Summary")
423
+ st.html(create_overall_summary_table(data["overall_summary"]).as_raw_html())
424
+
425
+ st.markdown("### Sampling Stations Map")
426
+ stations_df = get_stations_data()
427
+ fig = px.scatter_mapbox(
428
+ stations_df,
429
+ lat="Latitude",
430
+ lon="Longitude",
431
+ hover_data={
432
+ "Number": True,
433
+ "U_of_F": True,
434
+ "Sector": True,
435
+ "WBID": True,
436
+ "Latitude": False,
437
+ "Longitude": False,
438
+ },
439
+ hover_name="Name",
440
+ zoom=10,
441
+ height=700,
442
+ labels={
443
+ "Number": "Station Number",
444
+ "U_of_F": "ID",
445
+ "Sector": "Sector",
446
+ "WBID": "WBID",
447
+ },
448
+ )
449
+ fig.update_layout(
450
+ mapbox_style="carto-positron",
451
+ margin={"r": 0, "t": 0, "l": 0, "b": 0},
452
+ )
453
+ st.plotly_chart(fig, use_container_width=True)
454
+
455
+
456
+ elif section == "Summary by Station":
457
+ st.title("Summary by Station")
458
+ download_format = st.radio(
459
+ "Select download format:",
460
+ ["CSV", "Excel"],
461
+ key="summary_download",
462
+ horizontal=True,
463
+ )
464
+ download_data = data["downloads"]["summary"][download_format]
465
+ st.download_button(
466
+ label=f"Download Summary Data ({download_format})",
467
+ data=download_data[0],
468
+ file_name=f"water_quality_summary.{download_data[1]}",
469
+ mime=download_data[2],
470
+ )
471
+
472
+ st.markdown("""
473
+ This table shows summary statistics for various water quality measurements across different stations.
474
+ Each station's measurements are broken down into surface and bottom readings where applicable.
475
+ """)
476
+ st.dataframe(
477
+ data["multiindex_df"]
478
+ .style.format(precision=2)
479
+ .highlight_null(props="background-color: lightgray"),
480
+ use_container_width=True,
481
+ height=600,
482
+ )
483
+
484
+ st.markdown("---")
485
+ total_stations = len(data["summary_by_station"].index.get_level_values(0).unique())
486
+ st.markdown(f"Total number of stations: **{total_stations}**")
487
+
488
+ elif section == "Trends by Station":
489
+ st.title("Trends by Station")
490
+ analyte_names = [
491
+ "Dissolved Oxygen",
492
+ "Salinity",
493
+ "pH",
494
+ "Depth, Secchi Disk Depth",
495
+ "Turbidity",
496
+ "Fecal Coliform (MPN)",
497
+ "Total Nitrogen",
498
+ "Total Phosphorus",
499
+ ]
500
+ st.sidebar.markdown("### Filter Options")
501
+
502
+ selected_station = st.sidebar.selectbox(
503
+ "Station:",
504
+ sorted(data["raw_df"]["Station_Number"].unique()),
505
+ index=sorted(data["raw_df"]["Station_Number"].unique()).index("3.20"),
506
+ )
507
+ selected_position = st.sidebar.segmented_control(
508
+ "Sample Position:",
509
+ ("All", "Surface", "Bottom"),
510
+ default="All",
511
+ selection_mode="single",
512
+ )
513
+ selected_position = selected_position or "All"
514
+ filtered_df = data["raw_df"].query("Station_Number == @selected_station")
515
+ if selected_position != "All":
516
+ filtered_df = filtered_df.query("Sample_Position == @selected_position")
517
+
518
+ csv_buffer = io.StringIO()
519
+ filtered_df.to_csv(csv_buffer, index=False)
520
+ st.sidebar.download_button(
521
+ label="Download Filtered Data (CSV)",
522
+ data=csv_buffer.getvalue(),
523
+ file_name=f"station_{selected_station}_{selected_position.lower()}_data.csv",
524
+ mime="text/csv",
525
+ )
526
+
527
+ with st.sidebar.expander("Preview Filtered Data"):
528
+ st.markdown(f"**{len(filtered_df):,}** records")
529
+ display_columns = [
530
+ "Activity_Start_Date_Time",
531
+ "Sample_Position",
532
+ "Org_Analyte_Name",
533
+ "Org_Result_Value",
534
+ "Org_Result_Unit",
535
+ ]
536
+ preview_df = filtered_df[["Station_Number"] + display_columns].copy()
537
+ preview_df.set_index("Station_Number", inplace=True)
538
+ st.dataframe(
539
+ preview_df.style.format(precision=2),
540
+ use_container_width=True,
541
+ height=300,
542
+ )
543
+
544
+ if not filtered_df.empty:
545
+ fig = plot_analyte_trends(filtered_df, analyte_names, selected_position)
546
+ st.pyplot(fig)
547
+ else:
548
+ st.warning(
549
+ "No data available for the selected station and position combination."
550
+ )
551
+
552
+ elif section == "Sector Trends":
553
+ st.title("Sector Trends")
554
+ ENABLE_ALTAIR = st.sidebar.toggle("Interactive Plots", value=ENABLE_ALTAIR)
555
+ default_analytes = [
556
+ "Dissolved Oxygen",
557
+ "Salinity",
558
+ "Depth, Secchi Disk Depth",
559
+ "Total Nitrogen",
560
+ "Total Phosphorus",
561
+ ]
562
+ all_analytes = default_analytes + [
563
+ x
564
+ for x in sorted(data["raw_df"]["Org_Analyte_Name"].unique())
565
+ if x not in default_analytes
566
+ ]
567
+
568
+ selected_analytes = st.sidebar.multiselect(
569
+ "Select Analytes:",
570
+ options=all_analytes,
571
+ default=default_analytes,
572
+ key="sector_analyte_select",
573
+ help="Choose one or more analytes to plot.",
574
+ )
575
+ if selected_analytes and not data["raw_df"].empty:
576
+ if ENABLE_ALTAIR:
577
+ charts = altair_plot_sector_trends(data["raw_df"], selected_analytes)
578
+ st.altair_chart(charts, use_container_width=True) # type: ignore
579
+ else:
580
+ fig = plot_sector_trends(data["raw_df"], selected_analytes, base_height=3.5)
581
+ st.pyplot(fig)
582
+ elif not selected_analytes:
583
+ st.warning("No analytes selected.")
584
+ else:
585
+ st.warning("No data available for the selected analytes.")
586
+
587
+ elif section == "Parameter Correlations":
588
+ st.title("Parameter Correlations")
589
+ subset_by = "Sector"
590
+ st.sidebar.markdown("### Filter Options")
591
+ position_filter = st.sidebar.selectbox(
592
+ "Sample Position:", ["All", "Surface", "Bottom"], index=0
593
+ )
594
+ with st.spinner("Loading data for correlation plots..."):
595
+ analyte_names = [
596
+ "Dissolved Oxygen",
597
+ "Salinity",
598
+ "pH",
599
+ "Depth, Secchi Disk Depth",
600
+ "Turbidity",
601
+ "Fecal Coliform (MPN)",
602
+ "Total Nitrogen",
603
+ "Total Phosphorus",
604
+ ]
605
+ raw_df = data["raw_df"]
606
+ raw_df["Year"] = raw_df["Activity_Start_Date_Time"].dt.year
607
+ years = ["All"] + sorted(raw_df["Year"].unique().tolist(), reverse=True)
608
+ year_filter = st.sidebar.selectbox("Year:", years, index=0)
609
+ plot_df = raw_df.copy()
610
+ if position_filter != "All":
611
+ plot_df = plot_df[plot_df["Sample_Position"] == position_filter]
612
+ if year_filter != "All":
613
+ plot_df = plot_df[plot_df["Year"] == year_filter]
614
+ plot_df_download = plot_df.copy()
615
+ csv_buffer = io.StringIO()
616
+ plot_df_download.to_csv(csv_buffer, index=False)
617
+ st.sidebar.download_button(
618
+ label="Download Filtered Data (CSV)",
619
+ data=csv_buffer.getvalue(),
620
+ file_name=f"correlation_data_{subset_by}_{position_filter}_{year_filter}.csv",
621
+ mime="text/csv",
622
+ )
623
+ st.sidebar.markdown("### Group By")
624
+ subset_by = st.sidebar.selectbox(
625
+ "Group correlations by:", ["Sector", "Waterbody_Class"], index=0
626
+ )
627
+ unique_subsets = sorted(plot_df[subset_by].unique())
628
+
629
+ selected_groups = st.sidebar.multiselect(
630
+ "Select groups to display:",
631
+ options=unique_subsets,
632
+ default=unique_subsets,
633
+ key="group_selector",
634
+ )
635
+ # Add ordering control
636
+ order_by = st.sidebar.radio(
637
+ "Order groups by:", ["Number of Records", "Alphabetical"], key="group_order"
638
+ )
639
+
640
+ ### FIX THIS
641
+ # Add download button for grouped correlation data
642
+ if selected_groups:
643
+ grouped_data = []
644
+ for group in selected_groups:
645
+ subset_df = plot_df[plot_df[subset_by] == group]
646
+ if not subset_df.empty:
647
+ # Filter for just the analytes we want to correlate
648
+ analyte_df = subset_df[
649
+ subset_df["Org_Analyte_Name"].isin(analyte_names)
650
+ ].copy()
651
+ analyte_df["Group"] = group
652
+ grouped_data.append(analyte_df)
653
+
654
+ if grouped_data:
655
+ combined_data = pd.concat(grouped_data)
656
+ csv_buffer = io.StringIO()
657
+ combined_data.to_csv(csv_buffer, index=False)
658
+ st.sidebar.download_button(
659
+ label="Download Grouped Correlation Data (CSV)",
660
+ data=csv_buffer.getvalue(),
661
+ file_name=f"grouped_correlation_data_{subset_by}_{position_filter}_{year_filter}.csv",
662
+ mime="text/csv",
663
+ )
664
+
665
+ # Order the selected groups
666
+ if order_by == "Number of Records":
667
+ group_counts = {
668
+ group: len(plot_df[plot_df[subset_by] == group])
669
+ for group in selected_groups
670
+ }
671
+ selected_groups = sorted(
672
+ selected_groups, key=lambda x: group_counts[x], reverse=True
673
+ )
674
+ else:
675
+ selected_groups = sorted(selected_groups)
676
+
677
+ # Loop with filtered groups
678
+ cols = st.columns(2)
679
+ for idx, subset in enumerate(selected_groups):
680
+ subset_df = plot_df[plot_df[subset_by] == subset]
681
+ if not subset_df.empty:
682
+ fig = generate_correlation_plot(
683
+ subset_df, analyte_names, subset_by, subset, position_filter
684
+ )
685
+ cols[idx % 2].pyplot(fig)
686
+ plt.close()
687
+ with cols[idx % 2].expander(f"View {subset} Data"):
688
+ st.markdown(f"**{len(subset_df):,}** records")
689
+ display_columns = [
690
+ "Activity_Start_Date_Time",
691
+ "Station_Number",
692
+ "Sample_Position",
693
+ "Org_Analyte_Name",
694
+ "Org_Result_Value",
695
+ "Org_Result_Unit",
696
+ ]
697
+ st.dataframe(
698
+ subset_df[display_columns].style.format(precision=2),
699
+ use_container_width=True,
700
+ height=300,
701
+ )
702
+ csv_buffer = io.StringIO()
703
+ subset_df.to_csv(csv_buffer, index=False)
704
+ st.download_button(
705
+ label=f"Download {subset} Data (CSV)",
706
+ data=csv_buffer.getvalue(),
707
+ file_name=f"correlation_data_{subset}_{position_filter}_{year_filter}.csv",
708
+ mime="text/csv",
709
+ )
710
+
711
+ elif section == "DO/Temp Relationship":
712
+ ENABLE_ALTAIR = st.sidebar.toggle("Interactive Plot", value=ENABLE_ALTAIR)
713
+ st.title("DO/Temp Relationship")
714
+ st.markdown(
715
+ "This plot shows the relationship between dissolved oxygen and water temperature for all data."
716
+ )
717
+ if ENABLE_ALTAIR:
718
+ fig = altair_plot_do_temp_relationship(data["raw_df"])
719
+ st.altair_chart(fig, use_container_width=True) # type: ignore
720
+ else:
721
+ fig = plot_do_temp_relationship(data["raw_df"])
722
+ st.pyplot(fig)
723
+
724
+ elif section == "Calendar Heatmaps":
725
+ st.title("Calendar Heatmaps")
726
+ st.info(
727
+ "💡 You can customize the colormaps using the 'Plot Settings' expander in the sidebar."
728
+ )
729
+ raw_df = data["raw_df"]
730
+ raw_df["Date"] = pd.to_datetime(raw_df["Activity_Start_Date_Time"]).dt.date
731
+
732
+ default_analytes = [
733
+ "Temperature, Water",
734
+ "Dissolved Oxygen",
735
+ "Salinity",
736
+ "pH",
737
+ "Turbidity",
738
+ "Depth, Secchi Disk Depth",
739
+ "Fecal Coliform (MPN)",
740
+ "Total Nitrogen",
741
+ "Total Phosphorus",
742
+ "Chlorophyll-uncorrected",
743
+ ]
744
+
745
+ # Get all unique analytes and ensure defaults are at the start of the list
746
+ all_analytes = default_analytes + [
747
+ x
748
+ for x in sorted(raw_df["Org_Analyte_Name"].unique())
749
+ if x not in default_analytes
750
+ ]
751
+ selected_analytes = st.sidebar.multiselect(
752
+ "Select Analytes:",
753
+ options=all_analytes,
754
+ default=default_analytes,
755
+ key="calendar_analyte_select",
756
+ help="Choose one or more analytes to display in the heatmap.",
757
+ )
758
+
759
+ # Filter Options
760
+ st.sidebar.markdown("### Filter Options")
761
+ sector_filter = st.sidebar.selectbox(
762
+ "Sector:",
763
+ ["All"] + sorted(raw_df["Sector"].unique().tolist()),
764
+ index=0,
765
+ key="calendar_sector_select",
766
+ )
767
+ position_filter = st.sidebar.selectbox(
768
+ "Position:",
769
+ ["All", "Surface", "Bottom"],
770
+ index=0,
771
+ key="calendar_position_select",
772
+ )
773
+
774
+ def format_colormap_option(option):
775
+ append = ""
776
+ if option in [
777
+ "viridis", # Sequential
778
+ "plasma", # Sequential
779
+ "inferno", # Sequential
780
+ "magma", # Sequential
781
+ "GnBu", # Sequential (Multi-hue)
782
+ "Blues", # Sequential (Single-hue)
783
+ "Blues_r", # Sequential (Single-hue, reversed)
784
+ ]:
785
+ append = " [Sequential]"
786
+ elif option in [
787
+ "YlOrRd", # Sequential (Multi-hue)
788
+ "YlGnBu", # Sequential (Multi-hue)
789
+ "RdPu", # Sequential (Multi-hue)
790
+ ]:
791
+ append = " [Sequential (Multi-hue)]"
792
+ elif option in [
793
+ "RdYlBu", # Diverging
794
+ "RdBu", # Diverging
795
+ "coolwarm", # Diverging
796
+ ]:
797
+ append = " [Diverging]"
798
+ return option + append
799
+
800
+ colormap_help_text = """
801
+ Any selection here will override the default color scheme for all of the displayed
802
+ heatmaps. Selecting Default will revert to the analyte-specific default color schemes.
803
+
804
+ **The default color schemes are:**
805
+
806
+ `Fecal Coliform (MPN)` : `viridis` _(blue-green-yellow)_
807
+ `Temperature, Water` : `coolwarm` _(red-white-blue)_
808
+ `Dissolved Oxygen` : `RdYlBu` _(red-yellow-blue)_
809
+ `Total Nitrogen/Phosphorus` : `GnBu` _(green-blue)_
810
+ `Depth, Secchi Disk Depth` : `Blues_r` _(reversed blues)_
811
+ `All other analytes` : `Blues` _(blue)_
812
+ """
813
+ with st.sidebar.expander("Plot Settings", expanded=False):
814
+ colormap = st.radio(
815
+ "Color Scheme",
816
+ options=[
817
+ "Default",
818
+ # Sequential (Perceptually Uniform)
819
+ "viridis",
820
+ "plasma",
821
+ "inferno",
822
+ "magma",
823
+ # Sequential (Single-hue)
824
+ "Blues",
825
+ "Blues_r",
826
+ # Sequential (Multi-hue)
827
+ "GnBu",
828
+ "YlOrRd",
829
+ "YlGnBu",
830
+ "RdPu",
831
+ # Diverging
832
+ "RdYlBu",
833
+ "RdBu",
834
+ "coolwarm",
835
+ ],
836
+ index=0,
837
+ help=colormap_help_text,
838
+ format_func=format_colormap_option,
839
+ )
840
+
841
+ if colormap == "Default":
842
+ colormap = None
843
+
844
+ # Filter data
845
+ plot_df = raw_df.copy()
846
+ if sector_filter != "All":
847
+ plot_df = plot_df[plot_df["Sector"] == sector_filter]
848
+ if position_filter != "All":
849
+ plot_df = plot_df[plot_df["Sample_Position"] == position_filter]
850
+ if not plot_df.empty:
851
+ for analyte in selected_analytes:
852
+ fig = plot_calendar_heatmap(plot_df, analyte, colormap)
853
+ st.pyplot(fig)
854
+ else:
855
+ st.warning("No data available for the selected filters.")
856
+
857
+ elif section == "Seasonal Trends":
858
+ st.title("Seasonal Trends")
859
+ raw_df = data["raw_df"]
860
+ years = sorted(pd.to_datetime(raw_df["Activity_Start_Date_Time"]).dt.year.unique())
861
+ col1, col2 = st.columns(2)
862
+ with col1:
863
+ analyte = st.selectbox(
864
+ "Select Analyte:", ["Salinity"], index=0, key="seasonal_analyte_select"
865
+ )
866
+ with col2:
867
+ selected_year = st.selectbox(
868
+ "Select Year:",
869
+ sorted(years, reverse=True),
870
+ index=0,
871
+ key="seasonal_year_select",
872
+ )
873
+ if not raw_df.empty:
874
+ seasonal_data = load_seasonal_data(raw_df, analyte)
875
+ fig = generate_seasonal_plot(
876
+ seasonal_data,
877
+ str(selected_year),
878
+ shapefile_path="data/SAB/SAB.shp",
879
+ )
880
+ st.pyplot(fig)
881
+ else:
882
+ st.warning("No data available for seasonal analysis.")
883
+
884
+ elif section == "Nutrient Ratios":
885
+ ENABLE_ALTAIR = st.sidebar.toggle("Interactive Plots", value=ENABLE_ALTAIR)
886
+ st.title("Nutrient Ratios")
887
+ raw_df = data["raw_df"]
888
+ if not raw_df.empty:
889
+ if ENABLE_ALTAIR:
890
+ vconcat = altair_plot_np_ratios(raw_df)
891
+ st.altair_chart(vconcat, use_container_width=True) # type: ignore
892
+ else:
893
+ fig = plot_np_ratios(raw_df)
894
+ st.pyplot(fig)
895
+ else:
896
+ st.warning("No data available for nutrient ratio analysis.")
897
+
898
+ elif section == "Raw Data":
899
+ st.title("Raw Data")
900
+ raw_df = data["raw_df"]
901
+ raw_download_format = st.radio(
902
+ "Select download format:",
903
+ ["CSV", "Excel"],
904
+ key="raw_download",
905
+ horizontal=True,
906
+ )
907
+
908
+ if raw_download_format == "Excel":
909
+ excel_buffer = io.BytesIO()
910
+ with pd.ExcelWriter(excel_buffer, engine="xlsxwriter") as writer:
911
+ raw_df.to_excel(writer, sheet_name="Raw Water Quality Data", index=False)
912
+ raw_excel = excel_buffer.getvalue()
913
+ download_data = (raw_excel, "xlsx", "application/vnd.ms-excel")
914
+ else:
915
+ download_data = data["downloads"]["raw"]["CSV"]
916
+
917
+ st.download_button(
918
+ label=f"Download Raw Data ({raw_download_format})",
919
+ data=download_data[0],
920
+ file_name=f"water_quality_raw_2019-01-01_-_2024-10-31.{download_data[1]}",
921
+ mime=download_data[2],
922
+ )
923
+ st.markdown(f"""
924
+ Preview of the first 1,000 of {raw_df.shape[0]:,} records in the dataset.
925
+ """)
926
+ st.dataframe(
927
+ raw_df.head(1000).style.format(precision=2),
928
+ use_container_width=True,
929
+ height=600,
930
+ )
931
+
932
+ elif section == "Analytics":
933
+ st.title("Analytics")
934
+
935
+ log_file = Path("analytics.json")
936
+ if log_file.exists():
937
+ with open(log_file, "r") as f:
938
+ analytics_data = json.load(f)
939
+
940
+ col1, col2 = st.columns(2)
941
+
942
+ with col1:
943
+ visits_df = pd.DataFrame(analytics_data["visits"])
944
+ visits_df["timestamp"] = pd.to_datetime(visits_df["timestamp"])
945
+
946
+ daily_visits_df = (
947
+ visits_df.groupby("date")["visitor_id"]
948
+ .agg(["nunique", "count"])
949
+ .reset_index()
950
+ .rename(columns={"nunique": "Unique Visitors", "count": "Total Views"})
951
+ )
952
+ daily_visits_df["date"] = pd.to_datetime(daily_visits_df["date"])
953
+ daily_visits_df = daily_visits_df.sort_values("date")
954
+
955
+ total_unique_visitors = visits_df["visitor_id"].nunique()
956
+ total_views = len(visits_df)
957
+ avg_views_per_visitor = total_views / total_unique_visitors
958
+
959
+ st.subheader("Visitor Metrics")
960
+ metrics_col1, metrics_col2, metrics_col3 = st.columns(3)
961
+ metrics_col1.metric("Total Unique Visitors", total_unique_visitors)
962
+ metrics_col2.metric("Total Page Views", total_views)
963
+ metrics_col3.metric("Avg Views per Visitor", f"{avg_views_per_visitor:.1f}")
964
+
965
+ st.subheader("Daily Statistics")
966
+ st.dataframe(
967
+ daily_visits_df.style.format(
968
+ {"Unique Visitors": "{:,.0f}", "Total Views": "{:,.0f}"}
969
+ ),
970
+ hide_index=True,
971
+ )
972
+
973
+ with col2:
974
+ section_visits_df = pd.DataFrame(
975
+ {
976
+ "Section": analytics_data["section_counts"].keys(),
977
+ "Views": analytics_data["section_counts"].values(),
978
+ }
979
+ )
980
+ section_visits_df = section_visits_df.sort_values("Views", ascending=True)
981
+
982
+ st.subheader("Total Section Views")
983
+ st.bar_chart(section_visits_df.set_index("Section"))
984
+
985
+ with st.expander("Raw Visit Data"):
986
+ visits_df = pd.DataFrame(analytics_data["visits"])
987
+ visits_df["timestamp"] = pd.to_datetime(visits_df["timestamp"])
988
+ st.dataframe(visits_df)
989
+ else:
990
+ st.warning("No analytics data available.")
991
+
992
+ if ENABLE_TIMING:
993
+ st.markdown("---")
994
+ st.subheader("⚡ Performance Metrics")
995
+
996
+ if hasattr(st.session_state, "timing_stats"):
997
+ st.markdown("#### Summary Statistics")
998
+ # Create a summary dataframe with min, max, mean, and count
999
+ timing_summary = []
1000
+ for func_name, durations in st.session_state.timing_stats.items():
1001
+ timing_summary.append(
1002
+ {
1003
+ "Function": func_name,
1004
+ "Min (seconds)": min(durations),
1005
+ "Max (seconds)": max(durations),
1006
+ "Mean (seconds)": sum(durations) / len(durations),
1007
+ "Calls": len(durations),
1008
+ }
1009
+ )
1010
+
1011
+ timing_df = pd.DataFrame(timing_summary).sort_values(
1012
+ "Mean (seconds)", ascending=False
1013
+ )
1014
+
1015
+ st.dataframe(
1016
+ timing_df.style.format(
1017
+ {
1018
+ "Min (seconds)": "{:.2f}",
1019
+ "Max (seconds)": "{:.2f}",
1020
+ "Mean (seconds)": "{:.2f}",
1021
+ "Calls": "{:,.0f}",
1022
+ }
1023
+ ),
1024
+ use_container_width=True,
1025
+ )
1026
+
1027
+ st.markdown("#### Detailed Function Calls")
1028
+ if st.session_state.timing_logs:
1029
+ logs_df = pd.DataFrame(st.session_state.timing_logs)
1030
+ logs_df["timestamp"] = pd.to_datetime(logs_df["timestamp"])
1031
+
1032
+ # Format parameters column if it exists
1033
+ if "parameters" in logs_df.columns:
1034
+ # Option 1: Create wrapped text with newlines
1035
+ logs_df["parameters"] = logs_df["parameters"].apply(
1036
+ lambda x: (
1037
+ "\n".join(
1038
+ textwrap.wrap(
1039
+ "\n".join(f"{k}: {v}" for k, v in x.items()),
1040
+ width=50,
1041
+ break_long_words=False,
1042
+ replace_whitespace=False,
1043
+ )
1044
+ )
1045
+ if isinstance(x, dict)
1046
+ else str(x)
1047
+ )
1048
+ )
1049
+
1050
+ logs_df = logs_df.sort_values("timestamp", ascending=False)
1051
+
1052
+ st.dataframe(
1053
+ logs_df.style.format(
1054
+ {
1055
+ "duration": "{:.2f} seconds",
1056
+ "timestamp": lambda x: x.strftime("%H:%M:%S.%f")[:-3],
1057
+ }
1058
+ ),
1059
+ use_container_width=True,
1060
+ height=400,
1061
+ column_config={
1062
+ "parameters": st.column_config.TextColumn(
1063
+ "parameters",
1064
+ width="large",
1065
+ help="Function parameters and their values",
1066
+ )
1067
+ },
1068
+ )
1069
+ else:
1070
+ st.info("No timing statistics available yet. Try refreshing the page.")
data/SAB/SAB.cpg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad3031f5503a4404af825262ee8232cc04d4ea6683d42c5dd0a2f2a27ac9824
3
+ size 5
data/SAB/SAB.prj ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0360a15fcf4a096367d80e8c723d6dde12e82e4b05b906398443ddfc6a17b6cb
3
+ size 454
data/SAB/SAB.qmd ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE qgis PUBLIC 'http://mrcc.com/qgis.dtd' 'SYSTEM'>
2
+ <qgis version="3.40.0-Bratislava">
3
+ <identifier></identifier>
4
+ <parentidentifier></parentidentifier>
5
+ <language></language>
6
+ <type>dataset</type>
7
+ <title></title>
8
+ <abstract></abstract>
9
+ <links/>
10
+ <dates/>
11
+ <fees></fees>
12
+ <encoding></encoding>
13
+ <crs>
14
+ <spatialrefsys nativeFormat="Wkt">
15
+ <wkt>PROJCRS["NAD83(2011) / Florida GDL Albers",BASEGEOGCRS["NAD83(2011)",DATUM["NAD83 (National Spatial Reference System 2011)",ELLIPSOID["GRS 1980",6378137,298.257222101,LENGTHUNIT["metre",1]]],PRIMEM["Greenwich",0,ANGLEUNIT["degree",0.0174532925199433]],ID["EPSG",6318]],CONVERSION["Florida GDL Albers (meters)",METHOD["Albers Equal Area",ID["EPSG",9822]],PARAMETER["Latitude of false origin",24,ANGLEUNIT["degree",0.0174532925199433],ID["EPSG",8821]],PARAMETER["Longitude of false origin",-84,ANGLEUNIT["degree",0.0174532925199433],ID["EPSG",8822]],PARAMETER["Latitude of 1st standard parallel",24,ANGLEUNIT["degree",0.0174532925199433],ID["EPSG",8823]],PARAMETER["Latitude of 2nd standard parallel",31.5,ANGLEUNIT["degree",0.0174532925199433],ID["EPSG",8824]],PARAMETER["Easting at false origin",400000,LENGTHUNIT["metre",1],ID["EPSG",8826]],PARAMETER["Northing at false origin",0,LENGTHUNIT["metre",1],ID["EPSG",8827]]],CS[Cartesian,2],AXIS["easting (X)",east,ORDER[1],LENGTHUNIT["metre",1]],AXIS["northing (Y)",north,ORDER[2],LENGTHUNIT["metre",1]],USAGE[SCOPE["State-wide spatial data management."],AREA["United States (USA) - Florida."],BBOX[24.41,-87.63,31.01,-79.97]],ID["EPSG",6439]]</wkt>
16
+ <proj4>+proj=aea +lat_0=24 +lon_0=-84 +lat_1=24 +lat_2=31.5 +x_0=400000 +y_0=0 +ellps=GRS80 +units=m +no_defs</proj4>
17
+ <srsid>28506</srsid>
18
+ <srid>6439</srid>
19
+ <authid>EPSG:6439</authid>
20
+ <description>NAD83(2011) / Florida GDL Albers</description>
21
+ <projectionacronym>aea</projectionacronym>
22
+ <ellipsoidacronym>EPSG:7019</ellipsoidacronym>
23
+ <geographicflag>false</geographicflag>
24
+ </spatialrefsys>
25
+ </crs>
26
+ <extent/>
27
+ </qgis>
main.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def get_raw_data(file_path: str):
5
+ """
6
+ Read raw data from a CSV or Parquet file.
7
+ """
8
+ if file_path.endswith(".parquet"):
9
+ return pd.read_parquet(file_path)
10
+
11
+ categorical_columns = [
12
+ "Monitoring_Location_ID",
13
+ "Activity_Depth_Unit",
14
+ "Sample_Position",
15
+ "Time_Zone",
16
+ "Activity_Type",
17
+ "Waterbody_Class",
18
+ "WBID",
19
+ "Name",
20
+ "Sector",
21
+ "Total_Depth_Unit",
22
+ "Org_Analyte_Name",
23
+ ]
24
+
25
+ dtype_dict = {
26
+ "Station_Number": str,
27
+ **{col: "category" for col in categorical_columns},
28
+ }
29
+
30
+ return pd.read_csv(file_path, dtype=dtype_dict).assign(
31
+ Org_Result_Value=lambda df: pd.to_numeric(
32
+ df["Org_Result_Value"].replace("Not Reported", pd.NA), errors="coerce"
33
+ ),
34
+ Activity_Start_Date_Time=lambda df: pd.to_datetime(
35
+ df["Activity_Start_Date_Time"]
36
+ ),
37
+ )
38
+
39
+
40
+ def get_stations_data() -> pd.DataFrame:
41
+ """
42
+ Return stations data as a dataframe.
43
+ """
44
+ return pd.read_csv("data/Stations-Locations.csv")
45
+
46
+
47
+ def add_lat_long(raw_df: pd.DataFrame, stations_df: pd.DataFrame) -> pd.DataFrame:
48
+ """
49
+ Add latitude and longitude to raw data based on station number.
50
+ """
51
+ raw_df["Number"] = raw_df["Station_Number"].astype(float)
52
+ raw_df = raw_df.merge(
53
+ stations_df[["Number", "Latitude", "Longitude"]],
54
+ left_on="Number",
55
+ right_on="Number",
56
+ how="left",
57
+ )
58
+ return raw_df.drop("Number", axis=1)
59
+
60
+
61
+ def get_analyte_data_with_lat_long(df: pd.DataFrame, analyte: str) -> pd.DataFrame:
62
+ """
63
+ Extract and transform data for a specific analyte, adding geographical coordinates.
64
+
65
+ This function processes raw water quality data by:
66
+ 1. Adding latitude/longitude coordinates from stations data
67
+ 2. Filtering for a specific analyte
68
+ 3. Removing rows with missing values
69
+ 4. Aggregating duplicate measurements using mean values
70
+
71
+ Args:
72
+ df (pd.DataFrame): Raw water quality data containing at minimum these columns:
73
+ - Station_Number
74
+ - Org_Analyte_Name
75
+ - Org_Result_Value
76
+ analyte (str): Name of the analyte to filter for (e.g., "Temperature, Water")
77
+
78
+ Returns:
79
+ pd.DataFrame: Processed dataframe with columns:
80
+ - Activity_Start_Date_Time: Timestamp of measurement
81
+ - Station_Number: Monitoring station identifier
82
+ - Sector: Geographical sector
83
+ - WBID: Waterbody ID
84
+ - Sample_Position: Position of sample (e.g., "Surface", "Bottom")
85
+ - Activity_Depth: Depth of measurement
86
+ - Latitude: Station latitude
87
+ - Longitude: Station longitude
88
+ - {analyte}: Measured value for the specified analyte
89
+
90
+ Note:
91
+ Duplicate measurements at the same location and time are averaged.
92
+ """
93
+ return (
94
+ df.pipe(add_lat_long, get_stations_data())
95
+ .query(f"Org_Analyte_Name == '{analyte}'")
96
+ .dropna(subset=["Org_Result_Value"])
97
+ .pivot_table(
98
+ index=[
99
+ "Activity_Start_Date_Time",
100
+ "Station_Number",
101
+ "Sector",
102
+ "WBID",
103
+ "Sample_Position",
104
+ "Activity_Depth",
105
+ "Latitude",
106
+ "Longitude",
107
+ ],
108
+ values="Org_Result_Value",
109
+ aggfunc="mean",
110
+ observed=True,
111
+ )
112
+ .reset_index()
113
+ .rename(columns={"Org_Result_Value": analyte})
114
+ )
115
+
116
+
117
+ def create_station_stats(
118
+ pivoted: pd.DataFrame, station: str | float | int
119
+ ) -> pd.DataFrame:
120
+ """
121
+ Create statistics for a specific station from pivoted data.
122
+
123
+ Args:
124
+ pivoted: Pivoted DataFrame containing water quality measurements
125
+ station: Station identifier
126
+
127
+ Returns:
128
+ DataFrame with statistics for various water quality parameters
129
+ """
130
+ PARAMETERS = {
131
+ "Secchi Depth (feet)": ("Depth, Secchi Disk Depth", ["Surface"]),
132
+ "Temperature (°C)": ("Temperature, Water", ["Surface", "Bottom"]),
133
+ "Dissolved Oxygen (mg/L)": ("Dissolved Oxygen", ["Surface", "Bottom"]),
134
+ "Turbidity (NTU)": ("Turbidity", ["Surface", "Bottom"]),
135
+ "Salinity (ppt)": ("Salinity", ["Surface", "Bottom"]),
136
+ "pH": ("pH", ["Surface", "Bottom"]),
137
+ }
138
+ STATS = {"Average": "mean", "Maximum": "max", "Minimum": "min", "n=": "count"}
139
+ data = {"Station": station, "Statistic": list(STATS.keys())}
140
+ for param_name, (param_code, positions) in PARAMETERS.items():
141
+ for position in positions:
142
+ col_name = f"{param_name} {position}" if len(positions) > 1 else param_name
143
+ data[col_name] = [
144
+ pivoted[stat][position][station, param_code] for stat in STATS.values()
145
+ ]
146
+ return pd.DataFrame(data)
147
+
148
+
149
+ def create_overall_summary(df: pd.DataFrame) -> pd.DataFrame:
150
+ summary = (
151
+ df.groupby(["Org_Analyte_Name"], observed=False)["Org_Result_Value"]
152
+ .agg(["mean", "max", "min", "count"])
153
+ .round(2)
154
+ .rename(
155
+ columns={
156
+ "count": "Count",
157
+ "mean": "Mean",
158
+ "max": "Maximum",
159
+ "min": "Minimum",
160
+ }
161
+ )
162
+ )
163
+ summary.index.name = None
164
+ transposed = summary.T
165
+ return transposed.rename(
166
+ columns={
167
+ "Depth, Secchi Disk Depth": "Secchi Depth (feet)",
168
+ "Dissolved Oxygen": "Dissolved Oxygen (mg/L)",
169
+ "Salinity": "Salinity (ppt)",
170
+ "Turbidity": "Turbidity (NTU)",
171
+ "Temperature, Water": "Temperature (°C)",
172
+ }
173
+ ).loc[
174
+ :,
175
+ [
176
+ "Secchi Depth (feet)",
177
+ "Temperature (°C)",
178
+ "Dissolved Oxygen (mg/L)",
179
+ "Turbidity (NTU)",
180
+ "Salinity (ppt)",
181
+ "pH",
182
+ ],
183
+ ]
184
+
185
+
186
+ def create_summary_by_station_and_position(
187
+ df: pd.DataFrame, exclude_analytes: list[str] | None = None
188
+ ) -> pd.DataFrame:
189
+ """
190
+ Create a summary statistics table from water quality measurements.
191
+
192
+ Args:
193
+ df (pd.DataFrame): Processed dataframe from get_data function
194
+
195
+ Returns:
196
+ pd.DataFrame: Summary statistics table with surface/bottom measurements
197
+ """
198
+ if exclude_analytes is None:
199
+ exclude_analytes = []
200
+
201
+ summary = (
202
+ df.query("Org_Analyte_Name not in @exclude_analytes")
203
+ .groupby(
204
+ ["Station_Number", "Sample_Position", "Org_Analyte_Name"], observed=False
205
+ )["Org_Result_Value"]
206
+ .agg(["mean", "max", "min", "count"])
207
+ .round(2)
208
+ )
209
+ pivoted = summary.reset_index().pivot_table(
210
+ index=["Station_Number", "Org_Analyte_Name"],
211
+ columns=["Sample_Position"],
212
+ values=["mean", "max", "min", "count"],
213
+ observed=False,
214
+ )
215
+ stations = sorted(df["Station_Number"].unique())
216
+ return pd.concat(
217
+ [create_station_stats(pivoted, station) for station in stations]
218
+ ).set_index(["Station", "Statistic"])
219
+
220
+
221
+ def create_multiindex_columns(df: pd.DataFrame) -> pd.DataFrame:
222
+ new_df = df.copy()
223
+ new_df.columns = pd.MultiIndex.from_tuples(
224
+ [
225
+ (col.rsplit(" ", 1)[0], col.rsplit(" ", 1)[1])
226
+ if col != "Secchi Depth (feet)"
227
+ else ("", col)
228
+ for col in df.columns
229
+ ],
230
+ names=["Analyte", "Position"],
231
+ )
232
+ return new_df
pyproject.toml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "state-of-the-bay"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "contextily>=1.6.2",
9
+ "folium>=0.18.0",
10
+ "geopandas[all]>=1.0.1",
11
+ "great-tables>=0.13.0",
12
+ "ipykernel>=6.29.5",
13
+ "matplotlib>=3.9.2",
14
+ "nbformat>=5.10.4",
15
+ "osmnx>=1.9.3",
16
+ "pandas>=2.2.3",
17
+ "plotly>=5.24.1",
18
+ "plotnine>=0.14.1",
19
+ "polars>=1.12.0",
20
+ "pygwalker>=0.4.9.13",
21
+ "pytest>=8.3.3",
22
+ "scipy>=1.14.1",
23
+ "seaborn>=0.13.2",
24
+ "streamlit>=1.40.0",
25
+ "watchdog>=5.0.3",
26
+ "xlsxwriter>=3.2.0",
27
+ ]
28
+
29
+ [tool.uv]
30
+ dev-dependencies = [
31
+ "ipykernel>=6.29.5",
32
+ ]
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ xlsxwriter
4
+ numpy
5
+ scipy
6
+ matplotlib
7
+ plotly
8
+ great-tables
9
+ polars
10
+ seaborn
11
+ geopandas[all]
12
+ contextily
13
+ plotly-express
14
+ altair
tests/test_main.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import pytest
4
+
5
+ from main import create_station_stats
6
+
7
+
8
+ @pytest.fixture
9
+ def sample_pivoted_data():
10
+ """Create a sample pivoted dataset that matches the expected structure"""
11
+ # Create sample data
12
+ index = pd.MultiIndex.from_product(
13
+ [
14
+ ["1.00", "3.20"], # Station_Number
15
+ [
16
+ "Depth, Secchi Disk Depth",
17
+ "Temperature, Water",
18
+ "Dissolved Oxygen",
19
+ "Turbidity",
20
+ "Salinity",
21
+ "pH",
22
+ ], # Org_Analyte_Name
23
+ ],
24
+ names=["Station_Number", "Org_Analyte_Name"],
25
+ )
26
+
27
+ # Create MultiIndex columns
28
+ columns = pd.MultiIndex.from_product(
29
+ [
30
+ ["count", "max", "mean", "min"], # Aggregation functions
31
+ ["Bottom", "Surface"], # Sample_Position
32
+ ]
33
+ )
34
+
35
+ # Create sample data with consistent values
36
+ data = np.full((len(index), len(columns)), 10.0)
37
+
38
+ # Create DataFrame first
39
+ df = pd.DataFrame(data, index=index, columns=columns)
40
+
41
+ # Set count values to 100 using proper MultiIndex access
42
+ df.loc[:, ("count", "Bottom")] = 100
43
+ df.loc[:, ("count", "Surface")] = 100
44
+
45
+ return df
46
+
47
+
48
+ def test_create_station_stats_basic(sample_pivoted_data):
49
+ """Test basic functionality of create_station_stats"""
50
+ station = "3.20"
51
+ result = create_station_stats(sample_pivoted_data, station)
52
+
53
+ # Check basic structure
54
+ assert isinstance(result, pd.DataFrame)
55
+ assert len(result) == 4 # Average, Maximum, Minimum, n=
56
+ assert "Station" in result.columns
57
+ assert "Statistic" in result.columns
58
+
59
+
60
+ def test_create_station_stats_values(sample_pivoted_data):
61
+ """Test that values are correctly mapped from pivoted data"""
62
+ station = "3.20"
63
+ result = create_station_stats(sample_pivoted_data, station)
64
+
65
+ # Check specific values for Dissolved Oxygen
66
+ surface_do = result["Dissolved Oxygen (mg/L) Surface"].tolist()
67
+ assert surface_do == [10.0, 10.0, 10.0, 100] # mean, max, min, count
68
+
69
+
70
+ def test_create_station_stats_columns(sample_pivoted_data):
71
+ """Test that all expected columns are present"""
72
+ station = "3.20"
73
+ result = create_station_stats(sample_pivoted_data, station)
74
+
75
+ expected_columns = {
76
+ "Station",
77
+ "Statistic",
78
+ "Secchi Depth (feet)",
79
+ "Temperature (°C) Surface",
80
+ "Temperature (°C) Bottom",
81
+ "Dissolved Oxygen (mg/L) Surface",
82
+ "Dissolved Oxygen (mg/L) Bottom",
83
+ "Turbidity (NTU) Surface",
84
+ "Turbidity (NTU) Bottom",
85
+ "Salinity (ppt) Surface",
86
+ "Salinity (ppt) Bottom",
87
+ "pH Surface",
88
+ "pH Bottom",
89
+ }
90
+
91
+ assert set(result.columns) == expected_columns
92
+
93
+
94
+ def test_create_station_stats_missing_data(sample_pivoted_data):
95
+ """Test handling of missing data"""
96
+ station = "3.20"
97
+ # Introduce some NaN values
98
+ sample_pivoted_data.loc[(station, "pH"), ("mean", "Surface")] = np.nan
99
+
100
+ result = create_station_stats(sample_pivoted_data, station)
101
+ assert pd.isna(result["pH Surface"][0]) # Check if NaN is preserved
102
+
103
+
104
+ def test_create_station_stats_statistics(sample_pivoted_data):
105
+ """Test that statistics are in correct order"""
106
+ station = "3.20"
107
+ result = create_station_stats(sample_pivoted_data, station)
108
+
109
+ expected_statistics = ["Average", "Maximum", "Minimum", "n="]
110
+ assert result["Statistic"].tolist() == expected_statistics
111
+
112
+
113
+ def test_create_station_stats_invalid_station(sample_pivoted_data):
114
+ """Test behavior with invalid station"""
115
+ with pytest.raises(KeyError):
116
+ create_station_stats(sample_pivoted_data, "invalid_station")
uv.lock ADDED
The diff for this file is too large to render. See raw diff