waterdb / main.py
github-actions[bot]
Deploy from GitHub Actions
66a2bd7
import pandas as pd
def master_data_csv_to_parquet(file_path: str):
"""
Convert master data export from a CSV to a Parquet file.
"""
save_path = file_path.replace(".csv", ".parquet")
categorical_columns = [
"Monitoring_Location_ID",
"Activity_Depth_Unit",
"Sample_Position",
"Time_Zone",
"Activity_Type",
"Waterbody_Class",
"WBID",
"Name",
"Sector",
"Total_Depth_Unit",
"Org_Analyte_Name",
]
dtype_dict = {
"Station_Number": str,
**{col: "category" for col in categorical_columns},
}
return (
pd.read_csv(file_path, dtype=dtype_dict, low_memory=False)
.assign(
Org_Result_Value=lambda df: pd.to_numeric(
df["Org_Result_Value"].replace("Not Reported", pd.NA), errors="coerce"
),
Activity_Start_Date_Time=lambda df: pd.to_datetime(
df["Activity_Start_Date_Time"].apply(
lambda x: f"{x} 12:00:00" if len(str(x).strip()) <= 10 else x
),
errors="coerce",
),
Org_Result_Unit=lambda df: df["Org_Result_Unit"].str.replace("none", ""),
Station_Number=lambda x: x["Station_Number"].apply(
lambda s: f"{float(s):.2f}"
if pd.notna(s) and str(s).strip() != ""
else s
),
)
.to_parquet(save_path)
)