| import pandas as pd | |
| def master_data_csv_to_parquet(file_path: str): | |
| """ | |
| Convert master data export from a CSV to a Parquet file. | |
| """ | |
| save_path = file_path.replace(".csv", ".parquet") | |
| categorical_columns = [ | |
| "Monitoring_Location_ID", | |
| "Activity_Depth_Unit", | |
| "Sample_Position", | |
| "Time_Zone", | |
| "Activity_Type", | |
| "Waterbody_Class", | |
| "WBID", | |
| "Name", | |
| "Sector", | |
| "Total_Depth_Unit", | |
| "Org_Analyte_Name", | |
| ] | |
| dtype_dict = { | |
| "Station_Number": str, | |
| **{col: "category" for col in categorical_columns}, | |
| } | |
| return ( | |
| pd.read_csv(file_path, dtype=dtype_dict, low_memory=False) | |
| .assign( | |
| Org_Result_Value=lambda df: pd.to_numeric( | |
| df["Org_Result_Value"].replace("Not Reported", pd.NA), errors="coerce" | |
| ), | |
| Activity_Start_Date_Time=lambda df: pd.to_datetime( | |
| df["Activity_Start_Date_Time"].apply( | |
| lambda x: f"{x} 12:00:00" if len(str(x).strip()) <= 10 else x | |
| ), | |
| errors="coerce", | |
| ), | |
| Org_Result_Unit=lambda df: df["Org_Result_Unit"].str.replace("none", ""), | |
| Station_Number=lambda x: x["Station_Number"].apply( | |
| lambda s: f"{float(s):.2f}" | |
| if pd.notna(s) and str(s).strip() != "" | |
| else s | |
| ), | |
| ) | |
| .to_parquet(save_path) | |
| ) | |