import pandas as pd import os def get_season(month): if month in [12, 1, 2]: return "Winter" elif month in [3, 4, 5]: return "Summer" elif month in [6, 7, 8, 9]: return "Monsoon" else: return "Post-Monsoon" def clean_data(df: pd.DataFrame) -> pd.DataFrame: df = df.copy() # Remove duplicates df.drop_duplicates(inplace=True) # Drop rows where AQI missing df.dropna(subset=["AQI"], inplace=True) # Drop columns with >50% missing threshold = 0.5 * len(df) cols_to_drop = df.columns[df.isnull().sum() > threshold] df = df.drop(columns=cols_to_drop) # Fill numeric columns with median num_cols = df.select_dtypes(include=["number"]).columns for col in num_cols: df[col] = df[col].fillna(df[col].median()) # Date features df["Date"] = pd.to_datetime(df["Date"]) df["Month"] = df["Date"].dt.month df["Year"] = df["Date"].dt.year # Season feature df["Season"] = df["Month"].apply(get_season) return df def save_clean_data(df: pd.DataFrame, path: str): # Convert to absolute path from project root base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) full_path = os.path.join(base_dir, path) # Create directory if it doesn't exist os.makedirs(os.path.dirname(full_path), exist_ok=True) df.to_csv(full_path, index=False)