AQI_Predictor / src /features /preprocess.py
SparshSG's picture
Upload 18 files
3d6943b verified
import pandas as pd
import os
def get_season(month):
if month in [12, 1, 2]:
return "Winter"
elif month in [3, 4, 5]:
return "Summer"
elif month in [6, 7, 8, 9]:
return "Monsoon"
else:
return "Post-Monsoon"
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
# Remove duplicates
df.drop_duplicates(inplace=True)
# Drop rows where AQI missing
df.dropna(subset=["AQI"], inplace=True)
# Drop columns with >50% missing
threshold = 0.5 * len(df)
cols_to_drop = df.columns[df.isnull().sum() > threshold]
df = df.drop(columns=cols_to_drop)
# Fill numeric columns with median
num_cols = df.select_dtypes(include=["number"]).columns
for col in num_cols:
df[col] = df[col].fillna(df[col].median())
# Date features
df["Date"] = pd.to_datetime(df["Date"])
df["Month"] = df["Date"].dt.month
df["Year"] = df["Date"].dt.year
# Season feature
df["Season"] = df["Month"].apply(get_season)
return df
def save_clean_data(df: pd.DataFrame, path: str):
# Convert to absolute path from project root
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
full_path = os.path.join(base_dir, path)
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(full_path), exist_ok=True)
df.to_csv(full_path, index=False)