File size: 1,466 Bytes
3d6943b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import os


def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Summer"
    elif month in [6, 7, 8, 9]:
        return "Monsoon"
    else:
        return "Post-Monsoon"


def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    
    df = df.copy()

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Drop rows where AQI missing
    df.dropna(subset=["AQI"], inplace=True)

    # Drop columns with >50% missing
    threshold = 0.5 * len(df)
    cols_to_drop = df.columns[df.isnull().sum() > threshold]
    df = df.drop(columns=cols_to_drop)

    # Fill numeric columns with median
    num_cols = df.select_dtypes(include=["number"]).columns
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())

    # Date features
    df["Date"] = pd.to_datetime(df["Date"])
    df["Month"] = df["Date"].dt.month
    df["Year"] = df["Date"].dt.year

    # Season feature
    df["Season"] = df["Month"].apply(get_season)

    return df


def save_clean_data(df: pd.DataFrame, path: str):
    # Convert to absolute path from project root
    base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
    full_path = os.path.join(base_dir, path)

    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(full_path), exist_ok=True)

    df.to_csv(full_path, index=False)