File size: 3,318 Bytes
9d27b5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import argparse
import os
import sys
import pandas as pd
import numpy as np


def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Lowercase, strip, and replace spaces with underscores in column names."""
    df = df.copy()
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
    return df


def drop_invalid_rows(df: pd.DataFrame) -> pd.DataFrame:
    """Drop rows with missing target (first column) and fully empty feature rows."""
    df = df.copy()
    target_col = df.columns[0]
    df = df[~df[target_col].isna()]
    feature_df = df.iloc[:, 1:]
    non_empty_mask = ~(feature_df.isna().all(axis=1) | (feature_df.sum(axis=1) == 0))
    df = df.loc[non_empty_mask]
    return df


def remove_constant_and_sparse_features(df: pd.DataFrame, min_positive_frac: float = 0.0005):
    """Remove columns that are constant or extremely sparse (near-zero variance)."""
    target = df.columns[0]
    X = df.iloc[:, 1:]
    keep_cols = []
    for col in X.columns:
        series = X[col]
        if series.nunique(dropna=True) <= 1:
            continue
        # If binary-like, compute positive ratio
        try:
            pos_frac = (series.fillna(0) > 0).mean()
        except Exception:
            pos_frac = 1.0
        if pos_frac < min_positive_frac:
            continue
        keep_cols.append(col)
    cleaned = pd.concat([df[[target]], X[keep_cols]], axis=1)
    return cleaned


def impute_missing(df: pd.DataFrame) -> pd.DataFrame:
    """Impute missing values in features with 0, keep target as is."""
    target = df.columns[0]
    X = df.iloc[:, 1:].fillna(0)
    return pd.concat([df[[target]], X], axis=1)


def limit_classes(df: pd.DataFrame, min_samples: int = 5) -> pd.DataFrame:
    """Keep only classes with at least min_samples samples."""
    target = df.columns[0]
    counts = df[target].value_counts()
    keep = counts[counts >= min_samples].index
    return df[df[target].isin(keep)]


def main():
    parser = argparse.ArgumentParser(description="Preprocess disease-symptom CSV for training.")
    parser.add_argument("--input", required=True, help="Path to raw CSV")
    parser.add_argument("--output", default="cleaned_dataset.csv", help="Path to save cleaned CSV")
    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"❌ Input CSV not found: {args.input}")
        sys.exit(1)

    print("Loading CSV...")
    df = pd.read_csv(args.input)
    print(f"Raw shape: {df.shape}")

    print("Standardizing column names...")
    df = standardize_columns(df)

    print("Dropping invalid/empty rows...")
    df = drop_invalid_rows(df)
    print(f"After row cleanup: {df.shape}")

    print("Removing constant and sparse features...")
    df = remove_constant_and_sparse_features(df)
    print(f"After feature cleanup: {df.shape}")

    print("Imputing missing values (0 for symptoms)...")
    df = impute_missing(df)

    print("Limiting classes with very few samples...")
    df = limit_classes(df, min_samples=5)
    print(f"After class filtering: {df.shape}")

    print(f"Saving cleaned CSV to: {args.output}")
    df.to_csv(args.output, index=False)
    print("Done.")


if __name__ == "__main__":
    main()