TanU21 commited on
Commit
43f7806
·
verified ·
1 Parent(s): 460718f

Update preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +39 -19
preprocessing.py CHANGED
@@ -16,49 +16,69 @@ def data_quality(df: pd.DataFrame):
16
 
17
  def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
18
  for col in df.columns:
19
- # Skip boolean columns
20
  if df[col].isin([True, False]).all():
 
 
 
 
 
 
 
 
21
  continue
22
 
23
- # Attempt numeric conversion
24
- if df[col].dtype == 'object':
25
- try:
26
- df[col] = pd.to_numeric(df[col], errors='coerce')
27
- except Exception:
28
- pass
 
 
29
 
30
- # Attempt datetime conversion
31
  try:
32
- temp_col = pd.to_datetime(df[col], errors='coerce')
33
- if temp_col.notna().sum() > 0:
34
- df[col] = temp_col
35
- except Exception:
 
36
  pass
37
 
38
- # JSON list/dict conversion
39
  try:
40
- if df[col].apply(lambda x: isinstance(x, str) and x.startswith("[") and x.endswith("]")).all():
41
  df[col] = df[col].apply(json.loads)
42
- except Exception:
 
43
  pass
44
 
45
- # Boolean string to actual bool
46
- if df[col].dropna().isin(["TRUE", "FALSE"]).all():
47
- df[col] = df[col].map({"TRUE": True, "FALSE": False})
48
 
49
  return df
50
 
 
51
  def handle_missing_data(df: pd.DataFrame) -> pd.DataFrame:
 
 
 
52
  numeric_col = df.select_dtypes(include=['number']).columns
53
  if not numeric_col.empty:
54
  num_imputer = SimpleImputer(strategy='median')
55
  df[numeric_col] = num_imputer.fit_transform(df[numeric_col])
56
- categorical_col = df.select_dtypes(include=['object']).columns
 
57
  if not categorical_col.empty:
58
  cat_imputer = SimpleImputer(strategy='most_frequent')
59
  df[categorical_col] = cat_imputer.fit_transform(df[categorical_col])
 
 
 
 
60
  return df
61
 
 
62
  def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
63
  numeric_col = df.select_dtypes(include=['number','int64', 'float64']).columns
64
  if not numeric_col.empty:
 
16
 
17
  def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
18
  for col in df.columns:
 
19
  if df[col].isin([True, False]).all():
20
+ continue # already boolean
21
+
22
+ # Handle boolean strings
23
+ if df[col].dropna().astype(str).isin(["TRUE", "FALSE", "true", "false"]).all():
24
+ df[col] = df[col].map({
25
+ "TRUE": True, "FALSE": False,
26
+ "true": True, "false": False
27
+ })
28
  continue
29
 
30
+ # Try to parse as datetime, if at least 50% parse correctly
31
+ try:
32
+ temp = pd.to_datetime(df[col], errors='coerce')
33
+ if temp.notna().mean() > 0.5:
34
+ df[col] = temp
35
+ continue
36
+ except:
37
+ pass
38
 
39
+ # Try to parse numeric if at least 50% can be converted
40
  try:
41
+ temp = pd.to_numeric(df[col], errors='coerce')
42
+ if temp.notna().mean() > 0.5:
43
+ df[col] = temp
44
+ continue
45
+ except:
46
  pass
47
 
48
+ # Convert JSON-like strings
49
  try:
50
+ if df[col].dropna().apply(lambda x: isinstance(x, str) and x.strip().startswith("[") and x.strip().endswith("]")).all():
51
  df[col] = df[col].apply(json.loads)
52
+ continue
53
+ except:
54
  pass
55
 
56
+ # Default: make sure column is string
57
+ df[col] = df[col].astype(str)
 
58
 
59
  return df
60
 
61
+
62
  def handle_missing_data(df: pd.DataFrame) -> pd.DataFrame:
63
+ print("Before Imputation (NA Counts):")
64
+ print(df.isnull().sum())
65
+
66
  numeric_col = df.select_dtypes(include=['number']).columns
67
  if not numeric_col.empty:
68
  num_imputer = SimpleImputer(strategy='median')
69
  df[numeric_col] = num_imputer.fit_transform(df[numeric_col])
70
+
71
+ categorical_col = df.select_dtypes(include=['object', 'category']).columns
72
  if not categorical_col.empty:
73
  cat_imputer = SimpleImputer(strategy='most_frequent')
74
  df[categorical_col] = cat_imputer.fit_transform(df[categorical_col])
75
+
76
+ print("After Imputation (NA Counts):")
77
+ print(df.isnull().sum())
78
+
79
  return df
80
 
81
+
82
  def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
83
  numeric_col = df.select_dtypes(include=['number','int64', 'float64']).columns
84
  if not numeric_col.empty: