TanU21 commited on
Commit
4dc59af
·
verified ·
1 Parent(s): 43f7806

Update app/services/preprocessing.py

Browse files
Files changed (1) hide show
  1. app/services/preprocessing.py +48 -16
app/services/preprocessing.py CHANGED
@@ -11,36 +11,68 @@ def data_quality(df: pd.DataFrame):
11
  def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
12
  for col in df.columns:
13
  if df[col].isin([True, False]).all():
 
 
 
 
 
 
 
 
14
  continue
15
- if df[col].dtype == 'object' and df[col].str.replace('.', '', 1).str.isnumeric().all():
16
- df[col] = pd.to_numeric(df[col], errors='ignore')
17
  try:
18
- df[col] = pd.to_datetime(df[col], errors='coerce')
19
- if df[col].notna().sum() == 0:
20
- df[col] = df[col].astype(str)
21
- except Exception:
 
22
  pass
 
 
23
  try:
24
- if df[col].apply(lambda x: isinstance(x, str) and x.startswith("[") and x.endswith("]")).all():
 
 
 
 
 
 
 
 
 
25
  df[col] = df[col].apply(json.loads)
26
- except Exception:
 
27
  pass
28
- if df[col].dtype == 'object' and df[col].dropna().isin(["TRUE", "FALSE"]).all():
29
- df[col] = df[col].map({"TRUE": True, "FALSE": False})
30
- if df[col].dtype == 'object':
31
- df[col] = df[col].astype(str)
32
- df.fillna("", inplace=True)
33
  return df
34
 
 
35
  def handle_missing_data(df: pd.DataFrame) -> pd.DataFrame:
 
 
 
36
  numeric_col = df.select_dtypes(include=['number']).columns
37
  if not numeric_col.empty:
38
- df[numeric_col] = SimpleImputer(strategy='median').fit_transform(df[numeric_col])
39
- categorical_col = df.select_dtypes(include=['object']).columns
 
 
40
  if not categorical_col.empty:
41
- df[categorical_col] = SimpleImputer(strategy='most_frequent').fit_transform(df[categorical_col])
 
 
 
 
 
42
  return df
43
 
 
44
  def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
45
  numeric_col = df.select_dtypes(include=['number','int64', 'float64']).columns
46
  if not numeric_col.empty:
 
11
  def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
12
  for col in df.columns:
13
  if df[col].isin([True, False]).all():
14
+ continue # already boolean
15
+
16
+ # Handle boolean strings
17
+ if df[col].dropna().astype(str).isin(["TRUE", "FALSE", "true", "false"]).all():
18
+ df[col] = df[col].map({
19
+ "TRUE": True, "FALSE": False,
20
+ "true": True, "false": False
21
+ })
22
  continue
23
+
24
+ # Try to parse as datetime, if at least 50% parse correctly
25
  try:
26
+ temp = pd.to_datetime(df[col], errors='coerce')
27
+ if temp.notna().mean() > 0.5:
28
+ df[col] = temp
29
+ continue
30
+ except:
31
  pass
32
+
33
+ # Try to parse numeric if at least 50% can be converted
34
  try:
35
+ temp = pd.to_numeric(df[col], errors='coerce')
36
+ if temp.notna().mean() > 0.5:
37
+ df[col] = temp
38
+ continue
39
+ except:
40
+ pass
41
+
42
+ # Convert JSON-like strings
43
+ try:
44
+ if df[col].dropna().apply(lambda x: isinstance(x, str) and x.strip().startswith("[") and x.strip().endswith("]")).all():
45
  df[col] = df[col].apply(json.loads)
46
+ continue
47
+ except:
48
  pass
49
+
50
+ # Default: make sure column is string
51
+ df[col] = df[col].astype(str)
52
+
 
53
  return df
54
 
55
+
56
  def handle_missing_data(df: pd.DataFrame) -> pd.DataFrame:
57
+ print("Before Imputation (NA Counts):")
58
+ print(df.isnull().sum())
59
+
60
  numeric_col = df.select_dtypes(include=['number']).columns
61
  if not numeric_col.empty:
62
+ num_imputer = SimpleImputer(strategy='median')
63
+ df[numeric_col] = num_imputer.fit_transform(df[numeric_col])
64
+
65
+ categorical_col = df.select_dtypes(include=['object', 'category']).columns
66
  if not categorical_col.empty:
67
+ cat_imputer = SimpleImputer(strategy='most_frequent')
68
+ df[categorical_col] = cat_imputer.fit_transform(df[categorical_col])
69
+
70
+ print("After Imputation (NA Counts):")
71
+ print(df.isnull().sum())
72
+
73
  return df
74
 
75
+
76
  def handle_outliers(df: pd.DataFrame) -> pd.DataFrame:
77
  numeric_col = df.select_dtypes(include=['number','int64', 'float64']).columns
78
  if not numeric_col.empty: