TanU21 commited on
Commit
cdb0d1b
·
verified ·
1 Parent(s): 4dc59af

Update app/services/preprocessing.py

Browse files
Files changed (1) hide show
  1. app/services/preprocessing.py +11 -39
app/services/preprocessing.py CHANGED
@@ -9,46 +9,18 @@ def data_quality(df: pd.DataFrame):
9
  return df
10
 
11
  def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
 
12
  for col in df.columns:
13
- if df[col].isin([True, False]).all():
14
- continue # already boolean
15
-
16
- # Handle boolean strings
17
- if df[col].dropna().astype(str).isin(["TRUE", "FALSE", "true", "false"]).all():
18
- df[col] = df[col].map({
19
- "TRUE": True, "FALSE": False,
20
- "true": True, "false": False
21
- })
22
- continue
23
-
24
- # Try to parse as datetime, if at least 50% parse correctly
25
- try:
26
- temp = pd.to_datetime(df[col], errors='coerce')
27
- if temp.notna().mean() > 0.5:
28
- df[col] = temp
29
- continue
30
- except:
31
- pass
32
-
33
- # Try to parse numeric if at least 50% can be converted
34
- try:
35
- temp = pd.to_numeric(df[col], errors='coerce')
36
- if temp.notna().mean() > 0.5:
37
- df[col] = temp
38
- continue
39
- except:
40
- pass
41
-
42
- # Convert JSON-like strings
43
- try:
44
- if df[col].dropna().apply(lambda x: isinstance(x, str) and x.strip().startswith("[") and x.strip().endswith("]")).all():
45
- df[col] = df[col].apply(json.loads)
46
- continue
47
- except:
48
- pass
49
-
50
- # Default: make sure column is string
51
- df[col] = df[col].astype(str)
52
 
53
  return df
54
 
 
9
  return df
10
 
11
  def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
12
+ # Convert string-based dates to datetime, but ignore boolean values
13
  for col in df.columns:
14
+ if df[col].dtype == 'object' and not df[col].isin([True, False]).all():
15
+ try:
16
+ df[col] = pd.to_datetime(df[col], errors='coerce') # Invalid values become NaT
17
+ except Exception as e:
18
+ print(f"Skipping column {col}: {e}")
19
+
20
+ # Convert numeric strings to actual numbers
21
+ for col in df.select_dtypes(include=['object']).columns:
22
+ if df[col].str.replace('.', '', 1).str.isnumeric().all():
23
+ df[col] = pd.to_numeric(df[col])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  return df
26