TanU21 commited on
Commit
9b61c7d
·
verified ·
1 Parent(s): cdb0d1b

Update preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +11 -39
preprocessing.py CHANGED
@@ -15,46 +15,18 @@ def data_quality(df: pd.DataFrame):
15
  return df
16
 
17
  def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
 
18
  for col in df.columns:
19
- if df[col].isin([True, False]).all():
20
- continue # already boolean
21
-
22
- # Handle boolean strings
23
- if df[col].dropna().astype(str).isin(["TRUE", "FALSE", "true", "false"]).all():
24
- df[col] = df[col].map({
25
- "TRUE": True, "FALSE": False,
26
- "true": True, "false": False
27
- })
28
- continue
29
-
30
- # Try to parse as datetime, if at least 50% parse correctly
31
- try:
32
- temp = pd.to_datetime(df[col], errors='coerce')
33
- if temp.notna().mean() > 0.5:
34
- df[col] = temp
35
- continue
36
- except:
37
- pass
38
-
39
- # Try to parse numeric if at least 50% can be converted
40
- try:
41
- temp = pd.to_numeric(df[col], errors='coerce')
42
- if temp.notna().mean() > 0.5:
43
- df[col] = temp
44
- continue
45
- except:
46
- pass
47
-
48
- # Convert JSON-like strings
49
- try:
50
- if df[col].dropna().apply(lambda x: isinstance(x, str) and x.strip().startswith("[") and x.strip().endswith("]")).all():
51
- df[col] = df[col].apply(json.loads)
52
- continue
53
- except:
54
- pass
55
-
56
- # Default: make sure column is string
57
- df[col] = df[col].astype(str)
58
 
59
  return df
60
 
 
15
  return df
16
 
17
  def standardize_data_types(df: pd.DataFrame) -> pd.DataFrame:
18
+ # Convert string-based dates to datetime, but ignore boolean values
19
  for col in df.columns:
20
+ if df[col].dtype == 'object' and not df[col].isin([True, False]).all():
21
+ try:
22
+ df[col] = pd.to_datetime(df[col], errors='coerce') # Invalid values become NaT
23
+ except Exception as e:
24
+ print(f"Skipping column {col}: {e}")
25
+
26
+ # Convert numeric strings to actual numbers
27
+ for col in df.select_dtypes(include=['object']).columns:
28
+ if df[col].str.replace('.', '', 1).str.isnumeric().all():
29
+ df[col] = pd.to_numeric(df[col])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  return df
32