pavanmutha commited on
Commit
89c639a
·
verified ·
1 Parent(s): 55b0175

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -24
app.py CHANGED
@@ -58,36 +58,27 @@ def set_target_column(col_name):
58
  return f"✅ Target column set to: {col_name}"
59
 
60
  def clean_data(df):
61
- # Step 1: Drop columns and rows that are completely empty
62
  df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
63
-
64
- # Step 2: Handle categorical string columns (Label Encoding)
65
  for col in df.select_dtypes(include='object').columns:
66
- # Convert the column to string if it's not already
67
  df[col] = df[col].astype(str)
68
- try:
69
- # Apply Label Encoding to convert categories into numeric labels
 
 
 
 
 
 
70
  df[col] = LabelEncoder().fit_transform(df[col])
71
- except Exception as e:
72
- print(f"Error encoding column {col}: {e}")
73
- # You can log or handle this case if you want to skip the problematic column or take other actions
74
-
75
- # Step 3: Fill missing numeric data with the mean of the column (numeric_only=True ensures this works only for numeric columns)
76
  df = df.fillna(df.mean(numeric_only=True))
 
 
77
 
78
- # Step 4: Ensure all numeric columns are properly converted to numeric type (coerce errors to NaN)
79
- for col in df.select_dtypes(include=['float64', 'int64']).columns:
80
- df[col] = pd.to_numeric(df[col], errors='coerce')
81
-
82
- # Step 5: Special handling for 'Amount' column (if exists)
83
- if 'Amount' in df.columns:
84
- # Remove commas, dollar signs, and strip any leading/trailing spaces
85
- df['Amount'] = df['Amount'].str.replace(',', '').str.replace('$', '').str.strip()
86
- # Convert 'Amount' to numeric, coercing errors into NaN
87
- df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')
88
-
89
- # Step 6: Drop rows with any NaN values after cleaning (optional step, depending on how you want to handle NaN values)
90
- df = df.dropna(how='any')
91
 
92
 
93
 
 
58
  return f"✅ Target column set to: {col_name}"
59
 
60
  def clean_data(df):
61
+ # Drop rows and columns where all values are NaN
62
  df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
63
+
64
+ # Convert columns with object (string) data type to strings
65
  for col in df.select_dtypes(include='object').columns:
 
66
  df[col] = df[col].astype(str)
67
+
68
+ # Ensure that the 'Amount' column is treated as a string before using str accessor
69
+ if 'Amount' in df.columns:
70
+ df['Amount'] = df['Amount'].astype(str).str.replace(',', '').str.replace('$', '').str.strip()
71
+
72
+ # Convert categorical columns to numeric using LabelEncoder
73
+ for col in df.select_dtypes(include='object').columns:
74
+ if col != 'Amount': # Skip 'Amount' column as it was already cleaned
75
  df[col] = LabelEncoder().fit_transform(df[col])
76
+
77
+ # Fill missing values in numeric columns with the mean of each column
 
 
 
78
  df = df.fillna(df.mean(numeric_only=True))
79
+
80
+ return df
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
 
84