Update app.py
Browse files
app.py
CHANGED
|
@@ -58,36 +58,27 @@ def set_target_column(col_name):
|
|
| 58 |
return f"✅ Target column set to: {col_name}"
|
| 59 |
|
| 60 |
def clean_data(df):
|
| 61 |
-
#
|
| 62 |
df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
|
| 63 |
-
|
| 64 |
-
#
|
| 65 |
for col in df.select_dtypes(include='object').columns:
|
| 66 |
-
# Convert the column to string if it's not already
|
| 67 |
df[col] = df[col].astype(str)
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
df[col] = LabelEncoder().fit_transform(df[col])
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
# You can log or handle this case if you want to skip the problematic column or take other actions
|
| 74 |
-
|
| 75 |
-
# Step 3: Fill missing numeric data with the mean of the column (numeric_only=True ensures this works only for numeric columns)
|
| 76 |
df = df.fillna(df.mean(numeric_only=True))
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
# Step 4: Ensure all numeric columns are properly converted to numeric type (coerce errors to NaN)
|
| 79 |
-
for col in df.select_dtypes(include=['float64', 'int64']).columns:
|
| 80 |
-
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 81 |
-
|
| 82 |
-
# Step 5: Special handling for 'Amount' column (if exists)
|
| 83 |
-
if 'Amount' in df.columns:
|
| 84 |
-
# Remove commas, dollar signs, and strip any leading/trailing spaces
|
| 85 |
-
df['Amount'] = df['Amount'].str.replace(',', '').str.replace('$', '').str.strip()
|
| 86 |
-
# Convert 'Amount' to numeric, coercing errors into NaN
|
| 87 |
-
df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')
|
| 88 |
-
|
| 89 |
-
# Step 6: Drop rows with any NaN values after cleaning (optional step, depending on how you want to handle NaN values)
|
| 90 |
-
df = df.dropna(how='any')
|
| 91 |
|
| 92 |
|
| 93 |
|
|
|
|
| 58 |
return f"✅ Target column set to: {col_name}"
|
| 59 |
|
| 60 |
def clean_data(df):
|
| 61 |
+
# Drop rows and columns where all values are NaN
|
| 62 |
df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
|
| 63 |
+
|
| 64 |
+
# Convert columns with object (string) data type to strings
|
| 65 |
for col in df.select_dtypes(include='object').columns:
|
|
|
|
| 66 |
df[col] = df[col].astype(str)
|
| 67 |
+
|
| 68 |
+
# Ensure that the 'Amount' column is treated as a string before using str accessor
|
| 69 |
+
if 'Amount' in df.columns:
|
| 70 |
+
df['Amount'] = df['Amount'].astype(str).str.replace(',', '').str.replace('$', '').str.strip()
|
| 71 |
+
|
| 72 |
+
# Convert categorical columns to numeric using LabelEncoder
|
| 73 |
+
for col in df.select_dtypes(include='object').columns:
|
| 74 |
+
if col != 'Amount': # Skip 'Amount' column as it was already cleaned
|
| 75 |
df[col] = LabelEncoder().fit_transform(df[col])
|
| 76 |
+
|
| 77 |
+
# Fill missing values in numeric columns with the mean of each column
|
|
|
|
|
|
|
|
|
|
| 78 |
df = df.fillna(df.mean(numeric_only=True))
|
| 79 |
+
|
| 80 |
+
return df
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
|