AIDataAgentProjectFinal

Paused

App Files Files Community

pavanmutha commited on Apr 18, 2025

Commit

89c639a

verified ·

1 Parent(s): 55b0175

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -24

app.py CHANGED Viewed

@@ -58,36 +58,27 @@ def set_target_column(col_name):
     return f"✅ Target column set to: {col_name}"
 def clean_data(df):
-    # Step 1: Drop columns and rows that are completely empty
     df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
-    # Step 2: Handle categorical string columns (Label Encoding)
     for col in df.select_dtypes(include='object').columns:
-        # Convert the column to string if it's not already
         df[col] = df[col].astype(str)
-        try:
-            # Apply Label Encoding to convert categories into numeric labels
             df[col] = LabelEncoder().fit_transform(df[col])
-        except Exception as e:
-            print(f"Error encoding column {col}: {e}")
-            # You can log or handle this case if you want to skip the problematic column or take other actions
-    # Step 3: Fill missing numeric data with the mean of the column (numeric_only=True ensures this works only for numeric columns)
     df = df.fillna(df.mean(numeric_only=True))
-    # Step 4: Ensure all numeric columns are properly converted to numeric type (coerce errors to NaN)
-    for col in df.select_dtypes(include=['float64', 'int64']).columns:
-        df[col] = pd.to_numeric(df[col], errors='coerce')
-    # Step 5: Special handling for 'Amount' column (if exists)
-    if 'Amount' in df.columns:
-        # Remove commas, dollar signs, and strip any leading/trailing spaces
-        df['Amount'] = df['Amount'].str.replace(',', '').str.replace('$', '').str.strip()
-        # Convert 'Amount' to numeric, coercing errors into NaN
-        df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')
-    # Step 6: Drop rows with any NaN values after cleaning (optional step, depending on how you want to handle NaN values)
-    df = df.dropna(how='any')

     return f"✅ Target column set to: {col_name}"
 def clean_data(df):
+    # Drop rows and columns where all values are NaN
     df = df.dropna(how='all', axis=1).dropna(how='all', axis=0)
+    # Convert columns with object (string) data type to strings
     for col in df.select_dtypes(include='object').columns:
         df[col] = df[col].astype(str)
+    # Ensure that the 'Amount' column is treated as a string before using str accessor
+    if 'Amount' in df.columns:
+        df['Amount'] = df['Amount'].astype(str).str.replace(',', '').str.replace('$', '').str.strip()
+    # Convert categorical columns to numeric using LabelEncoder
+    for col in df.select_dtypes(include='object').columns:
+        if col != 'Amount':  # Skip 'Amount' column as it was already cleaned
             df[col] = LabelEncoder().fit_transform(df[col])
+    # Fill missing values in numeric columns with the mean of each column
     df = df.fillna(df.mean(numeric_only=True))
+    return df