Spaces:

varshitha22
/

Cancer_Prediction

Sleeping

varshitha22 commited on Feb 25, 2025

Commit

6a74ea1

verified ·

1 Parent(s): 7a27867

Update cancer.py

Files changed (1) hide show

cancer.py CHANGED Viewed

@@ -15,30 +15,26 @@ from xgboost import XGBClassifier
 def load_data():
     return pd.read_csv('cancer_prediction_data (2).csv')
 # Data Preprocessing
 def preprocess_data(df):
-    numeric = ['Age', 'Tumor_Size']
-    ordinal = ['Tumor_Grade', 'Symptoms_Severity', 'Alcohol_Consumption', 'Exercise_Frequency']
-    nominal = ['Gender', 'Family_History', 'Smoking_History']
     preprocess = ColumnTransformer([
-        ('Numerical Transformation', Pipeline([
-            ("Mean Imputation",SimpleImputer(strategy= 'mean')),
-            ('Scaling',StandardScaler())
-        ]), numeric),
-        ('Ordinal Transformation', Pipeline([
-            ("Mode Imputation",SimpleImputer(strategy= 'most_frequent')),
-            ('Encoding',OrdinalEncoder())
-        ]), ordinal),
-        ('Nominal Transformation', Pipeline([
-            ("Mode Imputation",SimpleImputer(strategy= 'most_frequent')),
-            ('Encoding',OneHotEncoder())
-        ]), nominal)], remainder='passthrough')
-    X = df.drop('Cancer_Present', axis=1)
-    y = df['Cancer_Present']
     return train_test_split(X, y, test_size=0.2, random_state=23), preprocess
 # Train Model

 def load_data():
     return pd.read_csv('cancer_prediction_data (2).csv')
+# Data Preprocessing
 # Data Preprocessing
 def preprocess_data(df):
+    # Dynamically identify categorical and numerical features
+    categorical_features = df.select_dtypes(include=['object']).columns
+    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
     preprocess = ColumnTransformer([
+        ('num', Pipeline([
+            ('imputer', SimpleImputer(strategy='mean')),  # For numeric columns
+            ('scaler', StandardScaler())
+        ]), numerical_features),
+        ('cat', Pipeline([
+            ('imputer', SimpleImputer(strategy='most_frequent')),  # For categorical columns
+            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
+        ]), categorical_features)
+    ], remainder='passthrough')
+    X = df.drop('Cancer_Present', axis=1)  # Drop target column
+    y = df['Cancer_Present']  # Target column
     return train_test_split(X, y, test_size=0.2, random_state=23), preprocess
 # Train Model