Spaces:

varshitha22
/

Cancer_Prediction

Sleeping

varshitha22 commited on Feb 25, 2025

Commit

b1ff86e

verified ·

1 Parent(s): 1e8a483

Update cancer.py

Files changed (1) hide show

cancer.py CHANGED Viewed

@@ -15,26 +15,29 @@ from xgboost import XGBClassifier
 def load_data():
     return pd.read_csv('cancer_prediction_data (2).csv')
-# Data Preprocessing
 # Data Preprocessing
 def preprocess_data(df):
-    # Dynamically identify categorical and numerical features
-    categorical_features = df.select_dtypes(include=['object']).columns
-    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
     preprocess = ColumnTransformer([
         ('num', Pipeline([
-            ('imputer', SimpleImputer(strategy='mean')),  # For numeric columns
             ('scaler', StandardScaler())
-        ]), numerical_features),
-        ('cat', Pipeline([
-            ('imputer', SimpleImputer(strategy='most_frequent')),  # For categorical columns
-            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
-        ]), categorical_features)
     ], remainder='passthrough')
-    x = df.drop('Cancer_Present', axis=1)  # Drop target column
-    y = df['Cancer_Present']  # Target column
     return train_test_split(x, y, test_size=0.2, random_state=23), preprocess
 # Train Model

 def load_data():
     return pd.read_csv('cancer_prediction_data (2).csv')
 # Data Preprocessing
 def preprocess_data(df):
+    numeric = ['Age', 'Tumor_Size']
+    ordinal = ['Tumor_Grade', 'Symptoms_Severity', 'Alcohol_Consumption', 'Exercise_Frequency']
+    nominal = ['Gender', 'Family_History', 'Smoking_History']
     preprocess = ColumnTransformer([
         ('num', Pipeline([
+            ('imputer', SimpleImputer(strategy='mean')),
             ('scaler', StandardScaler())
+        ]), numeric),
+        ('ord', Pipeline([
+            ('imputer', SimpleImputer(strategy='most_frequent')),
+            ('encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
+        ]), ordinal),
+        ('nom', Pipeline([
+            ('imputer', SimpleImputer(strategy='most_frequent')),
+            ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
+        ]), nominal)
     ], remainder='passthrough')
+    x = df.drop('Cancer_Present', axis=1)
+    y = df['Cancer_Present']
     return train_test_split(x, y, test_size=0.2, random_state=23), preprocess
 # Train Model