varshitha22 commited on
Commit
6a74ea1
·
verified ·
1 Parent(s): 7a27867

Update cancer.py

Browse files
Files changed (1) hide show
  1. cancer.py +15 -19
cancer.py CHANGED
@@ -15,30 +15,26 @@ from xgboost import XGBClassifier
15
  def load_data():
16
  return pd.read_csv('cancer_prediction_data (2).csv')
17
 
 
18
  # Data Preprocessing
19
  def preprocess_data(df):
20
- numeric = ['Age', 'Tumor_Size']
21
- ordinal = ['Tumor_Grade', 'Symptoms_Severity', 'Alcohol_Consumption', 'Exercise_Frequency']
22
- nominal = ['Gender', 'Family_History', 'Smoking_History']
23
 
24
  preprocess = ColumnTransformer([
25
- ('Numerical Transformation', Pipeline([
26
- ("Mean Imputation",SimpleImputer(strategy= 'mean')),
27
- ('Scaling',StandardScaler())
28
- ]), numeric),
29
-
30
- ('Ordinal Transformation', Pipeline([
31
- ("Mode Imputation",SimpleImputer(strategy= 'most_frequent')),
32
- ('Encoding',OrdinalEncoder())
33
- ]), ordinal),
34
-
35
- ('Nominal Transformation', Pipeline([
36
- ("Mode Imputation",SimpleImputer(strategy= 'most_frequent')),
37
- ('Encoding',OneHotEncoder())
38
- ]), nominal)], remainder='passthrough')
39
 
40
- X = df.drop('Cancer_Present', axis=1)
41
- y = df['Cancer_Present']
42
  return train_test_split(X, y, test_size=0.2, random_state=23), preprocess
43
 
44
  # Train Model
 
15
  def load_data():
16
  return pd.read_csv('cancer_prediction_data (2).csv')
17
 
18
+ # Data Preprocessing
19
  # Data Preprocessing
20
  def preprocess_data(df):
21
+ # Dynamically identify categorical and numerical features
22
+ categorical_features = df.select_dtypes(include=['object']).columns
23
+ numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
24
 
25
  preprocess = ColumnTransformer([
26
+ ('num', Pipeline([
27
+ ('imputer', SimpleImputer(strategy='mean')), # For numeric columns
28
+ ('scaler', StandardScaler())
29
+ ]), numerical_features),
30
+ ('cat', Pipeline([
31
+ ('imputer', SimpleImputer(strategy='most_frequent')), # For categorical columns
32
+ ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
33
+ ]), categorical_features)
34
+ ], remainder='passthrough')
 
 
 
 
 
35
 
36
+ X = df.drop('Cancer_Present', axis=1) # Drop target column
37
+ y = df['Cancer_Present'] # Target column
38
  return train_test_split(X, y, test_size=0.2, random_state=23), preprocess
39
 
40
  # Train Model