varshitha22 commited on
Commit
b1ff86e
·
verified ·
1 Parent(s): 1e8a483

Update cancer.py

Browse files
Files changed (1) hide show
  1. cancer.py +15 -12
cancer.py CHANGED
@@ -15,26 +15,29 @@ from xgboost import XGBClassifier
15
  def load_data():
16
  return pd.read_csv('cancer_prediction_data (2).csv')
17
 
18
- # Data Preprocessing
19
  # Data Preprocessing
20
  def preprocess_data(df):
21
- # Dynamically identify categorical and numerical features
22
- categorical_features = df.select_dtypes(include=['object']).columns
23
- numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
24
 
25
  preprocess = ColumnTransformer([
26
  ('num', Pipeline([
27
- ('imputer', SimpleImputer(strategy='mean')), # For numeric columns
28
  ('scaler', StandardScaler())
29
- ]), numerical_features),
30
- ('cat', Pipeline([
31
- ('imputer', SimpleImputer(strategy='most_frequent')), # For categorical columns
32
- ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
33
- ]), categorical_features)
 
 
 
 
34
  ], remainder='passthrough')
35
 
36
- x = df.drop('Cancer_Present', axis=1) # Drop target column
37
- y = df['Cancer_Present'] # Target column
38
  return train_test_split(x, y, test_size=0.2, random_state=23), preprocess
39
 
40
  # Train Model
 
15
  def load_data():
16
  return pd.read_csv('cancer_prediction_data (2).csv')
17
 
 
18
  # Data Preprocessing
19
  def preprocess_data(df):
20
+ numeric = ['Age', 'Tumor_Size']
21
+ ordinal = ['Tumor_Grade', 'Symptoms_Severity', 'Alcohol_Consumption', 'Exercise_Frequency']
22
+ nominal = ['Gender', 'Family_History', 'Smoking_History']
23
 
24
  preprocess = ColumnTransformer([
25
  ('num', Pipeline([
26
+ ('imputer', SimpleImputer(strategy='mean')),
27
  ('scaler', StandardScaler())
28
+ ]), numeric),
29
+ ('ord', Pipeline([
30
+ ('imputer', SimpleImputer(strategy='most_frequent')),
31
+ ('encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
32
+ ]), ordinal),
33
+ ('nom', Pipeline([
34
+ ('imputer', SimpleImputer(strategy='most_frequent')),
35
+ ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
36
+ ]), nominal)
37
  ], remainder='passthrough')
38
 
39
+ x = df.drop('Cancer_Present', axis=1)
40
+ y = df['Cancer_Present']
41
  return train_test_split(x, y, test_size=0.2, random_state=23), preprocess
42
 
43
  # Train Model