Spaces:

varshitha22
/

Cancer_Prediction

Sleeping

App Files Files Community

varshitha22 commited on Feb 26, 2025

Commit

8a4d07b

verified ·

1 Parent(s): 424eb46

Update cancer.py

Browse files

Files changed (1) hide show

cancer.py +22 -15

cancer.py CHANGED Viewed

@@ -10,6 +10,7 @@ from sklearn.linear_model import LogisticRegression
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.ensemble import RandomForestClassifier
 from xgboost import XGBClassifier
 # Load dataset
 def load_data():
@@ -34,21 +35,28 @@ def preprocess_data(df):
             ('imputer', SimpleImputer(strategy='most_frequent')),
             ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
         ]), nominal)
-    ], remainder='passthrough')
     x = df.drop('Cancer_Present', axis=1)
     y = df['Cancer_Present']
-    return train_test_split(x, y, test_size=0.2, random_state=23), preprocess
 # Train Model
 def train_model(x_train, y_train, preprocess, model_name):
     models = {
-        'Decision Tree': DecisionTreeClassifier(),
-        'Logistic Regression': LogisticRegression(),
-        'KNN': KNeighborsClassifier(),
-        'Random Forest': RandomForestClassifier(),
-        'XGBoost': XGBClassifier()
     }
     pipeline = Pipeline([
         ('preprocessor', preprocess),
         ('classifier', models[model_name])
@@ -62,9 +70,10 @@ st.set_page_config(page_title='Cancer Prediction App', layout='wide')
 with st.sidebar:
     st.markdown("### Select Machine Learning Model")
     model_name = st.radio("Choose a Model", ['Decision Tree', 'Logistic Regression', 'KNN', 'Random Forest', 'XGBoost'])
     if st.button("Train Model"):
         df = load_data()
-        (x_train, x_test, y_train, y_test), preprocess = preprocess_data(df)
         model = train_model(x_train, y_train, preprocess, model_name)
         accuracy = model.score(x_test, y_test)
         st.session_state['trained_model'] = model
@@ -85,8 +94,8 @@ with col1:
 with col2:
     smoking_history = st.selectbox("Smoking History", ['Never Smoker', 'Former Smoker', 'Current Smoker'])
-    alcohol_consumption = st.selectbox("Alcohol Consumption", ['Low','Moderate','High'])
-    exercise_frequency = st.selectbox("Exercise Frequency", ['Rarely', 'Occasionally', 'Regularly','Never'])
     gender = st.selectbox("Gender", ['Male', 'Female'])
     family_history = st.selectbox("Family History", ["No", "Yes"])
@@ -105,11 +114,8 @@ if st.button("Predict Cancer Presence"):
         for col in ['Age', 'Tumor_Size']:
             input_df[col] = pd.to_numeric(input_df[col], errors='coerce')
-        # Apply preprocessing
-        input_transformed = model.named_steps['preprocessor'].transform(input_df)
-        # Make prediction
-        prediction = model.named_steps['classifier'].predict(input_transformed)
         if prediction[0] == 1:
             st.markdown("<h3 style='color: red;'>Cancer Prediction: Positive 🟥</h3>", unsafe_allow_html=True)
@@ -120,3 +126,4 @@ if st.button("Predict Cancer Presence"):
     else:
         st.error("Please train a model first!")

 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.ensemble import RandomForestClassifier
 from xgboost import XGBClassifier
+from imblearn.over_sampling import SMOTE  # For handling class imbalance
 # Load dataset
 def load_data():
             ('imputer', SimpleImputer(strategy='most_frequent')),
             ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
         ]), nominal)
+    ], remainder='drop')  # Drop unlisted columns
     x = df.drop('Cancer_Present', axis=1)
     y = df['Cancer_Present']
+    # Handling class imbalance using SMOTE
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=23, stratify=y)
+    smote = SMOTE(random_state=23)
+    x_train, y_train = smote.fit_resample(x_train, y_train)
+    return x_train, x_test, y_train, y_test, preprocess
 # Train Model
 def train_model(x_train, y_train, preprocess, model_name):
     models = {
+        'Decision Tree': DecisionTreeClassifier(max_depth=5),
+        'Logistic Regression': LogisticRegression(max_iter=1000),
+        'KNN': KNeighborsClassifier(n_neighbors=5),
+        'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5),
+        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
     }
     pipeline = Pipeline([
         ('preprocessor', preprocess),
         ('classifier', models[model_name])
 with st.sidebar:
     st.markdown("### Select Machine Learning Model")
     model_name = st.radio("Choose a Model", ['Decision Tree', 'Logistic Regression', 'KNN', 'Random Forest', 'XGBoost'])
     if st.button("Train Model"):
         df = load_data()
+        x_train, x_test, y_train, y_test, preprocess = preprocess_data(df)
         model = train_model(x_train, y_train, preprocess, model_name)
         accuracy = model.score(x_test, y_test)
         st.session_state['trained_model'] = model
 with col2:
     smoking_history = st.selectbox("Smoking History", ['Never Smoker', 'Former Smoker', 'Current Smoker'])
+    alcohol_consumption = st.selectbox("Alcohol Consumption", ['Low', 'Moderate', 'High'])
+    exercise_frequency = st.selectbox("Exercise Frequency", ['Rarely', 'Occasionally', 'Regularly', 'Never'])
     gender = st.selectbox("Gender", ['Male', 'Female'])
     family_history = st.selectbox("Family History", ["No", "Yes"])
         for col in ['Age', 'Tumor_Size']:
             input_df[col] = pd.to_numeric(input_df[col], errors='coerce')
+        # Apply preprocessing using the same pipeline
+        prediction = model.predict(input_df)
         if prediction[0] == 1:
             st.markdown("<h3 style='color: red;'>Cancer Prediction: Positive 🟥</h3>", unsafe_allow_html=True)
     else:
         st.error("Please train a model first!")