import streamlit as st import pandas as pd import warnings from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer # Filter out warnings warnings.filterwarnings("ignore") st.title("Cancer Prediction App") st.markdown("### Powered by Innomatics Research Lab") # Load dataset @st.cache_data def load_data(): return pd.read_csv('cancer_prediction_data (2).csv') data = load_data() # Ensure target column exists target_col = 'Cancer_Present' if target_col not in data.columns: st.error(f"Target column '{target_col}' not found in data!") st.stop() # Split features and target X = data.drop(columns=[target_col]) y = data[target_col] # Define feature categories numerical_features = ['Age', 'Tumor_Size'] categorical_features = ['Gender', 'Tumor_Grade', 'Symptoms_Severity', 'Family_History', 'Smoking_History', 'Alcohol_Consumption', 'Exercise_Frequency'] # Preprocessing pipeline def create_preprocessing_pipeline(): numerical_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()) ]) categorical_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore')) ]) return ColumnTransformer([ ('num', numerical_pipeline, numerical_features), ('cat', categorical_pipeline, categorical_features) ]) preprocess = create_preprocessing_pipeline() # Sidebar - Select Algorithm st.sidebar.header("Model Selection") algorithm = st.sidebar.radio("Choose an Algorithm", ["SVM", "Random Forest", "Gradient Boosting"]) # Train different models model_dict = { "SVM": SVC(), "Random Forest": RandomForestClassifier(), "Gradient Boosting": GradientBoostingClassifier() } # Create pipeline pipeline = Pipeline([ ('preprocessing', preprocess), ('classifier', model_dict[algorithm]) ]) # Train model X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) pipeline.fit(X_train, y_train) accuracy = pipeline.score(X_test, y_test) st.sidebar.write(f"**{algorithm} Accuracy:** {accuracy * 100:.2f}%") # Sidebar - User input def user_input_features(): age = st.sidebar.slider("Age", 0, 120, 50) tumor_size = st.sidebar.slider("Tumor Size", 0.0, 100.0, 5.0) gender = st.sidebar.selectbox("Gender", ["Male", "Female"]) tumor_grade = st.sidebar.selectbox("Tumor Grade", ["Low", "Medium", "High"]) symptoms_severity = st.sidebar.selectbox("Symptoms Severity", ["Mild", "Moderate", "Severe"]) family_history = st.sidebar.selectbox("Family History", ["Yes", "No"]) smoking_history = st.sidebar.selectbox("Smoking History", ["Current Smoker", "Non-Smoker"]) alcohol_consumption = st.sidebar.selectbox("Alcohol Consumption", ["Low", "Moderate", "High"]) exercise_frequency = st.sidebar.selectbox("Exercise Frequency", ["Never", "Rarely", "Occasionally", "Often"]) return pd.DataFrame({ 'Age': [age], 'Tumor_Size': [tumor_size], 'Gender': [gender], 'Tumor_Grade': [tumor_grade], 'Symptoms_Severity': [symptoms_severity], 'Family_History': [family_history], 'Smoking_History': [smoking_history], 'Alcohol_Consumption': [alcohol_consumption], 'Exercise_Frequency': [exercise_frequency] }) st.sidebar.markdown("### Patient Data Input") input_df = user_input_features() st.subheader("User Input Data") st.write(input_df) # Prediction if st.button("Predict Cancer Presence"): prediction = pipeline.predict(input_df) result = "Cancer Detected" if prediction[0] == 1 else "No Cancer Detected" st.subheader(f"### Prediction: {result}")