import streamlit as st import pandas as pd import numpy as np from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier # Load dataset def load_data(): df = pd.read_csv('cancer_prediction_data (2).csv') return df # Data Preprocessing def preprocess_data(df): numeric = ['Age', 'Tumor_Size'] ordinal = ['Tumor_Grade', 'Symptoms_Severity', 'Alcohol_Consumption', 'Exercise_Frequency'] nominal = ['Gender', 'Family_History', 'Smoking_History'] # Pipelines numeric_preprocess = Pipeline([ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()) ]) ordinal_preprocess = Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)) ]) nominal_preprocess = Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore')) ]) # Column Transformer preprocess = ColumnTransformer([ ('num', numeric_preprocess, numeric), ('ord', ordinal_preprocess, ordinal), ('nom', nominal_preprocess, nominal) ], remainder='passthrough') X = df.drop('Cancer_Present', axis=1) y = df['Cancer_Present'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23) return X_train, X_test, y_train, y_test, preprocess # Train Models def train_model(X_train, y_train, preprocess, model_name): models = { 'Decision Tree': DecisionTreeClassifier(), 'SVM': SVC(), 'Logistic Regression': LogisticRegression(), 'KNN': KNeighborsClassifier() } model = models[model_name] pipeline = Pipeline([ ('preprocessor', preprocess), ('classifier', model) ]) pipeline.fit(X_train, y_train) return pipeline # Streamlit UI st.title("Cancer Prediction Using Machine Learning") df = load_data() X_train, X_test, y_train, y_test, preprocess = preprocess_data(df) model_name = st.selectbox("Select Model", ['Decision Tree', 'SVM', 'Logistic Regression', 'KNN']) if st.button("Train Model"): model = train_model(X_train, y_train, preprocess, model_name) accuracy = model.score(X_test, y_test) st.write(f"Model Accuracy: {accuracy:.2f}") st.session_state['trained_model'] = model st.success("Model trained successfully!") # Prediction Section st.header("Make a Prediction") age = st.number_input("Age", min_value=18, max_value=100, value=30) tumor_size = st.number_input("Tumor Size", min_value=1.0, max_value=10.0, value=5.0) tumor_grade = st.selectbox("Tumor Grade", [1, 2, 3]) symptoms_severity = st.selectbox("Symptoms Severity", [1, 2, 3]) smoking_history = st.selectbox("Smoking History", [0, 1, 2]) alcohol_consumption = st.selectbox("Alcohol Consumption", [0, 1, 2, 3]) exercise_frequency = st.selectbox("Exercise Frequency", [0, 1, 2, 3]) gender = st.selectbox("Gender", [0, 1]) family_history = st.selectbox("Family History", [0, 1]) input_data = [[age, tumor_size, tumor_grade, symptoms_severity, smoking_history, alcohol_consumption, exercise_frequency, gender, family_history]] if st.button("Predict Cancer Presence"): if 'trained_model' in st.session_state: model = st.session_state['trained_model'] input_df = pd.DataFrame(input_data, columns=X_train.columns) # Transform input data using the same preprocessor input_transformed = model.named_steps['preprocessor'].transform(input_df) prediction = model.named_steps['classifier'].predict(input_transformed) st.write("Cancer Prediction:", "Positive" if prediction[0] == 1 else "Negative") else: st.error("Please train a model first!")