prahalya's picture
Upload 3 files
b9ef127 verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
# Load dataset
def load_data():
df = pd.read_csv('cancer_prediction_data (2).csv')
return df
# Data Preprocessing
def preprocess_data(df):
numeric = ['Age', 'Tumor_Size']
ordinal = ['Tumor_Grade', 'Symptoms_Severity', 'Alcohol_Consumption', 'Exercise_Frequency']
nominal = ['Gender', 'Family_History', 'Smoking_History']
# Pipelines
numeric_preprocess = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
ordinal_preprocess = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])
nominal_preprocess = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
# Column Transformer
preprocess = ColumnTransformer([
('num', numeric_preprocess, numeric),
('ord', ordinal_preprocess, ordinal),
('nom', nominal_preprocess, nominal)
], remainder='passthrough')
X = df.drop('Cancer_Present', axis=1)
y = df['Cancer_Present']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)
return X_train, X_test, y_train, y_test, preprocess
# Train Models
def train_model(X_train, y_train, preprocess, model_name):
models = {
'Decision Tree': DecisionTreeClassifier(),
'SVM': SVC(),
'Logistic Regression': LogisticRegression(),
'KNN': KNeighborsClassifier()
}
model = models[model_name]
pipeline = Pipeline([
('preprocessor', preprocess),
('classifier', model)
])
pipeline.fit(X_train, y_train)
return pipeline
# Streamlit UI
st.title("Cancer Prediction Using Machine Learning")
df = load_data()
X_train, X_test, y_train, y_test, preprocess = preprocess_data(df)
model_name = st.selectbox("Select Model", ['Decision Tree', 'SVM', 'Logistic Regression', 'KNN'])
if st.button("Train Model"):
model = train_model(X_train, y_train, preprocess, model_name)
accuracy = model.score(X_test, y_test)
st.write(f"Model Accuracy: {accuracy:.2f}")
st.session_state['trained_model'] = model
st.success("Model trained successfully!")
# Prediction Section
st.header("Make a Prediction")
age = st.number_input("Age", min_value=18, max_value=100, value=30)
tumor_size = st.number_input("Tumor Size", min_value=1.0, max_value=10.0, value=5.0)
tumor_grade = st.selectbox("Tumor Grade", [1, 2, 3])
symptoms_severity = st.selectbox("Symptoms Severity", [1, 2, 3])
smoking_history = st.selectbox("Smoking History", [0, 1, 2])
alcohol_consumption = st.selectbox("Alcohol Consumption", [0, 1, 2, 3])
exercise_frequency = st.selectbox("Exercise Frequency", [0, 1, 2, 3])
gender = st.selectbox("Gender", [0, 1])
family_history = st.selectbox("Family History", [0, 1])
input_data = [[age, tumor_size, tumor_grade, symptoms_severity, smoking_history,
alcohol_consumption, exercise_frequency, gender, family_history]]
if st.button("Predict Cancer Presence"):
if 'trained_model' in st.session_state:
model = st.session_state['trained_model']
input_df = pd.DataFrame(input_data, columns=X_train.columns)
# Transform input data using the same preprocessor
input_transformed = model.named_steps['preprocessor'].transform(input_df)
prediction = model.named_steps['classifier'].predict(input_transformed)
st.write("Cancer Prediction:", "Positive" if prediction[0] == 1 else "Negative")
else:
st.error("Please train a model first!")