Rajusunnam's picture
Rename app (2).py to app.py
cf95905 verified
import streamlit as st
import pandas as pd
import warnings
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
# Filter out warnings
warnings.filterwarnings("ignore")
st.title("Cancer Prediction App")
st.markdown("### Powered by Innomatics Research Lab")
# Load dataset
@st.cache_data
def load_data():
return pd.read_csv('cancer_prediction_data (2).csv')
data = load_data()
# Ensure target column exists
target_col = 'Cancer_Present'
if target_col not in data.columns:
st.error(f"Target column '{target_col}' not found in data!")
st.stop()
# Split features and target
X = data.drop(columns=[target_col])
y = data[target_col]
# Define feature categories
numerical_features = ['Age', 'Tumor_Size']
categorical_features = ['Gender', 'Tumor_Grade', 'Symptoms_Severity', 'Family_History',
'Smoking_History', 'Alcohol_Consumption', 'Exercise_Frequency']
# Preprocessing pipeline
def create_preprocessing_pipeline():
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
return ColumnTransformer([
('num', numerical_pipeline, numerical_features),
('cat', categorical_pipeline, categorical_features)
])
preprocess = create_preprocessing_pipeline()
# Sidebar - Select Algorithm
st.sidebar.header("Model Selection")
algorithm = st.sidebar.radio("Choose an Algorithm", ["SVM", "Random Forest", "Gradient Boosting"])
# Train different models
model_dict = {
"SVM": SVC(),
"Random Forest": RandomForestClassifier(),
"Gradient Boosting": GradientBoostingClassifier()
}
# Create pipeline
pipeline = Pipeline([
('preprocessing', preprocess),
('classifier', model_dict[algorithm])
])
# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
accuracy = pipeline.score(X_test, y_test)
st.sidebar.write(f"**{algorithm} Accuracy:** {accuracy * 100:.2f}%")
# Sidebar - User input
def user_input_features():
age = st.sidebar.slider("Age", 0, 120, 50)
tumor_size = st.sidebar.slider("Tumor Size", 0.0, 100.0, 5.0)
gender = st.sidebar.selectbox("Gender", ["Male", "Female"])
tumor_grade = st.sidebar.selectbox("Tumor Grade", ["Low", "Medium", "High"])
symptoms_severity = st.sidebar.selectbox("Symptoms Severity", ["Mild", "Moderate", "Severe"])
family_history = st.sidebar.selectbox("Family History", ["Yes", "No"])
smoking_history = st.sidebar.selectbox("Smoking History", ["Current Smoker", "Non-Smoker"])
alcohol_consumption = st.sidebar.selectbox("Alcohol Consumption", ["Low", "Moderate", "High"])
exercise_frequency = st.sidebar.selectbox("Exercise Frequency", ["Never", "Rarely", "Occasionally", "Often"])
return pd.DataFrame({
'Age': [age],
'Tumor_Size': [tumor_size],
'Gender': [gender],
'Tumor_Grade': [tumor_grade],
'Symptoms_Severity': [symptoms_severity],
'Family_History': [family_history],
'Smoking_History': [smoking_history],
'Alcohol_Consumption': [alcohol_consumption],
'Exercise_Frequency': [exercise_frequency]
})
st.sidebar.markdown("### Patient Data Input")
input_df = user_input_features()
st.subheader("User Input Data")
st.write(input_df)
# Prediction
if st.button("Predict Cancer Presence"):
prediction = pipeline.predict(input_df)
result = "Cancer Detected" if prediction[0] == 1 else "No Cancer Detected"
st.subheader(f"### Prediction: {result}")