import streamlit as st
import pandas as pd
import warnings
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Filter out warnings
warnings.filterwarnings("ignore")

st.title("Cancer Prediction App")
st.markdown("### Powered by Innomatics Research Lab")

# Load dataset
@st.cache_data
def load_data():
    return pd.read_csv('cancer_prediction_data (2).csv')

data = load_data()

# Ensure target column exists
target_col = 'Cancer_Present'
if target_col not in data.columns:
    st.error(f"Target column '{target_col}' not found in data!")
    st.stop()

# Split features and target
X = data.drop(columns=[target_col])
y = data[target_col]

# Define feature categories
numerical_features = ['Age', 'Tumor_Size']
categorical_features = ['Gender', 'Tumor_Grade', 'Symptoms_Severity', 'Family_History',
                        'Smoking_History', 'Alcohol_Consumption', 'Exercise_Frequency']

# Preprocessing pipeline
def create_preprocessing_pipeline():
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    return ColumnTransformer([
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

preprocess = create_preprocessing_pipeline()

# Sidebar - Select Algorithm
st.sidebar.header("Model Selection")
algorithm = st.sidebar.radio("Choose an Algorithm", ["SVM", "Random Forest", "Gradient Boosting"])

# Train different models
model_dict = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Create pipeline
pipeline = Pipeline([
    ('preprocessing', preprocess),
    ('classifier', model_dict[algorithm])
])

# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
accuracy = pipeline.score(X_test, y_test)
st.sidebar.write(f"**{algorithm} Accuracy:** {accuracy * 100:.2f}%")

# Sidebar - User input
def user_input_features():
    age = st.sidebar.slider("Age", 0, 120, 50)
    tumor_size = st.sidebar.slider("Tumor Size", 0.0, 100.0, 5.0)
    gender = st.sidebar.selectbox("Gender", ["Male", "Female"])
    tumor_grade = st.sidebar.selectbox("Tumor Grade", ["Low", "Medium", "High"])
    symptoms_severity = st.sidebar.selectbox("Symptoms Severity", ["Mild", "Moderate", "Severe"])
    family_history = st.sidebar.selectbox("Family History", ["Yes", "No"])
    smoking_history = st.sidebar.selectbox("Smoking History", ["Current Smoker", "Non-Smoker"])
    alcohol_consumption = st.sidebar.selectbox("Alcohol Consumption", ["Low", "Moderate", "High"])
    exercise_frequency = st.sidebar.selectbox("Exercise Frequency", ["Never", "Rarely", "Occasionally", "Often"])
    return pd.DataFrame({
        'Age': [age],
        'Tumor_Size': [tumor_size],
        'Gender': [gender],
        'Tumor_Grade': [tumor_grade],
        'Symptoms_Severity': [symptoms_severity],
        'Family_History': [family_history],
        'Smoking_History': [smoking_history],
        'Alcohol_Consumption': [alcohol_consumption],
        'Exercise_Frequency': [exercise_frequency]
    })

st.sidebar.markdown("### Patient Data Input")
input_df = user_input_features()
st.subheader("User Input Data")
st.write(input_df)

# Prediction
if st.button("Predict Cancer Presence"):
    prediction = pipeline.predict(input_df)
    result = "Cancer Detected" if prediction[0] == 1 else "No Cancer Detected"
    st.subheader(f"### Prediction: {result}")