|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.model_selection import train_test_split, cross_val_score |
|
|
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.svm import SVC |
|
|
from sklearn.metrics import accuracy_score, classification_report |
|
|
from imblearn.over_sampling import SMOTE |
|
|
import optuna |
|
|
from sklearn.neighbors import KNeighborsClassifier |
|
|
|
|
|
|
|
|
uploaded_file = st.file_uploader("Upload your dataset (CSV format):", type=["csv"]) |
|
|
|
|
|
if uploaded_file is not None: |
|
|
|
|
|
data = pd.read_csv(uploaded_file) |
|
|
st.write("### Uploaded Dataset:") |
|
|
st.dataframe(data) |
|
|
|
|
|
|
|
|
st.write("### Dataset Overview:") |
|
|
st.write(data.describe()) |
|
|
|
|
|
|
|
|
st.write("### Missing Values:") |
|
|
st.write(data.isnull().sum()) |
|
|
|
|
|
|
|
|
target_column = st.selectbox("Select target column", data.columns) |
|
|
|
|
|
|
|
|
encoding_method = st.selectbox("Select Encoding Method", ["None", "LabelEncoding", "OneHotEncoding"]) |
|
|
if encoding_method == "LabelEncoding": |
|
|
label_encoder = LabelEncoder() |
|
|
data = data.apply(lambda col: label_encoder.fit_transform(col) if col.dtype == 'object' else col) |
|
|
st.write("Applied Label Encoding to categorical variables.") |
|
|
elif encoding_method == "OneHotEncoding": |
|
|
categorical_columns = data.select_dtypes(include=['object']).columns |
|
|
data = pd.get_dummies(data, columns=categorical_columns) |
|
|
st.write("Applied One-Hot Encoding to categorical variables.") |
|
|
|
|
|
|
|
|
y = data[target_column] |
|
|
X = data.drop(columns=[target_column]) |
|
|
value_counts = y.value_counts() |
|
|
st.write(f"Class distribution in {target_column}:") |
|
|
st.write(value_counts) |
|
|
if value_counts.min() / value_counts.max() < 0.25: |
|
|
smote = SMOTE(random_state=42) |
|
|
X, y = smote.fit_resample(X, y) |
|
|
st.write("Applied SMOTE for balancing classes.") |
|
|
|
|
|
|
|
|
scaling_method = st.selectbox("Select Scaling Method", ["None", "StandardScaler", "MinMaxScaler"]) |
|
|
if scaling_method == "StandardScaler": |
|
|
scaler = StandardScaler() |
|
|
X_scaled = scaler.fit_transform(X) |
|
|
elif scaling_method == "MinMaxScaler": |
|
|
scaler = MinMaxScaler() |
|
|
X_scaled = scaler.fit_transform(X) |
|
|
else: |
|
|
X_scaled = X |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42) |
|
|
|
|
|
|
|
|
algorithms = st.multiselect("Select algorithms", ["RandomForest", "LogisticRegression", "SVC", "KNN"]) |
|
|
|
|
|
|
|
|
metrics = st.multiselect("Select evaluation metrics", ["Accuracy", "Precision", "Recall", "F1-score"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def objective(trial): |
|
|
|
|
|
model_type = trial.suggest_categorical("model", algorithms) |
|
|
|
|
|
if model_type == "KNN": |
|
|
n_neighbors = trial.suggest_int("n_neighbors", 1, 100) |
|
|
p = trial.suggest_int("p", 1, 2) |
|
|
model = KNeighborsClassifier(n_neighbors=n_neighbors, p=p) |
|
|
|
|
|
elif model_type == "LogisticRegression": |
|
|
solver, penalty = trial.suggest_categorical("solver_penalty", [ |
|
|
("lbfgs", "l2"), ("newton-cg", "l2"), ("sag", "l2"), ("saga", "l1"), |
|
|
("saga", "l2"), ("saga", "elasticnet")]) |
|
|
C = trial.suggest_loguniform("C", 1e-5, 1e2) |
|
|
if penalty == "elasticnet": |
|
|
model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial", l1_ratio=0.3) |
|
|
else: |
|
|
model = LogisticRegression(C=C, solver=solver, penalty=penalty, multi_class="multinomial") |
|
|
|
|
|
elif model_type == "RandomForest": |
|
|
n_estimators = trial.suggest_int("n_estimators", 50, 200) |
|
|
max_depth = trial.suggest_int("max_depth", 3, 10) |
|
|
model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42) |
|
|
|
|
|
elif model_type == "SVC": |
|
|
C = trial.suggest_loguniform("C", 1e-5, 1e2) |
|
|
kernel = trial.suggest_categorical("kernel", ["linear", "rbf"]) |
|
|
model = SVC(C=C, kernel=kernel, random_state=42) |
|
|
|
|
|
|
|
|
score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean() |
|
|
return score |
|
|
|
|
|
|
|
|
if st.button("Start Hyperparameter Tuning"): |
|
|
study = optuna.create_study(direction="maximize") |
|
|
study.optimize(objective, n_trials=100) |
|
|
st.write(f"Best trial: {study.best_trial.params}") |
|
|
st.write(f"Best score: {study.best_trial.value}") |
|
|
|
|
|
|
|
|
best_model_type = study.best_trial.params['model'] |
|
|
if best_model_type == "KNN": |
|
|
model = KNeighborsClassifier(n_neighbors=study.best_trial.params['n_neighbors'], p=study.best_trial.params['p']) |
|
|
elif best_model_type == "LogisticRegression": |
|
|
model = LogisticRegression(C=study.best_trial.params['C'], solver=study.best_trial.params['solver_penalty'][0], |
|
|
penalty=study.best_trial.params['solver_penalty'][1], multi_class="multinomial") |
|
|
elif best_model_type == "RandomForest": |
|
|
model = RandomForestClassifier(n_estimators=study.best_trial.params['n_estimators'], |
|
|
max_depth=study.best_trial.params['max_depth'], random_state=42) |
|
|
elif best_model_type == "SVC": |
|
|
model = SVC(C=study.best_trial.params['C'], kernel=study.best_trial.params['kernel'], random_state=42) |
|
|
|
|
|
|
|
|
model.fit(X_train, y_train) |
|
|
y_pred = model.predict(X_test) |
|
|
|
|
|
|
|
|
st.write("### Model Evaluation:") |
|
|
if "Accuracy" in metrics: |
|
|
accuracy = accuracy_score(y_test, y_pred) |
|
|
st.write(f"Accuracy: {accuracy}") |
|
|
if "Precision" in metrics: |
|
|
precision = precision_score(y_test, y_pred, average='weighted') |
|
|
st.write(f"Precision: {precision}") |
|
|
if "Recall" in metrics: |
|
|
recall = recall_score(y_test, y_pred, average='weighted') |
|
|
st.write(f"Recall: {recall}") |
|
|
if "F1-score" in metrics: |
|
|
f1 = f1_score(y_test, y_pred, average='weighted') |
|
|
st.write(f"F1-score: {f1}") |
|
|
|
|
|
|
|
|
st.write("### Classification Report:") |
|
|
st.write(classification_report(y_test, y_pred)) |
|
|
else: |
|
|
st.warning("Please upload a dataset to proceed with EDA.") |
|
|
|