|
|
from sklearn.compose import ColumnTransformer |
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.linear_model import LinearRegression, LogisticRegression |
|
|
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier |
|
|
from xgboost import XGBRegressor, XGBClassifier |
|
|
from sklearn.svm import SVR, SVC |
|
|
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier |
|
|
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier |
|
|
from sklearn.linear_model import ElasticNet, BayesianRidge |
|
|
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, GradientBoostingClassifier, AdaBoostClassifier |
|
|
from sklearn.naive_bayes import GaussianNB |
|
|
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis |
|
|
from sklearn.linear_model import Ridge, Lasso |
|
|
from sklearn.impute import SimpleImputer |
|
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder |
|
|
from sklearn.compose import ColumnTransformer |
|
|
from sklearn.pipeline import Pipeline as SkPipeline |
|
|
|
|
|
import streamlit as st |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_model(task_type, model_name, hyperparams): |
|
|
"""Returns the model instance based on user selection with hyperparameters.""" |
|
|
models = { |
|
|
"regression": { |
|
|
|
|
|
"Linear Regression": LinearRegression, |
|
|
"Random Forest Regressor": RandomForestRegressor, |
|
|
"XGBoost Regressor": XGBRegressor, |
|
|
|
|
|
"Support Vector Regressor": SVR, |
|
|
"Decision Tree Regressor": DecisionTreeRegressor, |
|
|
"K-Nearest Neighbors Regressor": KNeighborsRegressor, |
|
|
"ElasticNet": ElasticNet, |
|
|
"Gradient Boosting Regressor": GradientBoostingRegressor, |
|
|
"AdaBoost Regressor": AdaBoostRegressor, |
|
|
"Bayesian Ridge": BayesianRidge, |
|
|
"Ridge Regression": Ridge, |
|
|
"Lasso Regression": Lasso , |
|
|
|
|
|
}, |
|
|
"classification": { |
|
|
|
|
|
"Logistic Regression": LogisticRegression, |
|
|
"Random Forest": RandomForestClassifier, |
|
|
"XGBoost": XGBClassifier, |
|
|
|
|
|
"Support Vector Classifier": SVC, |
|
|
"Decision Tree Classifier": DecisionTreeClassifier, |
|
|
"K-Nearest Neighbors Classifier": KNeighborsClassifier, |
|
|
"Gradient Boosting Classifier": GradientBoostingClassifier, |
|
|
"AdaBoost Classifier": AdaBoostClassifier, |
|
|
"Gaussian Naive Bayes": GaussianNB, |
|
|
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis, |
|
|
"Linear Discriminant Analysis": LinearDiscriminantAnalysis |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
if task_type in models and model_name in models[task_type]: |
|
|
return models[task_type][model_name](**hyperparams) |
|
|
else: |
|
|
raise ValueError(f"Invalid model selection: {model_name} for {task_type}") |
|
|
|
|
|
|
|
|
def train_model(df, target_column, task_type, selected_model_name, hyperparams): |
|
|
"""Preprocess data, train the selected model with hyperparameters, and return the trained model.""" |
|
|
|
|
|
with st.spinner(" Training model... Please wait!"): |
|
|
|
|
|
|
|
|
model = get_model(task_type, selected_model_name, hyperparams) |
|
|
|
|
|
|
|
|
X = df.drop(columns=[target_column]) |
|
|
y = df[target_column] |
|
|
|
|
|
|
|
|
label_encoder = None |
|
|
if task_type == "classification" and y.dtype == "object": |
|
|
from sklearn.preprocessing import LabelEncoder |
|
|
label_encoder = LabelEncoder() |
|
|
y = label_encoder.fit_transform(y) |
|
|
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
|
num_cols = X.select_dtypes(include=["int64", "float64"]).columns |
|
|
cat_cols = X.select_dtypes(include=["object", "category"]).columns |
|
|
|
|
|
|
|
|
|
|
|
num_pipeline = SkPipeline([ |
|
|
("imputer", SimpleImputer(strategy="median")), |
|
|
("scaler", StandardScaler()) |
|
|
]) |
|
|
|
|
|
|
|
|
cat_pipeline = SkPipeline([ |
|
|
("imputer", SimpleImputer(strategy="most_frequent")), |
|
|
("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) |
|
|
]) |
|
|
|
|
|
preprocessor = ColumnTransformer([ |
|
|
("num", num_pipeline, num_cols), |
|
|
("cat", cat_pipeline, cat_cols) |
|
|
]) |
|
|
|
|
|
pipeline = SkPipeline([ |
|
|
("preprocessor", preprocessor), |
|
|
("model", model) |
|
|
]) |
|
|
|
|
|
|
|
|
pipeline.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
st.session_state.X_test = X_test |
|
|
st.session_state.y_test = y_test |
|
|
st.session_state.task_type = task_type |
|
|
st.session_state.label_encoder = label_encoder |
|
|
|
|
|
|
|
|
if "test_results_calculated" in st.session_state: |
|
|
st.session_state.test_results_calculated = False |
|
|
|
|
|
|
|
|
for key in ['test_metrics', 'test_y_pred', 'test_y_test', 'test_cm', 'sampling_message']: |
|
|
if key in st.session_state: |
|
|
del st.session_state[key] |
|
|
|
|
|
|
|
|
if task_type == "classification": |
|
|
return pipeline, label_encoder |
|
|
else: |
|
|
return pipeline |