AutoML / src /training /train.py
akash
all files
890025a
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.linear_model import Ridge, Lasso
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
import streamlit as st
def get_model(task_type, model_name, hyperparams):
"""Returns the model instance based on user selection with hyperparameters."""
models = {
"regression": {
# Already existing:
"Linear Regression": LinearRegression,
"Random Forest Regressor": RandomForestRegressor,
"XGBoost Regressor": XGBRegressor,
# Additional regression models:
"Support Vector Regressor": SVR,
"Decision Tree Regressor": DecisionTreeRegressor,
"K-Nearest Neighbors Regressor": KNeighborsRegressor,
"ElasticNet": ElasticNet,
"Gradient Boosting Regressor": GradientBoostingRegressor,
"AdaBoost Regressor": AdaBoostRegressor,
"Bayesian Ridge": BayesianRidge,
"Ridge Regression": Ridge,
"Lasso Regression": Lasso ,
},
"classification": {
# Already existing:
"Logistic Regression": LogisticRegression,
"Random Forest": RandomForestClassifier,
"XGBoost": XGBClassifier,
# Additional classification models:
"Support Vector Classifier": SVC,
"Decision Tree Classifier": DecisionTreeClassifier,
"K-Nearest Neighbors Classifier": KNeighborsClassifier,
"Gradient Boosting Classifier": GradientBoostingClassifier,
"AdaBoost Classifier": AdaBoostClassifier,
"Gaussian Naive Bayes": GaussianNB,
"Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis,
"Linear Discriminant Analysis": LinearDiscriminantAnalysis
}
}
if task_type in models and model_name in models[task_type]:
return models[task_type][model_name](**hyperparams) # Apply hyperparameters
else:
raise ValueError(f"Invalid model selection: {model_name} for {task_type}")
def train_model(df, target_column, task_type, selected_model_name, hyperparams):
"""Preprocess data, train the selected model with hyperparameters, and return the trained model."""
with st.spinner(" Training model... Please wait!"):
# Get the model with hyperparameters
model = get_model(task_type, selected_model_name, hyperparams)
# Split features and target
X = df.drop(columns=[target_column])
y = df[target_column]
# Label encode target if classification (for categorical labels)
label_encoder = None
if task_type == "classification" and y.dtype == "object":
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns
# Preprocessing Pipeline
# Numeric pipeline: impute missing values then scale them
num_pipeline = SkPipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
# Categorical pipeline: impute missing values then one-hot encode them
cat_pipeline = SkPipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocessor = ColumnTransformer([
("num", num_pipeline, num_cols),
("cat", cat_pipeline, cat_cols)
])
pipeline = SkPipeline([
("preprocessor", preprocessor),
("model", model)
])
# Train Model
pipeline.fit(X_train, y_train)
# Store test data and metadata in session state
st.session_state.X_test = X_test
st.session_state.y_test = y_test
st.session_state.task_type = task_type
st.session_state.label_encoder = label_encoder # Store label encoder for decoding predictions
# Reset test results calculation flag when a new model is trained
if "test_results_calculated" in st.session_state:
st.session_state.test_results_calculated = False
# Clear any previous test metrics to avoid using stale data
for key in ['test_metrics', 'test_y_pred', 'test_y_test', 'test_cm', 'sampling_message']:
if key in st.session_state:
del st.session_state[key]
# Return trained model + label encoder (needed for decoding predictions if classification)
if task_type == "classification":
return pipeline, label_encoder
else:
return pipeline