anyonehomep1mane
Latest Code Changes and Bug Fixes
4928a1a
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from core.detection import detect_target_type
from models.registry import REGRESSION_MODELS, CLASSIFICATION_MODELS
from preprocessing.transformers import build_preprocessor
from utils.metrics import regression_metrics, classification_metrics
from core.visuals import regression_graphs, classification_graphs
from models.registry import MODEL_GROUPS
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np
def build_preprocessor(df):
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
if len(num_cols) + len(cat_cols) == 0:
raise ValueError("No usable feature columns found")
numeric_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categorical_pipeline = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_pipeline, num_cols),
("cat", categorical_pipeline, cat_cols),
],
remainder="drop"
)
return X, y, preprocessor
def build_pipeline(model, preprocessor):
return Pipeline([
("preprocessor", preprocessor),
("model", model)
])
def train_model(file, task_type, model_group, model_name, graph_type):
try:
if file is None:
return pd.DataFrame({
"Error": [f"Please upload a csv file first."]
}), None
df = pd.read_csv(file.name)
X, y, preprocessor = build_preprocessor(df)
detected_task = detect_target_type(y)
if task_type != detected_task:
return pd.DataFrame({
"Error": [f"Detected {detected_task} target, but {task_type} selected."]
}), None
if task_type == "Classification" and y.dtype == "object":
y = LabelEncoder().fit_transform(y)
model = MODEL_GROUPS[model_group][task_type][model_name]
unique_count = len(np.unique(y))
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y if unique_count < 20 else None
)
pipeline = build_pipeline(model, preprocessor)
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
if task_type == "Regression":
metrics = regression_metrics(y_test, preds)
else:
metrics = classification_metrics(pipeline, X_test, y_test, preds)
fig = None
if task_type == "Regression":
fig = regression_graphs(graph_type, X, y, model, pipeline, y_test, preds)
else:
fig = classification_graphs(graph_type, pipeline, X_test, y_test, preds)
metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
return metrics_df, fig
except ValueError as e:
return (
pd.DataFrame({"Error": [str(e)]}),
None,
)