|
|
import optuna |
|
|
import yaml |
|
|
import joblib |
|
|
import os |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
from sklearn.datasets import load_iris |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.metrics import accuracy_score |
|
|
from optuna.visualization.matplotlib import plot_optimization_history, plot_param_importances |
|
|
|
|
|
os.makedirs("models", exist_ok=True) |
|
|
os.makedirs("plots", exist_ok=True) |
|
|
|
|
|
def objective(trial): |
|
|
n_estimators = trial.suggest_int("n_estimators", 50, 300) |
|
|
max_depth = trial.suggest_int("max_depth", 2, 32) |
|
|
|
|
|
data = load_iris() |
|
|
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target) |
|
|
|
|
|
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth) |
|
|
clf.fit(X_train, y_train) |
|
|
|
|
|
accuracy = accuracy_score(y_test, clf.predict(X_test)) |
|
|
return accuracy |
|
|
|
|
|
study = optuna.create_study(direction="maximize") |
|
|
study.optimize(objective, n_trials=30) |
|
|
|
|
|
|
|
|
study_df = study.trials_dataframe() |
|
|
study_df.to_csv("models/study_trials.csv", index=False) |
|
|
|
|
|
|
|
|
with open("models/best_params.yaml", "w") as f: |
|
|
yaml.dump(study.best_trial.params, f) |
|
|
|
|
|
|
|
|
best_params = study.best_trial.params |
|
|
final_model = RandomForestClassifier(**best_params) |
|
|
data = load_iris() |
|
|
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target) |
|
|
final_model.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
joblib.dump(final_model, "models/best_model.pkl") |
|
|
|
|
|
|
|
|
plot_optimization_history(study) |
|
|
plt.savefig("plots/optimization_history.png") |
|
|
plt.clf() |
|
|
|
|
|
plot_param_importances(study) |
|
|
plt.savefig("plots/param_importances.png") |
|
|
|
|
|
|