dsccds / test.py
Anuj6263333's picture
Upload 2 files
96aea66 verified
import optuna
import yaml
import joblib
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from optuna.visualization.matplotlib import plot_optimization_history, plot_param_importances
os.makedirs("models", exist_ok=True)
os.makedirs("plots", exist_ok=True)
def objective(trial):
n_estimators = trial.suggest_int("n_estimators", 50, 300)
max_depth = trial.suggest_int("max_depth", 2, 32)
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
clf.fit(X_train, y_train)
accuracy = accuracy_score(y_test, clf.predict(X_test))
return accuracy
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)
# Save trial results
study_df = study.trials_dataframe()
study_df.to_csv("models/study_trials.csv", index=False)
# Save best parameters
with open("models/best_params.yaml", "w") as f:
yaml.dump(study.best_trial.params, f)
# Train final model with best parameters
best_params = study.best_trial.params
final_model = RandomForestClassifier(**best_params)
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
final_model.fit(X_train, y_train)
# Save model
joblib.dump(final_model, "models/best_model.pkl")
# Save Optuna plots
plot_optimization_history(study)
plt.savefig("plots/optimization_history.png")
plt.clf()
plot_param_importances(study)
plt.savefig("plots/param_importances.png")