import pandas as pd import skops.io as sio from sklearn.compose import ColumnTransformer from sklearn.ensemble import RandomForestClassifier from sklearn.impute import SimpleImputer from sklearn.metrics import accuracy_score, f1_score from sklearn.pipeline import Pipeline from sklearn.preprocessing import OrdinalEncoder, StandardScaler # Loading data. drug_df = pd.read_csv("./data/drug200.csv") drug_df = drug_df.sample(frac=1) # train/ test split from sklearn.model_selection import train_test_split X = drug_df.drop("Drug", axis=1).values y = drug_df.Drug.values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=125 ) # Pipeline cat_col = [1, 2, 3] num_col = [0, 4] transform = ColumnTransformer( [ ("encoder", OrdinalEncoder(), cat_col), ("num_imputer", SimpleImputer(strategy="median"), num_col), ("num_scaler", StandardScaler(), num_col), ] ) pipe = Pipeline( steps=[ ("preprocessing", transform), ("model", RandomForestClassifier(n_estimators=10, random_state=125)), ] ) # train pipe.fit(X_train, y_train) # Model Evaluation predictions = pipe.predict(X_test) accuracy = accuracy_score(y_test, predictions) f1 = f1_score(y_test, predictions, average="macro") print("Accuracy: ", str(round(accuracy, 2) * 100) + "%", "F1: ", round(f1, 2)) # Confusion matrix import matplotlib.pyplot as plt from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix predictions = pipe.predict(X_test) cm = confusion_matrix(y_test, predictions, labels=pipe.classes_) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipe.classes_) disp.plot() plt.savefig("./results/model_result.png", dpi=120) # Write metrics to files with open("./results/metrics.txt", "w") as outfile: outfile.write(f"\nAccuracy={round(accuracy, 2)}, F1_score = {round(f1, 2)}") # Save the model import pickle # save the model to disk filename = "./model/drug_pipeline.sav" pickle.dump(pipe, open(filename, "wb")) # sio.dump(pipe, "./model/drug_pipeline.skops")