import pandas as pd import numpy as np from sklearn.datasets import load_iris from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.multiclass import OneVsRestClassifier from sklearn.tree import plot_tree from tabulate import tabulate from sklearn.linear_model import LogisticRegression import mlflow from sklearn.metrics import accuracy_score from sklearn.metrics import ConfusionMatrixDisplay from sklearn.tree.export import export_text from sklearn import tree from itertools import combinations # load in the data data = load_iris() iris = data # convert to a dataframe df = pd.DataFrame(data.data, columns=data.feature_names) # create the species column df['Species'] = data.target # replace this with the actual names target = np.unique(data.target) target_names = np.unique(data.target_names) targets = dict(zip(target, target_names)) df['Species'] = df['Species'].replace(targets) # extract features and target variables x = df.drop(columns="Species") y = df["Species"] # save the feature name and target variables feature_names = x.columns labels = y.unique() # split the dataset X_train, test_x, y_train, test_lab = train_test_split(x, y, test_size=0.4, random_state=42) # The below is for classic logistic regression binary classifier one vs rest, # explainability is based on the coefficents in logistic regression # Create a One-vs-Rest logistic regression classifier clf = LogisticRegression(random_state=0, multi_class='ovr') # Train the classifier on the Iris dataset clf.fit(X_train, y_train) # Get the number of classes and features n_classes = len(set(iris.target)) n_features = iris.data.shape[1] # Create a figure with one subplot for each class fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes)) # Loop over each class for i in range(n_classes): # Get the feature importances for the current class coef = clf.coef_[i] importance = coef # Sort the feature importances in descending order indices = np.argsort(importance)[::-1] # Create a bar plot of the feature importances axs[i].bar(range(n_features), importance[indices]) axs[i].set_xticks(range(n_features)) axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90) axs[i].set_xlabel('Features') axs[i].set_ylabel('Importance') axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i])) # Adjust the spacing between subplots fig.tight_layout() # Show the plot plt.show() # Make predictions on the test data val_pred = clf.predict(test_x) accuracy = accuracy_score(test_lab, val_pred) mlflow.log_metric('dtc accuracy', accuracy) cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_) disp = ConfusionMatrixDisplay( confusion_matrix=cm, display_labels=clf.classes_) disp.plot() plt.tight_layout() mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') # The below is for one vs rest desicion trees with explainability importance values are # calculated based on the reduction of impurity measured by the Gini index. # Create a One-vs-Rest Decision Tree classifier clf_pre = DecisionTreeClassifier(random_state=0) clf = OneVsRestClassifier(clf_pre) # Train the classifier on the Iris dataset clf.fit(X_train, y_train) # Get the number of classes and features n_classes = len(set(iris.target)) n_features = iris.data.shape[1] # Create a figure with one subplot for each class fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes)) # Loop over each class for i in range(n_classes): # Get the feature importances for the current class importance = clf.estimators_[i].feature_importances_ # Sort the feature importances in descending order indices = np.argsort(importance)[::-1] # Create a bar plot of the feature importances axs[i].bar(range(n_features), importance[indices]) axs[i].set_xticks(range(n_features)) axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90) axs[i].set_xlabel('Features') axs[i].set_ylabel('Importance') axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i])) # Adjust the spacing between subplots fig.tight_layout() # Show the plot plt.show() y_pred_DTC = clf.predict(test_x) accuracy = accuracy_score(test_lab, val_pred) mlflow.log_metric('dtc accuracy', accuracy) cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_) disp = ConfusionMatrixDisplay( confusion_matrix=cm, display_labels=clf.classes_) disp.plot() plt.tight_layout() mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') # Show desicion tree for each class: two methods # Get the feature names feature_names = iris.feature_names # Loop over each decision tree classifier in the one-vs-rest classifier for i, estimator in enumerate(clf.estimators_): # Export the decision rules for the current tree tree_rules = export_text(estimator, feature_names=feature_names) # Print the decision rules for the current tree print(f"Decision rules for tree for cluster {i}:") print(tree_rules) # assume clf is your one vs rest classifier for i, estimator in enumerate(clf.estimators_): fig, ax = plt.subplots(figsize=(12, 8)) tree.plot_tree(estimator, feature_names=feature_names, class_names=labels, rounded=True, filled=True, fontsize=14, ax=ax) ax.set_title(f'Tree {i+1}') plt.show() # One vs one approach # BLR # Create a One-vs-One logistic regression classifier clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs') # Train the classifier on the Iris dataset clf.fit(X_train, y_train) # Get the number of classes and features n_classes = len(set(iris.target)) n_features = iris.data.shape[1] # Create a figure with one subplot for each class combination fig, axs = plt.subplots(n_classes * (n_classes - 1) // 2, 1, figsize=(10, 5 * n_classes * (n_classes - 1) // 2)) # Loop over each class combination index = 0 for i in range(n_classes): for j in range(i + 1, n_classes): # Get the feature importances for the current class combination coef = clf.coef_[index] importance = coef # Sort the feature importances in descending order indices = np.argsort(importance)[::-1] # Create a bar plot of the feature importances axs[index].bar(range(n_features), importance[indices]) axs[index].set_xticks(range(n_features)) axs[index].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90) axs[index].set_xlabel('Features') axs[index].set_ylabel('Importance') axs[index].set_title('Feature Importance for Class Combination {} vs {}'.format(iris.target_names[i], iris.target_names[j])) index += 1 # Adjust the spacing between subplots fig.tight_layout() # Show the plot plt.show() # Make predictions on the test data y_pred_ovo = clf.predict(test_x) accuracy = accuracy_score(test_lab, val_pred) mlflow.log_metric('blr accuracy', accuracy) # Get confusion matrix cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_) disp = ConfusionMatrixDisplay( confusion_matrix=cm, display_labels=clf.classes_) disp.plot() plt.tight_layout() mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') # Desicion tree clasifier # assume clf is your one vs one classifier for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)): # create a new binary label vector for the current pair of classes y_binary = (y_train == c1) | (y_train == c2) # train a decision tree on the current pair of classes estimator = DecisionTreeClassifier() estimator.fit(X_train, y_binary) # get feature importances importances = estimator.feature_importances_ # create a bar plot showing feature importances for the current tree fig, ax = plt.subplots(figsize=(8, 6)) ax.bar(np.arange(len(feature_names)), importances) ax.set_xticks(np.arange(len(feature_names))) ax.set_xticklabels(feature_names, rotation=45, ha='right') ax.set_title(f'Tree {i+1}: {c1} vs {c2} Feature Importances') ax.set_ylabel('Importance') plt.tight_layout() plt.show() # initialize a list to store feature importances for each tree importances_all = [] # assume clf is your one vs one classifier for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)): # create a new binary label vector for the current pair of classes y_binary = (y_train == c1) | (y_train == c2) # train a decision tree on the current pair of classes estimator = DecisionTreeClassifier() estimator.fit(X_train, y_binary) # get feature importances and store them in the list importances = estimator.feature_importances_ importances_all.append(importances) # plot the decision tree with feature importances fig, ax = plt.subplots(figsize=(12, 8)) tree.plot_tree(estimator, feature_names=feature_names, class_names=[str(c1), str(c2)], rounded=True, filled=True, fontsize=14, ax=ax) # add feature importances to title title = f'Tree {i+1}: {c1} vs {c2}\n' title += 'Feature importances:\n' for feature, importance in zip(feature_names, importances): title += f'{feature}: {importance:.3f}\n' ax.set_title(title) # Get confusion matrix cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_) disp = ConfusionMatrixDisplay( confusion_matrix=cm, display_labels=clf.classes_) disp.plot() plt.tight_layout() mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') # Example of code to show explainability (one vs rest for a specific incidence) # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42) # Train a binary classifier for each class binary_classifiers = {} for i in range(len(iris.target_names)): binary_y_train = np.where(y_train == i, 1, 0) model = DecisionTreeClassifier(random_state=42) model.fit(X_train, binary_y_train) binary_classifiers[i] = model # Choose a specific instance to explain (e.g., the first instance in the test set) instance = X_test[7] # Get the predicted probability scores for each class for the instance probs = [] for i in range(len(iris.target_names)): binary_classifier = binary_classifiers[i] prob = binary_classifier.predict_proba(instance.reshape(1, -1))[0, 1] probs.append(prob) # Get the index of the class with the highest probability score predicted_class = np.argmax(probs) # Extract the binary classifier with the highest probability score binary_classifier = binary_classifiers[predicted_class] # Plot the decision tree for the binary classifier with the highest probability score fig, ax = plt.subplots(figsize=(12, 12)) plot_tree(binary_classifier, filled=True, rounded=True, ax=ax, feature_names=iris.feature_names, class_names=['not ' + iris.target_names[predicted_class], iris.target_names[predicted_class]]) plt.show() # Print the predicted class and probability for the instance predicted_prob = probs[predicted_class] print('Predicted Class:', predicted_class) print('Predicted Probability:', predicted_prob) # Create a table with the ID, characteristics, true class label, and predicted class label for each sample in the test data table_test = np.column_stack((np.arange(len(y_test)) + 1, X_test, y_test, y_pred_ovo, y_pred_DTC)) header_test = np.concatenate((['ID'], iris.feature_names, ['True Class', 'Predicted Class_BLR', 'Predicted Class_DTC'])) table_test = np.vstack((header_test, table_test)) # Print the table for the test data print(tabulate(table_test))