copd-model-e / training /src /modelling /additional_code_onevsone_onevsrest_approaches.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import plot_tree
from tabulate import tabulate
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.tree.export import export_text
from sklearn import tree
from itertools import combinations
# load in the data
data = load_iris()
iris = data
# convert to a dataframe
df = pd.DataFrame(data.data, columns=data.feature_names)
# create the species column
df['Species'] = data.target
# replace this with the actual names
target = np.unique(data.target)
target_names = np.unique(data.target_names)
targets = dict(zip(target, target_names))
df['Species'] = df['Species'].replace(targets)
# extract features and target variables
x = df.drop(columns="Species")
y = df["Species"]
# save the feature name and target variables
feature_names = x.columns
labels = y.unique()
# split the dataset
X_train, test_x, y_train, test_lab = train_test_split(x, y, test_size=0.4, random_state=42)
# The below is for classic logistic regression binary classifier one vs rest,
# explainability is based on the coefficents in logistic regression
# Create a One-vs-Rest logistic regression classifier
clf = LogisticRegression(random_state=0, multi_class='ovr')
# Train the classifier on the Iris dataset
clf.fit(X_train, y_train)
# Get the number of classes and features
n_classes = len(set(iris.target))
n_features = iris.data.shape[1]
# Create a figure with one subplot for each class
fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes))
# Loop over each class
for i in range(n_classes):
# Get the feature importances for the current class
coef = clf.coef_[i]
importance = coef
# Sort the feature importances in descending order
indices = np.argsort(importance)[::-1]
# Create a bar plot of the feature importances
axs[i].bar(range(n_features), importance[indices])
axs[i].set_xticks(range(n_features))
axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90)
axs[i].set_xlabel('Features')
axs[i].set_ylabel('Importance')
axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i]))
# Adjust the spacing between subplots
fig.tight_layout()
# Show the plot
plt.show()
# Make predictions on the test data
val_pred = clf.predict(test_x)
accuracy = accuracy_score(test_lab, val_pred)
mlflow.log_metric('dtc accuracy', accuracy)
cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.tight_layout()
mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')
# The below is for one vs rest desicion trees with explainability importance values are
# calculated based on the reduction of impurity measured by the Gini index.
# Create a One-vs-Rest Decision Tree classifier
clf_pre = DecisionTreeClassifier(random_state=0)
clf = OneVsRestClassifier(clf_pre)
# Train the classifier on the Iris dataset
clf.fit(X_train, y_train)
# Get the number of classes and features
n_classes = len(set(iris.target))
n_features = iris.data.shape[1]
# Create a figure with one subplot for each class
fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes))
# Loop over each class
for i in range(n_classes):
# Get the feature importances for the current class
importance = clf.estimators_[i].feature_importances_
# Sort the feature importances in descending order
indices = np.argsort(importance)[::-1]
# Create a bar plot of the feature importances
axs[i].bar(range(n_features), importance[indices])
axs[i].set_xticks(range(n_features))
axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90)
axs[i].set_xlabel('Features')
axs[i].set_ylabel('Importance')
axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i]))
# Adjust the spacing between subplots
fig.tight_layout()
# Show the plot
plt.show()
y_pred_DTC = clf.predict(test_x)
accuracy = accuracy_score(test_lab, val_pred)
mlflow.log_metric('dtc accuracy', accuracy)
cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.tight_layout()
mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')
# Show desicion tree for each class: two methods
# Get the feature names
feature_names = iris.feature_names
# Loop over each decision tree classifier in the one-vs-rest classifier
for i, estimator in enumerate(clf.estimators_):
# Export the decision rules for the current tree
tree_rules = export_text(estimator, feature_names=feature_names)
# Print the decision rules for the current tree
print(f"Decision rules for tree for cluster {i}:")
print(tree_rules)
# assume clf is your one vs rest classifier
for i, estimator in enumerate(clf.estimators_):
fig, ax = plt.subplots(figsize=(12, 8))
tree.plot_tree(estimator,
feature_names=feature_names,
class_names=labels,
rounded=True,
filled=True,
fontsize=14,
ax=ax)
ax.set_title(f'Tree {i+1}')
plt.show()
# One vs one approach
# BLR
# Create a One-vs-One logistic regression classifier
clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')
# Train the classifier on the Iris dataset
clf.fit(X_train, y_train)
# Get the number of classes and features
n_classes = len(set(iris.target))
n_features = iris.data.shape[1]
# Create a figure with one subplot for each class combination
fig, axs = plt.subplots(n_classes * (n_classes - 1) // 2, 1, figsize=(10, 5 * n_classes * (n_classes - 1) // 2))
# Loop over each class combination
index = 0
for i in range(n_classes):
for j in range(i + 1, n_classes):
# Get the feature importances for the current class combination
coef = clf.coef_[index]
importance = coef
# Sort the feature importances in descending order
indices = np.argsort(importance)[::-1]
# Create a bar plot of the feature importances
axs[index].bar(range(n_features), importance[indices])
axs[index].set_xticks(range(n_features))
axs[index].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90)
axs[index].set_xlabel('Features')
axs[index].set_ylabel('Importance')
axs[index].set_title('Feature Importance for Class Combination {} vs {}'.format(iris.target_names[i], iris.target_names[j]))
index += 1
# Adjust the spacing between subplots
fig.tight_layout()
# Show the plot
plt.show()
# Make predictions on the test data
y_pred_ovo = clf.predict(test_x)
accuracy = accuracy_score(test_lab, val_pred)
mlflow.log_metric('blr accuracy', accuracy)
# Get confusion matrix
cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.tight_layout()
mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')
# Desicion tree clasifier
# assume clf is your one vs one classifier
for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)):
# create a new binary label vector for the current pair of classes
y_binary = (y_train == c1) | (y_train == c2)
# train a decision tree on the current pair of classes
estimator = DecisionTreeClassifier()
estimator.fit(X_train, y_binary)
# get feature importances
importances = estimator.feature_importances_
# create a bar plot showing feature importances for the current tree
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(np.arange(len(feature_names)), importances)
ax.set_xticks(np.arange(len(feature_names)))
ax.set_xticklabels(feature_names, rotation=45, ha='right')
ax.set_title(f'Tree {i+1}: {c1} vs {c2} Feature Importances')
ax.set_ylabel('Importance')
plt.tight_layout()
plt.show()
# initialize a list to store feature importances for each tree
importances_all = []
# assume clf is your one vs one classifier
for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)):
# create a new binary label vector for the current pair of classes
y_binary = (y_train == c1) | (y_train == c2)
# train a decision tree on the current pair of classes
estimator = DecisionTreeClassifier()
estimator.fit(X_train, y_binary)
# get feature importances and store them in the list
importances = estimator.feature_importances_
importances_all.append(importances)
# plot the decision tree with feature importances
fig, ax = plt.subplots(figsize=(12, 8))
tree.plot_tree(estimator,
feature_names=feature_names,
class_names=[str(c1), str(c2)],
rounded=True,
filled=True,
fontsize=14,
ax=ax)
# add feature importances to title
title = f'Tree {i+1}: {c1} vs {c2}\n'
title += 'Feature importances:\n'
for feature, importance in zip(feature_names, importances):
title += f'{feature}: {importance:.3f}\n'
ax.set_title(title)
# Get confusion matrix
cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.tight_layout()
mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')
# Example of code to show explainability (one vs rest for a specific incidence)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
# Train a binary classifier for each class
binary_classifiers = {}
for i in range(len(iris.target_names)):
binary_y_train = np.where(y_train == i, 1, 0)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, binary_y_train)
binary_classifiers[i] = model
# Choose a specific instance to explain (e.g., the first instance in the test set)
instance = X_test[7]
# Get the predicted probability scores for each class for the instance
probs = []
for i in range(len(iris.target_names)):
binary_classifier = binary_classifiers[i]
prob = binary_classifier.predict_proba(instance.reshape(1, -1))[0, 1]
probs.append(prob)
# Get the index of the class with the highest probability score
predicted_class = np.argmax(probs)
# Extract the binary classifier with the highest probability score
binary_classifier = binary_classifiers[predicted_class]
# Plot the decision tree for the binary classifier with the highest probability score
fig, ax = plt.subplots(figsize=(12, 12))
plot_tree(binary_classifier, filled=True, rounded=True, ax=ax, feature_names=iris.feature_names, class_names=['not ' + iris.target_names[predicted_class], iris.target_names[predicted_class]])
plt.show()
# Print the predicted class and probability for the instance
predicted_prob = probs[predicted_class]
print('Predicted Class:', predicted_class)
print('Predicted Probability:', predicted_prob)
# Create a table with the ID, characteristics, true class label, and predicted class label for each sample in the test data
table_test = np.column_stack((np.arange(len(y_test)) + 1, X_test, y_test, y_pred_ovo, y_pred_DTC))
header_test = np.concatenate((['ID'], iris.feature_names, ['True Class', 'Predicted Class_BLR', 'Predicted Class_DTC']))
table_test = np.vstack((header_test, table_test))
# Print the table for the test data
print(tabulate(table_test))