copd-model-e / training /src /modelling /additional_code_onevsone_onevsrest_approaches.py

Model E: Unsupervised PCA + clustering risk stratification

53a6def 5 days ago

12 kB

	import pandas as pd
	import numpy as np
	from sklearn.datasets import load_iris
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.metrics import confusion_matrix
	from sklearn.model_selection import train_test_split
	import matplotlib.pyplot as plt
	from sklearn.multiclass import OneVsRestClassifier
	from sklearn.tree import plot_tree
	from tabulate import tabulate
	from sklearn.linear_model import LogisticRegression
	import mlflow
	from sklearn.metrics import accuracy_score
	from sklearn.metrics import ConfusionMatrixDisplay
	from sklearn.tree.export import export_text
	from sklearn import tree
	from itertools import combinations


	# load in the data
	data = load_iris()
	iris = data
	# convert to a dataframe
	df = pd.DataFrame(data.data, columns=data.feature_names)
	# create the species column
	df['Species'] = data.target
	# replace this with the actual names
	target = np.unique(data.target)
	target_names = np.unique(data.target_names)
	targets = dict(zip(target, target_names))
	df['Species'] = df['Species'].replace(targets)

	# extract features and target variables
	x = df.drop(columns="Species")
	y = df["Species"]
	# save the feature name and target variables
	feature_names = x.columns
	labels = y.unique()

	# split the dataset
	X_train, test_x, y_train, test_lab = train_test_split(x, y, test_size=0.4, random_state=42)


	# The below is for classic logistic regression binary classifier one vs rest,
	# explainability is based on the coefficents in logistic regression

	# Create a One-vs-Rest logistic regression classifier
	clf = LogisticRegression(random_state=0, multi_class='ovr')

	# Train the classifier on the Iris dataset
	clf.fit(X_train, y_train)

	# Get the number of classes and features
	n_classes = len(set(iris.target))
	n_features = iris.data.shape[1]

	# Create a figure with one subplot for each class
	fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes))

	# Loop over each class
	for i in range(n_classes):
	# Get the feature importances for the current class
	coef = clf.coef_[i]
	importance = coef

	# Sort the feature importances in descending order
	indices = np.argsort(importance)[::-1]

	# Create a bar plot of the feature importances
	axs[i].bar(range(n_features), importance[indices])
	axs[i].set_xticks(range(n_features))
	axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90)
	axs[i].set_xlabel('Features')
	axs[i].set_ylabel('Importance')
	axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i]))

	# Adjust the spacing between subplots
	fig.tight_layout()

	# Show the plot
	plt.show()


	# Make predictions on the test data
	val_pred = clf.predict(test_x)
	accuracy = accuracy_score(test_lab, val_pred)
	mlflow.log_metric('dtc accuracy', accuracy)

	cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
	disp = ConfusionMatrixDisplay(
	confusion_matrix=cm, display_labels=clf.classes_)
	disp.plot()
	plt.tight_layout()
	mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')


	# The below is for one vs rest desicion trees with explainability importance values are
	# calculated based on the reduction of impurity measured by the Gini index.
	# Create a One-vs-Rest Decision Tree classifier
	clf_pre = DecisionTreeClassifier(random_state=0)
	clf = OneVsRestClassifier(clf_pre)

	# Train the classifier on the Iris dataset
	clf.fit(X_train, y_train)

	# Get the number of classes and features
	n_classes = len(set(iris.target))
	n_features = iris.data.shape[1]

	# Create a figure with one subplot for each class
	fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes))

	# Loop over each class
	for i in range(n_classes):
	# Get the feature importances for the current class
	importance = clf.estimators_[i].feature_importances_

	# Sort the feature importances in descending order
	indices = np.argsort(importance)[::-1]

	# Create a bar plot of the feature importances
	axs[i].bar(range(n_features), importance[indices])
	axs[i].set_xticks(range(n_features))
	axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90)
	axs[i].set_xlabel('Features')
	axs[i].set_ylabel('Importance')
	axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i]))

	# Adjust the spacing between subplots
	fig.tight_layout()

	# Show the plot
	plt.show()


	y_pred_DTC = clf.predict(test_x)
	accuracy = accuracy_score(test_lab, val_pred)
	mlflow.log_metric('dtc accuracy', accuracy)

	cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
	disp = ConfusionMatrixDisplay(
	confusion_matrix=cm, display_labels=clf.classes_)
	disp.plot()
	plt.tight_layout()
	mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')


	# Show desicion tree for each class: two methods

	# Get the feature names
	feature_names = iris.feature_names

	# Loop over each decision tree classifier in the one-vs-rest classifier
	for i, estimator in enumerate(clf.estimators_):
	# Export the decision rules for the current tree
	tree_rules = export_text(estimator, feature_names=feature_names)

	# Print the decision rules for the current tree
	print(f"Decision rules for tree for cluster {i}:")
	print(tree_rules)

	# assume clf is your one vs rest classifier
	for i, estimator in enumerate(clf.estimators_):
	fig, ax = plt.subplots(figsize=(12, 8))
	tree.plot_tree(estimator,
	feature_names=feature_names,
	class_names=labels,
	rounded=True,
	filled=True,
	fontsize=14,
	ax=ax)
	ax.set_title(f'Tree {i+1}')
	plt.show()


	# One vs one approach

	# BLR
	# Create a One-vs-One logistic regression classifier
	clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')

	# Train the classifier on the Iris dataset
	clf.fit(X_train, y_train)

	# Get the number of classes and features
	n_classes = len(set(iris.target))
	n_features = iris.data.shape[1]

	# Create a figure with one subplot for each class combination
	fig, axs = plt.subplots(n_classes * (n_classes - 1) // 2, 1, figsize=(10, 5 * n_classes * (n_classes - 1) // 2))

	# Loop over each class combination
	index = 0
	for i in range(n_classes):
	for j in range(i + 1, n_classes):
	# Get the feature importances for the current class combination
	coef = clf.coef_[index]
	importance = coef

	# Sort the feature importances in descending order
	indices = np.argsort(importance)[::-1]

	# Create a bar plot of the feature importances
	axs[index].bar(range(n_features), importance[indices])
	axs[index].set_xticks(range(n_features))
	axs[index].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90)
	axs[index].set_xlabel('Features')
	axs[index].set_ylabel('Importance')
	axs[index].set_title('Feature Importance for Class Combination {} vs {}'.format(iris.target_names[i], iris.target_names[j]))
	index += 1

	# Adjust the spacing between subplots
	fig.tight_layout()

	# Show the plot
	plt.show()


	# Make predictions on the test data
	y_pred_ovo = clf.predict(test_x)
	accuracy = accuracy_score(test_lab, val_pred)
	mlflow.log_metric('blr accuracy', accuracy)

	# Get confusion matrix
	cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
	disp = ConfusionMatrixDisplay(
	confusion_matrix=cm, display_labels=clf.classes_)
	disp.plot()
	plt.tight_layout()
	mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')


	# Desicion tree clasifier

	# assume clf is your one vs one classifier
	for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)):
	# create a new binary label vector for the current pair of classes
	y_binary = (y_train == c1) \| (y_train == c2)

	# train a decision tree on the current pair of classes
	estimator = DecisionTreeClassifier()
	estimator.fit(X_train, y_binary)

	# get feature importances
	importances = estimator.feature_importances_

	# create a bar plot showing feature importances for the current tree
	fig, ax = plt.subplots(figsize=(8, 6))
	ax.bar(np.arange(len(feature_names)), importances)
	ax.set_xticks(np.arange(len(feature_names)))
	ax.set_xticklabels(feature_names, rotation=45, ha='right')
	ax.set_title(f'Tree {i+1}: {c1} vs {c2} Feature Importances')
	ax.set_ylabel('Importance')
	plt.tight_layout()
	plt.show()

	# initialize a list to store feature importances for each tree
	importances_all = []

	# assume clf is your one vs one classifier
	for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)):
	# create a new binary label vector for the current pair of classes
	y_binary = (y_train == c1) \| (y_train == c2)

	# train a decision tree on the current pair of classes
	estimator = DecisionTreeClassifier()
	estimator.fit(X_train, y_binary)

	# get feature importances and store them in the list
	importances = estimator.feature_importances_
	importances_all.append(importances)

	# plot the decision tree with feature importances
	fig, ax = plt.subplots(figsize=(12, 8))
	tree.plot_tree(estimator,
	feature_names=feature_names,
	class_names=[str(c1), str(c2)],
	rounded=True,
	filled=True,
	fontsize=14,
	ax=ax)

	# add feature importances to title
	title = f'Tree {i+1}: {c1} vs {c2}\n'
	title += 'Feature importances:\n'
	for feature, importance in zip(feature_names, importances):
	title += f'{feature}: {importance:.3f}\n'
	ax.set_title(title)


	# Get confusion matrix
	cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
	disp = ConfusionMatrixDisplay(
	confusion_matrix=cm, display_labels=clf.classes_)
	disp.plot()
	plt.tight_layout()
	mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')


	# Example of code to show explainability (one vs rest for a specific incidence)

	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

	# Train a binary classifier for each class
	binary_classifiers = {}
	for i in range(len(iris.target_names)):
	binary_y_train = np.where(y_train == i, 1, 0)
	model = DecisionTreeClassifier(random_state=42)
	model.fit(X_train, binary_y_train)
	binary_classifiers[i] = model

	# Choose a specific instance to explain (e.g., the first instance in the test set)
	instance = X_test[7]

	# Get the predicted probability scores for each class for the instance
	probs = []
	for i in range(len(iris.target_names)):
	binary_classifier = binary_classifiers[i]
	prob = binary_classifier.predict_proba(instance.reshape(1, -1))[0, 1]
	probs.append(prob)

	# Get the index of the class with the highest probability score
	predicted_class = np.argmax(probs)

	# Extract the binary classifier with the highest probability score
	binary_classifier = binary_classifiers[predicted_class]

	# Plot the decision tree for the binary classifier with the highest probability score
	fig, ax = plt.subplots(figsize=(12, 12))
	plot_tree(binary_classifier, filled=True, rounded=True, ax=ax, feature_names=iris.feature_names, class_names=['not ' + iris.target_names[predicted_class], iris.target_names[predicted_class]])
	plt.show()

	# Print the predicted class and probability for the instance
	predicted_prob = probs[predicted_class]
	print('Predicted Class:', predicted_class)
	print('Predicted Probability:', predicted_prob)


	# Create a table with the ID, characteristics, true class label, and predicted class label for each sample in the test data
	table_test = np.column_stack((np.arange(len(y_test)) + 1, X_test, y_test, y_pred_ovo, y_pred_DTC))
	header_test = np.concatenate((['ID'], iris.feature_names, ['True Class', 'Predicted Class_BLR', 'Predicted Class_DTC']))
	table_test = np.vstack((header_test, table_test))

	# Print the table for the test data
	print(tabulate(table_test))