| | import pandas as pd |
| | import numpy as np |
| | from sklearn.datasets import load_iris |
| | from sklearn.tree import DecisionTreeClassifier |
| | from sklearn.metrics import confusion_matrix |
| | from sklearn.model_selection import train_test_split |
| | import matplotlib.pyplot as plt |
| | from sklearn.multiclass import OneVsRestClassifier |
| | from sklearn.tree import plot_tree |
| | from tabulate import tabulate |
| | from sklearn.linear_model import LogisticRegression |
| | import mlflow |
| | from sklearn.metrics import accuracy_score |
| | from sklearn.metrics import ConfusionMatrixDisplay |
| | from sklearn.tree.export import export_text |
| | from sklearn import tree |
| | from itertools import combinations |
| |
|
| |
|
| | |
| | data = load_iris() |
| | iris = data |
| | |
| | df = pd.DataFrame(data.data, columns=data.feature_names) |
| | |
| | df['Species'] = data.target |
| | |
| | target = np.unique(data.target) |
| | target_names = np.unique(data.target_names) |
| | targets = dict(zip(target, target_names)) |
| | df['Species'] = df['Species'].replace(targets) |
| |
|
| | |
| | x = df.drop(columns="Species") |
| | y = df["Species"] |
| | |
| | feature_names = x.columns |
| | labels = y.unique() |
| |
|
| | |
| | X_train, test_x, y_train, test_lab = train_test_split(x, y, test_size=0.4, random_state=42) |
| |
|
| |
|
| | |
| | |
| |
|
| | |
| | clf = LogisticRegression(random_state=0, multi_class='ovr') |
| |
|
| | |
| | clf.fit(X_train, y_train) |
| |
|
| | |
| | n_classes = len(set(iris.target)) |
| | n_features = iris.data.shape[1] |
| |
|
| | |
| | fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes)) |
| |
|
| | |
| | for i in range(n_classes): |
| | |
| | coef = clf.coef_[i] |
| | importance = coef |
| | |
| | |
| | indices = np.argsort(importance)[::-1] |
| | |
| | |
| | axs[i].bar(range(n_features), importance[indices]) |
| | axs[i].set_xticks(range(n_features)) |
| | axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90) |
| | axs[i].set_xlabel('Features') |
| | axs[i].set_ylabel('Importance') |
| | axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i])) |
| |
|
| | |
| | fig.tight_layout() |
| |
|
| | |
| | plt.show() |
| |
|
| |
|
| | |
| | val_pred = clf.predict(test_x) |
| | accuracy = accuracy_score(test_lab, val_pred) |
| | mlflow.log_metric('dtc accuracy', accuracy) |
| |
|
| | cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_) |
| | disp = ConfusionMatrixDisplay( |
| | confusion_matrix=cm, display_labels=clf.classes_) |
| | disp.plot() |
| | plt.tight_layout() |
| | mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') |
| |
|
| |
|
| | |
| | |
| | |
| | clf_pre = DecisionTreeClassifier(random_state=0) |
| | clf = OneVsRestClassifier(clf_pre) |
| |
|
| | |
| | clf.fit(X_train, y_train) |
| |
|
| | |
| | n_classes = len(set(iris.target)) |
| | n_features = iris.data.shape[1] |
| |
|
| | |
| | fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes)) |
| |
|
| | |
| | for i in range(n_classes): |
| | |
| | importance = clf.estimators_[i].feature_importances_ |
| | |
| | |
| | indices = np.argsort(importance)[::-1] |
| | |
| | |
| | axs[i].bar(range(n_features), importance[indices]) |
| | axs[i].set_xticks(range(n_features)) |
| | axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90) |
| | axs[i].set_xlabel('Features') |
| | axs[i].set_ylabel('Importance') |
| | axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i])) |
| |
|
| | |
| | fig.tight_layout() |
| |
|
| | |
| | plt.show() |
| |
|
| |
|
| | y_pred_DTC = clf.predict(test_x) |
| | accuracy = accuracy_score(test_lab, val_pred) |
| | mlflow.log_metric('dtc accuracy', accuracy) |
| |
|
| | cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_) |
| | disp = ConfusionMatrixDisplay( |
| | confusion_matrix=cm, display_labels=clf.classes_) |
| | disp.plot() |
| | plt.tight_layout() |
| | mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') |
| |
|
| |
|
| | |
| |
|
| | |
| | feature_names = iris.feature_names |
| |
|
| | |
| | for i, estimator in enumerate(clf.estimators_): |
| | |
| | tree_rules = export_text(estimator, feature_names=feature_names) |
| | |
| | |
| | print(f"Decision rules for tree for cluster {i}:") |
| | print(tree_rules) |
| |
|
| | |
| | for i, estimator in enumerate(clf.estimators_): |
| | fig, ax = plt.subplots(figsize=(12, 8)) |
| | tree.plot_tree(estimator, |
| | feature_names=feature_names, |
| | class_names=labels, |
| | rounded=True, |
| | filled=True, |
| | fontsize=14, |
| | ax=ax) |
| | ax.set_title(f'Tree {i+1}') |
| | plt.show() |
| |
|
| |
|
| | |
| |
|
| | |
| | |
| | clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs') |
| |
|
| | |
| | clf.fit(X_train, y_train) |
| |
|
| | |
| | n_classes = len(set(iris.target)) |
| | n_features = iris.data.shape[1] |
| |
|
| | |
| | fig, axs = plt.subplots(n_classes * (n_classes - 1) // 2, 1, figsize=(10, 5 * n_classes * (n_classes - 1) // 2)) |
| |
|
| | |
| | index = 0 |
| | for i in range(n_classes): |
| | for j in range(i + 1, n_classes): |
| | |
| | coef = clf.coef_[index] |
| | importance = coef |
| | |
| | |
| | indices = np.argsort(importance)[::-1] |
| | |
| | |
| | axs[index].bar(range(n_features), importance[indices]) |
| | axs[index].set_xticks(range(n_features)) |
| | axs[index].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90) |
| | axs[index].set_xlabel('Features') |
| | axs[index].set_ylabel('Importance') |
| | axs[index].set_title('Feature Importance for Class Combination {} vs {}'.format(iris.target_names[i], iris.target_names[j])) |
| | index += 1 |
| |
|
| | |
| | fig.tight_layout() |
| |
|
| | |
| | plt.show() |
| |
|
| |
|
| | |
| | y_pred_ovo = clf.predict(test_x) |
| | accuracy = accuracy_score(test_lab, val_pred) |
| | mlflow.log_metric('blr accuracy', accuracy) |
| |
|
| | |
| | cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_) |
| | disp = ConfusionMatrixDisplay( |
| | confusion_matrix=cm, display_labels=clf.classes_) |
| | disp.plot() |
| | plt.tight_layout() |
| | mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') |
| |
|
| |
|
| | |
| |
|
| | |
| | for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)): |
| | |
| | y_binary = (y_train == c1) | (y_train == c2) |
| | |
| | |
| | estimator = DecisionTreeClassifier() |
| | estimator.fit(X_train, y_binary) |
| | |
| | |
| | importances = estimator.feature_importances_ |
| | |
| | |
| | fig, ax = plt.subplots(figsize=(8, 6)) |
| | ax.bar(np.arange(len(feature_names)), importances) |
| | ax.set_xticks(np.arange(len(feature_names))) |
| | ax.set_xticklabels(feature_names, rotation=45, ha='right') |
| | ax.set_title(f'Tree {i+1}: {c1} vs {c2} Feature Importances') |
| | ax.set_ylabel('Importance') |
| | plt.tight_layout() |
| | plt.show() |
| |
|
| | |
| | importances_all = [] |
| |
|
| | |
| | for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)): |
| | |
| | y_binary = (y_train == c1) | (y_train == c2) |
| | |
| | |
| | estimator = DecisionTreeClassifier() |
| | estimator.fit(X_train, y_binary) |
| | |
| | |
| | importances = estimator.feature_importances_ |
| | importances_all.append(importances) |
| | |
| | |
| | fig, ax = plt.subplots(figsize=(12, 8)) |
| | tree.plot_tree(estimator, |
| | feature_names=feature_names, |
| | class_names=[str(c1), str(c2)], |
| | rounded=True, |
| | filled=True, |
| | fontsize=14, |
| | ax=ax) |
| | |
| | |
| | title = f'Tree {i+1}: {c1} vs {c2}\n' |
| | title += 'Feature importances:\n' |
| | for feature, importance in zip(feature_names, importances): |
| | title += f'{feature}: {importance:.3f}\n' |
| | ax.set_title(title) |
| |
|
| |
|
| | |
| | cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_) |
| | disp = ConfusionMatrixDisplay( |
| | confusion_matrix=cm, display_labels=clf.classes_) |
| | disp.plot() |
| | plt.tight_layout() |
| | mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png') |
| | |
| |
|
| | |
| |
|
| | |
| | X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42) |
| |
|
| | |
| | binary_classifiers = {} |
| | for i in range(len(iris.target_names)): |
| | binary_y_train = np.where(y_train == i, 1, 0) |
| | model = DecisionTreeClassifier(random_state=42) |
| | model.fit(X_train, binary_y_train) |
| | binary_classifiers[i] = model |
| |
|
| | |
| | instance = X_test[7] |
| |
|
| | |
| | probs = [] |
| | for i in range(len(iris.target_names)): |
| | binary_classifier = binary_classifiers[i] |
| | prob = binary_classifier.predict_proba(instance.reshape(1, -1))[0, 1] |
| | probs.append(prob) |
| |
|
| | |
| | predicted_class = np.argmax(probs) |
| |
|
| | |
| | binary_classifier = binary_classifiers[predicted_class] |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(12, 12)) |
| | plot_tree(binary_classifier, filled=True, rounded=True, ax=ax, feature_names=iris.feature_names, class_names=['not ' + iris.target_names[predicted_class], iris.target_names[predicted_class]]) |
| | plt.show() |
| |
|
| | |
| | predicted_prob = probs[predicted_class] |
| | print('Predicted Class:', predicted_class) |
| | print('Predicted Probability:', predicted_prob) |
| |
|
| |
|
| | |
| | table_test = np.column_stack((np.arange(len(y_test)) + 1, X_test, y_test, y_pred_ovo, y_pred_DTC)) |
| | header_test = np.concatenate((['ID'], iris.feature_names, ['True Class', 'Predicted Class_BLR', 'Predicted Class_DTC'])) |
| | table_test = np.vstack((header_test, table_test)) |
| |
|
| | |
| | print(tabulate(table_test)) |