import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import plot_tree
from tabulate import tabulate
from sklearn.linear_model import LogisticRegression
import mlflow
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.tree.export import export_text
from sklearn import tree
from itertools import combinations


# load in the data
data = load_iris()
iris = data
# convert to a dataframe
df = pd.DataFrame(data.data, columns=data.feature_names)
# create the species column
df['Species'] = data.target
# replace this with the actual names
target = np.unique(data.target)
target_names = np.unique(data.target_names)
targets = dict(zip(target, target_names))
df['Species'] = df['Species'].replace(targets)

# extract features and target variables
x = df.drop(columns="Species")
y = df["Species"]
# save the feature name and target variables
feature_names = x.columns
labels = y.unique()

# split the dataset
X_train, test_x, y_train, test_lab = train_test_split(x, y, test_size=0.4, random_state=42)


# The below is for classic logistic regression binary classifier one vs rest,
# explainability is based on the coefficents in logistic regression

# Create a One-vs-Rest logistic regression classifier
clf = LogisticRegression(random_state=0, multi_class='ovr')

# Train the classifier on the Iris dataset
clf.fit(X_train, y_train)

# Get the number of classes and features
n_classes = len(set(iris.target))
n_features = iris.data.shape[1]

# Create a figure with one subplot for each class
fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes))

# Loop over each class
for i in range(n_classes):
    # Get the feature importances for the current class
    coef = clf.coef_[i]
    importance = coef
    
    # Sort the feature importances in descending order
    indices = np.argsort(importance)[::-1]
    
    # Create a bar plot of the feature importances
    axs[i].bar(range(n_features), importance[indices])
    axs[i].set_xticks(range(n_features))
    axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90)
    axs[i].set_xlabel('Features')
    axs[i].set_ylabel('Importance')
    axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i]))

# Adjust the spacing between subplots
fig.tight_layout()

# Show the plot
plt.show()


# Make predictions on the test data
val_pred = clf.predict(test_x)
accuracy = accuracy_score(test_lab, val_pred)
mlflow.log_metric('dtc accuracy', accuracy)

cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.tight_layout()
mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')


# The below is for one vs rest desicion trees with explainability importance values are
# calculated based on the reduction of impurity measured by the Gini index.
# Create a One-vs-Rest Decision Tree classifier
clf_pre = DecisionTreeClassifier(random_state=0)
clf = OneVsRestClassifier(clf_pre)

# Train the classifier on the Iris dataset
clf.fit(X_train, y_train)

# Get the number of classes and features
n_classes = len(set(iris.target))
n_features = iris.data.shape[1]

# Create a figure with one subplot for each class
fig, axs = plt.subplots(n_classes, 1, figsize=(10, 5 * n_classes))

# Loop over each class
for i in range(n_classes):
    # Get the feature importances for the current class
    importance = clf.estimators_[i].feature_importances_
    
    # Sort the feature importances in descending order
    indices = np.argsort(importance)[::-1]
    
    # Create a bar plot of the feature importances
    axs[i].bar(range(n_features), importance[indices])
    axs[i].set_xticks(range(n_features))
    axs[i].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90)
    axs[i].set_xlabel('Features')
    axs[i].set_ylabel('Importance')
    axs[i].set_title('Feature Importance for Class {}'.format(iris.target_names[i]))

# Adjust the spacing between subplots
fig.tight_layout()

# Show the plot
plt.show()


y_pred_DTC = clf.predict(test_x)
accuracy = accuracy_score(test_lab, val_pred)
mlflow.log_metric('dtc accuracy', accuracy)

cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.tight_layout()
mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')


# Show desicion tree for each class: two methods

# Get the feature names
feature_names = iris.feature_names

# Loop over each decision tree classifier in the one-vs-rest classifier
for i, estimator in enumerate(clf.estimators_):
    # Export the decision rules for the current tree
    tree_rules = export_text(estimator, feature_names=feature_names)
    
    # Print the decision rules for the current tree
    print(f"Decision rules for tree for cluster {i}:")
    print(tree_rules)

# assume clf is your one vs rest classifier
for i, estimator in enumerate(clf.estimators_):
    fig, ax = plt.subplots(figsize=(12, 8))
    tree.plot_tree(estimator,
                   feature_names=feature_names,
                   class_names=labels,
                   rounded=True,
                   filled=True,
                   fontsize=14,
                   ax=ax)
    ax.set_title(f'Tree {i+1}')
plt.show()


# One vs one approach

# BLR
# Create a One-vs-One logistic regression classifier
clf = LogisticRegression(random_state=0, multi_class='multinomial', solver='lbfgs')

# Train the classifier on the Iris dataset
clf.fit(X_train, y_train)

# Get the number of classes and features
n_classes = len(set(iris.target))
n_features = iris.data.shape[1]

# Create a figure with one subplot for each class combination
fig, axs = plt.subplots(n_classes * (n_classes - 1) // 2, 1, figsize=(10, 5 * n_classes * (n_classes - 1) // 2))

# Loop over each class combination
index = 0
for i in range(n_classes):
    for j in range(i + 1, n_classes):
        # Get the feature importances for the current class combination
        coef = clf.coef_[index]
        importance = coef
        
        # Sort the feature importances in descending order
        indices = np.argsort(importance)[::-1]
        
        # Create a bar plot of the feature importances
        axs[index].bar(range(n_features), importance[indices])
        axs[index].set_xticks(range(n_features))
        axs[index].set_xticklabels(np.array(iris.feature_names)[indices], rotation=90)
        axs[index].set_xlabel('Features')
        axs[index].set_ylabel('Importance')
        axs[index].set_title('Feature Importance for Class Combination {} vs {}'.format(iris.target_names[i], iris.target_names[j]))
        index += 1

# Adjust the spacing between subplots
fig.tight_layout()

# Show the plot
plt.show()


# Make predictions on the test data
y_pred_ovo = clf.predict(test_x)
accuracy = accuracy_score(test_lab, val_pred)
mlflow.log_metric('blr accuracy', accuracy)

# Get confusion matrix
cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.tight_layout()
mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')


# Desicion tree clasifier

# assume clf is your one vs one classifier
for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)):
    # create a new binary label vector for the current pair of classes
    y_binary = (y_train == c1) | (y_train == c2)
    
    # train a decision tree on the current pair of classes
    estimator = DecisionTreeClassifier()
    estimator.fit(X_train, y_binary)
    
    # get feature importances
    importances = estimator.feature_importances_
    
    # create a bar plot showing feature importances for the current tree
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(np.arange(len(feature_names)), importances)
    ax.set_xticks(np.arange(len(feature_names)))
    ax.set_xticklabels(feature_names, rotation=45, ha='right')
    ax.set_title(f'Tree {i+1}: {c1} vs {c2} Feature Importances')
    ax.set_ylabel('Importance')
    plt.tight_layout()
    plt.show()

    # initialize a list to store feature importances for each tree
importances_all = []

# assume clf is your one vs one classifier
for i, (c1, c2) in enumerate(combinations(clf.classes_, 2)):
    # create a new binary label vector for the current pair of classes
    y_binary = (y_train == c1) | (y_train == c2)
    
    # train a decision tree on the current pair of classes
    estimator = DecisionTreeClassifier()
    estimator.fit(X_train, y_binary)
    
    # get feature importances and store them in the list
    importances = estimator.feature_importances_
    importances_all.append(importances)
    
    # plot the decision tree with feature importances
    fig, ax = plt.subplots(figsize=(12, 8))
    tree.plot_tree(estimator,
                   feature_names=feature_names,
                   class_names=[str(c1), str(c2)],
                   rounded=True,
                   filled=True,
                   fontsize=14,
                   ax=ax)
    
    # add feature importances to title
    title = f'Tree {i+1}: {c1} vs {c2}\n'
    title += 'Feature importances:\n'
    for feature, importance in zip(feature_names, importances):
        title += f'{feature}: {importance:.3f}\n'
    ax.set_title(title)


# Get confusion matrix
cm = confusion_matrix(test_lab, val_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.tight_layout()
mlflow.log_figure(disp.figure_, 'fig/' + 'confusion_matrix' + '.png')
    

# Example of code to show explainability (one vs rest for a specific incidence)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Train a binary classifier for each class
binary_classifiers = {}
for i in range(len(iris.target_names)):
    binary_y_train = np.where(y_train == i, 1, 0)
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, binary_y_train)
    binary_classifiers[i] = model

# Choose a specific instance to explain (e.g., the first instance in the test set)
instance = X_test[7]

# Get the predicted probability scores for each class for the instance
probs = []
for i in range(len(iris.target_names)):
    binary_classifier = binary_classifiers[i]
    prob = binary_classifier.predict_proba(instance.reshape(1, -1))[0, 1]
    probs.append(prob)

# Get the index of the class with the highest probability score
predicted_class = np.argmax(probs)

# Extract the binary classifier with the highest probability score
binary_classifier = binary_classifiers[predicted_class]

# Plot the decision tree for the binary classifier with the highest probability score
fig, ax = plt.subplots(figsize=(12, 12))
plot_tree(binary_classifier, filled=True, rounded=True, ax=ax, feature_names=iris.feature_names, class_names=['not ' + iris.target_names[predicted_class], iris.target_names[predicted_class]])
plt.show()

# Print the predicted class and probability for the instance
predicted_prob = probs[predicted_class]
print('Predicted Class:', predicted_class)
print('Predicted Probability:', predicted_prob)


# Create a table with the ID, characteristics, true class label, and predicted class label for each sample in the test data
table_test = np.column_stack((np.arange(len(y_test)) + 1, X_test, y_test, y_pred_ovo, y_pred_DTC))
header_test = np.concatenate((['ID'], iris.feature_names, ['True Class', 'Predicted Class_BLR', 'Predicted Class_DTC']))
table_test = np.vstack((header_test, table_test))

# Print the table for the test data
print(tabulate(table_test))