import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from utils.data_helper import * from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score import os from sklearn.metrics import balanced_accuracy_score as bac def pred(model, x_test, y_test): y_pred = model.predict(x_test) y_pred = np.array([1 if y[0] >= 0.5 else 0 for y in y_pred]) print('num students:', len(y_pred)) score = None if len(y_pred) == len(y_test): score = bac(y_test, y_pred) print('BAC score:', score) # ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot() print(f'num fail: {sum(y_pred[y_test==1])}/{sum(y_test)}') print(f'num pass: {sum(y_pred[y_test==0] == 0)}/{sum(y_test==0)}') return score def pred_pass_fail(model, x_test, label=None): y_pred = model.predict(x_test) # print('Success Prediction:', y_pred) y_pred = np.array([1 if y[0] >= 0.5 else 0 for y in y_pred]) print('num students:', len(y_pred)) print(f'num fail: {sum(y_pred)}/{len(y_pred)}') print(f'num pass: {sum(y_pred == 0)}/{len(y_pred)}') score=None if label==0: score = sum(y_pred == 0)/len(y_pred) elif label==1: score = sum(y_pred)/len(y_pred) print(f'Percentage correct: ', score) return {'score': score, 'num fail': sum(y_pred), 'num pass': sum(y_pred == 0)} def pred_change_label(model, x_test_real, x_test_syn, y_test=None): y_pred_real = np.array([1 if y[0] >= 0.5 else 0 for y in model.predict(x_test_real)]) y_pred_syn = np.array([1 if y[0] >= 0.5 else 0 for y in model.predict(x_test_syn)]) print(np.sum(y_pred_real==y_pred_syn)) pass_to_fail = np.sum((y_pred_real == 0) & (y_pred_syn == 1)) fail_to_pass = np.sum((y_pred_real == 1) & (y_pred_syn == 0)) print('>>>>> Number of students predicted to be pass change to fail after introducing new content: ', pass_to_fail) print('>>>>> Number of students predicted to be fail change to pass after introducing new content: ', fail_to_pass) if (y_test is not None): print(f'>>>>> Ground Truth, Num passing students: {np.sum(y_test==0)}, Num failing students: {np.sum(y_test==1)}') return pass_to_fail, fail_to_pass def pred_change_percentage(model, x_test_real, x_test_syn, y_test=None): y_pred_real = model.predict(x_test_real) y_pred_syn = model.predict(x_test_syn) y_pred_label_real = np.array([1 if y[0] >= 0.5 else 0 for y in y_pred_real]) impact_low_performing = y_pred_syn[y_pred_label_real] - y_pred_real[y_pred_label_real] print('>>>>> New assignment impact on average students: ', np.mean(y_pred_syn - y_pred_real), np.std(y_pred_syn - y_pred_real)) print('>>>>> New assignment impact on predicted-low-performing students', np.mean(impact_low_performing), np.std(impact_low_performing)) return np.mean(y_pred_syn - y_pred_real), np.std(y_pred_syn - y_pred_real), np.mean(impact_low_performing), np.std(impact_low_performing) def first_trail_prediction(pass_reward, fail_reward, DATA_DIR="Y:/data/result/easy-fail/eq_week-marras_et_al-dsp_002", size=np.array([3, 3, 4, 3, 2, 2, 3, 2]), feature_list=["competency_strength", "competency_alignment", "competency_anticipation", "content_alignment", "content_anticipation", "student_speed", "student_shape"], show=False): features_2 = np.load(DATA_DIR + "/feature_values.npz") data = features_2['feature_values'] label_df = pd.read_csv(DATA_DIR + "/feature_labels.csv") y = np.array(label_df['label-pass-fail']).astype(int) # Log scale time data[:, :, -2] = np.log(data[:, :, -2]) pass_data = data[y==0] fail_data = data[y==1] if show: print('Student pass the course:', pass_data.shape) print('Student fail the course:', fail_data.shape) filna_pass_data = dealing_missing_value(pass_data) filna_fail_data = dealing_missing_value(fail_data) filna_all_data = dealing_missing_value(data) discretized_all_data = discretized_feature(clusters_list=[3, 3, 4, 3, 2, 2, 3], feature=filna_all_data, feature_list=feature_list) discretized_pass_data = discretized_feature(clusters_list=[3, 3, 4, 3, 2, 2, 3], feature=filna_pass_data, feature_list=feature_list) discretized_fail_data = discretized_feature(clusters_list=[3, 3, 4, 3, 2, 2, 3], feature=filna_fail_data, feature_list=feature_list) student_world = World(size=size, samples_trajectory=discretized_pass_data) trajectories_pass = feature_table_to_trajectories(student_world, discretized_pass_data) trajectories_fail = feature_table_to_trajectories(student_world, discretized_fail_data) trajectories_all = feature_table_to_trajectories(student_world, discretized_all_data) save_dir = f"results/{DATA_DIR.split('/')[-1]}" # Check if the directory exists, and create it if it doesn't if not os.path.exists(save_dir): os.makedirs(save_dir) plot_confusion_matrix(pass_reward, fail_reward, trajectories_all, y, save_dir) reward_each_student(pass_reward, trajectories_pass, 'Pass Reward, Pass Students', save_dir) reward_each_student(pass_reward, trajectories_fail, 'Pass Reward, Fail Students', save_dir) reward_each_student(fail_reward, trajectories_pass, 'Fail Reward, Pass Students', save_dir) reward_each_student(fail_reward, trajectories_fail, 'Fail Reward, Fail Students', save_dir) def reward_each_student(reward, trajectories, title=None, path=None): ans = [] for t in trajectories: R = 0 for state in t: R += reward[state[0]][0] ans.append(R) sns.histplot(ans) plt.title(title) if path: plt.savefig(path + f'/{title}.jpg') plt.close() return ans def plot_confusion_matrix(pass_reward, fail_reward, trajectories_all, y, path): pred_label = pred_pass_fail(pass_reward, fail_reward, trajectories_all) y_test = np.array(y) pred_label = np.array(pred_label) print(np.sum(pred_label==0), np.sum(pred_label==1)) tn, fp, fn, tp = confusion_matrix(y_test, pred_label, normalize='true').ravel() print('tn, fp, fn, tp:', tn, fp, fn, tp) disp = ConfusionMatrixDisplay.from_predictions( y_test, pred_label, display_labels=['Pass', 'Fail'], cmap=plt.cm.Blues, normalize='true' ) print('Balanced Accuracy Score:', balanced_accuracy_score(y_test, pred_label)) if path: plt.savefig(path + f'/confusion_matrix.jpg') plt.close() def create_attributes(DATA_DIR, pass_reward, fail_reward, size = np.array([3, 3, 4, 3, 2, 2, 3, 2]), feature_list=["competency_strength", "competency_alignment", "competency_anticipation", "content_alignment", "content_anticipation", "student_speed", "student_shape"], show=True): """ return X = shape (#number_student, 2) with each element (pass_reward, fail_reward) """ features_2 = np.load(DATA_DIR + "/feature_values.npz") data = features_2['feature_values'] label_df = pd.read_csv(DATA_DIR + "/feature_labels.csv") y = np.array(label_df['label-pass-fail']).astype(int) # Log scale time data[:, :, -2] = np.log(data[:, :, -2]) pass_data = data[y==0] fail_data = data[y==1] if show: print('Student pass the course:', pass_data.shape) print('Student fail the course:', fail_data.shape) filna_all_data = dealing_missing_value(data) discretized_all_data = discretized_feature(clusters_list=[3, 3, 4, 3, 2, 2, 3], feature=filna_all_data, feature_list=feature_list) student_world = World(size=size, samples_trajectory=discretized_all_data) trajectories_all = feature_table_to_trajectories(student_world, discretized_all_data) student_pass_score = reward_each_student(pass_reward, trajectories_all) student_fail_score = reward_each_student(fail_reward, trajectories_all) X = np.column_stack((student_pass_score, student_fail_score)) return X, y