tranhuonglan
first commit
e448441
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from utils.data_helper import *
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score
import os
from sklearn.metrics import balanced_accuracy_score as bac
def pred(model, x_test, y_test):
y_pred = model.predict(x_test)
y_pred = np.array([1 if y[0] >= 0.5 else 0 for y in y_pred])
print('num students:', len(y_pred))
score = None
if len(y_pred) == len(y_test):
score = bac(y_test, y_pred)
print('BAC score:', score)
# ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot()
print(f'num fail: {sum(y_pred[y_test==1])}/{sum(y_test)}')
print(f'num pass: {sum(y_pred[y_test==0] == 0)}/{sum(y_test==0)}')
return score
def pred_pass_fail(model, x_test, label=None):
y_pred = model.predict(x_test)
# print('Success Prediction:', y_pred)
y_pred = np.array([1 if y[0] >= 0.5 else 0 for y in y_pred])
print('num students:', len(y_pred))
print(f'num fail: {sum(y_pred)}/{len(y_pred)}')
print(f'num pass: {sum(y_pred == 0)}/{len(y_pred)}')
score=None
if label==0:
score = sum(y_pred == 0)/len(y_pred)
elif label==1:
score = sum(y_pred)/len(y_pred)
print(f'Percentage correct: ', score)
return {'score': score, 'num fail': sum(y_pred), 'num pass': sum(y_pred == 0)}
def pred_change_label(model, x_test_real, x_test_syn, y_test=None):
y_pred_real = np.array([1 if y[0] >= 0.5 else 0 for y in model.predict(x_test_real)])
y_pred_syn = np.array([1 if y[0] >= 0.5 else 0 for y in model.predict(x_test_syn)])
print(np.sum(y_pred_real==y_pred_syn))
pass_to_fail = np.sum((y_pred_real == 0) & (y_pred_syn == 1))
fail_to_pass = np.sum((y_pred_real == 1) & (y_pred_syn == 0))
print('>>>>> Number of students predicted to be pass change to fail after introducing new content: ',
pass_to_fail)
print('>>>>> Number of students predicted to be fail change to pass after introducing new content: ',
fail_to_pass)
if (y_test is not None):
print(f'>>>>> Ground Truth, Num passing students: {np.sum(y_test==0)}, Num failing students: {np.sum(y_test==1)}')
return pass_to_fail, fail_to_pass
def pred_change_percentage(model, x_test_real, x_test_syn, y_test=None):
y_pred_real = model.predict(x_test_real)
y_pred_syn = model.predict(x_test_syn)
y_pred_label_real = np.array([1 if y[0] >= 0.5 else 0 for y in y_pred_real])
impact_low_performing = y_pred_syn[y_pred_label_real] - y_pred_real[y_pred_label_real]
print('>>>>> New assignment impact on average students: ', np.mean(y_pred_syn - y_pred_real), np.std(y_pred_syn - y_pred_real))
print('>>>>> New assignment impact on predicted-low-performing students', np.mean(impact_low_performing), np.std(impact_low_performing))
return np.mean(y_pred_syn - y_pred_real), np.std(y_pred_syn - y_pred_real), np.mean(impact_low_performing), np.std(impact_low_performing)
def first_trail_prediction(pass_reward, fail_reward, DATA_DIR="Y:/data/result/easy-fail/eq_week-marras_et_al-dsp_002",
size=np.array([3, 3, 4, 3, 2, 2, 3, 2]),
feature_list=["competency_strength", "competency_alignment", "competency_anticipation", "content_alignment",
"content_anticipation", "student_speed", "student_shape"],
show=False):
features_2 = np.load(DATA_DIR + "/feature_values.npz")
data = features_2['feature_values']
label_df = pd.read_csv(DATA_DIR + "/feature_labels.csv")
y = np.array(label_df['label-pass-fail']).astype(int)
# Log scale time
data[:, :, -2] = np.log(data[:, :, -2])
pass_data = data[y==0]
fail_data = data[y==1]
if show:
print('Student pass the course:', pass_data.shape)
print('Student fail the course:', fail_data.shape)
filna_pass_data = dealing_missing_value(pass_data)
filna_fail_data = dealing_missing_value(fail_data)
filna_all_data = dealing_missing_value(data)
discretized_all_data = discretized_feature(clusters_list=[3, 3, 4, 3, 2, 2, 3], feature=filna_all_data, feature_list=feature_list)
discretized_pass_data = discretized_feature(clusters_list=[3, 3, 4, 3, 2, 2, 3], feature=filna_pass_data, feature_list=feature_list)
discretized_fail_data = discretized_feature(clusters_list=[3, 3, 4, 3, 2, 2, 3], feature=filna_fail_data, feature_list=feature_list)
student_world = World(size=size, samples_trajectory=discretized_pass_data)
trajectories_pass = feature_table_to_trajectories(student_world, discretized_pass_data)
trajectories_fail = feature_table_to_trajectories(student_world, discretized_fail_data)
trajectories_all = feature_table_to_trajectories(student_world, discretized_all_data)
save_dir = f"results/{DATA_DIR.split('/')[-1]}"
# Check if the directory exists, and create it if it doesn't
if not os.path.exists(save_dir):
os.makedirs(save_dir)
plot_confusion_matrix(pass_reward, fail_reward, trajectories_all, y, save_dir)
reward_each_student(pass_reward, trajectories_pass, 'Pass Reward, Pass Students', save_dir)
reward_each_student(pass_reward, trajectories_fail, 'Pass Reward, Fail Students', save_dir)
reward_each_student(fail_reward, trajectories_pass, 'Fail Reward, Pass Students', save_dir)
reward_each_student(fail_reward, trajectories_fail, 'Fail Reward, Fail Students', save_dir)
def reward_each_student(reward, trajectories, title=None, path=None):
ans = []
for t in trajectories:
R = 0
for state in t:
R += reward[state[0]][0]
ans.append(R)
sns.histplot(ans)
plt.title(title)
if path:
plt.savefig(path + f'/{title}.jpg')
plt.close()
return ans
def plot_confusion_matrix(pass_reward, fail_reward, trajectories_all, y, path):
pred_label = pred_pass_fail(pass_reward, fail_reward, trajectories_all)
y_test = np.array(y)
pred_label = np.array(pred_label)
print(np.sum(pred_label==0), np.sum(pred_label==1))
tn, fp, fn, tp = confusion_matrix(y_test, pred_label, normalize='true').ravel()
print('tn, fp, fn, tp:', tn, fp, fn, tp)
disp = ConfusionMatrixDisplay.from_predictions(
y_test,
pred_label,
display_labels=['Pass', 'Fail'],
cmap=plt.cm.Blues,
normalize='true'
)
print('Balanced Accuracy Score:', balanced_accuracy_score(y_test, pred_label))
if path:
plt.savefig(path + f'/confusion_matrix.jpg')
plt.close()
def create_attributes(DATA_DIR, pass_reward, fail_reward,
size = np.array([3, 3, 4, 3, 2, 2, 3, 2]),
feature_list=["competency_strength", "competency_alignment", "competency_anticipation", "content_alignment",
"content_anticipation", "student_speed", "student_shape"],
show=True):
"""
return X = shape (#number_student, 2) with each element (pass_reward, fail_reward)
"""
features_2 = np.load(DATA_DIR + "/feature_values.npz")
data = features_2['feature_values']
label_df = pd.read_csv(DATA_DIR + "/feature_labels.csv")
y = np.array(label_df['label-pass-fail']).astype(int)
# Log scale time
data[:, :, -2] = np.log(data[:, :, -2])
pass_data = data[y==0]
fail_data = data[y==1]
if show:
print('Student pass the course:', pass_data.shape)
print('Student fail the course:', fail_data.shape)
filna_all_data = dealing_missing_value(data)
discretized_all_data = discretized_feature(clusters_list=[3, 3, 4, 3, 2, 2, 3], feature=filna_all_data, feature_list=feature_list)
student_world = World(size=size, samples_trajectory=discretized_all_data)
trajectories_all = feature_table_to_trajectories(student_world, discretized_all_data)
student_pass_score = reward_each_student(pass_reward, trajectories_all)
student_fail_score = reward_each_student(fail_reward, trajectories_all)
X = np.column_stack((student_pass_score, student_fail_score))
return X, y