|
|
|
|
|
|
|
|
import sys |
|
|
import os |
|
|
sys.path.append('../IRL-MOOC/') |
|
|
|
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from sklearn.model_selection import train_test_split |
|
|
from utils.rnn_models import * |
|
|
import argparse |
|
|
import time |
|
|
|
|
|
def parse_args(): |
|
|
parser = argparse.ArgumentParser(description="Process course data.") |
|
|
parser.add_argument('--data_dir', type=str, default='../IRL-MOOC/results/whatif/dsp-002/', |
|
|
help="Directory containing the raw clickstream data.") |
|
|
parser.add_argument('--course_id', type=str, default='dsp-002', |
|
|
help="Course ID.") |
|
|
parser.add_argument('--meta_data', type=str, default='../IRL-MOOC/metadata/metadata.csv', |
|
|
help="Meta data file to get total number of weeks") |
|
|
parser.add_argument('--output_dir', type=str, default='../IRL-MOOC/results/whatif/dsp-002/') |
|
|
return parser.parse_args() |
|
|
|
|
|
if __name__ == '__main__': |
|
|
args = parse_args() |
|
|
|
|
|
def main(): |
|
|
week_type = 'eq_week' |
|
|
feature_types = [] |
|
|
args = parse_args() |
|
|
DATA_DIR = args.data_dir |
|
|
SAVE_DIR = args.output_dir |
|
|
course = args.course_id |
|
|
metadata = pd.read_csv(args.meta_data) |
|
|
|
|
|
rnn_models = [bidirectional_lstm_32_64, bidirectional_lstm_64, bidirectional_lstm_128, bidirectional_lstm_32] |
|
|
save_name = 'run_history/' + course + '_baseline_model_' + week_type + '_bilstm' |
|
|
save_stats = save_name + ".csv" |
|
|
save_val_stats = save_name + "val.csv" |
|
|
counter = 0 |
|
|
experiment_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile']) |
|
|
val_exp_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile']) |
|
|
early_predict = np.arange(4, 5) |
|
|
epochs = 1 |
|
|
|
|
|
for percentile in early_predict: |
|
|
x_train = [] |
|
|
x_test = [] |
|
|
x_val = [] |
|
|
y_train = [] |
|
|
y_test = [] |
|
|
y_val = [] |
|
|
total_weeks = metadata[metadata['course_id'] == course.replace('_', '-')]['weeks'].values[0] |
|
|
num_weeks = int(np.round(total_weeks * percentile / 10)) |
|
|
labels = pd.read_csv(f'../IRL-MOOC/data/{course}/early-prediction_{course}_labels.csv')['label-pass-fail'] |
|
|
course_features = np.load(f'{DATA_DIR}/real-data-early-prediction_dsp-002_1to{percentile}_ver2.npy_features.npy') |
|
|
print('course:', course, 'total_weeks:', total_weeks, 'num_weeks:', num_weeks, 'course_feature_shape:', course_features.shape, 'percentile', percentile) |
|
|
indices = np.arange(course_features.shape[0]) |
|
|
x_train, x_test_v, y_train, y_test_v, indices_train, indices_test_v = train_test_split(course_features, labels.values, indices, train_size=0.8, random_state=0, stratify=labels) |
|
|
x_test, x_val, y_test, y_val, indices_test, indices_val = train_test_split(x_test_v, y_test_v, indices_test_v, train_size=0.5, random_state=1, stratify=y_test_v) |
|
|
|
|
|
print('Train shape:', x_train.shape, 'Val shape:', x_val.shape, 'Test shape:', x_test.shape, len(indices_test), np.all(x_test[0] == course_features[indices_test[0]])) |
|
|
np.save(f'{SAVE_DIR}/test_students_{percentile}.npy', indices_test) |
|
|
print('course: ', course) |
|
|
print('week_type: ', week_type) |
|
|
for model in rnn_models: |
|
|
current_timestamp = str(time.time())[:-2] |
|
|
print(model.__name__, current_timestamp) |
|
|
history, scores, val_scores, best_model = model(x_train, y_train, x_test, y_test, x_val, y_val, week_type, feature_types, course, percentile, current_timestamp, num_epochs=epochs) |
|
|
experiment_scores.loc[counter] = scores |
|
|
val_exp_scores.loc[counter] = val_scores |
|
|
counter += 1 |
|
|
run_name = 'baseline_' + course + model.__name__ + "_ep" + str(percentile) + "_" + current_timestamp |
|
|
if not os.path.exists(f'{SAVE_DIR}/run_history/'): |
|
|
os.makedirs(f'{SAVE_DIR}/run_history/') |
|
|
plot_history(history, f'{SAVE_DIR}/run_history/' + run_name, counter) |
|
|
numpy_loss_history = np.array(history.history['loss']) |
|
|
np.savetxt(f'{SAVE_DIR}/run_history/' + run_name + "_loss_history.txt", numpy_loss_history, delimiter=",") |
|
|
experiment_scores.to_csv(f'{SAVE_DIR}/' + save_stats) |
|
|
val_exp_scores.to_csv(f'{SAVE_DIR}/' + save_val_stats) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |