tranhuonglan
first commit
e448441
#!/usr/bin/env python
# coding: utf-8
import sys
import os
sys.path.append('../IRL-MOOC/')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from utils.rnn_models import *
import argparse
import time
def parse_args():
parser = argparse.ArgumentParser(description="Process course data.")
parser.add_argument('--data_dir', type=str, default='../IRL-MOOC/results/whatif/dsp-002/',
help="Directory containing the raw clickstream data.")
parser.add_argument('--course_id', type=str, default='dsp-002',
help="Course ID.")
parser.add_argument('--meta_data', type=str, default='../IRL-MOOC/metadata/metadata.csv',
help="Meta data file to get total number of weeks")
parser.add_argument('--output_dir', type=str, default='../IRL-MOOC/results/whatif/dsp-002/')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
def main():
week_type = 'eq_week'
feature_types = []
args = parse_args()
DATA_DIR = args.data_dir
SAVE_DIR = args.output_dir
course = args.course_id
metadata = pd.read_csv(args.meta_data)
rnn_models = [bidirectional_lstm_32_64, bidirectional_lstm_64, bidirectional_lstm_128, bidirectional_lstm_32]
save_name = 'run_history/' + course + '_baseline_model_' + week_type + '_bilstm'
save_stats = save_name + ".csv"
save_val_stats = save_name + "val.csv"
counter = 0
experiment_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile'])
val_exp_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile'])
early_predict = np.arange(4, 5)
epochs = 1
for percentile in early_predict:
x_train = []
x_test = []
x_val = []
y_train = []
y_test = []
y_val = []
total_weeks = metadata[metadata['course_id'] == course.replace('_', '-')]['weeks'].values[0]
num_weeks = int(np.round(total_weeks * percentile / 10))
labels = pd.read_csv(f'../IRL-MOOC/data/{course}/early-prediction_{course}_labels.csv')['label-pass-fail']
course_features = np.load(f'{DATA_DIR}/real-data-early-prediction_dsp-002_1to{percentile}_ver2.npy_features.npy')
print('course:', course, 'total_weeks:', total_weeks, 'num_weeks:', num_weeks, 'course_feature_shape:', course_features.shape, 'percentile', percentile)
indices = np.arange(course_features.shape[0])
x_train, x_test_v, y_train, y_test_v, indices_train, indices_test_v = train_test_split(course_features, labels.values, indices, train_size=0.8, random_state=0, stratify=labels)
x_test, x_val, y_test, y_val, indices_test, indices_val = train_test_split(x_test_v, y_test_v, indices_test_v, train_size=0.5, random_state=1, stratify=y_test_v)
print('Train shape:', x_train.shape, 'Val shape:', x_val.shape, 'Test shape:', x_test.shape, len(indices_test), np.all(x_test[0] == course_features[indices_test[0]]))
np.save(f'{SAVE_DIR}/test_students_{percentile}.npy', indices_test)
print('course: ', course)
print('week_type: ', week_type)
for model in rnn_models:
current_timestamp = str(time.time())[:-2]
print(model.__name__, current_timestamp)
history, scores, val_scores, best_model = model(x_train, y_train, x_test, y_test, x_val, y_val, week_type, feature_types, course, percentile, current_timestamp, num_epochs=epochs)
experiment_scores.loc[counter] = scores
val_exp_scores.loc[counter] = val_scores
counter += 1
run_name = 'baseline_' + course + model.__name__ + "_ep" + str(percentile) + "_" + current_timestamp
if not os.path.exists(f'{SAVE_DIR}/run_history/'):
os.makedirs(f'{SAVE_DIR}/run_history/')
plot_history(history, f'{SAVE_DIR}/run_history/' + run_name, counter)
numpy_loss_history = np.array(history.history['loss'])
np.savetxt(f'{SAVE_DIR}/run_history/' + run_name + "_loss_history.txt", numpy_loss_history, delimiter=",")
experiment_scores.to_csv(f'{SAVE_DIR}/' + save_stats)
val_exp_scores.to_csv(f'{SAVE_DIR}/' + save_val_stats)
if __name__ == '__main__':
main()