#!/usr/bin/env python # coding: utf-8 import sys import os sys.path.append('../IRL-MOOC/') import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from utils.rnn_models import * import argparse import time def parse_args(): parser = argparse.ArgumentParser(description="Process course data.") parser.add_argument('--data_dir', type=str, default='../IRL-MOOC/results/whatif/dsp-002/', help="Directory containing the raw clickstream data.") parser.add_argument('--course_id', type=str, default='dsp-002', help="Course ID.") parser.add_argument('--meta_data', type=str, default='../IRL-MOOC/metadata/metadata.csv', help="Meta data file to get total number of weeks") parser.add_argument('--output_dir', type=str, default='../IRL-MOOC/results/whatif/dsp-002/') return parser.parse_args() if __name__ == '__main__': args = parse_args() def main(): week_type = 'eq_week' feature_types = [] args = parse_args() DATA_DIR = args.data_dir SAVE_DIR = args.output_dir course = args.course_id metadata = pd.read_csv(args.meta_data) rnn_models = [bidirectional_lstm_32_64, bidirectional_lstm_64, bidirectional_lstm_128, bidirectional_lstm_32] save_name = 'run_history/' + course + '_baseline_model_' + week_type + '_bilstm' save_stats = save_name + ".csv" save_val_stats = save_name + "val.csv" counter = 0 experiment_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile']) val_exp_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile']) early_predict = np.arange(4, 5) epochs = 1 for percentile in early_predict: x_train = [] x_test = [] x_val = [] y_train = [] y_test = [] y_val = [] total_weeks = metadata[metadata['course_id'] == course.replace('_', '-')]['weeks'].values[0] num_weeks = int(np.round(total_weeks * percentile / 10)) labels = pd.read_csv(f'../IRL-MOOC/data/{course}/early-prediction_{course}_labels.csv')['label-pass-fail'] course_features = np.load(f'{DATA_DIR}/real-data-early-prediction_dsp-002_1to{percentile}_ver2.npy_features.npy') print('course:', course, 'total_weeks:', total_weeks, 'num_weeks:', num_weeks, 'course_feature_shape:', course_features.shape, 'percentile', percentile) indices = np.arange(course_features.shape[0]) x_train, x_test_v, y_train, y_test_v, indices_train, indices_test_v = train_test_split(course_features, labels.values, indices, train_size=0.8, random_state=0, stratify=labels) x_test, x_val, y_test, y_val, indices_test, indices_val = train_test_split(x_test_v, y_test_v, indices_test_v, train_size=0.5, random_state=1, stratify=y_test_v) print('Train shape:', x_train.shape, 'Val shape:', x_val.shape, 'Test shape:', x_test.shape, len(indices_test), np.all(x_test[0] == course_features[indices_test[0]])) np.save(f'{SAVE_DIR}/test_students_{percentile}.npy', indices_test) print('course: ', course) print('week_type: ', week_type) for model in rnn_models: current_timestamp = str(time.time())[:-2] print(model.__name__, current_timestamp) history, scores, val_scores, best_model = model(x_train, y_train, x_test, y_test, x_val, y_val, week_type, feature_types, course, percentile, current_timestamp, num_epochs=epochs) experiment_scores.loc[counter] = scores val_exp_scores.loc[counter] = val_scores counter += 1 run_name = 'baseline_' + course + model.__name__ + "_ep" + str(percentile) + "_" + current_timestamp if not os.path.exists(f'{SAVE_DIR}/run_history/'): os.makedirs(f'{SAVE_DIR}/run_history/') plot_history(history, f'{SAVE_DIR}/run_history/' + run_name, counter) numpy_loss_history = np.array(history.history['loss']) np.savetxt(f'{SAVE_DIR}/run_history/' + run_name + "_loss_history.txt", numpy_loss_history, delimiter=",") experiment_scores.to_csv(f'{SAVE_DIR}/' + save_stats) val_exp_scores.to_csv(f'{SAVE_DIR}/' + save_val_stats) if __name__ == '__main__': main()