File size: 4,488 Bytes
e448441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
# coding: utf-8
import sys
import os
sys.path.append('../IRL-MOOC/')

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from utils.rnn_models import *
import argparse
import time 

def parse_args():
    parser = argparse.ArgumentParser(description="Process course data.")
    parser.add_argument('--data_dir', type=str, default='../IRL-MOOC/results/whatif/dsp-002/',
                        help="Directory containing the raw clickstream data.")
    parser.add_argument('--course_id', type=str,  default='dsp-002',
                        help="Course ID.")
    parser.add_argument('--meta_data', type=str,  default='../IRL-MOOC/metadata/metadata.csv',
                        help="Meta data file to get total number of weeks")
    parser.add_argument('--output_dir', type=str,  default='../IRL-MOOC/results/whatif/dsp-002/')
    return parser.parse_args()

if __name__ == '__main__':
    args = parse_args()

def main():
    week_type = 'eq_week'
    feature_types = []
    args = parse_args()
    DATA_DIR = args.data_dir
    SAVE_DIR = args.output_dir
    course = args.course_id
    metadata = pd.read_csv(args.meta_data)

    rnn_models = [bidirectional_lstm_32_64, bidirectional_lstm_64, bidirectional_lstm_128, bidirectional_lstm_32]
    save_name = 'run_history/' + course + '_baseline_model_' + week_type + '_bilstm'
    save_stats = save_name + ".csv"
    save_val_stats = save_name + "val.csv"
    counter = 0
    experiment_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile'])
    val_exp_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile'])
    early_predict = np.arange(4, 5)
    epochs = 1

    for percentile in early_predict:
        x_train = []
        x_test = []
        x_val = []
        y_train = []
        y_test = []
        y_val = []
        total_weeks = metadata[metadata['course_id'] == course.replace('_', '-')]['weeks'].values[0]
        num_weeks = int(np.round(total_weeks * percentile / 10))
        labels = pd.read_csv(f'../IRL-MOOC/data/{course}/early-prediction_{course}_labels.csv')['label-pass-fail']
        course_features = np.load(f'{DATA_DIR}/real-data-early-prediction_dsp-002_1to{percentile}_ver2.npy_features.npy')
        print('course:', course, 'total_weeks:', total_weeks, 'num_weeks:', num_weeks, 'course_feature_shape:', course_features.shape, 'percentile', percentile)
        indices = np.arange(course_features.shape[0])
        x_train, x_test_v, y_train, y_test_v, indices_train, indices_test_v = train_test_split(course_features, labels.values, indices, train_size=0.8, random_state=0, stratify=labels)
        x_test, x_val, y_test, y_val, indices_test, indices_val = train_test_split(x_test_v, y_test_v, indices_test_v, train_size=0.5, random_state=1, stratify=y_test_v)

        print('Train shape:', x_train.shape, 'Val shape:', x_val.shape, 'Test shape:', x_test.shape, len(indices_test), np.all(x_test[0] == course_features[indices_test[0]]))
        np.save(f'{SAVE_DIR}/test_students_{percentile}.npy', indices_test)
        print('course: ', course)
        print('week_type: ', week_type)
        for model in rnn_models:
            current_timestamp = str(time.time())[:-2]
            print(model.__name__, current_timestamp) 
            history, scores, val_scores, best_model = model(x_train, y_train, x_test, y_test, x_val, y_val, week_type, feature_types, course, percentile, current_timestamp, num_epochs=epochs)
            experiment_scores.loc[counter] = scores
            val_exp_scores.loc[counter] = val_scores
            counter += 1
            run_name = 'baseline_' + course + model.__name__  + "_ep" + str(percentile) + "_" + current_timestamp
            if not os.path.exists(f'{SAVE_DIR}/run_history/'):
                os.makedirs(f'{SAVE_DIR}/run_history/')
            plot_history(history, f'{SAVE_DIR}/run_history/' + run_name, counter)
            numpy_loss_history = np.array(history.history['loss'])
            np.savetxt(f'{SAVE_DIR}/run_history/' + run_name + "_loss_history.txt", numpy_loss_history, delimiter=",")
            experiment_scores.to_csv(f'{SAVE_DIR}/' + save_stats)
            val_exp_scores.to_csv(f'{SAVE_DIR}/' + save_val_stats)

if __name__ == '__main__':
    main()