Spaces:

tranhuonglan
/

what-if-simulation-app

Sleeping

what-if-simulation-app / src /scripts /BiLSTM-train.py

tranhuonglan

first commit

e448441 6 months ago

4.49 kB

	#!/usr/bin/env python
	# coding: utf-8
	import sys
	import os
	sys.path.append('../IRL-MOOC/')

	import numpy as np
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from utils.rnn_models import *
	import argparse
	import time

	def parse_args():
	parser = argparse.ArgumentParser(description="Process course data.")
	parser.add_argument('--data_dir', type=str, default='../IRL-MOOC/results/whatif/dsp-002/',
	help="Directory containing the raw clickstream data.")
	parser.add_argument('--course_id', type=str, default='dsp-002',
	help="Course ID.")
	parser.add_argument('--meta_data', type=str, default='../IRL-MOOC/metadata/metadata.csv',
	help="Meta data file to get total number of weeks")
	parser.add_argument('--output_dir', type=str, default='../IRL-MOOC/results/whatif/dsp-002/')
	return parser.parse_args()

	if __name__ == '__main__':
	args = parse_args()

	def main():
	week_type = 'eq_week'
	feature_types = []
	args = parse_args()
	DATA_DIR = args.data_dir
	SAVE_DIR = args.output_dir
	course = args.course_id
	metadata = pd.read_csv(args.meta_data)

	rnn_models = [bidirectional_lstm_32_64, bidirectional_lstm_64, bidirectional_lstm_128, bidirectional_lstm_32]
	save_name = 'run_history/' + course + '_baseline_model_' + week_type + '_bilstm'
	save_stats = save_name + ".csv"
	save_val_stats = save_name + "val.csv"
	counter = 0
	experiment_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile'])
	val_exp_scores = pd.DataFrame(columns=['acc', 'bac','prec','rec','f1', 'auc', 'feature_type', 'week_type', 'course', 'model_name','data_balance', 'timestamp', 'percentile'])
	early_predict = np.arange(4, 5)
	epochs = 1

	for percentile in early_predict:
	x_train = []
	x_test = []
	x_val = []
	y_train = []
	y_test = []
	y_val = []
	total_weeks = metadata[metadata['course_id'] == course.replace('_', '-')]['weeks'].values[0]
	num_weeks = int(np.round(total_weeks * percentile / 10))
	labels = pd.read_csv(f'../IRL-MOOC/data/{course}/early-prediction_{course}_labels.csv')['label-pass-fail']
	course_features = np.load(f'{DATA_DIR}/real-data-early-prediction_dsp-002_1to{percentile}_ver2.npy_features.npy')
	print('course:', course, 'total_weeks:', total_weeks, 'num_weeks:', num_weeks, 'course_feature_shape:', course_features.shape, 'percentile', percentile)
	indices = np.arange(course_features.shape[0])
	x_train, x_test_v, y_train, y_test_v, indices_train, indices_test_v = train_test_split(course_features, labels.values, indices, train_size=0.8, random_state=0, stratify=labels)
	x_test, x_val, y_test, y_val, indices_test, indices_val = train_test_split(x_test_v, y_test_v, indices_test_v, train_size=0.5, random_state=1, stratify=y_test_v)

	print('Train shape:', x_train.shape, 'Val shape:', x_val.shape, 'Test shape:', x_test.shape, len(indices_test), np.all(x_test[0] == course_features[indices_test[0]]))
	np.save(f'{SAVE_DIR}/test_students_{percentile}.npy', indices_test)
	print('course: ', course)
	print('week_type: ', week_type)
	for model in rnn_models:
	current_timestamp = str(time.time())[:-2]
	print(model.__name__, current_timestamp)
	history, scores, val_scores, best_model = model(x_train, y_train, x_test, y_test, x_val, y_val, week_type, feature_types, course, percentile, current_timestamp, num_epochs=epochs)
	experiment_scores.loc[counter] = scores
	val_exp_scores.loc[counter] = val_scores
	counter += 1
	run_name = 'baseline_' + course + model.__name__ + "_ep" + str(percentile) + "_" + current_timestamp
	if not os.path.exists(f'{SAVE_DIR}/run_history/'):
	os.makedirs(f'{SAVE_DIR}/run_history/')
	plot_history(history, f'{SAVE_DIR}/run_history/' + run_name, counter)
	numpy_loss_history = np.array(history.history['loss'])
	np.savetxt(f'{SAVE_DIR}/run_history/' + run_name + "_loss_history.txt", numpy_loss_history, delimiter=",")
	experiment_scores.to_csv(f'{SAVE_DIR}/' + save_stats)
	val_exp_scores.to_csv(f'{SAVE_DIR}/' + save_val_stats)

	if __name__ == '__main__':
	main()