| import numpy as np
|
| from sklearn.cluster import KMeans
|
| import matplotlib.pyplot as plt
|
| import seaborn as sns
|
| from datetime import datetime
|
| from tqdm import tqdm
|
| import pandas as pd
|
|
|
| def filter_range_dates(df, start_date, end_date):
|
| df = df[df['date'] >= str2dt(start_date)]
|
| df = df[df['date'] <= str2dt(end_date)] if end_date is not np.nan else df
|
| return df
|
|
|
| def tmp2dt(x):
|
| return str2dt(datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
|
|
|
| def str2dt(ts, format='%Y-%m-%d %H:%M:%S'):
|
| return datetime.strptime(ts, format)
|
|
|
| def dt2w(dt):
|
| return dt.strftime('%W')
|
|
|
| def outliers_iqr(data):
|
| quartile_1, quartile_3 = np.percentile(data, [10, 90])
|
| iqr = quartile_3 - quartile_1
|
| lower_bound = quartile_1 - (1.5 * iqr)
|
| upper_bound = quartile_3 + (1.5 * iqr)
|
|
|
| return lower_bound, upper_bound
|
|
|
|
|
| def dealing_missing_value(data):
|
| """
|
| dealing_missing_value: return filled nan dataset that would replace the nan value for each week each feature
|
| by 0. Furthermore, a boolean value is added into each week stating whether the week has missing
|
| value or not.
|
| :param data: Dataset size (num_student, num_week, num_feature)
|
| """
|
|
|
|
|
| fillna_data = np.copy(data)
|
| fillna_data[:, -1, 2] = np.nan_to_num(fillna_data[:, -1, 2])
|
| fillna_data[:, -1, 4] = np.nan_to_num(fillna_data[:, -1, 4])
|
| nan_mask = np.isnan(fillna_data).any(axis=2)
|
|
|
|
|
| fillna_data = np.dstack((fillna_data, nan_mask.astype(int)))
|
| fillna_data = np.nan_to_num(fillna_data)
|
|
|
| return fillna_data
|
|
|
| def discretized_feature(clusters_list, feature, feature_list, kmeans, show=False, discretized_by_percentile=False, mids=[]):
|
| """
|
| discretized_feature: using KNN to group each feature in feature_list into multiple groups, replace the
|
| float value by the group number
|
| :param n_clusters_list: shape [no feature] hyperparameter list of number of groups of each feature
|
| :param feature: shape [no student, no week, no feature]
|
| :param feature_list: shape [no feature] list of names of each feature, for analyzing purpose only
|
| """
|
|
|
| data_arr = feature
|
| data_arr = np.transpose(data_arr, (2, 0, 1))
|
| discretized_arr = data_arr
|
| if discretized_by_percentile:
|
| print('50 percentile of non zero value:', mids)
|
| print("Discretized by percentile")
|
| else:
|
| print("Discretized by KMeans")
|
| for id, column in enumerate(feature_list):
|
| if show:
|
| print("Discritize feature in progress:", column)
|
| print('Number of groups:', clusters_list[id])
|
| if discretized_by_percentile:
|
| X = data_arr[id]
|
| y_pred = np.zeros(X.shape)
|
| y_pred[X == 0] = 0
|
| y_pred[(X>0) & (X<=mids[id])] = 1
|
| y_pred[X > mids[id]] = 2
|
| discretized_arr[id] = y_pred.astype(int)
|
| else:
|
| X = np.reshape(data_arr[id], (-1, 1))
|
| y_pred = kmeans[id].predict(X)
|
| map_group = dict(zip(np.argsort(kmeans[id].cluster_centers_[:, 0]), np.arange(clusters_list[id])))
|
| y_pred = np.array([map_group[x] for x in y_pred])
|
|
|
| discretized_arr[id] = np.reshape(y_pred, (data_arr[id].shape))
|
| return np.transpose(discretized_arr, (1, 2, 0)).astype(int)
|
|
|
| def visualize_skewness(data):
|
| """
|
| visualize_skewness: visualize histogram of each feature data of all weeks in data
|
| """
|
| fig, ax = plt.subplots(2, 4, figsize=(16, 5))
|
| fig.tight_layout(pad=3.0)
|
| for id, column in enumerate(data.columns):
|
| X = data[column].dropna()
|
| X = X[X > 0]
|
| sns.histplot(X, kde=True, ax=ax[id//4, id%4])
|
| plt.show()
|
|
|
| def plot_distortion(n_clusters_list, X, title, ax, seed=0):
|
| """
|
| Plot the distortion (in-cluster variance) on the y-axis and
|
| the number of clusters in the x-axis
|
|
|
| :param n_clusters_list: List of number of clusters to explore
|
| :param X: np array of data points
|
| """
|
| distortion_list = []
|
| for k in n_clusters_list:
|
| kmeans = KMeans(n_clusters=k, random_state=seed).fit(X)
|
| distortion = kmeans.inertia_
|
| distortion_list.append(distortion)
|
| ax.plot(n_clusters_list, distortion_list, 'o-')
|
| ax.set_ylabel('Distortion')
|
| ax.set_title(title)
|
|
|
| def plot_all_distortion(feature_df):
|
| """
|
| Plot all the distortion of all feature
|
| """
|
| fig, ax = plt.subplots(2, 4, figsize=(16, 5))
|
| fig.tight_layout(pad=3.0)
|
| for id, column in enumerate(feature_df.columns):
|
| print("Features:", column)
|
|
|
| X = np.array(feature_df[column].dropna()).reshape(-1, 1)
|
| plot_distortion(range(2, 10), X, column, ax[id//4, id%4])
|
|
|
| plt.show()
|
|
|
| def feature_table_to_trajectories(student_world, samples_trajectory):
|
| """
|
| Transform students' feature into trajectory of fixed length (number of weeks)
|
| Actions: (0/1) indicate whether student move to the next state in the consecutive weeks
|
| States: (int) calculated from features' value ([number of features] to int) by function state_point_to_index
|
| :param student_world: world object defines by number of states, number of actions,
|
| and transition probability table
|
| :param samples_trajectory: [no student, no week, no features] experts (students)' feature taken from the feature dataset
|
| :return trajectory_list: list of shape [number of students]. Each element is a trajectory
|
| that is a list of shapes [number of weeks] containing tuples (state_from, action, state_to)
|
| """
|
| trajectory_list = []
|
| no_student = samples_trajectory.shape[0]
|
| no_week = samples_trajectory.shape[1]
|
| for student in range(no_student):
|
| t = []
|
| for week in range(no_week - 1):
|
| s_from = student_world.state_point_to_index(samples_trajectory[student, week])
|
| s_to = student_world.state_point_to_index(samples_trajectory[student, week + 1])
|
| action = 0 if s_from == s_to else 1
|
| t.append((s_from, action, s_to))
|
| trajectory_list.append(t)
|
| return np.array(trajectory_list)
|
|
|
| def data_to_trajectories(dataset, map_action, weeks=None, all=True, remove_empty=True):
|
| """
|
| Return a list of trajectories, each trajectory is a list of (state_from, action, state_to) tuples
|
| """
|
| def user_to_trajectory(x, weeks):
|
| trajectory = []
|
| for i in range(len(x['event_id'])):
|
| if (all==True or x['week'][i] in weeks):
|
| if (i != 0 and x['event_id'][i] != x['event_id'][i-1]):
|
| action = map_action['Move To Quiz'] if 'Quiz' in x['action'][i] else map_action['Move To Video']
|
| trajectory.append((x['event_id'][i - 1], action, x['event_id'][i]))
|
| trajectory.append([x['event_id'][i], map_action[x['action'][i]], x['event_id'][i]])
|
| return np.array(trajectory)
|
| trajectories = dataset[['event_id', 'week', 'action']].apply(lambda x: user_to_trajectory(x, weeks), axis=1)
|
| if remove_empty:
|
| trajectories = [x for x in trajectories if x.size != 0]
|
| assert len(trajectories) > 0
|
| max_length = max([len(t) for t in trajectories])
|
| min_length = min([len(t) for t in trajectories])
|
| print('max length, min length', max_length, min_length)
|
| print(f'num of student in {weeks} week:', len(trajectories))
|
| print('----------------------')
|
| return trajectories
|
|
|
| def trajectories_to_features(trajectories, world, path="", upper_bound=100, num_week=6, syn=True, save_to_disk=True):
|
| course_features, next_states_actions = [], []
|
| n_trajectory = 0
|
| if syn:
|
| n_trajectory = len(trajectories)
|
| else:
|
| n_trajectory = len(trajectories[0])
|
| student = []
|
| print('num of student:', n_trajectory)
|
| for t in tqdm(range(n_trajectory)):
|
| array_features, array_next_states_actions = [], []
|
| if syn:
|
| if num_week == 1:
|
| student = [np.array(trajectories[t])] if len(trajectories[t]) else [[]]
|
| else:
|
| student = [np.array(trajectories[t][i]) if len(trajectories[t]) else [[]] for i in range(num_week)]
|
| else:
|
| student = [trajectories[i].iloc[t] for i in range(num_week)]
|
| for i in range(num_week):
|
|
|
| length = len(student[i])
|
| if length == 0:
|
| array_features.append(-1*np.ones((upper_bound, world.n_features)))
|
| array_next_states_actions.append(-1*np.ones((upper_bound, 2)))
|
| continue
|
| if (length) > upper_bound:
|
| tmp = student[i][:upper_bound, :2]
|
| else:
|
| try:
|
| tmp = student[i][:, :2]
|
| except:
|
| print('Error in student[i][:, :2], length:', length)
|
| print(student[i])
|
| raise ValueError("Invalid student data format")
|
| array_next_states_actions.append(np.pad(tmp,
|
| ((0, upper_bound - len(tmp)), (0, 0)), 'constant', constant_values=-1))
|
| array_features.append(np.pad([world.state_to_array(state, action) for state, action in tmp],
|
| ((0, upper_bound - len(tmp)), (0, 0)), 'constant', constant_values=-1))
|
| array_next_states_actions = np.concatenate(array_next_states_actions, axis=0)
|
| array_features = np.concatenate(array_features, axis=0)
|
| next_states_actions.append(array_next_states_actions)
|
| course_features.append(array_features)
|
| course_features = np.array(course_features)
|
|
|
|
|
| if save_to_disk:
|
| np.save(f'{path}_features.npy', course_features)
|
| return course_features
|
|
|
|
|
| def whatif_values(df, schedule, map_event_id, problem_event):
|
| """
|
| return a dict for whatif features: {event_week, problem_difficulty, video_length, week_complexity}
|
| event_week: week that the video/quiz are introduced, if missing then approximate by the most frequent week that the event occurs
|
| problem_difficulty: the difficulty for each problem event
|
| video_length: the length for each video event
|
| week_complexity: the complexity of each weekly topic
|
| """
|
| week_skill = [[1], [1], [2], [3, 4, 5, 6], [7, 8], [9, 10], [11], [3, 4, 7], [5, 12], [13]]
|
| schedule['event_id'] = schedule['id'].apply(lambda x: map_event_id[x] if x in list(map_event_id.keys()) else None)
|
| schedule = schedule.drop_duplicates(subset='event_id', keep='first')
|
|
|
| event_week = df.groupby('event_id')['week'].apply(lambda x: x.value_counts().idxmax() + 1)
|
| chapter_arr = []
|
| for i in range(len(map_event_id)):
|
| chapter_arr.append(schedule[schedule['event_id']==i]['chapter'].values[0]
|
| if i in schedule['event_id'].values else
|
| event_week[i])
|
|
|
| df = df.merge(schedule, on='event_id', how='left')
|
| df['grade_max'] = df['grade_max'].fillna(df.groupby('event_id')['problem_grade'].transform(lambda x: x.max()))
|
|
|
| df['difficulty'] = np.where(df['grade_max']!=0, df['problem_grade']/df['grade_max'], 1)
|
|
|
| problem_difficulty = df.groupby('event_id')['difficulty'].apply(lambda x: 1-x.mean())
|
|
|
| video_length = schedule[['event_id', 'duration']]
|
| min_len, max_len = schedule[schedule['type']=='video']['duration'].min(), schedule[schedule['type']=='video']['duration'].max()
|
| video_length['duration'] = (video_length['duration']-min_len)/(max_len-min_len)
|
| event_value = pd.DataFrame({})
|
| event_value['difficulty'] = problem_difficulty
|
| event_value['chapter'] = chapter_arr
|
| event_value = event_value.merge(video_length, on='event_id', how='left')
|
| event_value['duration'].fillna(event_value['duration'].mean(), inplace=True)
|
| event_value['is_problem'] = event_value['event_id'].apply(lambda x: True if x in problem_event else False)
|
| return event_value.sort_values('event_id'), (max_len, min_len)
|
|
|
| |