what-if-simulation-app / src /utils /data_helper.py
tranhuonglan
first commit
e448441
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from tqdm import tqdm
import pandas as pd
def filter_range_dates(df, start_date, end_date):
df = df[df['date'] >= str2dt(start_date)]
df = df[df['date'] <= str2dt(end_date)] if end_date is not np.nan else df
return df
def tmp2dt(x):
return str2dt(datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
def str2dt(ts, format='%Y-%m-%d %H:%M:%S'):
return datetime.strptime(ts, format)
def dt2w(dt):
return dt.strftime('%W')
def outliers_iqr(data):
quartile_1, quartile_3 = np.percentile(data, [10, 90])
iqr = quartile_3 - quartile_1
lower_bound = quartile_1 - (1.5 * iqr)
upper_bound = quartile_3 + (1.5 * iqr)
# Filter out values that are within the bounds
return lower_bound, upper_bound
def dealing_missing_value(data):
"""
dealing_missing_value: return filled nan dataset that would replace the nan value for each week each feature
by 0. Furthermore, a boolean value is added into each week stating whether the week has missing
value or not.
:param data: Dataset size (num_student, num_week, num_feature)
"""
#Shape of median each week each feature (10, 7)
#Fill the last week content-anticipation, and competency-anticipation as 0
fillna_data = np.copy(data)
fillna_data[:, -1, 2] = np.nan_to_num(fillna_data[:, -1, 2])
fillna_data[:, -1, 4] = np.nan_to_num(fillna_data[:, -1, 4])
nan_mask = np.isnan(fillna_data).any(axis=2)
# Add the new column to the dataset
# 1 if has nan, 0 otherwise, ignore the nan value of content-anticipation, and competency-anticipation in the last week
fillna_data = np.dstack((fillna_data, nan_mask.astype(int)))
fillna_data = np.nan_to_num(fillna_data)
# Mask the data in the last week for content-anticipation, and competency-anticipation as 0
return fillna_data
def discretized_feature(clusters_list, feature, feature_list, kmeans, show=False, discretized_by_percentile=False, mids=[]):
"""
discretized_feature: using KNN to group each feature in feature_list into multiple groups, replace the
float value by the group number
:param n_clusters_list: shape [no feature] hyperparameter list of number of groups of each feature
:param feature: shape [no student, no week, no feature]
:param feature_list: shape [no feature] list of names of each feature, for analyzing purpose only
"""
# Temporary: fill nan value to 0
data_arr = feature
data_arr = np.transpose(data_arr, (2, 0, 1))
discretized_arr = data_arr
if discretized_by_percentile:
print('50 percentile of non zero value:', mids)
print("Discretized by percentile")
else:
print("Discretized by KMeans")
for id, column in enumerate(feature_list):
if show:
print("Discritize feature in progress:", column)
print('Number of groups:', clusters_list[id])
if discretized_by_percentile:
X = data_arr[id]
y_pred = np.zeros(X.shape)
y_pred[X == 0] = 0
y_pred[(X>0) & (X<=mids[id])] = 1
y_pred[X > mids[id]] = 2
discretized_arr[id] = y_pred.astype(int)
else:
X = np.reshape(data_arr[id], (-1, 1))
y_pred = kmeans[id].predict(X)
map_group = dict(zip(np.argsort(kmeans[id].cluster_centers_[:, 0]), np.arange(clusters_list[id])))
y_pred = np.array([map_group[x] for x in y_pred])
# print(np.unique(y_pred))
discretized_arr[id] = np.reshape(y_pred, (data_arr[id].shape))
return np.transpose(discretized_arr, (1, 2, 0)).astype(int)
def visualize_skewness(data):
"""
visualize_skewness: visualize histogram of each feature data of all weeks in data
"""
fig, ax = plt.subplots(2, 4, figsize=(16, 5))
fig.tight_layout(pad=3.0)
for id, column in enumerate(data.columns):
X = data[column].dropna()
X = X[X > 0]
sns.histplot(X, kde=True, ax=ax[id//4, id%4])
plt.show()
def plot_distortion(n_clusters_list, X, title, ax, seed=0):
"""
Plot the distortion (in-cluster variance) on the y-axis and
the number of clusters in the x-axis
:param n_clusters_list: List of number of clusters to explore
:param X: np array of data points
"""
distortion_list = []
for k in n_clusters_list:
kmeans = KMeans(n_clusters=k, random_state=seed).fit(X)
distortion = kmeans.inertia_
distortion_list.append(distortion)
ax.plot(n_clusters_list, distortion_list, 'o-')
ax.set_ylabel('Distortion')
ax.set_title(title)
def plot_all_distortion(feature_df):
"""
Plot all the distortion of all feature
"""
fig, ax = plt.subplots(2, 4, figsize=(16, 5))
fig.tight_layout(pad=3.0)
for id, column in enumerate(feature_df.columns):
print("Features:", column)
# Preprocessing and re-standardization
X = np.array(feature_df[column].dropna()).reshape(-1, 1)
plot_distortion(range(2, 10), X, column, ax[id//4, id%4])
plt.show()
def feature_table_to_trajectories(student_world, samples_trajectory):
"""
Transform students' feature into trajectory of fixed length (number of weeks)
Actions: (0/1) indicate whether student move to the next state in the consecutive weeks
States: (int) calculated from features' value ([number of features] to int) by function state_point_to_index
:param student_world: world object defines by number of states, number of actions,
and transition probability table
:param samples_trajectory: [no student, no week, no features] experts (students)' feature taken from the feature dataset
:return trajectory_list: list of shape [number of students]. Each element is a trajectory
that is a list of shapes [number of weeks] containing tuples (state_from, action, state_to)
"""
trajectory_list = []
no_student = samples_trajectory.shape[0]
no_week = samples_trajectory.shape[1]
for student in range(no_student):
t = []
for week in range(no_week - 1):
s_from = student_world.state_point_to_index(samples_trajectory[student, week])
s_to = student_world.state_point_to_index(samples_trajectory[student, week + 1])
action = 0 if s_from == s_to else 1
t.append((s_from, action, s_to))
trajectory_list.append(t)
return np.array(trajectory_list)
def data_to_trajectories(dataset, map_action, weeks=None, all=True, remove_empty=True):
"""
Return a list of trajectories, each trajectory is a list of (state_from, action, state_to) tuples
"""
def user_to_trajectory(x, weeks):
trajectory = []
for i in range(len(x['event_id'])):
if (all==True or x['week'][i] in weeks):
if (i != 0 and x['event_id'][i] != x['event_id'][i-1]):
action = map_action['Move To Quiz'] if 'Quiz' in x['action'][i] else map_action['Move To Video']
trajectory.append((x['event_id'][i - 1], action, x['event_id'][i]))
trajectory.append([x['event_id'][i], map_action[x['action'][i]], x['event_id'][i]])
return np.array(trajectory)
trajectories = dataset[['event_id', 'week', 'action']].apply(lambda x: user_to_trajectory(x, weeks), axis=1)
if remove_empty:
trajectories = [x for x in trajectories if x.size != 0]
assert len(trajectories) > 0
max_length = max([len(t) for t in trajectories])
min_length = min([len(t) for t in trajectories])
print('max length, min length', max_length, min_length)
print(f'num of student in {weeks} week:', len(trajectories))
print('----------------------')
return trajectories
def trajectories_to_features(trajectories, world, path="", upper_bound=100, num_week=6, syn=True, save_to_disk=True):
course_features, next_states_actions = [], []
n_trajectory = 0
if syn:
n_trajectory = len(trajectories)
else:
n_trajectory = len(trajectories[0])
student = []
print('num of student:', n_trajectory)
for t in tqdm(range(n_trajectory)):
array_features, array_next_states_actions = [], []
if syn:
if num_week == 1:
student = [np.array(trajectories[t])] if len(trajectories[t]) else [[]]
else:
student = [np.array(trajectories[t][i]) if len(trajectories[t]) else [[]] for i in range(num_week)]
else:
student = [trajectories[i].iloc[t] for i in range(num_week)]
for i in range(num_week):
length = len(student[i])
if length == 0:
array_features.append(-1*np.ones((upper_bound, world.n_features)))
array_next_states_actions.append(-1*np.ones((upper_bound, 2)))
continue
if (length) > upper_bound:
tmp = student[i][:upper_bound, :2]
else:
try:
tmp = student[i][:, :2]
except:
print('Error in student[i][:, :2], length:', length)
print(student[i])
raise ValueError("Invalid student data format")
array_next_states_actions.append(np.pad(tmp,
((0, upper_bound - len(tmp)), (0, 0)), 'constant', constant_values=-1))
array_features.append(np.pad([world.state_to_array(state, action) for state, action in tmp],
((0, upper_bound - len(tmp)), (0, 0)), 'constant', constant_values=-1))
array_next_states_actions = np.concatenate(array_next_states_actions, axis=0)
array_features = np.concatenate(array_features, axis=0)
next_states_actions.append(array_next_states_actions)
course_features.append(array_features)
course_features = np.array(course_features)
# next_states_actions = np.array(next_states_actions)
# student = [x for x in student]
if save_to_disk:
np.save(f'{path}_features.npy', course_features)
return course_features
def whatif_values(df, schedule, map_event_id, problem_event):
"""
return a dict for whatif features: {event_week, problem_difficulty, video_length, week_complexity}
event_week: week that the video/quiz are introduced, if missing then approximate by the most frequent week that the event occurs
problem_difficulty: the difficulty for each problem event
video_length: the length for each video event
week_complexity: the complexity of each weekly topic
"""
week_skill = [[1], [1], [2], [3, 4, 5, 6], [7, 8], [9, 10], [11], [3, 4, 7], [5, 12], [13]]
schedule['event_id'] = schedule['id'].apply(lambda x: map_event_id[x] if x in list(map_event_id.keys()) else None)
schedule = schedule.drop_duplicates(subset='event_id', keep='first')
# count the week it has the most interactions
event_week = df.groupby('event_id')['week'].apply(lambda x: x.value_counts().idxmax() + 1) # range from 1 to 10
chapter_arr = []
for i in range(len(map_event_id)):
chapter_arr.append(schedule[schedule['event_id']==i]['chapter'].values[0]
if i in schedule['event_id'].values else
event_week[i])
df = df.merge(schedule, on='event_id', how='left')
df['grade_max'] = df['grade_max'].fillna(df.groupby('event_id')['problem_grade'].transform(lambda x: x.max()))
# df['grade_max'] = df['grade_max'].replace(0, 1e-6) #avoid division by zero
df['difficulty'] = np.where(df['grade_max']!=0, df['problem_grade']/df['grade_max'], 1)
# if problem_grade is missing, use submission_number to approximate the difficulty
problem_difficulty = df.groupby('event_id')['difficulty'].apply(lambda x: 1-x.mean())
video_length = schedule[['event_id', 'duration']]
min_len, max_len = schedule[schedule['type']=='video']['duration'].min(), schedule[schedule['type']=='video']['duration'].max()
video_length['duration'] = (video_length['duration']-min_len)/(max_len-min_len)
event_value = pd.DataFrame({})
event_value['difficulty'] = problem_difficulty
event_value['chapter'] = chapter_arr
event_value = event_value.merge(video_length, on='event_id', how='left')
event_value['duration'].fillna(event_value['duration'].mean(), inplace=True)
event_value['is_problem'] = event_value['event_id'].apply(lambda x: True if x in problem_event else False)
return event_value.sort_values('event_id'), (max_len, min_len)