Spaces:

tranhuonglan
/

what-if-simulation-app

Sleeping

what-if-simulation-app / src /utils /data_helper.py

tranhuonglan

first commit

e448441 8 months ago

13 kB

	import numpy as np
	from sklearn.cluster import KMeans
	import matplotlib.pyplot as plt
	import seaborn as sns
	from datetime import datetime
	from tqdm import tqdm
	import pandas as pd

	def filter_range_dates(df, start_date, end_date):
	df = df[df['date'] >= str2dt(start_date)]
	df = df[df['date'] <= str2dt(end_date)] if end_date is not np.nan else df
	return df

	def tmp2dt(x):
	return str2dt(datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))

	def str2dt(ts, format='%Y-%m-%d %H:%M:%S'):
	return datetime.strptime(ts, format)

	def dt2w(dt):
	return dt.strftime('%W')

	def outliers_iqr(data):
	quartile_1, quartile_3 = np.percentile(data, [10, 90])
	iqr = quartile_3 - quartile_1
	lower_bound = quartile_1 - (1.5 * iqr)
	upper_bound = quartile_3 + (1.5 * iqr)
	# Filter out values that are within the bounds
	return lower_bound, upper_bound


	def dealing_missing_value(data):
	"""
	dealing_missing_value: return filled nan dataset that would replace the nan value for each week each feature
	by 0. Furthermore, a boolean value is added into each week stating whether the week has missing
	value or not.
	:param data: Dataset size (num_student, num_week, num_feature)
	"""
	#Shape of median each week each feature (10, 7)
	#Fill the last week content-anticipation, and competency-anticipation as 0
	fillna_data = np.copy(data)
	fillna_data[:, -1, 2] = np.nan_to_num(fillna_data[:, -1, 2])
	fillna_data[:, -1, 4] = np.nan_to_num(fillna_data[:, -1, 4])
	nan_mask = np.isnan(fillna_data).any(axis=2)
	# Add the new column to the dataset
	# 1 if has nan, 0 otherwise, ignore the nan value of content-anticipation, and competency-anticipation in the last week
	fillna_data = np.dstack((fillna_data, nan_mask.astype(int)))
	fillna_data = np.nan_to_num(fillna_data)
	# Mask the data in the last week for content-anticipation, and competency-anticipation as 0
	return fillna_data

	def discretized_feature(clusters_list, feature, feature_list, kmeans, show=False, discretized_by_percentile=False, mids=[]):
	"""
	discretized_feature: using KNN to group each feature in feature_list into multiple groups, replace the
	float value by the group number
	:param n_clusters_list: shape [no feature] hyperparameter list of number of groups of each feature
	:param feature: shape [no student, no week, no feature]
	:param feature_list: shape [no feature] list of names of each feature, for analyzing purpose only
	"""
	# Temporary: fill nan value to 0
	data_arr = feature
	data_arr = np.transpose(data_arr, (2, 0, 1))
	discretized_arr = data_arr
	if discretized_by_percentile:
	print('50 percentile of non zero value:', mids)
	print("Discretized by percentile")
	else:
	print("Discretized by KMeans")
	for id, column in enumerate(feature_list):
	if show:
	print("Discritize feature in progress:", column)
	print('Number of groups:', clusters_list[id])
	if discretized_by_percentile:
	X = data_arr[id]
	y_pred = np.zeros(X.shape)
	y_pred[X == 0] = 0
	y_pred[(X>0) & (X<=mids[id])] = 1
	y_pred[X > mids[id]] = 2
	discretized_arr[id] = y_pred.astype(int)
	else:
	X = np.reshape(data_arr[id], (-1, 1))
	y_pred = kmeans[id].predict(X)
	map_group = dict(zip(np.argsort(kmeans[id].cluster_centers_[:, 0]), np.arange(clusters_list[id])))
	y_pred = np.array([map_group[x] for x in y_pred])
	# print(np.unique(y_pred))
	discretized_arr[id] = np.reshape(y_pred, (data_arr[id].shape))
	return np.transpose(discretized_arr, (1, 2, 0)).astype(int)

	def visualize_skewness(data):
	"""
	visualize_skewness: visualize histogram of each feature data of all weeks in data
	"""
	fig, ax = plt.subplots(2, 4, figsize=(16, 5))
	fig.tight_layout(pad=3.0)
	for id, column in enumerate(data.columns):
	X = data[column].dropna()
	X = X[X > 0]
	sns.histplot(X, kde=True, ax=ax[id//4, id%4])
	plt.show()

	def plot_distortion(n_clusters_list, X, title, ax, seed=0):
	"""
	Plot the distortion (in-cluster variance) on the y-axis and
	the number of clusters in the x-axis

	:param n_clusters_list: List of number of clusters to explore
	:param X: np array of data points
	"""
	distortion_list = []
	for k in n_clusters_list:
	kmeans = KMeans(n_clusters=k, random_state=seed).fit(X)
	distortion = kmeans.inertia_
	distortion_list.append(distortion)
	ax.plot(n_clusters_list, distortion_list, 'o-')
	ax.set_ylabel('Distortion')
	ax.set_title(title)

	def plot_all_distortion(feature_df):
	"""
	Plot all the distortion of all feature
	"""
	fig, ax = plt.subplots(2, 4, figsize=(16, 5))
	fig.tight_layout(pad=3.0)
	for id, column in enumerate(feature_df.columns):
	print("Features:", column)
	# Preprocessing and re-standardization
	X = np.array(feature_df[column].dropna()).reshape(-1, 1)
	plot_distortion(range(2, 10), X, column, ax[id//4, id%4])

	plt.show()

	def feature_table_to_trajectories(student_world, samples_trajectory):
	"""
	Transform students' feature into trajectory of fixed length (number of weeks)
	Actions: (0/1) indicate whether student move to the next state in the consecutive weeks
	States: (int) calculated from features' value ([number of features] to int) by function state_point_to_index
	:param student_world: world object defines by number of states, number of actions,
	and transition probability table
	:param samples_trajectory: [no student, no week, no features] experts (students)' feature taken from the feature dataset
	:return trajectory_list: list of shape [number of students]. Each element is a trajectory
	that is a list of shapes [number of weeks] containing tuples (state_from, action, state_to)
	"""
	trajectory_list = []
	no_student = samples_trajectory.shape[0]
	no_week = samples_trajectory.shape[1]
	for student in range(no_student):
	t = []
	for week in range(no_week - 1):
	s_from = student_world.state_point_to_index(samples_trajectory[student, week])
	s_to = student_world.state_point_to_index(samples_trajectory[student, week + 1])
	action = 0 if s_from == s_to else 1
	t.append((s_from, action, s_to))
	trajectory_list.append(t)
	return np.array(trajectory_list)

	def data_to_trajectories(dataset, map_action, weeks=None, all=True, remove_empty=True):
	"""
	Return a list of trajectories, each trajectory is a list of (state_from, action, state_to) tuples
	"""
	def user_to_trajectory(x, weeks):
	trajectory = []
	for i in range(len(x['event_id'])):
	if (all==True or x['week'][i] in weeks):
	if (i != 0 and x['event_id'][i] != x['event_id'][i-1]):
	action = map_action['Move To Quiz'] if 'Quiz' in x['action'][i] else map_action['Move To Video']
	trajectory.append((x['event_id'][i - 1], action, x['event_id'][i]))
	trajectory.append([x['event_id'][i], map_action[x['action'][i]], x['event_id'][i]])
	return np.array(trajectory)
	trajectories = dataset[['event_id', 'week', 'action']].apply(lambda x: user_to_trajectory(x, weeks), axis=1)
	if remove_empty:
	trajectories = [x for x in trajectories if x.size != 0]
	assert len(trajectories) > 0
	max_length = max([len(t) for t in trajectories])
	min_length = min([len(t) for t in trajectories])
	print('max length, min length', max_length, min_length)
	print(f'num of student in {weeks} week:', len(trajectories))
	print('----------------------')
	return trajectories

	def trajectories_to_features(trajectories, world, path="", upper_bound=100, num_week=6, syn=True, save_to_disk=True):
	course_features, next_states_actions = [], []
	n_trajectory = 0
	if syn:
	n_trajectory = len(trajectories)
	else:
	n_trajectory = len(trajectories[0])
	student = []
	print('num of student:', n_trajectory)
	for t in tqdm(range(n_trajectory)):
	array_features, array_next_states_actions = [], []
	if syn:
	if num_week == 1:
	student = [np.array(trajectories[t])] if len(trajectories[t]) else [[]]
	else:
	student = [np.array(trajectories[t][i]) if len(trajectories[t]) else [[]] for i in range(num_week)]
	else:
	student = [trajectories[i].iloc[t] for i in range(num_week)]
	for i in range(num_week):

	length = len(student[i])
	if length == 0:
	array_features.append(-1*np.ones((upper_bound, world.n_features)))
	array_next_states_actions.append(-1*np.ones((upper_bound, 2)))
	continue
	if (length) > upper_bound:
	tmp = student[i][:upper_bound, :2]
	else:
	try:
	tmp = student[i][:, :2]
	except:
	print('Error in student[i][:, :2], length:', length)
	print(student[i])
	raise ValueError("Invalid student data format")
	array_next_states_actions.append(np.pad(tmp,
	((0, upper_bound - len(tmp)), (0, 0)), 'constant', constant_values=-1))
	array_features.append(np.pad([world.state_to_array(state, action) for state, action in tmp],
	((0, upper_bound - len(tmp)), (0, 0)), 'constant', constant_values=-1))
	array_next_states_actions = np.concatenate(array_next_states_actions, axis=0)
	array_features = np.concatenate(array_features, axis=0)
	next_states_actions.append(array_next_states_actions)
	course_features.append(array_features)
	course_features = np.array(course_features)
	# next_states_actions = np.array(next_states_actions)
	# student = [x for x in student]
	if save_to_disk:
	np.save(f'{path}_features.npy', course_features)
	return course_features


	def whatif_values(df, schedule, map_event_id, problem_event):
	"""
	return a dict for whatif features: {event_week, problem_difficulty, video_length, week_complexity}
	event_week: week that the video/quiz are introduced, if missing then approximate by the most frequent week that the event occurs
	problem_difficulty: the difficulty for each problem event
	video_length: the length for each video event
	week_complexity: the complexity of each weekly topic
	"""
	week_skill = [[1], [1], [2], [3, 4, 5, 6], [7, 8], [9, 10], [11], [3, 4, 7], [5, 12], [13]]
	schedule['event_id'] = schedule['id'].apply(lambda x: map_event_id[x] if x in list(map_event_id.keys()) else None)
	schedule = schedule.drop_duplicates(subset='event_id', keep='first')
	# count the week it has the most interactions
	event_week = df.groupby('event_id')['week'].apply(lambda x: x.value_counts().idxmax() + 1) # range from 1 to 10
	chapter_arr = []
	for i in range(len(map_event_id)):
	chapter_arr.append(schedule[schedule['event_id']==i]['chapter'].values[0]
	if i in schedule['event_id'].values else
	event_week[i])

	df = df.merge(schedule, on='event_id', how='left')
	df['grade_max'] = df['grade_max'].fillna(df.groupby('event_id')['problem_grade'].transform(lambda x: x.max()))
	# df['grade_max'] = df['grade_max'].replace(0, 1e-6) #avoid division by zero
	df['difficulty'] = np.where(df['grade_max']!=0, df['problem_grade']/df['grade_max'], 1)
	# if problem_grade is missing, use submission_number to approximate the difficulty
	problem_difficulty = df.groupby('event_id')['difficulty'].apply(lambda x: 1-x.mean())

	video_length = schedule[['event_id', 'duration']]
	min_len, max_len = schedule[schedule['type']=='video']['duration'].min(), schedule[schedule['type']=='video']['duration'].max()
	video_length['duration'] = (video_length['duration']-min_len)/(max_len-min_len)
	event_value = pd.DataFrame({})
	event_value['difficulty'] = problem_difficulty
	event_value['chapter'] = chapter_arr
	event_value = event_value.merge(video_length, on='event_id', how='left')
	event_value['duration'].fillna(event_value['duration'].mean(), inplace=True)
	event_value['is_problem'] = event_value['event_id'].apply(lambda x: True if x in problem_event else False)
	return event_value.sort_values('event_id'), (max_len, min_len)