|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import os |
|
|
import argparse |
|
|
|
|
|
def check_fail(df): |
|
|
tmp = df.drop_duplicates('user_id') |
|
|
return sum(tmp['label-pass-fail']) / len(tmp) |
|
|
|
|
|
def parse_args(): |
|
|
parser = argparse.ArgumentParser(description="Process course data.") |
|
|
parser.add_argument('--data_dir', type=str, default='data/mooc_raw/coursera', |
|
|
help="Directory containing the raw clickstream data.") |
|
|
parser.add_argument('--feature_dir', type=str,default='data/mooc_features', |
|
|
help="Directory containing the meata data.") |
|
|
parser.add_argument('--course_id', type=str, default='dsp-006', |
|
|
help="Course ID.") |
|
|
parser.add_argument('--output_dir', type=str, default='data') |
|
|
return parser.parse_args() |
|
|
|
|
|
if __name__ == '__main__': |
|
|
args = parse_args() |
|
|
DATA_DIR = args.data_dir |
|
|
course = args.course_id |
|
|
FEATURE_DIR = args.feature_dir |
|
|
SAVE_DIR = f'{args.output_dir}/{course}' |
|
|
|
|
|
print(f'Processing course: {course}') |
|
|
grade = pd.read_csv(f'{DATA_DIR}/grade/'+ course + '.csv') |
|
|
video = pd.read_csv(f'{DATA_DIR}/video_event/' + course + '.csv') |
|
|
video = video.dropna(axis=1) |
|
|
problem = pd.read_csv('{}/problem_event/'.format(DATA_DIR) + course + '.csv') |
|
|
problem.rename(columns={'grade': 'problem_grade'}, inplace=True) |
|
|
row_nums = pd.read_csv(f'{FEATURE_DIR}/eq_week-marras_et_al-dsp_006/feature_labels.csv') |
|
|
if 'Unnamed: 0' in row_nums.columns: |
|
|
row_nums.rename(columns={'Unnamed: 0': 'user_index'}, inplace=True) |
|
|
else: |
|
|
print(row_nums.columns) |
|
|
row_nums.rename(columns={'user_index': 'rown'}, inplace=True) |
|
|
useridmap = pd.read_csv(f'{FEATURE_DIR}/user_id_mapping-{course.replace("-", "_")}.csv') |
|
|
try: |
|
|
if 'Unnamed: 0' in useridmap.columns: |
|
|
useridmap.rename(columns={'Unnamed: 0': 'rown'}, inplace=True) |
|
|
users = pd.merge(row_nums, useridmap, on='rown', how='left') |
|
|
except: |
|
|
raise ValueError('User ID mapping file does not contain expected columns.') |
|
|
print(f'Number of unique users: {len(users)}, % of fail students: {np.round(sum(users['label-pass-fail']) / len(users), 2)}') |
|
|
print(f'Number of total interactions in video, problem: ',len(video), len(problem)) |
|
|
|
|
|
if 'user_id' not in video: |
|
|
video.user_id=video.index |
|
|
if 'user_id' not in problem: |
|
|
problem.user_id=problem.index |
|
|
|
|
|
video = video.loc[~video['event_type'].str.contains('Transcript', case=False)] |
|
|
uv = video[video.user_id.isin(users.user_id)] |
|
|
uvl = pd.merge(uv, users, on='user_id', how='left') |
|
|
up = problem[problem.user_id.isin(users.user_id)] |
|
|
upl = pd.merge(up, users, on='user_id', how='left') |
|
|
v_df = uvl.dropna(axis=1) |
|
|
v_df['action'] = v_df.event_type |
|
|
v_df = v_df.rename(columns={'video_id': 'event_id'}) |
|
|
v_df.drop('event_type', axis=1, inplace=True) |
|
|
upl['action'] = upl['event_type'] |
|
|
p_df = upl.drop(['event_type', 'problem_type'], axis=1) |
|
|
p_df = p_df.dropna(axis=0) |
|
|
|
|
|
p_df = p_df.rename(columns={'problem_id': 'event_id'}) |
|
|
p_df['original_grade'] = p_df.problem_grade |
|
|
combined = pd.concat([v_df, p_df]) |
|
|
combined.fillna({'problem_grade': -1}, inplace=True) |
|
|
print('Number of unique events:', len(combined.event_id.unique())) |
|
|
print('Number of unique actions:', len(combined.action.unique())) |
|
|
|
|
|
combinedg = pd.merge(combined, grade, on='user_id', how='left') |
|
|
combinedg['pass_fail'] = combinedg['grade'] < 4 |
|
|
combinedg_drop = combinedg.drop(columns=['rown']) |
|
|
combinedg_drop.drop_duplicates(inplace=True) |
|
|
if not os.path.exists(SAVE_DIR): |
|
|
os.makedirs(SAVE_DIR) |
|
|
combinedg_drop.to_csv(f'{SAVE_DIR}/combinedg_features_' + course + '.csv') |
|
|
print('Logging: Done saving preprocessed data to', f'{SAVE_DIR}/combinedg_features_' + course + '.csv') |
|
|
|
|
|
|