import numpy as np import pandas as pd import os import argparse def check_fail(df): tmp = df.drop_duplicates('user_id') return sum(tmp['label-pass-fail']) / len(tmp) def parse_args(): parser = argparse.ArgumentParser(description="Process course data.") parser.add_argument('--data_dir', type=str, default='data/mooc_raw/coursera', help="Directory containing the raw clickstream data.") parser.add_argument('--feature_dir', type=str,default='data/mooc_features', help="Directory containing the meata data.") parser.add_argument('--course_id', type=str, default='dsp-006', help="Course ID.") parser.add_argument('--output_dir', type=str, default='data') return parser.parse_args() if __name__ == '__main__': args = parse_args() DATA_DIR = args.data_dir course = args.course_id FEATURE_DIR = args.feature_dir SAVE_DIR = f'{args.output_dir}/{course}' print(f'Processing course: {course}') grade = pd.read_csv(f'{DATA_DIR}/grade/'+ course + '.csv') video = pd.read_csv(f'{DATA_DIR}/video_event/' + course + '.csv') video = video.dropna(axis=1) problem = pd.read_csv('{}/problem_event/'.format(DATA_DIR) + course + '.csv') problem.rename(columns={'grade': 'problem_grade'}, inplace=True) row_nums = pd.read_csv(f'{FEATURE_DIR}/eq_week-marras_et_al-dsp_006/feature_labels.csv') if 'Unnamed: 0' in row_nums.columns: row_nums.rename(columns={'Unnamed: 0': 'user_index'}, inplace=True) else: print(row_nums.columns) row_nums.rename(columns={'user_index': 'rown'}, inplace=True) useridmap = pd.read_csv(f'{FEATURE_DIR}/user_id_mapping-{course.replace("-", "_")}.csv') try: if 'Unnamed: 0' in useridmap.columns: useridmap.rename(columns={'Unnamed: 0': 'rown'}, inplace=True) users = pd.merge(row_nums, useridmap, on='rown', how='left') except: raise ValueError('User ID mapping file does not contain expected columns.') print(f'Number of unique users: {len(users)}, % of fail students: {np.round(sum(users['label-pass-fail']) / len(users), 2)}') print(f'Number of total interactions in video, problem: ',len(video), len(problem)) if 'user_id' not in video: video.user_id=video.index if 'user_id' not in problem: problem.user_id=problem.index video = video.loc[~video['event_type'].str.contains('Transcript', case=False)] uv = video[video.user_id.isin(users.user_id)] uvl = pd.merge(uv, users, on='user_id', how='left') up = problem[problem.user_id.isin(users.user_id)] upl = pd.merge(up, users, on='user_id', how='left') v_df = uvl.dropna(axis=1) v_df['action'] = v_df.event_type v_df = v_df.rename(columns={'video_id': 'event_id'}) # rename column v_df.drop('event_type', axis=1, inplace=True) # drop useless column upl['action'] = upl['event_type'] p_df = upl.drop(['event_type', 'problem_type'], axis=1) p_df = p_df.dropna(axis=0) p_df = p_df.rename(columns={'problem_id': 'event_id'}) p_df['original_grade'] = p_df.problem_grade combined = pd.concat([v_df, p_df]) combined.fillna({'problem_grade': -1}, inplace=True) print('Number of unique events:', len(combined.event_id.unique())) print('Number of unique actions:', len(combined.action.unique())) combinedg = pd.merge(combined, grade, on='user_id', how='left') combinedg['pass_fail'] = combinedg['grade'] < 4 combinedg_drop = combinedg.drop(columns=['rown']) combinedg_drop.drop_duplicates(inplace=True) if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) combinedg_drop.to_csv(f'{SAVE_DIR}/combinedg_features_' + course + '.csv') print('Logging: Done saving preprocessed data to', f'{SAVE_DIR}/combinedg_features_' + course + '.csv')