File size: 3,867 Bytes
e448441 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import numpy as np
import pandas as pd
import os
import argparse
def check_fail(df):
tmp = df.drop_duplicates('user_id')
return sum(tmp['label-pass-fail']) / len(tmp)
def parse_args():
parser = argparse.ArgumentParser(description="Process course data.")
parser.add_argument('--data_dir', type=str, default='data/mooc_raw/coursera',
help="Directory containing the raw clickstream data.")
parser.add_argument('--feature_dir', type=str,default='data/mooc_features',
help="Directory containing the meata data.")
parser.add_argument('--course_id', type=str, default='dsp-006',
help="Course ID.")
parser.add_argument('--output_dir', type=str, default='data')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
DATA_DIR = args.data_dir
course = args.course_id
FEATURE_DIR = args.feature_dir
SAVE_DIR = f'{args.output_dir}/{course}'
print(f'Processing course: {course}')
grade = pd.read_csv(f'{DATA_DIR}/grade/'+ course + '.csv')
video = pd.read_csv(f'{DATA_DIR}/video_event/' + course + '.csv')
video = video.dropna(axis=1)
problem = pd.read_csv('{}/problem_event/'.format(DATA_DIR) + course + '.csv')
problem.rename(columns={'grade': 'problem_grade'}, inplace=True)
row_nums = pd.read_csv(f'{FEATURE_DIR}/eq_week-marras_et_al-dsp_006/feature_labels.csv')
if 'Unnamed: 0' in row_nums.columns:
row_nums.rename(columns={'Unnamed: 0': 'user_index'}, inplace=True)
else:
print(row_nums.columns)
row_nums.rename(columns={'user_index': 'rown'}, inplace=True)
useridmap = pd.read_csv(f'{FEATURE_DIR}/user_id_mapping-{course.replace("-", "_")}.csv')
try:
if 'Unnamed: 0' in useridmap.columns:
useridmap.rename(columns={'Unnamed: 0': 'rown'}, inplace=True)
users = pd.merge(row_nums, useridmap, on='rown', how='left')
except:
raise ValueError('User ID mapping file does not contain expected columns.')
print(f'Number of unique users: {len(users)}, % of fail students: {np.round(sum(users['label-pass-fail']) / len(users), 2)}')
print(f'Number of total interactions in video, problem: ',len(video), len(problem))
if 'user_id' not in video:
video.user_id=video.index
if 'user_id' not in problem:
problem.user_id=problem.index
video = video.loc[~video['event_type'].str.contains('Transcript', case=False)]
uv = video[video.user_id.isin(users.user_id)]
uvl = pd.merge(uv, users, on='user_id', how='left')
up = problem[problem.user_id.isin(users.user_id)]
upl = pd.merge(up, users, on='user_id', how='left')
v_df = uvl.dropna(axis=1)
v_df['action'] = v_df.event_type
v_df = v_df.rename(columns={'video_id': 'event_id'}) # rename column
v_df.drop('event_type', axis=1, inplace=True) # drop useless column
upl['action'] = upl['event_type']
p_df = upl.drop(['event_type', 'problem_type'], axis=1)
p_df = p_df.dropna(axis=0)
p_df = p_df.rename(columns={'problem_id': 'event_id'})
p_df['original_grade'] = p_df.problem_grade
combined = pd.concat([v_df, p_df])
combined.fillna({'problem_grade': -1}, inplace=True)
print('Number of unique events:', len(combined.event_id.unique()))
print('Number of unique actions:', len(combined.action.unique()))
combinedg = pd.merge(combined, grade, on='user_id', how='left')
combinedg['pass_fail'] = combinedg['grade'] < 4
combinedg_drop = combinedg.drop(columns=['rown'])
combinedg_drop.drop_duplicates(inplace=True)
if not os.path.exists(SAVE_DIR):
os.makedirs(SAVE_DIR)
combinedg_drop.to_csv(f'{SAVE_DIR}/combinedg_features_' + course + '.csv')
print('Logging: Done saving preprocessed data to', f'{SAVE_DIR}/combinedg_features_' + course + '.csv')
|