Spaces:
Build error
Build error
| import argparse | |
| import pickle | |
| import random | |
| import copy | |
| import pandas as pd | |
| import numpy as np | |
| from collections import Counter | |
| import os | |
| from data_preprocessor import DataPreprocessor | |
| def prepare_pretraining_files(data_processor, options): | |
| # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| val_file = open(options.val_file_path, "w") | |
| val_info = open(options.val_info_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| # if options.workspace_name == section: | |
| if "ratio_proportion_change3" == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| # step_names_token = [step for step in prob_groups['Step Name'] if str(step) != 'nan'] | |
| # print(step_names_token) | |
| # writtenTrain = False | |
| # writtenTest = False | |
| student_groups.sort_values(by="Time", inplace=True) | |
| # prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # print(len(prob_list), prob_list) | |
| # first_prob_list = prob_list[:3] | |
| # last_prob_list = prob_list[-3:] | |
| # print(len(first_prob_list), first_prob_list) | |
| # print(len(last_prob_list), last_prob_list) | |
| # final_prob_list = first_prob_list + last_prob_list | |
| # print(len(prob_list), len(final_prob_list), final_prob_list) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # For first 3 and last 3 only | |
| # if not prob in final_prob_list: | |
| # continue | |
| # print(prob) | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups["Step Name"])) | |
| unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) | |
| if unique_steps_len < 4: | |
| continue | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 1800: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| # progress = "" | |
| step_names_token = [] | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| for index, row in prob_groups[['Time', 'Step Name', 'CF (Etalon)', 'Outcome', 'Help Level','CF (Workspace Progress Status)']].iterrows(): | |
| step = row["Step Name"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| # if row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| # print(row) | |
| continue | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) | |
| # 4 and more in sequence | |
| if step_names_token and unique_steps_len > 4: | |
| # and len(step_names_token) > 3 | |
| # For information | |
| # indices = [str(i) for i in prob_groups.index] | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| proba = random.random() | |
| # if prob in first_prob_list: | |
| if proba <= 0.8: | |
| # writtenTrain = True | |
| train_file.write("\t".join(step_names_token)) | |
| train_file.write("\n") | |
| # train_info.write(",".join([str(progress),str(prob), str(student), str(len(step_names_token)), | |
| # "\t".join(map(str, outcome)), "\t".join(map(str, help_level))])) | |
| # progress, problem name, student id, auto_complete, total steps length, er or me, outcome seq, help_level seq, encoding in steps length | |
| train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), | |
| "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) | |
| train_info.write("\n") | |
| elif proba > 0.9: | |
| # elif prob in last_prob_list: | |
| # writtenTest = True | |
| test_file.write("\t".join(step_names_token)) | |
| test_file.write("\n") | |
| # test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| # "\t".join(map(str, outcome)), "\t".join(map(str, help_level))])) | |
| # progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length | |
| test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), | |
| "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) | |
| test_info.write("\n") | |
| else: | |
| val_file.write("\t".join(step_names_token)) | |
| val_file.write("\n") | |
| # test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| # "\t".join(map(str, outcome)), "\t".join(map(str, help_level))])) | |
| # progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length | |
| val_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), | |
| "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) | |
| val_info.write("\n") | |
| # Indicates actions of next student | |
| # Indicates next problem | |
| # if writtenTrain: | |
| # train_file.write("\n") | |
| # train_info.write("\n") | |
| # if writtenTest: | |
| # test_file.write("\n") | |
| # test_info.write("\n") | |
| # if not writtenTrain and not writtenTest: | |
| # print(f"Student {student} is not involved in workspace : {options.workspace_name}.") | |
| train_file.close() | |
| train_info.close() | |
| val_file.close() | |
| val_info.close() | |
| test_file.close() | |
| test_info.close() | |
| def prepare_school_pretraining_files(data_processor, options): | |
| # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator(sep=",") | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| val_file = open(options.val_file_path, "w") | |
| val_info = open(options.val_info_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| for chunk_data in chunk_iterator: | |
| for school, school_group in chunk_data.groupby('CF (Anon School Id)'): | |
| if not options.school or school in options.school: | |
| for class_id, class_group in school_group.groupby('CF (Anon Class Id)'): | |
| for student, student_groups in class_group.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time") | |
| # prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # For first 3 and last 3 only | |
| # if not prob in final_prob_list: | |
| # continue | |
| # print(prob) | |
| step_names_token = [] | |
| means_and_extremes = False | |
| for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): | |
| progress = row["CF (Workspace Progress Status)"] | |
| action = row["Action"] | |
| attempt = row["Attempt At Step"] | |
| autofilled = row["CF (Is Autofilled)"] | |
| step = row["Step Name"] | |
| scenario = row['CF (Problem Scenario Tags)'] | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(etalon): | |
| etalon = etalon.strip('{}') | |
| key, value = etalon.split('=') | |
| etalon = value | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| except Exception as e: | |
| pass | |
| if not autofilled: | |
| new_step = f"{step}:{action}:{attempt}" | |
| step_names_token.append(new_step) | |
| if step_names_token: | |
| where_opt = [] | |
| step1 = False | |
| step2 = False | |
| strategy_data = False | |
| for step_oh in step_names_token: | |
| step = step_oh.split(":") | |
| if len(step) == 3: | |
| step = step[0] | |
| else: | |
| step = ":".join(step[:2]) | |
| # print(f"changed {step_oh} = ? {step}") | |
| if step == options.opt_step1[0]: | |
| where_opt.append("_1") | |
| step1 = True | |
| elif step == options.opt_step2[0]: | |
| where_opt.append("_2") | |
| step2 = True | |
| elif step in options.opt_step1[1:]: | |
| where_opt.append("1") | |
| if step1: | |
| strategy_data = True | |
| elif step in options.opt_step2[1:]: | |
| where_opt.append("2") | |
| if step2: | |
| strategy_data = True | |
| else: | |
| where_opt.append("0") | |
| if strategy_data and step_names_token[-1].split(":")[-2] != "Done": | |
| strategy_data = False | |
| if strategy_data: | |
| proba = random.random() | |
| step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token] | |
| step_names_token = [] | |
| for s in step_names_tokens: | |
| if s != "nan": | |
| if not step_names_token or s != step_names_token[-1]: | |
| step_names_token.append(s) | |
| # if prob in first_prob_list: | |
| if proba <= 0.8: | |
| train_file.write("\t".join(step_names_token)) | |
| train_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length | |
| train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) | |
| train_info.write("\n") | |
| elif proba > 0.9: | |
| # elif prob in last_prob_list: | |
| test_file.write("\t".join(step_names_token)) | |
| test_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length | |
| test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) | |
| test_info.write("\n") | |
| else: | |
| val_file.write("\t".join(step_names_token)) | |
| val_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length | |
| val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) | |
| val_info.write("\n") | |
| # break | |
| # break | |
| # break | |
| # break | |
| # break | |
| train_file.close() | |
| train_info.close() | |
| val_file.close() | |
| val_info.close() | |
| test_file.close() | |
| test_info.close() | |
| def prepare_school_coded_pretraining_files(data_processor, options): | |
| # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator(sep=",") | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| val_file = open(options.val_file_path, "w") | |
| val_info = open(options.val_info_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| for chunk_data in chunk_iterator: | |
| for school, school_group in chunk_data.groupby('CF (Anon School Id)'): | |
| if not options.school or school in options.school: | |
| print(f"{school} : {school_group.shape}") | |
| school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & | |
| (school_group['CF (Encounter)'] == 0) & | |
| (school_group['CF (Is Review Mode)'] == -1) ] | |
| print(f"{school} : {school_group.shape}") | |
| # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): | |
| for student, student_groups in school_group.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| # At least 3 last problems are selected | |
| prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) | |
| prob_list = prob_list[-int(len(prob_list)/2):] | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| if not prob in prob_list: | |
| continue | |
| progress = list(pd.unique(prob_groups["CF (Workspace Progress Status)"]))[0] | |
| if progress != "GRADUATED": | |
| continue | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) | |
| unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) | |
| if unique_steps_len < 4: | |
| continue | |
| class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) | |
| step_names_token = [] | |
| original_steps_actions_attempts_help_levels_outcomes = [] | |
| original_steps = [] | |
| means_and_extremes = False | |
| for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', | |
| 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', | |
| 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): | |
| step = row["Step Name"] | |
| action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] | |
| attempt = row["Attempt At Step"] # number | |
| outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| help_level = row["Help Level"] # number | |
| # progress = row["CF (Workspace Progress Status)"] | |
| scenario = row['CF (Problem Scenario Tags)'] | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(etalon): | |
| etalon = etalon.strip('{}') | |
| key, value = etalon.split('=') | |
| etalon = value | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| except Exception as e: | |
| pass | |
| if row['CF (Is Autofilled)'] == True: | |
| continue | |
| prev = step_names_token[-1] if step_names_token else "" | |
| prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" | |
| if not step_names_token or step != prev_step: | |
| if step in options.opt_step1 or step in options.opt_step2: | |
| new_step = step | |
| else: | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| step_names_token.append(new_step) | |
| else: | |
| if not (step in options.opt_step1 or step in options.opt_step2): | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if prev < new_step: | |
| step_names_token[-1] = new_step | |
| original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") | |
| original_steps.append(step) | |
| unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) | |
| if step_names_token and unique_steps_len > 4: | |
| proba = random.random() | |
| # if prob in first_prob_list: | |
| if proba <= 0.8: | |
| train_file.write("\t".join(step_names_token)) | |
| train_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, | |
| # prefered ER or ME, total steps length, | |
| # original seq-action-attempt-help_level-outcome | |
| train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| train_info.write("\n") | |
| elif proba > 0.9: | |
| # elif prob in last_prob_list: | |
| test_file.write("\t".join(step_names_token)) | |
| test_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, | |
| # prefered ER or ME, total steps length, | |
| # original seq-action-attempt-help_level-outcome | |
| test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| test_info.write("\n") | |
| else: | |
| val_file.write("\t".join(step_names_token)) | |
| val_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, | |
| # prefered ER or ME, total steps length, | |
| # original seq-action-attempt-help_level-outcome | |
| val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| val_info.write("\n") | |
| # break | |
| # break | |
| # break | |
| # break | |
| # break | |
| train_file.close() | |
| train_info.close() | |
| val_file.close() | |
| val_info.close() | |
| test_file.close() | |
| test_info.close() | |
| def prepare_school_attention_files(data_processor, options): | |
| # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator(sep=",") | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| val_file = open(options.val_file_path, "w") | |
| val_info = open(options.val_info_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| for chunk_data in chunk_iterator: | |
| for school, school_group in chunk_data.groupby('CF (Anon School Id)'): | |
| if not options.school or school in options.school: | |
| for class_id, class_group in school_group.groupby('CF (Anon Class Id)'): | |
| for student, student_groups in class_group.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time") | |
| # prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # if len(prob_list) > 0 : | |
| # first_fews = int(len(prob_list)/2) | |
| # last_fews = len(prob_list) - first_fews | |
| # first_prob_list = prob_list[:first_fews] | |
| # last_prob_list = prob_list[-last_fews:] | |
| # final_prob_list = first_prob_list + last_prob_list | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| step_names_token = [] | |
| means_and_extremes = False | |
| for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): | |
| progress = row["CF (Workspace Progress Status)"] | |
| action = row["Action"] | |
| attempt = row["Attempt At Step"] | |
| autofilled = row["CF (Is Autofilled)"] | |
| step = row["Step Name"] | |
| scenario = row['CF (Problem Scenario Tags)'] | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(etalon): | |
| etalon = etalon.strip('{}') | |
| key, value = etalon.split('=') | |
| etalon = value | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| except Exception as e: | |
| pass | |
| if not autofilled: | |
| new_step = f"{step}:{action}:{attempt}" | |
| step_names_token.append(new_step) | |
| if step_names_token: | |
| where_opt = [] | |
| step1 = False | |
| step2 = False | |
| strategy_data = False | |
| for step_oh in step_names_token: | |
| step = step_oh.split(":") | |
| if len(step) == 3: | |
| step = step[0] | |
| else: | |
| step = ":".join(step[:2]) | |
| # print(f"changed {step_oh} = ? {step}") | |
| if step == options.opt_step1[0]: | |
| where_opt.append("_1") | |
| step1 = True | |
| elif step == options.opt_step2[0]: | |
| where_opt.append("_2") | |
| step2 = True | |
| elif step in options.opt_step1[1:]: | |
| where_opt.append("1") | |
| if step1: | |
| strategy_data = True | |
| elif step in options.opt_step2[1:]: | |
| where_opt.append("2") | |
| if step2: | |
| strategy_data = True | |
| else: | |
| where_opt.append("0") | |
| if strategy_data and step_names_token[-1].split(":")[-2] != "Done": | |
| strategy_data = False | |
| if strategy_data: | |
| # proba = random.random() | |
| step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token] | |
| step_names_token = [] | |
| for s in step_names_tokens: | |
| if s != "nan": | |
| if not step_names_token or s != step_names_token[-1]: | |
| step_names_token.append(s) | |
| # if prob in first_prob_list: | |
| if progress == "GRADUATED":# and means_and_extremes:# and prob in first_prob_list: | |
| train_file.write("\t".join(step_names_token)) | |
| train_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length | |
| train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) | |
| train_info.write("\n") | |
| elif progress == "PROMOTED":# and means_and_extremes:# and prob in last_prob_list: | |
| # elif prob in last_prob_list: | |
| test_file.write("\t".join(step_names_token)) | |
| test_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length | |
| test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) | |
| test_info.write("\n") | |
| # else: | |
| # val_file.write("\t".join(step_names_token)) | |
| # val_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length | |
| # val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) | |
| # val_info.write("\n") | |
| # break | |
| # break | |
| # break | |
| # break | |
| # break | |
| train_file.close() | |
| train_info.close() | |
| val_file.close() | |
| val_info.close() | |
| test_file.close() | |
| test_info.close() | |
| def prepare_finetuning_10per_files(data_processor, options): | |
| ''' | |
| Used for L@S paper. | |
| Only two strategies were defined as: | |
| 0: non-opt strategy | |
| 1: opt used strategy | |
| ''' | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| # val_file = open(options.val_file_path, "w") | |
| # val_info = open(options.val_info_path, "w") | |
| # val_label = open(options.val_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if "ratio_proportion_change3" == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups["Step Name"])) | |
| unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) | |
| if unique_steps_len < 4: | |
| continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 1800: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) | |
| # 4 and more in sequence | |
| if step_names_token and unique_steps_len > 4: | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if options.opt_step1: | |
| any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) | |
| if any_opt1: | |
| label_opt = "1" | |
| if options.opt_step2: | |
| any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) | |
| if any_opt2: | |
| label_opt = "1" | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), | |
| f"{1 if means_and_extremes else 0}"]) | |
| overall_data.append(["\t".join(step_names_token), info]) | |
| overall_labels.append(label_opt) | |
| # overall_data.append('') | |
| # overall_labels.append('') | |
| overall_labels = np.array(overall_labels) | |
| indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| train_len = int(len(overall_labels) * 0.10) | |
| sample_size = int(train_len/2) | |
| print(f"sample_size: {sample_size}") | |
| sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) | |
| test_sampled_instances = random.sample(indices_of_zeros, balanced_test) | |
| test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) | |
| # writtenTrain = False | |
| # writtenTest = False | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| steps_seq = all_data[0] | |
| info = all_data[1] | |
| if index in sampled_instances: | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| elif index in test_sampled_instances: | |
| # proba = random.random() | |
| # if proba <0.5: | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_label.write(label) | |
| test_label.write("\n") | |
| # else: | |
| # val_file.write(steps_seq) | |
| # val_file.write("\n") | |
| # val_info.write(info) | |
| # val_info.write("\n") | |
| # val_label.write(label) | |
| # val_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| # val_file.close() | |
| # val_info.close() | |
| # val_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_finetuning_IS_FS_files(data_processor, options): | |
| ''' | |
| Used for L@S paper. This function gathers first three problems of each student. | |
| Only two strategies were defined as: | |
| 0: non-opt strategy | |
| 1: opt used strategy | |
| train: IS | |
| test: FS | |
| ''' | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| # val_file = open(options.val_file_path, "w") | |
| # val_info = open(options.val_info_path, "w") | |
| # val_label = open(options.val_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if "ratio_proportion_change3" == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| if len(prob_list) < 3: | |
| continue | |
| selected = 3 #1. int(len(prob_list)/2) | |
| #2. 3 & <6 | |
| #3. 3 & <3 | |
| first_prob_list = prob_list[:selected] | |
| last_prob_list = prob_list[-selected:] | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups["Step Name"])) | |
| unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) | |
| if unique_steps_len < 4: | |
| continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 1800: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) | |
| # 4 and more in sequence | |
| if step_names_token and unique_steps_len > 4: | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if options.opt_step1: | |
| any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) | |
| if any_opt1: | |
| label_opt = "1" | |
| if options.opt_step2: | |
| any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) | |
| if any_opt2: | |
| label_opt = "1" | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), | |
| f"{1 if means_and_extremes else 0}"]) | |
| if prob in first_prob_list: | |
| train_file.write("\t".join(step_names_token)) | |
| train_file.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_label.write(label_opt) | |
| train_label.write("\n") | |
| elif prob in last_prob_list: | |
| test_file.write("\t".join(step_names_token)) | |
| test_file.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_label.write(label_opt) | |
| test_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| # val_file.close() | |
| # val_info.close() | |
| # val_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_finetuning_IS_files_old(data_processor, opts): | |
| ''' | |
| Used for L@S paper. This function gathers first three problems of each student. | |
| Only two strategies were defined as: | |
| 0: non-opt strategy | |
| 1: opt used strategy | |
| ''' | |
| options = copy.deepcopy(opts) | |
| for k,v in vars(opts).items(): | |
| if k.startswith("train") or k.startswith("test"): | |
| if v: | |
| f_path = v.split("/") | |
| f_path = f_path[0]+"/"+f_path[1]+"/IS/"+f_path[2] | |
| setattr(options, f"{k}", f_path) | |
| print(f"options.{k} : {getattr(options, f'{k}')}") | |
| steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| trainr_label = open(options.trainr_label_path, "w") | |
| train_gt_label = open(options.train_gt_label_path, "w") | |
| # test_file = open(options.test_file_path, "w") | |
| # test_info = open(options.test_info_path, "w") | |
| # test_label = open(options.test_label_path, "w") | |
| # testr_label = open(options.testr_label_path, "w") | |
| # test_gt_label = open(options.test_gt_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if options.workspace_name == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time") | |
| prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| if len(prob_list) < 3: | |
| continue | |
| first_prob_list = prob_list[:3] | |
| # last_prob_list = prob_list[-3:] | |
| # # print(len(first_prob_list), len(last_prob_list)) | |
| # final_prob_list = first_prob_list + last_prob_list | |
| # print(len(prob_list), len(final_prob_list)) #, final_prob_list) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # For first 3 and last 3 only | |
| if not prob in first_prob_list: | |
| continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 2000: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| progress = "" | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| finals = len(options.final_step) | |
| totals = 0 | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| # if row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| # print(row) | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| if finals == 0: | |
| totals += 1 | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| errors = 0 | |
| for step, out in zip(step_names_token, outcome): | |
| if (finals and step in options.final_step) or totals > 0: | |
| out = out.split(":") | |
| if any(any(ind in o for o in out) for ind in error_ind): | |
| errors +=1 | |
| if finals: | |
| totals = finals | |
| # 4 and more in sequence | |
| if step_names_token: # and len(step_names_token) > 3 | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if options.opt_step1: | |
| any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) | |
| if any_opt1: | |
| label_opt = "1" | |
| if options.opt_step2: | |
| any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) | |
| if any_opt2: | |
| label_opt = "1" | |
| correctness = 1 - errors/totals | |
| strat_correct = "0" | |
| if correctness > 0.75: | |
| strat_correct = "1" | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)]) | |
| overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) | |
| overall_labels.append(label_opt) | |
| overall_data.append('') | |
| overall_labels.append('') | |
| # overall_labels = np.array(overall_labels) | |
| # indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| # indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| # zeros_instances_size = int(1 * len(indices_of_zeros)) | |
| # ones_instances_size = int(1 * len(indices_of_ones)) | |
| # sample_size = min(zeros_instances_size, ones_instances_size) | |
| # sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| # sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| writtenTrain = False | |
| # writtenTest = False | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| if all_data: | |
| steps_seq = all_data[0] | |
| strat_correct = all_data[1] | |
| info = all_data[2] | |
| me_opt = all_data[3] | |
| # if index in sampled_instances: | |
| writtenTrain = True | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| trainr_label.write(strat_correct) | |
| trainr_label.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_gt_label.write(me_opt) | |
| train_gt_label.write("\n") | |
| # else: | |
| # writtenTest = True | |
| # test_file.write(steps_seq) | |
| # test_file.write("\n") | |
| # # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| # test_label.write(label) | |
| # test_label.write("\n") | |
| # # testr_label.write(str(correctness)) | |
| # testr_label.write(strat_correct) | |
| # testr_label.write("\n") | |
| # test_info.write(info) | |
| # test_info.write("\n") | |
| # test_gt_label.write(me_opt) | |
| # test_gt_label.write("\n") | |
| else: | |
| # Indicates actions of next student | |
| # Indicates next problem | |
| if writtenTrain: | |
| writtenTrain = False | |
| train_file.write("\n") | |
| train_info.write("\n") | |
| train_label.write("\n") | |
| trainr_label.write("\n") | |
| train_gt_label.write("\n") | |
| # if writtenTest: | |
| # writtenTest = False | |
| # test_file.write("\n") | |
| # test_info.write("\n") | |
| # test_label.write("\n") | |
| # testr_label.write("\n") | |
| # test_gt_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| trainr_label.close() | |
| train_gt_label.close() | |
| # test_file.close() | |
| # test_info.close() | |
| # test_label.close() | |
| # testr_label.close() | |
| # test_gt_label.close() | |
| def prepare_finetuning_FS_files_old(data_processor, opts): | |
| ''' | |
| Used for L@S paper. This function gathers last three problems of each student. | |
| Only two strategies were defined as: | |
| 0: non-opt strategy | |
| 1: opt used strategy | |
| ''' | |
| options = copy.deepcopy(opts) | |
| for k,v in vars(opts).items(): | |
| if k.startswith("train") or k.startswith("test"): | |
| if v: | |
| f_path = v.split("/") | |
| f_path = f_path[0]+"/"+f_path[1]+"/FS/"+f_path[2] | |
| setattr(options, f"{k}", f_path) | |
| print(f"options.{k} : {getattr(options, f'{k}')}") | |
| steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| trainr_label = open(options.trainr_label_path, "w") | |
| train_gt_label = open(options.train_gt_label_path, "w") | |
| # test_file = open(options.test_file_path, "w") | |
| # test_info = open(options.test_info_path, "w") | |
| # test_label = open(options.test_label_path, "w") | |
| # testr_label = open(options.testr_label_path, "w") | |
| # test_gt_label = open(options.test_gt_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if options.workspace_name == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time") | |
| prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| if len(prob_list) < 3: | |
| continue | |
| # first_prob_list = prob_list[:3] | |
| last_prob_list = prob_list[-3:] | |
| # # print(len(first_prob_list), len(last_prob_list)) | |
| # final_prob_list = first_prob_list + last_prob_list | |
| # print(len(prob_list), len(final_prob_list)) #, final_prob_list) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # For first 3 and last 3 only | |
| if not prob in last_prob_list: | |
| continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 2000: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| progress = "" | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| finals = len(options.final_step) | |
| totals = 0 | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| # if row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| # print(row) | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| if finals == 0: | |
| totals += 1 | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| errors = 0 | |
| for step, out in zip(step_names_token, outcome): | |
| if (finals and step in options.final_step) or totals > 0: | |
| out = out.split(":") | |
| if any(any(ind in o for o in out) for ind in error_ind): | |
| errors +=1 | |
| if finals: | |
| totals = finals | |
| # 4 and more in sequence | |
| if step_names_token: # and len(step_names_token) > 3 | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if options.opt_step1: | |
| any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) | |
| if any_opt1: | |
| label_opt = "1" | |
| if options.opt_step2: | |
| any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) | |
| if any_opt2: | |
| label_opt = "1" | |
| correctness = 1 - errors/totals | |
| strat_correct = "0" | |
| if correctness > 0.75: | |
| strat_correct = "1" | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)]) | |
| overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) | |
| overall_labels.append(label_opt) | |
| overall_data.append('') | |
| overall_labels.append('') | |
| # overall_labels = np.array(overall_labels) | |
| # indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| # indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| # zeros_instances_size = int(0.10 * len(indices_of_zeros)) | |
| # ones_instances_size = int(0.10 * len(indices_of_ones)) | |
| # sample_size = min(zeros_instances_size, ones_instances_size) | |
| # sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| # sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| writtenTrain = False | |
| # writtenTest = False | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| if all_data: | |
| steps_seq = all_data[0] | |
| strat_correct = all_data[1] | |
| info = all_data[2] | |
| me_opt = all_data[3] | |
| # if index in sampled_instances: | |
| writtenTrain = True | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| trainr_label.write(strat_correct) | |
| trainr_label.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_gt_label.write(me_opt) | |
| train_gt_label.write("\n") | |
| # else: | |
| # writtenTest = True | |
| # test_file.write(steps_seq) | |
| # test_file.write("\n") | |
| # # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| # test_label.write(label) | |
| # test_label.write("\n") | |
| # # testr_label.write(str(correctness)) | |
| # testr_label.write(strat_correct) | |
| # testr_label.write("\n") | |
| # test_info.write(info) | |
| # test_info.write("\n") | |
| # test_gt_label.write(me_opt) | |
| # test_gt_label.write("\n") | |
| else: | |
| # Indicates actions of next student | |
| # Indicates next problem | |
| if writtenTrain: | |
| writtenTrain = False | |
| train_file.write("\n") | |
| train_info.write("\n") | |
| train_label.write("\n") | |
| trainr_label.write("\n") | |
| train_gt_label.write("\n") | |
| # if writtenTest: | |
| # writtenTest = False | |
| # test_file.write("\n") | |
| # test_info.write("\n") | |
| # test_label.write("\n") | |
| # testr_label.write("\n") | |
| # test_gt_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| trainr_label.close() | |
| train_gt_label.close() | |
| # test_file.close() | |
| # test_info.close() | |
| # test_label.close() | |
| # testr_label.close() | |
| # test_gt_label.close() | |
| def prepare_finetuning_correctness_files(data_processor, options): | |
| ''' | |
| Ongoing research. Student strategy learning/predicting. | |
| FinalAnswer step | |
| Correct: 1 , correctness of final strategy > 0.75 | |
| Incorrect: 0 , else < 0.75 | |
| ''' | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| # val_file = open(options.val_file_path, "w") | |
| # val_info = open(options.val_info_path, "w") | |
| # val_label = open(options.val_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if "ratio_proportion_change3" == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups["Step Name"])) | |
| unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) | |
| if unique_steps_len < 4: | |
| continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 1800: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| final_correct = 0 | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| if step == "FinalAnswer": | |
| final_correct += 1 | |
| unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) | |
| # 4 and more in sequence | |
| if step_names_token and unique_steps_len > 4: | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if final_correct == 1: | |
| label_opt = "1" | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), | |
| f"{1 if means_and_extremes else 0}"]) | |
| overall_data.append(["\t".join(step_names_token), info]) | |
| overall_labels.append(label_opt) | |
| # overall_data.append('') | |
| # overall_labels.append('') | |
| overall_labels = np.array(overall_labels) | |
| indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| train_len = int(len(overall_labels) * 0.10) | |
| sample_size = int(train_len/2) | |
| print(f"sample_size: {sample_size}") | |
| sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| # writtenTrain = False | |
| # writtenTest = False | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| steps_seq = all_data[0] | |
| info = all_data[1] | |
| if index in sampled_instances: | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| else: | |
| # proba = random.random() | |
| # if proba <0.5: | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_label.write(label) | |
| test_label.write("\n") | |
| # else: | |
| # val_file.write(steps_seq) | |
| # val_file.write("\n") | |
| # val_info.write(info) | |
| # val_info.write("\n") | |
| # val_label.write(label) | |
| # val_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| # val_file.close() | |
| # val_info.close() | |
| # val_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_finetuning_correctness_files_old(data_processor, opts): | |
| ''' | |
| Ongoing research. Student strategy learning/predicting. | |
| Correct, 1: correctness of final strategy > 0.75 | |
| Incorrect, 0: else < 0.75 | |
| ''' | |
| options = copy.deepcopy(opts) | |
| for k,v in vars(opts).items(): | |
| if k.startswith("train") or k.startswith("test"): | |
| if v: | |
| f_path = v.split("/") | |
| f_path = f_path[0]+"/"+f_path[1]+"/fa_correctness/"+f_path[2] | |
| # f_path = f_path[0]+"/"+f_path[1]+"/check2/"+f_path[2] | |
| setattr(options, f"{k}", f_path) | |
| print(f"options.{k} : {getattr(options, f'{k}')}") | |
| steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| # trainr_label = open(options.trainr_label_path, "w") | |
| # train_gt_label = open(options.train_gt_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| # testr_label = open(options.testr_label_path, "w") | |
| # test_gt_label = open(options.test_gt_label_path, "w") | |
| ws = "_".join(options.workspace_name.split("_")[:-1]) | |
| print("Workspace: ", ws) | |
| overall_data = [] | |
| overall_labels = [] | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if ws == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time") | |
| prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # if len(prob_list) < 3: | |
| # continue | |
| # first_prob_list = prob_list[:3] | |
| # last_prob_list = prob_list[-3:] | |
| # # print(len(first_prob_list), len(last_prob_list)) | |
| # final_prob_list = first_prob_list + last_prob_list | |
| # print(len(prob_list), len(final_prob_list)) #, final_prob_list) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # For first 3 and last 3 only | |
| # if not prob in last_prob_list: | |
| # continue | |
| # print(options.final_step in list(prob_groups["Step Name"])) | |
| # if not (options.final_step in list(prob_groups["Step Name"])): | |
| # continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 2000: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| progress = "" | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| # finals = len(options.final_step) | |
| totals = 0 | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| # if row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| # print(row) | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| # if finals == 0: | |
| # totals += 1 | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| errors = 0 | |
| for step, out in zip(step_names_token, outcome): | |
| if (step in options.final_step):# or totals > 0: | |
| out = out.split(":") | |
| totals = len(out) | |
| # print(totals) | |
| for ind in error_ind: | |
| if ind in out: | |
| errors +=1 | |
| # if finals: | |
| # totals = finals | |
| # 4 and more in sequence | |
| if step_names_token and totals>0: # and len(step_names_token) > 3 | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if options.opt_step1: | |
| all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) | |
| any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) | |
| if any_opt1: | |
| label_opt = "2" | |
| if all_opt1: | |
| label_opt = "1" | |
| if options.opt_step2: | |
| all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) | |
| any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) | |
| if any_opt2: | |
| label_opt = "4" | |
| if all_opt2: | |
| label_opt = "3" | |
| if any_opt1 and any_opt2: | |
| label_opt = "5" | |
| if any_opt1 and all_opt2: | |
| label_opt = "6" | |
| if all_opt1 and any_opt2: | |
| label_opt = "7" | |
| if all_opt1 and all_opt2: | |
| label_opt = "8" | |
| correctness = 1 - errors/totals | |
| strat_correct = "0" | |
| if correctness > 0.75: | |
| strat_correct = "1" | |
| # if not means_and_extremes and label_opt == "2": | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), f"{1 if means_and_extremes else 0}"]) | |
| overall_data.append(["\t".join(step_names_token), label_opt, info]) | |
| overall_labels.append(strat_correct) | |
| overall_data.append('') | |
| overall_labels.append('') | |
| overall_labels = np.array(overall_labels, dtype=str) | |
| indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| per = 0.20 | |
| zeros_instances_size = int(per * len(indices_of_zeros)) | |
| ones_instances_size = int(per * len(indices_of_ones)) | |
| sample_size = min(zeros_instances_size, ones_instances_size) | |
| sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| writtenTrain = False | |
| writtenTest = False | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| if all_data: | |
| steps_seq = all_data[0] | |
| label_opt = all_data[1] | |
| info = all_data[2] | |
| # me_opt = all_data[3] | |
| if index in sampled_instances: | |
| writtenTrain = True | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| # trainr_label.write(label_opt) | |
| # trainr_label.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| # train_gt_label.write(me_opt) | |
| # train_gt_label.write("\n") | |
| else: | |
| writtenTest = True | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| test_label.write(label) | |
| test_label.write("\n") | |
| # testr_label.write(str(correctness)) | |
| # testr_label.write(label_opt) | |
| # testr_label.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| # test_gt_label.write(me_opt) | |
| # test_gt_label.write("\n") | |
| else: | |
| # Indicates actions of next student | |
| # Indicates next problem | |
| if writtenTrain: | |
| writtenTrain = False | |
| train_file.write("\n") | |
| train_info.write("\n") | |
| train_label.write("\n") | |
| # trainr_label.write("\n") | |
| # train_gt_label.write("\n") | |
| if writtenTest: | |
| writtenTest = False | |
| test_file.write("\n") | |
| test_info.write("\n") | |
| test_label.write("\n") | |
| # testr_label.write("\n") | |
| # test_gt_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| # trainr_label.close() | |
| # train_gt_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| # testr_label.close() | |
| # test_gt_label.close() | |
| def prepare_finetuning_correctness_aaai_files(data_processor, opts): | |
| ''' | |
| Ongoing research. Student strategy learning/predicting. | |
| Correct, 1: correctness of final strategy > 0.75 | |
| Incorrect, 0: else < 0.75 | |
| ''' | |
| options = copy.deepcopy(opts) | |
| for k,v in vars(opts).items(): | |
| if k.startswith("train") or k.startswith("test") or k.startswith("val"): | |
| if v: | |
| f_path = v.split("/") | |
| # f_path = f_path[0]+"/"+f_path[1]+"/correctness/"+f_path[2] | |
| f_path = f_path[0]+"/"+f_path[1]+"/aaai/"+f_path[2] | |
| setattr(options, f"{k}", f_path) | |
| print(f"options.{k} : {getattr(options, f'{k}')}") | |
| # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| val_file = open(options.val_file_path, "w") | |
| val_info = open(options.val_info_path, "w") | |
| val_label = open(options.val_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| high_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_high_performers.pkl", "rb")) | |
| mid_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_mid_performers.pkl", "rb")) | |
| low_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_low_performers.pkl", "rb")) | |
| prob_sel_list = pickle.load(open(f"{options.workspace_name}/aaai/change3_problem_list.pkl", "rb")) | |
| ws = "_".join(options.workspace_name.split("_")[:-1]) | |
| print(ws, len(high_performer), len(mid_performer), len(low_performer), len(prob_sel_list)) | |
| overall_data = [] | |
| overall_labels = [] | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| # if options.workspace_name == section: | |
| if ws == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| if student in high_performer or student in mid_performer or student in low_performer: | |
| student_groups.sort_values(by="Time") | |
| prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # For first 3 and last 3 only | |
| if not prob in prob_sel_list: | |
| continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 2000: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| progress = "" | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| totals = 0 | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| # if row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| # print(row) | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| errors = 0 | |
| for step, out in zip(step_names_token, outcome): | |
| if (step in options.final_step): | |
| out = out.split(":") | |
| totals = len(out) | |
| # print(totals) | |
| for ind in error_ind: | |
| if ind in out: | |
| errors +=1 | |
| # 4 and more in sequence | |
| if step_names_token and totals>0: # and len(step_names_token) > 3 | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| correctness = 1 - errors/totals | |
| strat_correct = "0" | |
| if correctness > 0.75: | |
| strat_correct = "1" | |
| # if not means_and_extremes and label_opt == "2": | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| info = ",".join([str(progress), str(correctness), f"{1 if means_and_extremes else 0}",str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]) | |
| overall_data.append(["\t".join(step_names_token), info]) | |
| overall_labels.append(strat_correct) | |
| # overall_data.append('') | |
| # overall_labels.append('') | |
| overall_labels = np.array(overall_labels) | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| if all_data: | |
| steps_seq = all_data[0] | |
| info = all_data[1] | |
| student = info.split(",")[4] | |
| if student in high_performer: | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| elif student in mid_performer: | |
| val_file.write(steps_seq) | |
| val_file.write("\n") | |
| val_label.write(label) | |
| val_label.write("\n") | |
| val_info.write(info) | |
| val_info.write("\n") | |
| elif student in low_performer: | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| test_label.write(label) | |
| test_label.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| val_file.close() | |
| val_info.close() | |
| val_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_finetuning_SL_files(data_processor, opts): | |
| ''' | |
| Ongoing research. Student strategy learning/predicting. | |
| We have defined 9 strategy as: | |
| Notation; Label | |
| UU; 0 | |
| CU; 1 | |
| PU; 2 | |
| UC; 3 | |
| UP; 4 | |
| PP; 5 | |
| PC; 6 | |
| CP; 7 | |
| CC; 8 | |
| ''' | |
| options = copy.deepcopy(opts) | |
| for k,v in vars(opts).items(): | |
| if k.startswith("train") or k.startswith("test"): | |
| if v: | |
| f_path = v.split("/") | |
| f_path = f_path[0]+"/"+f_path[1]+"/SL/"+f_path[2] | |
| setattr(options, f"{k}", f_path) | |
| print(f"options.{k} : {getattr(options, f'{k}')}") | |
| steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| trainr_label = open(options.trainr_label_path, "w") | |
| train_gt_label = open(options.train_gt_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| testr_label = open(options.testr_label_path, "w") | |
| test_gt_label = open(options.test_gt_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if options.workspace_name == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time") | |
| prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # if len(prob_list) < 3: | |
| # continue | |
| # first_prob_list = prob_list[:3] | |
| # last_prob_list = prob_list[-3:] | |
| # # print(len(first_prob_list), len(last_prob_list)) | |
| # final_prob_list = first_prob_list + last_prob_list | |
| # print(len(prob_list), len(final_prob_list)) #, final_prob_list) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # For first 3 and last 3 only | |
| # if not prob in last_prob_list: | |
| # continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 2000: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| progress = "" | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| finals = len(options.final_step) | |
| totals = 0 | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| # if row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| # print(row) | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| if finals == 0: | |
| totals += 1 | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| errors = 0 | |
| for step, out in zip(step_names_token, outcome): | |
| if (finals and step in options.final_step) or totals > 0: | |
| out = out.split(":") | |
| if any(any(ind in o for o in out) for ind in error_ind): | |
| errors +=1 | |
| if finals: | |
| totals = finals | |
| # 4 and more in sequence | |
| if step_names_token: # and len(step_names_token) > 3 | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if options.opt_step1: | |
| all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) | |
| any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) | |
| if any_opt1: | |
| label_opt = "2" | |
| if all_opt1: | |
| label_opt = "1" | |
| if options.opt_step2: | |
| all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) | |
| any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) | |
| if any_opt2: | |
| label_opt = "4" | |
| if all_opt2: | |
| label_opt = "3" | |
| if any_opt1 and any_opt2: | |
| label_opt = "5" | |
| if any_opt1 and all_opt2: | |
| label_opt = "6" | |
| if all_opt1 and any_opt2: | |
| label_opt = "7" | |
| if all_opt1 and all_opt2: | |
| label_opt = "8" | |
| correctness = 1 - errors/totals | |
| strat_correct = "0" | |
| if correctness > 0.75: | |
| strat_correct = "1" | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)]) | |
| overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) | |
| overall_labels.append(label_opt) | |
| overall_data.append('') | |
| overall_labels.append('') | |
| overall_labels = np.array(overall_labels) | |
| indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| indices_of_twos = list(np.where(overall_labels == '2')[0]) | |
| indices_of_threes = list(np.where(overall_labels == '3')[0]) | |
| indices_of_fours = list(np.where(overall_labels == '4')[0]) | |
| indices_of_fives = list(np.where(overall_labels == '5')[0]) | |
| indices_of_sixes = list(np.where(overall_labels == '6')[0]) | |
| indices_of_sevens = list(np.where(overall_labels == '7')[0]) | |
| indices_of_eights = list(np.where(overall_labels == '8')[0]) | |
| per = 0.20 | |
| zeros_instances_size = int(per * len(indices_of_zeros)) | |
| ones_instances_size = int(per * len(indices_of_ones)) | |
| twos_instances_size = int(per * len(indices_of_twos)) | |
| threes_instances_size = int(per * len(indices_of_threes)) | |
| fours_instances_size = int(per * len(indices_of_fours)) | |
| fives_instances_size = int(per * len(indices_of_fives)) | |
| sixes_instances_size = int(per * len(indices_of_sixes)) | |
| sevens_instances_size = int(per * len(indices_of_sevens)) | |
| eights_instances_size = int(per * len(indices_of_eights)) | |
| sample_size = min(zeros_instances_size, ones_instances_size, twos_instances_size, threes_instances_size, fours_instances_size, fives_instances_size, sixes_instances_size, sevens_instances_size, eights_instances_size) | |
| print(f"Sample size.... {sample_size}") | |
| sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| sampled_instances.extend(random.sample(indices_of_twos, sample_size)) | |
| sampled_instances.extend(random.sample(indices_of_threes, sample_size)) | |
| sampled_instances.extend(random.sample(indices_of_fours, sample_size)) | |
| sampled_instances.extend(random.sample(indices_of_fives, sample_size)) | |
| sampled_instances.extend(random.sample(indices_of_sixes, sample_size)) | |
| sampled_instances.extend(random.sample(indices_of_sevens, sample_size)) | |
| sampled_instances.extend(random.sample(indices_of_eights, sample_size)) | |
| writtenTrain = False | |
| writtenTest = False | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| if all_data: | |
| steps_seq = all_data[0] | |
| strat_correct = all_data[1] | |
| info = all_data[2] | |
| me_opt = all_data[3] | |
| if index in sampled_instances: | |
| writtenTrain = True | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| trainr_label.write(strat_correct) | |
| trainr_label.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_gt_label.write(me_opt) | |
| train_gt_label.write("\n") | |
| else: | |
| writtenTest = True | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| test_label.write(label) | |
| test_label.write("\n") | |
| # testr_label.write(str(correctness)) | |
| testr_label.write(strat_correct) | |
| testr_label.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_gt_label.write(me_opt) | |
| test_gt_label.write("\n") | |
| else: | |
| # Indicates actions of next student | |
| # Indicates next problem | |
| if writtenTrain: | |
| writtenTrain = False | |
| train_file.write("\n") | |
| train_info.write("\n") | |
| train_label.write("\n") | |
| trainr_label.write("\n") | |
| train_gt_label.write("\n") | |
| if writtenTest: | |
| writtenTest = False | |
| test_file.write("\n") | |
| test_info.write("\n") | |
| test_label.write("\n") | |
| testr_label.write("\n") | |
| test_gt_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| trainr_label.close() | |
| train_gt_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| testr_label.close() | |
| test_gt_label.close() | |
| def prepare_finetuning_effectiveness_files(data_processor, opts): | |
| ''' | |
| Ongoing research. Student strategy learning/predicting. | |
| We have defined 9 strategy as: | |
| Notation; Label | |
| UU; 0 | |
| CU; 1 | |
| PU; 2 | |
| UC; 3 | |
| UP; 4 | |
| PP; 5 | |
| PC; 6 | |
| CP; 7 | |
| CC; 8 | |
| if UU and CU and PU and gt = ER and correct, a positive instance | |
| if UU and UC and UP and gt = ME and correct, a positive instance | |
| else a strategy PP, PC, CP, CC and gt = ER/ME or incorrect, a negative instance | |
| ''' | |
| options = copy.deepcopy(opts) | |
| for k,v in vars(opts).items(): | |
| if k.startswith("train") or k.startswith("test"): | |
| if v: | |
| f_path = v.split("/") | |
| f_path = f_path[0]+"/"+f_path[1]+"/effectiveness/"+f_path[2] | |
| setattr(options, f"{k}", f_path) | |
| print(f"options.{k} : {getattr(options, f'{k}')}") | |
| steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| trainr_label = open(options.trainr_label_path, "w") | |
| train_gt_label = open(options.train_gt_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| testr_label = open(options.testr_label_path, "w") | |
| test_gt_label = open(options.test_gt_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if options.workspace_name == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time") | |
| prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # if len(prob_list) < 3: | |
| # continue | |
| # first_prob_list = prob_list[:3] | |
| # last_prob_list = prob_list[-3:] | |
| # # print(len(first_prob_list), len(last_prob_list)) | |
| # final_prob_list = first_prob_list + last_prob_list | |
| # print(len(prob_list), len(final_prob_list)) #, final_prob_list) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # For first 3 and last 3 only | |
| # if not prob in last_prob_list: | |
| # continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 2000: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| progress = "" | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| finals = len(options.final_step) | |
| totals = 0 | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| # if row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| # print(row) | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| if finals == 0: | |
| totals += 1 | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| errors = 0 | |
| for step, out in zip(step_names_token, outcome): | |
| if (finals and step in options.final_step) or totals > 0: | |
| out = out.split(":") | |
| if any(any(ind in o for o in out) for ind in error_ind): | |
| errors +=1 | |
| if finals: | |
| totals = finals | |
| # 4 and more in sequence | |
| if step_names_token: # and len(step_names_token) > 3 | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if options.opt_step1: | |
| all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) | |
| any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) | |
| if any_opt1: | |
| label_opt = "2" | |
| if all_opt1: | |
| label_opt = "1" | |
| if options.opt_step2: | |
| all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) | |
| any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) | |
| if any_opt2: | |
| label_opt = "4" | |
| if all_opt2: | |
| label_opt = "3" | |
| if any_opt1 and any_opt2: | |
| label_opt = "5" | |
| if any_opt1 and all_opt2: | |
| label_opt = "6" | |
| if all_opt1 and any_opt2: | |
| label_opt = "7" | |
| if all_opt1 and all_opt2: | |
| label_opt = "8" | |
| correctness = 1 - errors/totals | |
| strat_correct = "0" | |
| if correctness > 0.75: | |
| strat_correct = "1" | |
| label_effectiveness = "0" | |
| if label_opt in ["0", "1", "2"] and not means_and_extremes and strat_correct == "1": | |
| label_effectiveness = "1" | |
| elif label_opt in ["0", "3", "4"] and means_and_extremes and strat_correct == "1": | |
| label_effectiveness = "1" | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), label_opt, f"{1 if means_and_extremes else 0}"]) | |
| overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) | |
| overall_labels.append(label_effectiveness) | |
| overall_data.append('') | |
| overall_labels.append('') | |
| overall_labels = np.array(overall_labels) | |
| indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| per = 0.20 | |
| zeros_instances_size = int(per * len(indices_of_zeros)) | |
| ones_instances_size = int(per * len(indices_of_ones)) | |
| sample_size = min(zeros_instances_size, ones_instances_size) | |
| sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| writtenTrain = False | |
| writtenTest = False | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| if all_data: | |
| steps_seq = all_data[0] | |
| strat_correct = all_data[1] | |
| info = all_data[2] | |
| me_opt = all_data[3] | |
| if index in sampled_instances: | |
| writtenTrain = True | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| trainr_label.write(strat_correct) | |
| trainr_label.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_gt_label.write(me_opt) | |
| train_gt_label.write("\n") | |
| else: | |
| writtenTest = True | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| test_label.write(label) | |
| test_label.write("\n") | |
| # testr_label.write(str(correctness)) | |
| testr_label.write(strat_correct) | |
| testr_label.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_gt_label.write(me_opt) | |
| test_gt_label.write("\n") | |
| else: | |
| # Indicates actions of next student | |
| # Indicates next problem | |
| if writtenTrain: | |
| writtenTrain = False | |
| train_file.write("\n") | |
| train_info.write("\n") | |
| train_label.write("\n") | |
| trainr_label.write("\n") | |
| train_gt_label.write("\n") | |
| if writtenTest: | |
| writtenTest = False | |
| test_file.write("\n") | |
| test_info.write("\n") | |
| test_label.write("\n") | |
| testr_label.write("\n") | |
| test_gt_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| trainr_label.close() | |
| train_gt_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| testr_label.close() | |
| test_gt_label.close() | |
| def prepare_attn_test_files(data_processor, opts): | |
| options = copy.deepcopy(opts) | |
| if options.code: | |
| new_folder = f"{options.workspace_name}/{options.code}" | |
| if not os.path.exists(new_folder): | |
| os.makedirs(new_folder) | |
| for k,v in vars(opts).items(): | |
| if k.startswith("train") or k.startswith("test"): | |
| if v: | |
| f_path = (f"/{options.code}/").join(v.split("/")) | |
| setattr(options, f"{k}", f_path) | |
| print(f"options.{k} : {getattr(options, f'{k}')}") | |
| steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| if options.code != "full": | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if options.workspace_name == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time") | |
| prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 2000: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| progress = "" | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| means_and_extremes = False | |
| finals = len(options.final_step) | |
| totals = 0 | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| # if row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| # print(row) | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| if finals == 0: | |
| totals += 1 | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| errors = 0 | |
| for step, out in zip(step_names_token, outcome): | |
| if (finals and step in options.final_step) or totals > 0: | |
| out = out.split(":") | |
| if any(any(ind in o for o in out) for ind in error_ind): | |
| errors +=1 | |
| if finals: | |
| totals = finals | |
| # 4 and more in sequence | |
| if step_names_token: # and len(step_names_token) > 3 | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if options.opt_step1: | |
| all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) | |
| any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) | |
| if any_opt1: | |
| label_opt = "2" | |
| if all_opt1: | |
| label_opt = "1" | |
| if options.opt_step2: | |
| all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) | |
| any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) | |
| if any_opt2: | |
| label_opt = "4" | |
| if all_opt2: | |
| label_opt = "3" | |
| if any_opt1 and any_opt2: | |
| label_opt = "5" | |
| if any_opt1 and all_opt2: | |
| label_opt = "6" | |
| if all_opt1 and any_opt2: | |
| label_opt = "7" | |
| if all_opt1 and all_opt2: | |
| label_opt = "8" | |
| correctness = 1 - errors/totals | |
| opt_correct = "0" | |
| if correctness > 0.75: | |
| opt_correct = "1" | |
| proba = random.random() | |
| # if proba <= 0.1: | |
| # if not means_and_extremes: | |
| # if prob in first_prob_list: | |
| if options.code == "full" or (options.code == "gt" and not means_and_extremes) or (options.code == "correct" and opt_correct == "1") or (options.code == "progress" and progress == "GRADUATED"): | |
| if label_opt == "0": | |
| continue | |
| train_file.write("\t".join(step_names_token)) | |
| train_file.write("\n") | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), | |
| str(correctness), f"{1 if means_and_extremes else 0}", label_opt])) | |
| train_info.write("\n") | |
| # if means_and_extremes: | |
| # if prob in last_prob_list: | |
| else: | |
| test_file.write("\t".join(step_names_token)) | |
| test_file.write("\n") | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), | |
| str(correctness), f"{1 if means_and_extremes else 0}", label_opt])) | |
| test_info.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| if options.code != "full": | |
| test_file.close() | |
| test_info.close() | |
| def prepare_finetuning_future_files(data_processor, opts): | |
| options = copy.deepcopy(opts) | |
| for k,v in vars(opts).items(): | |
| if k.startswith("train") or k.startswith("test"): | |
| if v: | |
| f_path = ("/effectiveness/").join(v.split("/")) | |
| setattr(options, f"{k}", f_path) | |
| print(f"options.{k} : {getattr(options, f'{k}')}") | |
| steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| chunk_iterator = data_processor.load_file_iterator() | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| trainr_label = open(options.trainr_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| testr_label = open(options.testr_label_path, "w") | |
| for chunk_data in chunk_iterator: | |
| for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): | |
| if options.workspace_name == section: | |
| for student, student_groups in section_groups.groupby("Anon Student Id"): | |
| writtenTrain = False | |
| writtenTest = False | |
| student_groups.sort_values(by="Time") | |
| prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # if len(prob_list) < 6: | |
| # continue | |
| # first_prob_list = prob_list[:3] | |
| # last_prob_list = prob_list[-3:] | |
| # # print(len(first_prob_list), len(last_prob_list)) | |
| # final_prob_list = first_prob_list + last_prob_list | |
| # print(len(prob_list), len(final_prob_list)) #, final_prob_list) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # For first 3 and last 3 only | |
| # if not prob in final_prob_list: | |
| # continue | |
| step_names_token = [] | |
| time_stamps = list(prob_groups["Time"]) | |
| time_stamps_list = set() | |
| for i in range(len(time_stamps)-1): | |
| if (time_stamps[i+1] - time_stamps[i]) < 2000: | |
| time_stamps_list.add(time_stamps[i+1]) | |
| progress = "" | |
| outcome = [] | |
| help_level = [] | |
| auto_complete = False | |
| errors = 0 | |
| totals = 0 | |
| means_and_extremes = False | |
| for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): | |
| step = row["Step Name"] | |
| etalon = row["CF (Etalon)"] | |
| progress = row["CF (Workspace Progress Status)"] | |
| if not pd.isna(step): | |
| if step in options.opt_step1: | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| # break | |
| except Exception as e: | |
| pass | |
| if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: | |
| # if row["Time"] in time_stamps_list: | |
| auto_complete = True | |
| # print(row) | |
| continue | |
| # if not step_names_token or step != step_names_token[-1]: | |
| # step_names_token.append(step) | |
| if not step_names_token or step != step_names_token[-1]: | |
| step_names_token.append(step) | |
| # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| outcome.append(row['Outcome']) | |
| help_level.append(str(row["Help Level"])) | |
| totals += 1 | |
| else: | |
| outcome[-1] = outcome[-1]+":"+row['Outcome'] | |
| help_level[-1] = help_level[-1]+":"+str(row['Help Level']) | |
| error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| for out in outcome: | |
| out = out.split(":") | |
| if any(any(ind in o for o in out) for ind in error_ind): | |
| errors +=1 | |
| # 4 and more in sequence | |
| if step_names_token: # and len(step_names_token) > 3 | |
| where_opt = [] | |
| for stp in step_names_token: | |
| if stp in options.opt_step1: | |
| where_opt.append("1") | |
| elif stp in options.opt_step2: | |
| where_opt.append("2") | |
| else: | |
| where_opt.append("0") | |
| label_opt = "0" | |
| if options.opt_step1: | |
| all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) | |
| any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) | |
| if any_opt1: | |
| label_opt = "2" | |
| if all_opt1: | |
| label_opt = "1" | |
| if options.opt_step2: | |
| all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) | |
| any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) | |
| if any_opt2: | |
| label_opt = "4" | |
| if all_opt2: | |
| label_opt = "3" | |
| if any_opt1 and any_opt2: | |
| label_opt = "5" | |
| if any_opt1 and all_opt2: | |
| label_opt = "6" | |
| if all_opt1 and any_opt2: | |
| label_opt = "7" | |
| if all_opt1 and all_opt2: | |
| label_opt = "8" | |
| correctness = 1 - errors/totals | |
| opt_correct = "0" | |
| if correctness < 0.25: | |
| opt_correct = "0" | |
| elif correctness < 0.5: | |
| opt_correct = "1" | |
| elif correctness < 0.75: | |
| opt_correct = "2" | |
| else: | |
| opt_correct = "3" | |
| proba = random.random() | |
| # if proba <= 0.1: | |
| if not means_and_extremes: | |
| # if prob in first_prob_list: | |
| writtenTrain = True | |
| train_file.write("\t".join(step_names_token)) | |
| train_file.write("\n") | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| train_label.write(label_opt) | |
| train_label.write("\n") | |
| # trainr_label.write(str(correctness)) | |
| trainr_label.write(opt_correct) | |
| trainr_label.write("\n") | |
| train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) | |
| train_info.write("\n") | |
| if means_and_extremes: | |
| # if prob in last_prob_list: | |
| # else: | |
| writtenTest = True | |
| test_file.write("\t".join(step_names_token)) | |
| test_file.write("\n") | |
| # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length | |
| test_label.write(label_opt) | |
| test_label.write("\n") | |
| # testr_label.write(str(correctness)) | |
| testr_label.write(opt_correct) | |
| testr_label.write("\n") | |
| test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), | |
| "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) | |
| test_info.write("\n") | |
| # Indicates actions of next student | |
| # Indicates next problem | |
| if writtenTrain: | |
| train_file.write("\n") | |
| train_info.write("\n") | |
| train_label.write("\n") | |
| trainr_label.write("\n") | |
| if writtenTest: | |
| test_file.write("\n") | |
| test_info.write("\n") | |
| test_label.write("\n") | |
| testr_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| trainr_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| testr_label.close() | |
| def prepare_school_coded_finetuning_partial_seq_files(data_processor, options): | |
| ''' | |
| Ongoing research. | |
| FinalAnswer step correctness | |
| Correct: 0 if attempt at step>1 | |
| 1 if attempt at step==1 | |
| ''' | |
| chunk_iterator = data_processor.load_file_iterator(sep=",") | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| # val_file = open(options.val_file_path, "w") | |
| # val_info = open(options.val_info_path, "w") | |
| # val_label = open(options.val_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) | |
| # kcs = [kc if not pd.isna(kc) for kc in kcs] | |
| for chunk_data in chunk_iterator: | |
| for school, school_group in chunk_data.groupby('CF (Anon School Id)'): | |
| if not options.school or school in options.school: | |
| print(f"{school} : {school_group.shape}") | |
| school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & | |
| (school_group['CF (Encounter)'] == 0) & | |
| (school_group['CF (Is Review Mode)'] == -1) ] | |
| print(f"{school} : {school_group.shape}") | |
| # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): | |
| for student, student_groups in school_group.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| # prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) | |
| unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) | |
| if unique_steps_len < 4: | |
| continue | |
| unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) | |
| if unique_opt_steps_len < 2: | |
| continue | |
| class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) | |
| step_names_token = [] | |
| original_steps_actions_attempts_help_levels_outcomes = [] | |
| original_steps = [] | |
| means_and_extremes = False | |
| correctness = "0" | |
| opt_used = False | |
| for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', | |
| 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', | |
| 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): | |
| step = row["Step Name"] | |
| action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] | |
| attempt = row["Attempt At Step"] # number | |
| outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| help_level = row["Help Level"] # number | |
| progress = row["CF (Workspace Progress Status)"] | |
| scenario = row['CF (Problem Scenario Tags)'] | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(etalon): | |
| etalon = etalon.strip('{}') | |
| key, value = etalon.split('=') | |
| etalon = value | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| except Exception as e: | |
| pass | |
| if row['CF (Is Autofilled)'] == True: | |
| continue | |
| prev = step_names_token[-1] if step_names_token else "" | |
| prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" | |
| if not step_names_token or step != prev_step: | |
| if step in options.opt_step1 or step in options.opt_step2: | |
| new_step = step | |
| opt_used = True | |
| else: | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if step != "FinalAnswer": | |
| step_names_token.append(new_step) | |
| else: | |
| step_names_token.append("FinalAnswer") | |
| else: | |
| if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if prev < new_step: | |
| step_names_token[-1] = new_step | |
| if step == "FinalAnswer" and opt_used: | |
| if attempt == 1 and outcome == "OK": | |
| correctness = "1" | |
| else: | |
| correctness = "0" | |
| original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") | |
| original_steps.append(step) | |
| unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) | |
| if step_names_token and unique_steps_len > 4: | |
| info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]) | |
| overall_data.append(["\t".join(step_names_token), info]) | |
| overall_labels.append(correctness) | |
| # proba = random.random() | |
| # # if prob in first_prob_list: | |
| # if proba <= 0.8: | |
| # train_file.write("\t".join(step_names_token)) | |
| # train_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # train_info.write("\n") | |
| # elif proba > 0.9: | |
| # # elif prob in last_prob_list: | |
| # test_file.write("\t".join(step_names_token)) | |
| # test_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # test_info.write("\n") | |
| # else: | |
| # val_file.write("\t".join(step_names_token)) | |
| # val_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # val_info.write("\n") | |
| # break | |
| # break | |
| # break | |
| # break | |
| # break | |
| overall_labels = np.array(overall_labels) | |
| indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| train_len = int(len(overall_labels) * 0.10) | |
| sample_size = int(train_len/2) | |
| print(f"sample_size: {sample_size}") | |
| sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] | |
| indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] | |
| balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) | |
| print(f"balanced_test: {balanced_test}") | |
| test_sampled_instances = random.sample(indices_of_zeros, balanced_test) | |
| test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| steps_seq = all_data[0] | |
| info = all_data[1] | |
| if index in sampled_instances: | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| elif index in test_sampled_instances: | |
| # proba = random.random() | |
| # if proba <0.5: | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_label.write(label) | |
| test_label.write("\n") | |
| # else: | |
| # val_file.write(steps_seq) | |
| # val_file.write("\n") | |
| # val_info.write(info) | |
| # val_info.write("\n") | |
| # val_label.write(label) | |
| # val_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| # val_file.close() | |
| # val_info.close() | |
| # val_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_school_coded_finetuning_opts_files(data_processor, options): | |
| ''' | |
| Ongoing research. | |
| Labels: | |
| 0 - Opt 1 | |
| 1 - Opt 2 | |
| 2 - Both Opt | |
| ''' | |
| chunk_iterator = data_processor.load_file_iterator(sep=",") | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| # val_file = open(options.val_file_path, "w") | |
| # val_info = open(options.val_info_path, "w") | |
| # val_label = open(options.val_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) | |
| # kcs = [kc if not pd.isna(kc) for kc in kcs] | |
| for chunk_data in chunk_iterator: | |
| for school, school_group in chunk_data.groupby('CF (Anon School Id)'): | |
| if not options.school or school in options.school: | |
| print(f"{school} : {school_group.shape}") | |
| school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & | |
| (school_group['CF (Encounter)'] == 0) & | |
| (school_group['CF (Is Review Mode)'] == -1) ] | |
| print(f"{school} : {school_group.shape}") | |
| # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): | |
| for student, student_groups in school_group.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| # prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) | |
| # prob_list = prob_list[-int(len(prob_list)/2):] | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # if not prob in prob_list: | |
| # continue | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) | |
| unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) | |
| if unique_steps_len < 4: | |
| continue | |
| unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) | |
| if unique_opt_steps_len < 2: | |
| continue | |
| print(unique_steps, unique_opt_steps_len) | |
| class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) | |
| step_names_token = [] | |
| original_steps_actions_attempts_help_levels_outcomes = [] | |
| original_steps = [] | |
| means_and_extremes = False | |
| opt1_used = False | |
| opt2_used = False | |
| for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', | |
| 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', | |
| 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): | |
| step = row["Step Name"] | |
| action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] | |
| attempt = row["Attempt At Step"] # number | |
| outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| help_level = row["Help Level"] # number | |
| progress = row["CF (Workspace Progress Status)"] | |
| scenario = row['CF (Problem Scenario Tags)'] | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(etalon): | |
| etalon = etalon.strip('{}') | |
| key, value = etalon.split('=') | |
| etalon = value | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| except Exception as e: | |
| pass | |
| if row['CF (Is Autofilled)'] == True: | |
| continue | |
| prev = step_names_token[-1] if step_names_token else "" | |
| prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" | |
| if not step_names_token or step != prev_step: | |
| if step in options.opt_step1 or step in options.opt_step2: | |
| new_step = step | |
| if step in options.opt_step1[1:]: | |
| opt1_used = True | |
| elif step in options.opt_step2[2:]: | |
| opt2_used = True | |
| else: | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| step_names_token.append(new_step) | |
| else: | |
| if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if prev < new_step: | |
| step_names_token[-1] = new_step | |
| original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") | |
| original_steps.append(step) | |
| if (not opt1_used) and (not opt2_used): | |
| continue | |
| unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) | |
| if step_names_token and unique_steps_len > 4: | |
| info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]) | |
| overall_data.append(["\t".join(step_names_token), info]) | |
| label = None | |
| if opt1_used and opt2_used: | |
| label = "2" | |
| if (not opt1_used) and opt2_used: | |
| label = "1" | |
| if opt1_used and (not opt2_used): | |
| label = "0" | |
| print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}") | |
| overall_labels.append(label) | |
| # proba = random.random() | |
| # # if prob in first_prob_list: | |
| # if proba <= 0.8: | |
| # train_file.write("\t".join(step_names_token)) | |
| # train_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # train_info.write("\n") | |
| # elif proba > 0.9: | |
| # # elif prob in last_prob_list: | |
| # test_file.write("\t".join(step_names_token)) | |
| # test_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # test_info.write("\n") | |
| # else: | |
| # val_file.write("\t".join(step_names_token)) | |
| # val_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # val_info.write("\n") | |
| # break | |
| # break | |
| # break | |
| # break | |
| # break | |
| overall_labels = np.array(overall_labels) | |
| indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| indices_of_twos = list(np.where(overall_labels == '2')[0]) | |
| train_len = int(len(overall_labels) * 0.10) | |
| sample_size = int(train_len/3) | |
| print(f"sample_size: {sample_size}") | |
| sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| sampled_instances.extend(random.sample(indices_of_twos, sample_size)) | |
| indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] | |
| indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] | |
| indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] | |
| balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos)) | |
| print(f"balanced_test: {balanced_test}") | |
| test_sampled_instances = random.sample(indices_of_zeros, balanced_test) | |
| test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) | |
| test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| steps_seq = all_data[0] | |
| info = all_data[1] | |
| if index in sampled_instances: | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| elif index in test_sampled_instances: | |
| # proba = random.random() | |
| # if proba <0.5: | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_label.write(label) | |
| test_label.write("\n") | |
| # else: | |
| # val_file.write(steps_seq) | |
| # val_file.write("\n") | |
| # val_info.write(info) | |
| # val_info.write("\n") | |
| # val_label.write(label) | |
| # val_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| # val_file.close() | |
| # val_info.close() | |
| # val_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_school_coded_finetuning_opts_intentional_files(data_processor, options): | |
| ''' | |
| Ongoing research. | |
| Labels: | |
| 0 - Opt 1 | |
| 1 - Opt 2 | |
| 2 - Both Opt | |
| ''' | |
| chunk_iterator = data_processor.load_file_iterator(sep=",") | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| val_file = open(options.val_file_path, "w") | |
| val_info = open(options.val_info_path, "w") | |
| val_label = open(options.val_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| # overall_data = [] | |
| # overall_labels = [] | |
| # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) | |
| # kcs = [kc if not pd.isna(kc) for kc in kcs] | |
| for chunk_data in chunk_iterator: | |
| for school, school_group in chunk_data.groupby('CF (Anon School Id)'): | |
| if not options.school or school in options.school: | |
| print(f"{school} : {school_group.shape}") | |
| school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & | |
| (school_group['CF (Encounter)'] == 0) & | |
| (school_group['CF (Is Review Mode)'] == -1) ] | |
| print(f"{school} : {school_group.shape}") | |
| # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): | |
| for student, student_groups in school_group.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| # prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| prob_list= list(pd.unique(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])) | |
| # prob_list = prob_list[-int(len(prob_list)/2):] | |
| if len(prob_list) == 0: | |
| continue | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # if not prob in prob_list: | |
| # continue | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) | |
| unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) | |
| if unique_steps_len < 4: | |
| continue | |
| unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) | |
| if unique_opt_steps_len < 2: | |
| continue | |
| # print(unique_steps, unique_opt_steps_len) | |
| class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) | |
| step_names_token = [] | |
| original_steps_actions_attempts_help_levels_outcomes = [] | |
| original_steps = [] | |
| means_and_extremes = False | |
| opt1_used = False | |
| opt2_used = False | |
| for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', | |
| 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', | |
| 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): | |
| step = row["Step Name"] | |
| action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] | |
| attempt = row["Attempt At Step"] # number | |
| outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| help_level = row["Help Level"] # number | |
| progress = row["CF (Workspace Progress Status)"] | |
| scenario = row['CF (Problem Scenario Tags)'] | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(etalon): | |
| etalon = etalon.strip('{}') | |
| key, value = etalon.split('=') | |
| etalon = value | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| except Exception as e: | |
| pass | |
| if row['CF (Is Autofilled)'] == True: | |
| continue | |
| prev = step_names_token[-1] if step_names_token else "" | |
| prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" | |
| if not step_names_token or step != prev_step: | |
| if step in options.opt_step1 or step in options.opt_step2: | |
| new_step = step | |
| if step in options.opt_step1[1:]: | |
| opt1_used = True | |
| elif step in options.opt_step2[2:]: | |
| opt2_used = True | |
| else: | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| step_names_token.append(new_step) | |
| else: | |
| if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if prev < new_step: | |
| step_names_token[-1] = new_step | |
| original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") | |
| original_steps.append(step) | |
| # if (not opt1_used) and (not opt2_used): | |
| # continue | |
| unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) | |
| if step_names_token and unique_steps_len > 4: | |
| info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]) | |
| # overall_data.append(["\t".join(step_names_token), info]) | |
| # label = None | |
| # if opt1_used and opt2_used: | |
| # label = "2" | |
| # if (not opt1_used) and opt2_used: | |
| # label = "1" | |
| # if opt1_used and (not opt2_used): | |
| # label = "0" | |
| # print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}") | |
| # overall_labels.append(label) | |
| proba = random.random() | |
| # if prob in first_prob_list: | |
| if proba <= 0.8: | |
| train_file.write("\t".join(step_names_token)) | |
| train_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, | |
| # prefered ER or ME, total steps length, | |
| # original seq-action-attempt-help_level-outcome | |
| train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| train_info.write("\n") | |
| elif proba > 0.9: | |
| # elif prob in last_prob_list: | |
| test_file.write("\t".join(step_names_token)) | |
| test_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, | |
| # prefered ER or ME, total steps length, | |
| # original seq-action-attempt-help_level-outcome | |
| test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| test_info.write("\n") | |
| else: | |
| val_file.write("\t".join(step_names_token)) | |
| val_file.write("\n") | |
| # school, class, student id, progress, problem name, scenario, | |
| # prefered ER or ME, total steps length, | |
| # original seq-action-attempt-help_level-outcome | |
| val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| val_info.write("\n") | |
| # break | |
| # break | |
| # break | |
| # break | |
| # break | |
| # overall_labels = np.array(overall_labels) | |
| # indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| # indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| # indices_of_twos = list(np.where(overall_labels == '2')[0]) | |
| # train_len = int(len(overall_labels) * 0.10) | |
| # sample_size = int(train_len/3) | |
| # print(f"sample_size: {sample_size}") | |
| # sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| # sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) | |
| # indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] | |
| # indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] | |
| # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] | |
| # balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos)) | |
| # print(f"balanced_test: {balanced_test}") | |
| # test_sampled_instances = random.sample(indices_of_zeros, balanced_test) | |
| # test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) | |
| # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) | |
| # for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| # steps_seq = all_data[0] | |
| # info = all_data[1] | |
| # if index in sampled_instances: | |
| # train_file.write(steps_seq) | |
| # train_file.write("\n") | |
| # train_info.write(info) | |
| # train_info.write("\n") | |
| # train_label.write(label) | |
| # train_label.write("\n") | |
| # elif index in test_sampled_instances: | |
| # # proba = random.random() | |
| # # if proba <0.5: | |
| # test_file.write(steps_seq) | |
| # test_file.write("\n") | |
| # test_info.write(info) | |
| # test_info.write("\n") | |
| # test_label.write(label) | |
| # test_label.write("\n") | |
| # # else: | |
| # # val_file.write(steps_seq) | |
| # # val_file.write("\n") | |
| # # val_info.write(info) | |
| # # val_info.write("\n") | |
| # # val_label.write(label) | |
| # # val_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| val_file.close() | |
| val_info.close() | |
| val_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options): | |
| ''' | |
| Ongoing research. | |
| FinalAnswer step correctness | |
| Correctness after opts: | |
| 0 if attempt at step>1 | |
| 1 if attempt at step==1 | |
| ''' | |
| kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) | |
| kcs = [kc for kc in kcs if not pd.isna(kc)] | |
| kcs = np.array(sorted(list(kcs))) | |
| print(kcs, type(kcs)) | |
| print(f"KCs: {kcs}") | |
| chunk_iterator = data_processor.load_file_iterator(sep=",") | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| # val_file = open(options.val_file_path, "w") | |
| # val_info = open(options.val_info_path, "w") | |
| # val_label = open(options.val_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) | |
| # kcs = [kc if not pd.isna(kc) for kc in kcs] | |
| for chunk_data in chunk_iterator: | |
| for school, school_group in chunk_data.groupby('CF (Anon School Id)'): | |
| if not options.school or school in options.school: | |
| print(f"{school} : {school_group.shape}") | |
| school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & | |
| (school_group['CF (Encounter)'] == 0) & | |
| (school_group['CF (Is Review Mode)'] == -1) ] | |
| print(f"{school} : {school_group.shape}") | |
| # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): | |
| for student, student_groups in school_group.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| # prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) | |
| # prob_list = prob_list[-int(len(prob_list)/2):] | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # if not prob in prob_list: | |
| # continue | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) | |
| unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) | |
| if unique_steps_len < 4: | |
| continue | |
| unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) | |
| if unique_opt_steps_len < 2: | |
| continue | |
| # print(unique_steps, unique_opt_steps_len) | |
| class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) | |
| step_names_token = [] | |
| original_steps_actions_attempts_help_levels_outcomes = [] | |
| original_steps = [] | |
| means_and_extremes = False | |
| opt1_used = False | |
| opt2_used = False | |
| final_after_opts = False | |
| correctness = "0" | |
| kcs_skills = [0 for i in kcs] | |
| diff_skills = [0 for i in kcs] | |
| finalanswer_skill = [0 for i in kcs] | |
| for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', | |
| 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', | |
| 'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', | |
| 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): | |
| step = row["Step Name"] | |
| action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] | |
| attempt = row["Attempt At Step"] # number | |
| outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| help_level = row["Help Level"] # number | |
| progress = row["CF (Workspace Progress Status)"] | |
| scenario = row['CF (Problem Scenario Tags)'] | |
| kc = row['KC Model(MATHia)'] | |
| prev_skill = row['CF (Skill Previous p-Known)'] | |
| curr_skill = row['CF (Skill New p-Known)'] | |
| # print(kc, prev_skill) | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(etalon): | |
| etalon = etalon.strip('{}') | |
| key, value = etalon.split('=') | |
| etalon = value | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| except Exception as e: | |
| pass | |
| if row['CF (Is Autofilled)'] == True: | |
| continue | |
| prev = step_names_token[-1] if step_names_token else "" | |
| prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" | |
| if not step_names_token or step != prev_step: | |
| if step in options.opt_step1 or step in options.opt_step2: | |
| new_step = step | |
| if step in options.opt_step1[1:]: | |
| opt1_used = True | |
| elif step in options.opt_step2[2:]: | |
| opt2_used = True | |
| else: | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts: | |
| final_after_opts = True | |
| if outcome == "OK": | |
| correctness = "1" | |
| step_names_token.append(new_step) | |
| else: | |
| if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if prev < new_step: | |
| step_names_token[-1] = new_step | |
| if not pd.isna(kc): | |
| index = np.argwhere(kcs==kc).flatten()[0] | |
| # print(index, type(index)) | |
| kcs_skills[index] = prev_skill | |
| diff_skills[index] = prev_skill - curr_skill | |
| if step == "FinalAnswer": | |
| finalanswer_skill[index] = prev_skill | |
| original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") | |
| original_steps.append(step) | |
| if (not opt1_used) and (not opt2_used): | |
| continue | |
| unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) | |
| if step_names_token and unique_steps_len > 4: | |
| label = None | |
| if opt1_used and opt2_used: | |
| label = "2" | |
| if (not opt1_used) and opt2_used: | |
| label = "1" | |
| if opt1_used and (not opt2_used): | |
| label = "0" | |
| # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}") | |
| info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label, | |
| "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), | |
| "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)]) | |
| overall_data.append(["\t".join(step_names_token), info]) | |
| overall_labels.append(correctness) | |
| # proba = random.random() | |
| # # if prob in first_prob_list: | |
| # if proba <= 0.8: | |
| # train_file.write("\t".join(step_names_token)) | |
| # train_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # train_info.write("\n") | |
| # elif proba > 0.9: | |
| # # elif prob in last_prob_list: | |
| # test_file.write("\t".join(step_names_token)) | |
| # test_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # test_info.write("\n") | |
| # else: | |
| # val_file.write("\t".join(step_names_token)) | |
| # val_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # val_info.write("\n") | |
| # break | |
| # break | |
| # break | |
| # break | |
| # break | |
| overall_labels = np.array(overall_labels) | |
| indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| # indices_of_twos = list(np.where(overall_labels == '2')[0]) | |
| train_len = int(len(overall_labels) * 0.10) | |
| sample_size = int(train_len/2) | |
| print(f"sample_size: {sample_size}") | |
| sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) | |
| indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] | |
| indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] | |
| # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] | |
| balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos)) | |
| print(f"balanced_test: {balanced_test}") | |
| test_sampled_instances = random.sample(indices_of_zeros, balanced_test) | |
| test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) | |
| # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| steps_seq = all_data[0] | |
| info = all_data[1] | |
| if index in sampled_instances: | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| elif index in test_sampled_instances: | |
| # proba = random.random() | |
| # if proba <0.5: | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_label.write(label) | |
| test_label.write("\n") | |
| # else: | |
| # val_file.write(steps_seq) | |
| # val_file.write("\n") | |
| # val_info.write(info) | |
| # val_info.write("\n") | |
| # val_label.write(label) | |
| # val_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| # val_file.close() | |
| # val_info.close() | |
| # val_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options): | |
| ''' | |
| Ongoing research. | |
| FinalAnswer step correctness | |
| Correctness after opts: | |
| 0 if attempt at step>1 | |
| 1 if attempt at step==1 | |
| ''' | |
| kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) | |
| kcs = [kc for kc in kcs if not pd.isna(kc)] | |
| kcs = np.array(sorted(list(kcs))) | |
| print(kcs, type(kcs)) | |
| print(f"KCs: {kcs}") | |
| chunk_iterator = data_processor.load_file_iterator(sep=",") | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| # val_file = open(options.val_file_path, "w") | |
| # val_info = open(options.val_info_path, "w") | |
| # val_label = open(options.val_label_path, "w") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| train_data = [] | |
| train_labels = [] | |
| test_data = [] | |
| test_labels = [] | |
| # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) | |
| # kcs = [kc if not pd.isna(kc) for kc in kcs] | |
| for chunk_data in chunk_iterator: | |
| for school, school_group in chunk_data.groupby('CF (Anon School Id)'): | |
| if not options.school or school in options.school: | |
| print(f"{school} : {school_group.shape}") | |
| school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & | |
| (school_group['CF (Encounter)'] == 0) & | |
| (school_group['CF (Is Review Mode)'] == -1) ] | |
| print(f"{school} : {school_group.shape}") | |
| # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): | |
| for student, student_groups in school_group.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| train = True | |
| proba = random.random() | |
| if proba < 0.5: | |
| train = False | |
| # prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) | |
| # prob_list = prob_list[-int(len(prob_list)/2):] | |
| prev_kcs_skills = [0 for i in kcs] | |
| for pi, (prob, prob_groups) in enumerate(student_groups.groupby("Problem Name")): | |
| # if not prob in prob_list: | |
| # continue | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) | |
| unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) | |
| if unique_steps_len < 4: | |
| continue | |
| unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) | |
| if unique_opt_steps_len < 2: | |
| continue | |
| # print(unique_steps, unique_opt_steps_len) | |
| class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) | |
| step_names_token = [] | |
| original_steps_actions_attempts_help_levels_outcomes = [] | |
| original_steps = [] | |
| means_and_extremes = False | |
| opt1_used = False | |
| opt2_used = False | |
| final_after_opts = False | |
| correctness = "0" | |
| kcs_skills = [0 for i in kcs] | |
| diff_skills = [0 for i in kcs] | |
| finalanswer_skill = [0 for i in kcs] | |
| for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', | |
| 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', | |
| 'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', | |
| 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): | |
| step = row["Step Name"] | |
| action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] | |
| attempt = row["Attempt At Step"] # number | |
| outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| help_level = row["Help Level"] # number | |
| progress = row["CF (Workspace Progress Status)"] | |
| scenario = row['CF (Problem Scenario Tags)'] | |
| kc = row['KC Model(MATHia)'] | |
| prev_skill = row['CF (Skill Previous p-Known)'] | |
| curr_skill = row['CF (Skill New p-Known)'] | |
| # print(kc, prev_skill) | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(etalon): | |
| etalon = etalon.strip('{}') | |
| key, value = etalon.split('=') | |
| etalon = value | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| except Exception as e: | |
| pass | |
| if row['CF (Is Autofilled)'] == True: | |
| continue | |
| prev = step_names_token[-1] if step_names_token else "" | |
| prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" | |
| if not step_names_token or step != prev_step: | |
| if step in options.opt_step1 or step in options.opt_step2: | |
| new_step = step | |
| if step in options.opt_step1[1:]: | |
| opt1_used = True | |
| elif step in options.opt_step2[2:]: | |
| opt2_used = True | |
| else: | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts: | |
| final_after_opts = True | |
| if outcome == "OK": | |
| correctness = "1" | |
| step_names_token.append(new_step) | |
| else: | |
| if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if prev < new_step: | |
| step_names_token[-1] = new_step | |
| if not pd.isna(kc): | |
| index = np.argwhere(kcs==kc).flatten()[0] | |
| # print(index, type(index)) | |
| kcs_skills[index] = prev_skill | |
| if pi != 0: | |
| diff_skills[index] = prev_skill - prev_kcs_skills[index] | |
| prev_kcs_skills[index] = prev_skill | |
| if step == "FinalAnswer": | |
| finalanswer_skill[index] = prev_skill | |
| original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") | |
| original_steps.append(step) | |
| if (not opt1_used) and (not opt2_used): | |
| continue | |
| unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) | |
| if step_names_token and unique_steps_len > 4: | |
| label = None | |
| if opt1_used and opt2_used: | |
| label = "2" | |
| if (not opt1_used) and opt2_used: | |
| label = "1" | |
| if opt1_used and (not opt2_used): | |
| label = "0" | |
| # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}") | |
| info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label, | |
| "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), | |
| "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)]) | |
| if train: | |
| train_data.append(["\t".join(step_names_token), info]) | |
| train_labels.append(correctness) | |
| else: | |
| test_data.append(["\t".join(step_names_token), info]) | |
| test_labels.append(correctness) | |
| # proba = random.random() | |
| # # if prob in first_prob_list: | |
| # if proba <= 0.8: | |
| # train_file.write("\t".join(step_names_token)) | |
| # train_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # train_info.write("\n") | |
| # elif proba > 0.9: | |
| # # elif prob in last_prob_list: | |
| # test_file.write("\t".join(step_names_token)) | |
| # test_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # test_info.write("\n") | |
| # else: | |
| # val_file.write("\t".join(step_names_token)) | |
| # val_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # val_info.write("\n") | |
| # break | |
| # break | |
| # break | |
| # break | |
| # break | |
| # overall_labels = np.array(overall_labels) | |
| # indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| # indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| # # indices_of_twos = list(np.where(overall_labels == '2')[0]) | |
| # train_len = int(len(overall_labels) * 0.10) | |
| # sample_size = int(train_len/2) | |
| # print(f"sample_size: {sample_size}") | |
| # sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| # sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| # # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) | |
| # indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] | |
| # indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] | |
| # # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] | |
| # balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos)) | |
| # print(f"balanced_test: {balanced_test}") | |
| # test_sampled_instances = random.sample(indices_of_zeros, balanced_test) | |
| # test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) | |
| # # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) | |
| for index, (all_data, label) in enumerate(zip(train_data, train_labels)): | |
| steps_seq = all_data[0] | |
| info = all_data[1] | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| for index, (all_data, label) in enumerate(zip(test_data, test_labels)): | |
| steps_seq = all_data[0] | |
| info = all_data[1] | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_label.write(label) | |
| test_label.write("\n") | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options): | |
| ''' | |
| Ongoing research. | |
| FinalAnswer step correctness | |
| Correctness after opts: | |
| 0 if attempt at step>1 | |
| 1 if attempt at step==1 | |
| ''' | |
| kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) | |
| kcs = [kc for kc in kcs if not pd.isna(kc)] | |
| kcs = np.array(sorted(list(kcs))) | |
| print(kcs, type(kcs)) | |
| print(f"KCs: {kcs}") | |
| chunk_iterator = data_processor.load_file_iterator(sep=",") | |
| train_file = open(options.train_file_path, "w") | |
| train_info = open(options.train_info_path, "w") | |
| train_label = open(options.train_label_path, "w") | |
| val_file = open(options.val_file_path, "a") | |
| val_info = open(options.val_info_path, "a") | |
| val_label = open(options.val_label_path, "a") | |
| test_file = open(options.test_file_path, "w") | |
| test_info = open(options.test_info_path, "w") | |
| test_label = open(options.test_label_path, "w") | |
| overall_data = [] | |
| overall_labels = [] | |
| # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) | |
| # kcs = [kc if not pd.isna(kc) for kc in kcs] | |
| for chunk_data in chunk_iterator: | |
| for school, school_group in chunk_data.groupby('CF (Anon School Id)'): | |
| if not options.school or school in options.school: | |
| print(f"{school} : {school_group.shape}") | |
| school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & | |
| (school_group['CF (Encounter)'] == 0) & | |
| (school_group['CF (Is Review Mode)'] == -1) ] | |
| print(f"{school} : {school_group.shape}") | |
| # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): | |
| for student, student_groups in school_group.groupby("Anon Student Id"): | |
| student_groups.sort_values(by="Time", inplace=True) | |
| # prob_list = list(pd.unique(student_groups["Problem Name"])) | |
| # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) | |
| # prob_list = prob_list[-int(len(prob_list)/2):] | |
| for prob, prob_groups in student_groups.groupby("Problem Name"): | |
| # if not prob in prob_list: | |
| # continue | |
| actions = list(prob_groups["Action"]) | |
| # A problem should be completed by a student clicking Done button. | |
| if not "Done" in actions: | |
| continue | |
| unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) | |
| unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) | |
| if unique_steps_len < 4: | |
| continue | |
| unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) | |
| if unique_opt_steps_len < 2: | |
| continue | |
| # print(unique_steps, unique_opt_steps_len) | |
| class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) | |
| step_names_token = [] | |
| original_steps_actions_attempts_help_levels_outcomes = [] | |
| original_steps = [] | |
| means_and_extremes = False | |
| opt1_used = False | |
| opt2_used = False | |
| final_after_opts = False | |
| correctness = "0" | |
| kcs_skills = [0 for i in kcs] | |
| diff_skills = [0 for i in kcs] | |
| finalanswer_skill = [0 for i in kcs] | |
| for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', | |
| 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', | |
| 'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', | |
| 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): | |
| step = row["Step Name"] | |
| action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] | |
| attempt = row["Attempt At Step"] # number | |
| outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] | |
| help_level = row["Help Level"] # number | |
| progress = row["CF (Workspace Progress Status)"] | |
| scenario = row['CF (Problem Scenario Tags)'] | |
| kc = row['KC Model(MATHia)'] | |
| prev_skill = row['CF (Skill Previous p-Known)'] | |
| curr_skill = row['CF (Skill New p-Known)'] | |
| # print(kc, prev_skill) | |
| if not pd.isna(step): | |
| if step in options.opt_step1 and not means_and_extremes: | |
| etalon = row["CF (Etalon)"] | |
| if not pd.isna(etalon): | |
| etalon = etalon.strip('{}') | |
| key, value = etalon.split('=') | |
| etalon = value | |
| try: | |
| etalon = int(etalon) | |
| except Exception as e: | |
| try: | |
| etalon = float(etalon) | |
| means_and_extremes = True | |
| except Exception as e: | |
| pass | |
| if row['CF (Is Autofilled)'] == True: | |
| continue | |
| prev = step_names_token[-1] if step_names_token else "" | |
| prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" | |
| if not step_names_token or step != prev_step: | |
| if step in options.opt_step1 or step in options.opt_step2: | |
| new_step = step | |
| if step in options.opt_step1[1:]: | |
| opt1_used = True | |
| elif step in options.opt_step2[2:]: | |
| opt2_used = True | |
| else: | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts: | |
| final_after_opts = True | |
| if outcome == "OK": | |
| correctness = "1" | |
| step_names_token.append(new_step) | |
| else: | |
| if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): | |
| if action == "Attempt" and outcome != "OK": | |
| new_step = step+"-2" | |
| elif "Hint" in action: | |
| new_step = step+"-1" | |
| else: | |
| new_step = step+"-0" | |
| if prev < new_step: | |
| step_names_token[-1] = new_step | |
| if not pd.isna(kc): | |
| index = np.argwhere(kcs==kc).flatten()[0] | |
| # print(index, type(index)) | |
| kcs_skills[index] = prev_skill | |
| diff_skills[index] = prev_skill - curr_skill | |
| if step == "FinalAnswer": | |
| finalanswer_skill[index] = prev_skill | |
| original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") | |
| original_steps.append(step) | |
| if (not opt1_used) and (not opt2_used): | |
| continue | |
| unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) | |
| if step_names_token and unique_steps_len > 4: | |
| label = None | |
| if opt1_used and opt2_used: | |
| label = "2" | |
| if (not opt1_used) and opt2_used: | |
| label = "1" | |
| if opt1_used and (not opt2_used): | |
| label = "0" | |
| # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}") | |
| info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label, | |
| "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), | |
| "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)]) | |
| overall_data.append(["\t".join(step_names_token), info]) | |
| overall_labels.append(correctness) | |
| # proba = random.random() | |
| # # if prob in first_prob_list: | |
| # if proba <= 0.8: | |
| # train_file.write("\t".join(step_names_token)) | |
| # train_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # train_info.write("\n") | |
| # elif proba > 0.9: | |
| # # elif prob in last_prob_list: | |
| # test_file.write("\t".join(step_names_token)) | |
| # test_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # test_info.write("\n") | |
| # else: | |
| # val_file.write("\t".join(step_names_token)) | |
| # val_file.write("\n") | |
| # # school, class, student id, progress, problem name, scenario, | |
| # # prefered ER or ME, total steps length, | |
| # # original seq-action-attempt-help_level-outcome | |
| # val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), | |
| # f"{1 if means_and_extremes else 0}", str(len(step_names_token)), | |
| # "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) | |
| # val_info.write("\n") | |
| # break | |
| # break | |
| # break | |
| # break | |
| # break | |
| overall_labels = np.array(overall_labels) | |
| indices_of_zeros = list(np.where(overall_labels == '0')[0]) | |
| indices_of_ones = list(np.where(overall_labels == '1')[0]) | |
| # indices_of_twos = list(np.where(overall_labels == '2')[0]) | |
| # train_len = int(len(overall_labels) * 0.10) | |
| train_len = int(len(overall_labels) * float(options.per)) | |
| sample_size = int(train_len/2) | |
| if float(options.per) == 1: | |
| sample_size = min(len(indices_of_zeros), len(indices_of_ones)) | |
| elif float(options.per) > 1: | |
| sample_size = int(options.per) | |
| print(f"sample_size: {sample_size}") | |
| sampled_instances = random.sample(indices_of_zeros, sample_size) | |
| sampled_instances.extend(random.sample(indices_of_ones, sample_size)) | |
| # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) | |
| indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] | |
| indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] | |
| # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] | |
| balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos)) | |
| print(f"balanced_test: {balanced_test}") | |
| test_sampled_instances = random.sample(indices_of_zeros, balanced_test) | |
| test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) | |
| # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) | |
| for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): | |
| steps_seq = all_data[0] | |
| info = all_data[1] | |
| if index in sampled_instances: | |
| train_file.write(steps_seq) | |
| train_file.write("\n") | |
| train_info.write(info) | |
| train_info.write("\n") | |
| train_label.write(label) | |
| train_label.write("\n") | |
| if float(options.per) == 1.0: | |
| val_file.write(steps_seq) | |
| val_file.write("\n") | |
| val_info.write(info) | |
| val_info.write("\n") | |
| val_label.write(label) | |
| val_label.write("\n") | |
| elif index in test_sampled_instances: | |
| # proba = random.random() | |
| # if proba <0.5: | |
| test_file.write(steps_seq) | |
| test_file.write("\n") | |
| test_info.write(info) | |
| test_info.write("\n") | |
| test_label.write(label) | |
| test_label.write("\n") | |
| if float(options.per) != 1.0: | |
| val_file.write(steps_seq) | |
| val_file.write("\n") | |
| val_info.write(info) | |
| val_info.write("\n") | |
| val_label.write(label) | |
| val_label.write("\n") | |
| train_file.close() | |
| train_info.close() | |
| train_label.close() | |
| val_file.close() | |
| val_info.close() | |
| val_label.close() | |
| test_file.close() | |
| test_info.close() | |
| test_label.close() | |
| def prepare_pretraining_vocab_file(options): | |
| # kc = pickle.load(open("dataset/unique/unique_kcs_list.pkl","rb")) | |
| # kc_token = {"KC"+str(i):k for i, k in enumerate(kc)} | |
| # pickle.dump(kc_token, open("pretraining/unique_dict_kc_token.pkl", "wb")) | |
| # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| # step_token = {"step"+str(i):k for i, k in enumerate(steps)} | |
| # folder_name = options.workspace_name+"/" if options.workspace_name else "" | |
| # pickle.dump(step_token, open(f"{folder_name}pretraining/unique_dict_step_token.pkl", "wb")) | |
| # steps = pickle.load(open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl","rb")) | |
| steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) | |
| # print("No of unique kc", len(kc)) | |
| print("No of unique steps ", len(steps)) | |
| # print("No of unique problem", len(prob)) | |
| # print("Size of vocab ", len(steps)) | |
| ordered_steps = sorted(list(steps)) | |
| with (open(options.vocab_file_path,"w")) as vb_file: | |
| vb_file.write("[PAD]\n") | |
| vb_file.write("[UNK]\n") | |
| vb_file.write("[MASK]\n") | |
| vb_file.write("[CLS]\n") | |
| vb_file.write("[SEP]\n") | |
| # vb_file.write("\n".join(kc_token.keys())) | |
| # vb_file.write("\n") | |
| # vb_file.write("\n".join(step_token.keys())) | |
| # vb_file.write("\n".join(ordered_steps)) | |
| for step in ordered_steps: | |
| if step in options.opt_step1 or step in options.opt_step2: | |
| vb_file.write(f"{step}\n") | |
| else: | |
| for i in range(3): | |
| vb_file.write(f"{step}-{i}\n") | |
| vb_file.close() | |
| with open(options.vocab_file_path,"r") as f: | |
| l = f.readlines() | |
| print(l, len(l)) | |
| f.close() | |
| def main(opt): | |
| options = copy.deepcopy(opt) | |
| if opt.workspace_name: | |
| options.dataset_folder = opt.dataset_folder+opt.workspace_name+"/" | |
| data_processor = DataPreprocessor(input_file_path=opt.dataset) | |
| if opt.analyze_dataset_by_section: | |
| print(f"Analyzing dataset by section for workspace: {opt.workspace_name}") | |
| data_processor.analyze_dataset_by_section(opt.workspace_name) | |
| pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_prob_hierarchy, open(f"{options.dataset_folder}unique_hierarchy_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb")) | |
| if opt.analyze_dataset_by_school: | |
| print(f"Analyzing dataset of all school for workspace: {opt.workspace_name}") | |
| data_processor.analyze_dataset_by_school(opt.workspace_name) | |
| if not os.path.exists(options.dataset_folder): | |
| os.makedirs(options.dataset_folder) | |
| pickle.dump(data_processor.unique_schools, open(f"{options.dataset_folder}unique_schools_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_class, open(f"{options.dataset_folder}unique_class_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_new_steps_w_action_attempt, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl", "wb")) | |
| pickle.dump(data_processor.unique_new_steps_w_action_attempt_kcs, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_kcs.pkl", "wb")) | |
| pickle.dump(data_processor.unique_new_steps_w_kcs, open(f"{options.dataset_folder}unique_new_steps_w_kcs_list.pkl", "wb")) | |
| if opt.workspace_name: | |
| for k,v in vars(opt).items(): | |
| if 'path' in k: | |
| if v: | |
| redirect_path = opt.workspace_name+"/" | |
| if opt.school and opt.pretrain: | |
| sch = f"sch_largest_{len(opt.school)}-coded" #f"sch_largest_655" | |
| redirect_path = redirect_path + sch+"/" | |
| if opt.school_folder: | |
| redirect_path = redirect_path + opt.school_folder+"/" | |
| # else: | |
| # sch = "sch_largest_655" | |
| if k != "vocab_file_path": | |
| if opt.pretrain: | |
| redirect_path = redirect_path + "pretraining/" | |
| else: | |
| if opt.code: | |
| redirect_path = redirect_path + f"{opt.code}/" | |
| elif opt.finetune_task: | |
| if opt.diff_val_folder and "val" in v: | |
| redirect_path = redirect_path + f"finetuning/" | |
| else: | |
| redirect_path = redirect_path + f"finetuning/{opt.finetune_task}/" | |
| if not os.path.exists(redirect_path): | |
| os.makedirs(redirect_path) | |
| else: | |
| if not os.path.exists(redirect_path+"/pretraining/"): | |
| os.makedirs(redirect_path+"/pretraining/") | |
| setattr(options, f"{k}", redirect_path+v) | |
| # setattr(options, f"{k}", opt.workspace_name+"/check/"+v) | |
| print(f"options.{k} : {getattr(options, f'{k}')}") | |
| if options.pretrain: | |
| print("Preparing vocab...") | |
| prepare_pretraining_vocab_file(options) | |
| print("Preparing pre-training dataset...") | |
| # old non-repeated steps | |
| # prepare_pretraining_files(data_processor, options) | |
| # coded | |
| # prepare_school_coded_pretraining_files(data_processor, options) | |
| prepare_school_coded_finetuning_opts_intentional_files(data_processor, options) | |
| # prepare_pretraining_files(data_processor, options) | |
| # prepare_school_pretraining_files(data_processor, options) | |
| # else: | |
| # print("Preparing attention dataset...") | |
| # prepare_school_attention_files(data_processor, options) | |
| else: | |
| print("Preparing fine-tuning dataset...") | |
| # _1920 | |
| # prepare_finetuning_10per_files(data_processor, options) | |
| # prepare_finetuning_IS_FS_files(data_processor, options) | |
| # prepare_finetuning_correctness_files(data_processor, options) | |
| # _2223 | |
| # prepare_school_coded_finetuning_partial_seq_files(data_processor, options) | |
| # prepare_school_coded_finetuning_opts_files(data_processor, options) | |
| prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options) | |
| # prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options) | |
| # prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options) | |
| # prepare_finetuning_IS_files(data_processor, options) | |
| # # prepare_finetuning_FS_files(data_processor, options) | |
| # prepare_finetuning_correctness_aaai_files(data_processor, options) | |
| # # prepare_finetuning_SL_files(data_processor, options) | |
| # # prepare_finetuning_effectiveness_files(data_processor, options) | |
| # prepare_attn_test_files(data_processor, options) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('-dataset_folder', type=str, default="dataset/CL4999_1920/") | |
| parser.add_argument('-analyze_dataset_by_section', type=bool, default=False) | |
| parser.add_argument('-analyze_dataset_by_school', type=bool, default=False) | |
| parser.add_argument('-workspace_name', type=str, default=None) | |
| parser.add_argument('-school', nargs='+', type=str, default=None) | |
| parser.add_argument('-school_folder', type=str, default=None) | |
| # parser.add_argument('-highGRschool', nargs='+', type=str, default=None) | |
| # parser.add_argument('-lowGRschool', nargs='+', type=str, default=None) | |
| parser.add_argument('-code', type=str, default=None) | |
| parser.add_argument('-finetune_task', type=str, default=None) | |
| parser.add_argument('-per', type=float, default=None) | |
| parser.add_argument("-diff_val_folder", type=bool, default=False, help="use for different val folder") | |
| parser.add_argument('-opt_step1', nargs='+', type=str, help='List of optional steps 1') | |
| parser.add_argument('-opt_step2', nargs='+', type=str, help='List of optional steps 2') | |
| parser.add_argument('-final_step', nargs='+', type=str, help='List of final step') | |
| parser.add_argument('-dataset', type=str, default="dataset/CL4999_1920/course2_1920_4999_students_datashop.txt") | |
| parser.add_argument('-pretrain', type=bool, default=False) | |
| parser.add_argument('-vocab_file_path', type=str, default="pretraining/vocab.txt") #pretraining/vocab.txt | |
| # Prepare for pretraining | |
| parser.add_argument('-train_file_path', type=str, default="train.txt") #pretraining/pretrain.txt | |
| parser.add_argument('-train_info_path', type=str, default="train_info.txt") #pretraining/pretrain_info.txt | |
| parser.add_argument('-train_label_path', type=str, default="train_label.txt") #finetuning/train_label.txt | |
| parser.add_argument('-val_file_path', type=str, default="val.txt") #pretraining/val.txt | |
| parser.add_argument('-val_info_path', type=str, default="val_info.txt") #pretraining/val_info.txt | |
| parser.add_argument('-val_label_path', type=str, default="val_label.txt") #finetuning/val_label.txt | |
| parser.add_argument('-test_file_path', type=str, default="test.txt") #pretraining/test.txt | |
| parser.add_argument('-test_info_path', type=str, default="test_info.txt") #pretraining/test_info.txt | |
| parser.add_argument('-test_label_path', type=str, default="test_label.txt") #finetuning/test_label.txt | |
| # parser.add_argument('-train_gt_label_path', type=str, default="finetuning/train_gt_label.txt") | |
| # parser.add_argument('-test_gt_label_path', type=str, default="finetuning/test_gt_label.txt") | |
| options = parser.parse_args() | |
| if not options.opt_step1: | |
| setattr(options, "opt_step1", []) | |
| print("Optional steps 1: ", options.opt_step1) | |
| if not options.opt_step2: | |
| setattr(options, "opt_step2", []) | |
| print("Optional steps 2: ", options.opt_step2) | |
| if not options.final_step: | |
| setattr(options, "final_step", []) | |
| print("Final steps: ", options.final_step) | |
| main(options) | |