Spaces:

AstraTeam
/

astra

Build error

App Files Files Community

astra / prepare_pretraining_input_vocab_file.py

suryadev1

fine

cecfca1 over 1 year ago

raw

history blame

259 kB

	import argparse
	import pickle
	import random
	import copy
	import pandas as pd
	import numpy as np
	from collections import Counter
	import os
	from data_preprocessor import DataPreprocessor

	def prepare_pretraining_files(data_processor, options):

	# steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))

	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")

	val_file = open(options.val_file_path, "w")
	val_info = open(options.val_info_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")


	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	# if options.workspace_name == section:
	if "ratio_proportion_change3" == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):
	# step_names_token = [step for step in prob_groups['Step Name'] if str(step) != 'nan']
	# print(step_names_token)

	# writtenTrain = False
	# writtenTest = False

	student_groups.sort_values(by="Time", inplace=True)
	# prob_list = list(pd.unique(student_groups["Problem Name"]))
	# print(len(prob_list), prob_list)

	# first_prob_list = prob_list[:3]
	# last_prob_list = prob_list[-3:]
	# print(len(first_prob_list), first_prob_list)
	# print(len(last_prob_list), last_prob_list)

	# final_prob_list = first_prob_list + last_prob_list
	# print(len(prob_list), len(final_prob_list), final_prob_list)

	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# For first 3 and last 3 only
	# if not prob in final_prob_list:
	# continue
	# print(prob)
	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups["Step Name"]))
	unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)]))
	if unique_steps_len < 4:
	continue

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 1800:
	time_stamps_list.add(time_stamps[i+1])

	# progress = ""

	step_names_token = []
	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False

	for index, row in prob_groups[['Time', 'Step Name', 'CF (Etalon)', 'Outcome', 'Help Level','CF (Workspace Progress Status)']].iterrows():

	step = row["Step Name"]
	progress = row["CF (Workspace Progress Status)"]
	etalon = row["CF (Etalon)"]

	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass

	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	# if row["Time"] in time_stamps_list:
	auto_complete = True
	# print(row)
	continue

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)]))

	# 4 and more in sequence
	if step_names_token and unique_steps_len > 4:
	# and len(step_names_token) > 3
	# For information
	# indices = [str(i) for i in prob_groups.index]
	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")
	proba = random.random()

	# if prob in first_prob_list:
	if proba <= 0.8:
	# writtenTrain = True
	train_file.write("\t".join(step_names_token))
	train_file.write("\n")
	# train_info.write(",".join([str(progress),str(prob), str(student), str(len(step_names_token)),
	# "\t".join(map(str, outcome)), "\t".join(map(str, help_level))]))
	# progress, problem name, student id, auto_complete, total steps length, er or me, outcome seq, help_level seq, encoding in steps length
	train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)),
	"\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
	train_info.write("\n")

	elif proba > 0.9:
	# elif prob in last_prob_list:

	# writtenTest = True

	test_file.write("\t".join(step_names_token))
	test_file.write("\n")
	# test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	# "\t".join(map(str, outcome)), "\t".join(map(str, help_level))]))
	# progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length
	test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)),
	"\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
	test_info.write("\n")
	else:
	val_file.write("\t".join(step_names_token))
	val_file.write("\n")
	# test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	# "\t".join(map(str, outcome)), "\t".join(map(str, help_level))]))
	# progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length
	val_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)),
	"\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
	val_info.write("\n")
	# Indicates actions of next student
	# Indicates next problem
	# if writtenTrain:
	# train_file.write("\n")
	# train_info.write("\n")
	# if writtenTest:
	# test_file.write("\n")
	# test_info.write("\n")
	# if not writtenTrain and not writtenTest:
	# print(f"Student {student} is not involved in workspace : {options.workspace_name}.")


	train_file.close()
	train_info.close()

	val_file.close()
	val_info.close()

	test_file.close()
	test_info.close()

	def prepare_school_pretraining_files(data_processor, options):

	# steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))

	chunk_iterator = data_processor.load_file_iterator(sep=",")

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")

	val_file = open(options.val_file_path, "w")
	val_info = open(options.val_info_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")


	for chunk_data in chunk_iterator:
	for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
	if not options.school or school in options.school:
	for class_id, class_group in school_group.groupby('CF (Anon Class Id)'):
	for student, student_groups in class_group.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time")
	# prob_list = list(pd.unique(student_groups["Problem Name"]))
	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# For first 3 and last 3 only
	# if not prob in final_prob_list:
	# continue
	# print(prob)
	step_names_token = []
	means_and_extremes = False
	for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
	progress = row["CF (Workspace Progress Status)"]
	action = row["Action"]
	attempt = row["Attempt At Step"]
	autofilled = row["CF (Is Autofilled)"]
	step = row["Step Name"]
	scenario = row['CF (Problem Scenario Tags)']

	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	etalon = row["CF (Etalon)"]
	if not pd.isna(etalon):
	etalon = etalon.strip('{}')
	key, value = etalon.split('=')
	etalon = value
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	except Exception as e:
	pass

	if not autofilled:
	new_step = f"{step}:{action}:{attempt}"
	step_names_token.append(new_step)

	if step_names_token:
	where_opt = []
	step1 = False
	step2 = False
	strategy_data = False
	for step_oh in step_names_token:
	step = step_oh.split(":")
	if len(step) == 3:
	step = step[0]
	else:
	step = ":".join(step[:2])

	# print(f"changed {step_oh} = ? {step}")
	if step == options.opt_step1[0]:
	where_opt.append("_1")
	step1 = True
	elif step == options.opt_step2[0]:
	where_opt.append("_2")
	step2 = True
	elif step in options.opt_step1[1:]:
	where_opt.append("1")
	if step1:
	strategy_data = True
	elif step in options.opt_step2[1:]:
	where_opt.append("2")
	if step2:
	strategy_data = True
	else:
	where_opt.append("0")

	if strategy_data and step_names_token[-1].split(":")[-2] != "Done":
	strategy_data = False

	if strategy_data:
	proba = random.random()
	step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token]
	step_names_token = []
	for s in step_names_tokens:
	if s != "nan":
	if not step_names_token or s != step_names_token[-1]:
	step_names_token.append(s)
	# if prob in first_prob_list:
	if proba <= 0.8:
	train_file.write("\t".join(step_names_token))
	train_file.write("\n")
	# school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
	train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
	train_info.write("\n")

	elif proba > 0.9:
	# elif prob in last_prob_list:
	test_file.write("\t".join(step_names_token))
	test_file.write("\n")
	# school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
	test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
	test_info.write("\n")

	else:
	val_file.write("\t".join(step_names_token))
	val_file.write("\n")
	# school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
	val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
	val_info.write("\n")
	# break
	# break
	# break
	# break
	# break



	train_file.close()
	train_info.close()

	val_file.close()
	val_info.close()

	test_file.close()
	test_info.close()

	def prepare_school_coded_pretraining_files(data_processor, options):

	# steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))

	chunk_iterator = data_processor.load_file_iterator(sep=",")

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")

	val_file = open(options.val_file_path, "w")
	val_info = open(options.val_info_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")


	for chunk_data in chunk_iterator:
	for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
	if not options.school or school in options.school:
	print(f"{school} : {school_group.shape}")
	school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
	(school_group['CF (Encounter)'] == 0) &
	(school_group['CF (Is Review Mode)'] == -1) ]
	print(f"{school} : {school_group.shape}")
	# for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
	for student, student_groups in school_group.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)
	# At least 3 last problems are selected
	prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
	prob_list = prob_list[-int(len(prob_list)/2):]
	for prob, prob_groups in student_groups.groupby("Problem Name"):
	if not prob in prob_list:
	continue
	progress = list(pd.unique(prob_groups["CF (Workspace Progress Status)"]))[0]
	if progress != "GRADUATED":
	continue
	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
	unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
	if unique_steps_len < 4:
	continue
	class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
	step_names_token = []
	original_steps_actions_attempts_help_levels_outcomes = []
	original_steps = []
	means_and_extremes = False
	for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
	'Outcome', 'Help Level', 'CF (Workspace Progress Status)',
	'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
	step = row["Step Name"]
	action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
	attempt = row["Attempt At Step"] # number
	outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	help_level = row["Help Level"] # number
	# progress = row["CF (Workspace Progress Status)"]
	scenario = row['CF (Problem Scenario Tags)']

	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	etalon = row["CF (Etalon)"]
	if not pd.isna(etalon):
	etalon = etalon.strip('{}')
	key, value = etalon.split('=')
	etalon = value
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	except Exception as e:
	pass
	if row['CF (Is Autofilled)'] == True:
	continue
	prev = step_names_token[-1] if step_names_token else ""
	prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""

	if not step_names_token or step != prev_step:
	if step in options.opt_step1 or step in options.opt_step2:
	new_step = step
	else:
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"
	step_names_token.append(new_step)
	else:
	if not (step in options.opt_step1 or step in options.opt_step2):
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"
	if prev < new_step:
	step_names_token[-1] = new_step
	original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
	original_steps.append(step)

	unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
	if step_names_token and unique_steps_len > 4:
	proba = random.random()
	# if prob in first_prob_list:
	if proba <= 0.8:
	train_file.write("\t".join(step_names_token))
	train_file.write("\n")
	# school, class, student id, progress, problem name, scenario,
	# prefered ER or ME, total steps length,
	# original seq-action-attempt-help_level-outcome
	train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	train_info.write("\n")

	elif proba > 0.9:
	# elif prob in last_prob_list:
	test_file.write("\t".join(step_names_token))
	test_file.write("\n")
	# school, class, student id, progress, problem name, scenario,
	# prefered ER or ME, total steps length,
	# original seq-action-attempt-help_level-outcome
	test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	test_info.write("\n")

	else:
	val_file.write("\t".join(step_names_token))
	val_file.write("\n")
	# school, class, student id, progress, problem name, scenario,
	# prefered ER or ME, total steps length,
	# original seq-action-attempt-help_level-outcome
	val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	val_info.write("\n")
	# break
	# break
	# break
	# break
	# break



	train_file.close()
	train_info.close()

	val_file.close()
	val_info.close()

	test_file.close()
	test_info.close()


	def prepare_school_attention_files(data_processor, options):

	# steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))

	chunk_iterator = data_processor.load_file_iterator(sep=",")

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")

	val_file = open(options.val_file_path, "w")
	val_info = open(options.val_info_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")


	for chunk_data in chunk_iterator:
	for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
	if not options.school or school in options.school:
	for class_id, class_group in school_group.groupby('CF (Anon Class Id)'):
	for student, student_groups in class_group.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time")
	# prob_list = list(pd.unique(student_groups["Problem Name"]))
	# if len(prob_list) > 0 :
	# first_fews = int(len(prob_list)/2)
	# last_fews = len(prob_list) - first_fews
	# first_prob_list = prob_list[:first_fews]
	# last_prob_list = prob_list[-last_fews:]

	# final_prob_list = first_prob_list + last_prob_list
	for prob, prob_groups in student_groups.groupby("Problem Name"):
	step_names_token = []
	means_and_extremes = False
	for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
	progress = row["CF (Workspace Progress Status)"]
	action = row["Action"]
	attempt = row["Attempt At Step"]
	autofilled = row["CF (Is Autofilled)"]
	step = row["Step Name"]
	scenario = row['CF (Problem Scenario Tags)']

	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	etalon = row["CF (Etalon)"]
	if not pd.isna(etalon):
	etalon = etalon.strip('{}')
	key, value = etalon.split('=')
	etalon = value
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	except Exception as e:
	pass

	if not autofilled:
	new_step = f"{step}:{action}:{attempt}"
	step_names_token.append(new_step)

	if step_names_token:
	where_opt = []
	step1 = False
	step2 = False
	strategy_data = False
	for step_oh in step_names_token:
	step = step_oh.split(":")
	if len(step) == 3:
	step = step[0]
	else:
	step = ":".join(step[:2])

	# print(f"changed {step_oh} = ? {step}")
	if step == options.opt_step1[0]:
	where_opt.append("_1")
	step1 = True
	elif step == options.opt_step2[0]:
	where_opt.append("_2")
	step2 = True
	elif step in options.opt_step1[1:]:
	where_opt.append("1")
	if step1:
	strategy_data = True
	elif step in options.opt_step2[1:]:
	where_opt.append("2")
	if step2:
	strategy_data = True
	else:
	where_opt.append("0")

	if strategy_data and step_names_token[-1].split(":")[-2] != "Done":
	strategy_data = False

	if strategy_data:
	# proba = random.random()
	step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token]
	step_names_token = []
	for s in step_names_tokens:
	if s != "nan":
	if not step_names_token or s != step_names_token[-1]:
	step_names_token.append(s)
	# if prob in first_prob_list:
	if progress == "GRADUATED":# and means_and_extremes:# and prob in first_prob_list:
	train_file.write("\t".join(step_names_token))
	train_file.write("\n")
	# school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
	train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
	train_info.write("\n")

	elif progress == "PROMOTED":# and means_and_extremes:# and prob in last_prob_list:
	# elif prob in last_prob_list:
	test_file.write("\t".join(step_names_token))
	test_file.write("\n")
	# school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
	test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
	test_info.write("\n")

	# else:
	# val_file.write("\t".join(step_names_token))
	# val_file.write("\n")
	# # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
	# val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
	# val_info.write("\n")
	# break
	# break
	# break
	# break
	# break



	train_file.close()
	train_info.close()

	val_file.close()
	val_info.close()

	test_file.close()
	test_info.close()

	def prepare_finetuning_10per_files(data_processor, options):
	'''
	Used for L@S paper.
	Only two strategies were defined as:
	0: non-opt strategy
	1: opt used strategy
	'''
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	# val_file = open(options.val_file_path, "w")
	# val_info = open(options.val_info_path, "w")
	# val_label = open(options.val_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	overall_data = []
	overall_labels = []
	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if "ratio_proportion_change3" == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)
	for prob, prob_groups in student_groups.groupby("Problem Name"):

	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups["Step Name"]))
	unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)]))
	if unique_steps_len < 4:
	continue

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 1800:
	time_stamps_list.add(time_stamps[i+1])

	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	auto_complete = True
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)]))
	# 4 and more in sequence
	if step_names_token and unique_steps_len > 4:
	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if options.opt_step1:
	any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
	if any_opt1:
	label_opt = "1"

	if options.opt_step2:
	any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
	if any_opt2:
	label_opt = "1"

	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)),
	f"{1 if means_and_extremes else 0}"])
	overall_data.append(["\t".join(step_names_token), info])
	overall_labels.append(label_opt)

	# overall_data.append('')
	# overall_labels.append('')

	overall_labels = np.array(overall_labels)
	indices_of_zeros = list(np.where(overall_labels == '0')[0])
	indices_of_ones = list(np.where(overall_labels == '1')[0])

	train_len = int(len(overall_labels) * 0.10)
	sample_size = int(train_len/2)
	print(f"sample_size: {sample_size}")
	sampled_instances = random.sample(indices_of_zeros, sample_size)
	sampled_instances.extend(random.sample(indices_of_ones, sample_size))

	balanced_test = min(len(indices_of_zeros), len(indices_of_ones))
	test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
	test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))

	# writtenTrain = False
	# writtenTest = False
	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):

	steps_seq = all_data[0]
	info = all_data[1]

	if index in sampled_instances:
	train_file.write(steps_seq)
	train_file.write("\n")

	train_info.write(info)
	train_info.write("\n")

	train_label.write(label)
	train_label.write("\n")
	elif index in test_sampled_instances:
	# proba = random.random()
	# if proba <0.5:
	test_file.write(steps_seq)
	test_file.write("\n")

	test_info.write(info)
	test_info.write("\n")

	test_label.write(label)
	test_label.write("\n")
	# else:
	# val_file.write(steps_seq)
	# val_file.write("\n")

	# val_info.write(info)
	# val_info.write("\n")

	# val_label.write(label)
	# val_label.write("\n")

	train_file.close()
	train_info.close()
	train_label.close()

	# val_file.close()
	# val_info.close()
	# val_label.close()

	test_file.close()
	test_info.close()
	test_label.close()

	def prepare_finetuning_IS_FS_files(data_processor, options):
	'''
	Used for L@S paper. This function gathers first three problems of each student.
	Only two strategies were defined as:
	0: non-opt strategy
	1: opt used strategy
	train: IS
	test: FS
	'''
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	# val_file = open(options.val_file_path, "w")
	# val_info = open(options.val_info_path, "w")
	# val_label = open(options.val_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if "ratio_proportion_change3" == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)

	prob_list = list(pd.unique(student_groups["Problem Name"]))
	if len(prob_list) < 3:
	continue
	selected = 3 #1. int(len(prob_list)/2)
	#2. 3 & <6
	#3. 3 & <3
	first_prob_list = prob_list[:selected]
	last_prob_list = prob_list[-selected:]

	for prob, prob_groups in student_groups.groupby("Problem Name"):

	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups["Step Name"]))
	unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)]))
	if unique_steps_len < 4:
	continue

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 1800:
	time_stamps_list.add(time_stamps[i+1])

	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	auto_complete = True
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)]))
	# 4 and more in sequence
	if step_names_token and unique_steps_len > 4:
	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if options.opt_step1:
	any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
	if any_opt1:
	label_opt = "1"

	if options.opt_step2:
	any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
	if any_opt2:
	label_opt = "1"

	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)),
	f"{1 if means_and_extremes else 0}"])
	if prob in first_prob_list:
	train_file.write("\t".join(step_names_token))
	train_file.write("\n")

	train_info.write(info)
	train_info.write("\n")

	train_label.write(label_opt)
	train_label.write("\n")
	elif prob in last_prob_list:
	test_file.write("\t".join(step_names_token))
	test_file.write("\n")

	test_info.write(info)
	test_info.write("\n")

	test_label.write(label_opt)
	test_label.write("\n")

	train_file.close()
	train_info.close()
	train_label.close()

	# val_file.close()
	# val_info.close()
	# val_label.close()

	test_file.close()
	test_info.close()
	test_label.close()

	def prepare_finetuning_IS_files_old(data_processor, opts):
	'''
	Used for L@S paper. This function gathers first three problems of each student.
	Only two strategies were defined as:
	0: non-opt strategy
	1: opt used strategy
	'''

	options = copy.deepcopy(opts)
	for k,v in vars(opts).items():
	if k.startswith("train") or k.startswith("test"):
	if v:
	f_path = v.split("/")
	f_path = f_path[0]+"/"+f_path[1]+"/IS/"+f_path[2]
	setattr(options, f"{k}", f_path)
	print(f"options.{k} : {getattr(options, f'{k}')}")

	steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")
	trainr_label = open(options.trainr_label_path, "w")
	train_gt_label = open(options.train_gt_label_path, "w")

	# test_file = open(options.test_file_path, "w")
	# test_info = open(options.test_info_path, "w")
	# test_label = open(options.test_label_path, "w")
	# testr_label = open(options.testr_label_path, "w")
	# test_gt_label = open(options.test_gt_label_path, "w")

	overall_data = []
	overall_labels = []
	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if options.workspace_name == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):

	student_groups.sort_values(by="Time")
	prob_list = list(pd.unique(student_groups["Problem Name"]))

	if len(prob_list) < 3:
	continue

	first_prob_list = prob_list[:3]
	# last_prob_list = prob_list[-3:]
	# # print(len(first_prob_list), len(last_prob_list))

	# final_prob_list = first_prob_list + last_prob_list
	# print(len(prob_list), len(final_prob_list)) #, final_prob_list)

	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# For first 3 and last 3 only
	if not prob in first_prob_list:
	continue

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 2000:
	time_stamps_list.add(time_stamps[i+1])

	progress = ""
	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False
	finals = len(options.final_step)
	totals = 0

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	# if row["Time"] in time_stamps_list:
	auto_complete = True
	# print(row)
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	if finals == 0:
	totals += 1
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	errors = 0
	for step, out in zip(step_names_token, outcome):
	if (finals and step in options.final_step) or totals > 0:
	out = out.split(":")
	if any(any(ind in o for o in out) for ind in error_ind):
	errors +=1

	if finals:
	totals = finals
	# 4 and more in sequence
	if step_names_token: # and len(step_names_token) > 3

	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if options.opt_step1:
	any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
	if any_opt1:
	label_opt = "1"


	if options.opt_step2:
	any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
	if any_opt2:
	label_opt = "1"

	correctness = 1 - errors/totals
	strat_correct = "0"
	if correctness > 0.75:
	strat_correct = "1"

	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)])

	overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"])
	overall_labels.append(label_opt)

	overall_data.append('')
	overall_labels.append('')

	# overall_labels = np.array(overall_labels)
	# indices_of_zeros = list(np.where(overall_labels == '0')[0])
	# indices_of_ones = list(np.where(overall_labels == '1')[0])

	# zeros_instances_size = int(1 * len(indices_of_zeros))
	# ones_instances_size = int(1 * len(indices_of_ones))
	# sample_size = min(zeros_instances_size, ones_instances_size)
	# sampled_instances = random.sample(indices_of_zeros, sample_size)
	# sampled_instances.extend(random.sample(indices_of_ones, sample_size))

	writtenTrain = False
	# writtenTest = False
	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
	if all_data:
	steps_seq = all_data[0]
	strat_correct = all_data[1]
	info = all_data[2]
	me_opt = all_data[3]

	# if index in sampled_instances:
	writtenTrain = True
	train_file.write(steps_seq)
	train_file.write("\n")
	train_label.write(label)
	train_label.write("\n")
	trainr_label.write(strat_correct)
	trainr_label.write("\n")
	train_info.write(info)
	train_info.write("\n")
	train_gt_label.write(me_opt)
	train_gt_label.write("\n")
	# else:
	# writtenTest = True
	# test_file.write(steps_seq)
	# test_file.write("\n")
	# # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	# test_label.write(label)
	# test_label.write("\n")
	# # testr_label.write(str(correctness))
	# testr_label.write(strat_correct)
	# testr_label.write("\n")
	# test_info.write(info)
	# test_info.write("\n")
	# test_gt_label.write(me_opt)
	# test_gt_label.write("\n")
	else:
	# Indicates actions of next student
	# Indicates next problem
	if writtenTrain:
	writtenTrain = False
	train_file.write("\n")
	train_info.write("\n")
	train_label.write("\n")
	trainr_label.write("\n")
	train_gt_label.write("\n")
	# if writtenTest:
	# writtenTest = False
	# test_file.write("\n")
	# test_info.write("\n")
	# test_label.write("\n")
	# testr_label.write("\n")
	# test_gt_label.write("\n")

	train_file.close()
	train_info.close()
	train_label.close()
	trainr_label.close()
	train_gt_label.close()

	# test_file.close()
	# test_info.close()
	# test_label.close()
	# testr_label.close()
	# test_gt_label.close()

	def prepare_finetuning_FS_files_old(data_processor, opts):
	'''
	Used for L@S paper. This function gathers last three problems of each student.
	Only two strategies were defined as:
	0: non-opt strategy
	1: opt used strategy
	'''

	options = copy.deepcopy(opts)
	for k,v in vars(opts).items():
	if k.startswith("train") or k.startswith("test"):
	if v:
	f_path = v.split("/")
	f_path = f_path[0]+"/"+f_path[1]+"/FS/"+f_path[2]
	setattr(options, f"{k}", f_path)
	print(f"options.{k} : {getattr(options, f'{k}')}")

	steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")
	trainr_label = open(options.trainr_label_path, "w")
	train_gt_label = open(options.train_gt_label_path, "w")

	# test_file = open(options.test_file_path, "w")
	# test_info = open(options.test_info_path, "w")
	# test_label = open(options.test_label_path, "w")
	# testr_label = open(options.testr_label_path, "w")
	# test_gt_label = open(options.test_gt_label_path, "w")

	overall_data = []
	overall_labels = []
	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if options.workspace_name == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):

	student_groups.sort_values(by="Time")
	prob_list = list(pd.unique(student_groups["Problem Name"]))

	if len(prob_list) < 3:
	continue

	# first_prob_list = prob_list[:3]
	last_prob_list = prob_list[-3:]
	# # print(len(first_prob_list), len(last_prob_list))

	# final_prob_list = first_prob_list + last_prob_list
	# print(len(prob_list), len(final_prob_list)) #, final_prob_list)

	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# For first 3 and last 3 only
	if not prob in last_prob_list:
	continue

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 2000:
	time_stamps_list.add(time_stamps[i+1])

	progress = ""
	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False
	finals = len(options.final_step)
	totals = 0

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	# if row["Time"] in time_stamps_list:
	auto_complete = True
	# print(row)
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	if finals == 0:
	totals += 1
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	errors = 0
	for step, out in zip(step_names_token, outcome):
	if (finals and step in options.final_step) or totals > 0:
	out = out.split(":")
	if any(any(ind in o for o in out) for ind in error_ind):
	errors +=1

	if finals:
	totals = finals
	# 4 and more in sequence
	if step_names_token: # and len(step_names_token) > 3

	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if options.opt_step1:
	any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
	if any_opt1:
	label_opt = "1"


	if options.opt_step2:
	any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
	if any_opt2:
	label_opt = "1"

	correctness = 1 - errors/totals
	strat_correct = "0"
	if correctness > 0.75:
	strat_correct = "1"

	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)])

	overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"])
	overall_labels.append(label_opt)

	overall_data.append('')
	overall_labels.append('')

	# overall_labels = np.array(overall_labels)
	# indices_of_zeros = list(np.where(overall_labels == '0')[0])
	# indices_of_ones = list(np.where(overall_labels == '1')[0])

	# zeros_instances_size = int(0.10 * len(indices_of_zeros))
	# ones_instances_size = int(0.10 * len(indices_of_ones))
	# sample_size = min(zeros_instances_size, ones_instances_size)
	# sampled_instances = random.sample(indices_of_zeros, sample_size)
	# sampled_instances.extend(random.sample(indices_of_ones, sample_size))

	writtenTrain = False
	# writtenTest = False
	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
	if all_data:
	steps_seq = all_data[0]
	strat_correct = all_data[1]
	info = all_data[2]
	me_opt = all_data[3]

	# if index in sampled_instances:
	writtenTrain = True
	train_file.write(steps_seq)
	train_file.write("\n")
	train_label.write(label)
	train_label.write("\n")
	trainr_label.write(strat_correct)
	trainr_label.write("\n")
	train_info.write(info)
	train_info.write("\n")
	train_gt_label.write(me_opt)
	train_gt_label.write("\n")
	# else:
	# writtenTest = True
	# test_file.write(steps_seq)
	# test_file.write("\n")
	# # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	# test_label.write(label)
	# test_label.write("\n")
	# # testr_label.write(str(correctness))
	# testr_label.write(strat_correct)
	# testr_label.write("\n")
	# test_info.write(info)
	# test_info.write("\n")
	# test_gt_label.write(me_opt)
	# test_gt_label.write("\n")
	else:
	# Indicates actions of next student
	# Indicates next problem
	if writtenTrain:
	writtenTrain = False
	train_file.write("\n")
	train_info.write("\n")
	train_label.write("\n")
	trainr_label.write("\n")
	train_gt_label.write("\n")
	# if writtenTest:
	# writtenTest = False
	# test_file.write("\n")
	# test_info.write("\n")
	# test_label.write("\n")
	# testr_label.write("\n")
	# test_gt_label.write("\n")

	train_file.close()
	train_info.close()
	train_label.close()
	trainr_label.close()
	train_gt_label.close()

	# test_file.close()
	# test_info.close()
	# test_label.close()
	# testr_label.close()
	# test_gt_label.close()


	def prepare_finetuning_correctness_files(data_processor, options):
	'''
	Ongoing research. Student strategy learning/predicting.
	FinalAnswer step
	Correct: 1 , correctness of final strategy > 0.75
	Incorrect: 0 , else < 0.75
	'''
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	# val_file = open(options.val_file_path, "w")
	# val_info = open(options.val_info_path, "w")
	# val_label = open(options.val_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	overall_data = []
	overall_labels = []
	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if "ratio_proportion_change3" == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)
	for prob, prob_groups in student_groups.groupby("Problem Name"):

	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups["Step Name"]))
	unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)]))
	if unique_steps_len < 4:
	continue

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 1800:
	time_stamps_list.add(time_stamps[i+1])

	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False
	final_correct = 0
	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	auto_complete = True
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
	if step == "FinalAnswer":
	final_correct += 1
	unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)]))
	# 4 and more in sequence
	if step_names_token and unique_steps_len > 4:
	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if final_correct == 1:
	label_opt = "1"

	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)),
	f"{1 if means_and_extremes else 0}"])
	overall_data.append(["\t".join(step_names_token), info])
	overall_labels.append(label_opt)

	# overall_data.append('')
	# overall_labels.append('')

	overall_labels = np.array(overall_labels)
	indices_of_zeros = list(np.where(overall_labels == '0')[0])
	indices_of_ones = list(np.where(overall_labels == '1')[0])

	train_len = int(len(overall_labels) * 0.10)
	sample_size = int(train_len/2)
	print(f"sample_size: {sample_size}")
	sampled_instances = random.sample(indices_of_zeros, sample_size)
	sampled_instances.extend(random.sample(indices_of_ones, sample_size))

	# writtenTrain = False
	# writtenTest = False
	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):

	steps_seq = all_data[0]
	info = all_data[1]

	if index in sampled_instances:
	train_file.write(steps_seq)
	train_file.write("\n")

	train_info.write(info)
	train_info.write("\n")

	train_label.write(label)
	train_label.write("\n")
	else:
	# proba = random.random()
	# if proba <0.5:
	test_file.write(steps_seq)
	test_file.write("\n")

	test_info.write(info)
	test_info.write("\n")

	test_label.write(label)
	test_label.write("\n")
	# else:
	# val_file.write(steps_seq)
	# val_file.write("\n")

	# val_info.write(info)
	# val_info.write("\n")

	# val_label.write(label)
	# val_label.write("\n")

	train_file.close()
	train_info.close()
	train_label.close()

	# val_file.close()
	# val_info.close()
	# val_label.close()

	test_file.close()
	test_info.close()
	test_label.close()

	def prepare_finetuning_correctness_files_old(data_processor, opts):
	'''
	Ongoing research. Student strategy learning/predicting.
	Correct, 1: correctness of final strategy > 0.75
	Incorrect, 0: else < 0.75
	'''
	options = copy.deepcopy(opts)
	for k,v in vars(opts).items():
	if k.startswith("train") or k.startswith("test"):
	if v:
	f_path = v.split("/")
	f_path = f_path[0]+"/"+f_path[1]+"/fa_correctness/"+f_path[2]
	# f_path = f_path[0]+"/"+f_path[1]+"/check2/"+f_path[2]
	setattr(options, f"{k}", f_path)
	print(f"options.{k} : {getattr(options, f'{k}')}")

	steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")
	# trainr_label = open(options.trainr_label_path, "w")
	# train_gt_label = open(options.train_gt_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")
	# testr_label = open(options.testr_label_path, "w")
	# test_gt_label = open(options.test_gt_label_path, "w")
	ws = "_".join(options.workspace_name.split("_")[:-1])
	print("Workspace: ", ws)
	overall_data = []
	overall_labels = []
	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if ws == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):

	student_groups.sort_values(by="Time")
	prob_list = list(pd.unique(student_groups["Problem Name"]))

	# if len(prob_list) < 3:
	# continue

	# first_prob_list = prob_list[:3]
	# last_prob_list = prob_list[-3:]
	# # print(len(first_prob_list), len(last_prob_list))

	# final_prob_list = first_prob_list + last_prob_list
	# print(len(prob_list), len(final_prob_list)) #, final_prob_list)

	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# For first 3 and last 3 only
	# if not prob in last_prob_list:
	# continue
	# print(options.final_step in list(prob_groups["Step Name"]))
	# if not (options.final_step in list(prob_groups["Step Name"])):
	# continue
	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 2000:
	time_stamps_list.add(time_stamps[i+1])

	progress = ""
	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False
	# finals = len(options.final_step)


	totals = 0

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	# if row["Time"] in time_stamps_list:
	auto_complete = True
	# print(row)
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	# if finals == 0:
	# totals += 1
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	errors = 0
	for step, out in zip(step_names_token, outcome):
	if (step in options.final_step):# or totals > 0:
	out = out.split(":")
	totals = len(out)
	# print(totals)
	for ind in error_ind:
	if ind in out:
	errors +=1

	# if finals:
	# totals = finals
	# 4 and more in sequence
	if step_names_token and totals>0: # and len(step_names_token) > 3

	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if options.opt_step1:
	all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
	any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])

	if any_opt1:
	label_opt = "2"
	if all_opt1:
	label_opt = "1"


	if options.opt_step2:
	all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
	any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
	if any_opt2:
	label_opt = "4"
	if all_opt2:
	label_opt = "3"
	if any_opt1 and any_opt2:
	label_opt = "5"
	if any_opt1 and all_opt2:
	label_opt = "6"
	if all_opt1 and any_opt2:
	label_opt = "7"
	if all_opt1 and all_opt2:
	label_opt = "8"


	correctness = 1 - errors/totals
	strat_correct = "0"
	if correctness > 0.75:
	strat_correct = "1"

	# if not means_and_extremes and label_opt == "2":
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), f"{1 if means_and_extremes else 0}"])

	overall_data.append(["\t".join(step_names_token), label_opt, info])
	overall_labels.append(strat_correct)

	overall_data.append('')
	overall_labels.append('')

	overall_labels = np.array(overall_labels, dtype=str)
	indices_of_zeros = list(np.where(overall_labels == '0')[0])
	indices_of_ones = list(np.where(overall_labels == '1')[0])

	per = 0.20
	zeros_instances_size = int(per * len(indices_of_zeros))
	ones_instances_size = int(per * len(indices_of_ones))

	sample_size = min(zeros_instances_size, ones_instances_size)
	sampled_instances = random.sample(indices_of_zeros, sample_size)
	sampled_instances.extend(random.sample(indices_of_ones, sample_size))

	writtenTrain = False
	writtenTest = False
	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
	if all_data:
	steps_seq = all_data[0]
	label_opt = all_data[1]
	info = all_data[2]
	# me_opt = all_data[3]

	if index in sampled_instances:
	writtenTrain = True
	train_file.write(steps_seq)
	train_file.write("\n")
	train_label.write(label)
	train_label.write("\n")
	# trainr_label.write(label_opt)
	# trainr_label.write("\n")
	train_info.write(info)
	train_info.write("\n")
	# train_gt_label.write(me_opt)
	# train_gt_label.write("\n")
	else:
	writtenTest = True
	test_file.write(steps_seq)
	test_file.write("\n")
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	test_label.write(label)
	test_label.write("\n")
	# testr_label.write(str(correctness))
	# testr_label.write(label_opt)
	# testr_label.write("\n")
	test_info.write(info)
	test_info.write("\n")
	# test_gt_label.write(me_opt)
	# test_gt_label.write("\n")
	else:
	# Indicates actions of next student
	# Indicates next problem
	if writtenTrain:
	writtenTrain = False
	train_file.write("\n")
	train_info.write("\n")
	train_label.write("\n")
	# trainr_label.write("\n")
	# train_gt_label.write("\n")
	if writtenTest:
	writtenTest = False
	test_file.write("\n")
	test_info.write("\n")
	test_label.write("\n")
	# testr_label.write("\n")
	# test_gt_label.write("\n")


	train_file.close()
	train_info.close()
	train_label.close()
	# trainr_label.close()
	# train_gt_label.close()

	test_file.close()
	test_info.close()
	test_label.close()
	# testr_label.close()
	# test_gt_label.close()

	def prepare_finetuning_correctness_aaai_files(data_processor, opts):
	'''
	Ongoing research. Student strategy learning/predicting.
	Correct, 1: correctness of final strategy > 0.75
	Incorrect, 0: else < 0.75
	'''
	options = copy.deepcopy(opts)
	for k,v in vars(opts).items():
	if k.startswith("train") or k.startswith("test") or k.startswith("val"):
	if v:
	f_path = v.split("/")
	# f_path = f_path[0]+"/"+f_path[1]+"/correctness/"+f_path[2]
	f_path = f_path[0]+"/"+f_path[1]+"/aaai/"+f_path[2]
	setattr(options, f"{k}", f_path)
	print(f"options.{k} : {getattr(options, f'{k}')}")

	# steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	val_file = open(options.val_file_path, "w")
	val_info = open(options.val_info_path, "w")
	val_label = open(options.val_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	high_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_high_performers.pkl", "rb"))
	mid_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_mid_performers.pkl", "rb"))
	low_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_low_performers.pkl", "rb"))
	prob_sel_list = pickle.load(open(f"{options.workspace_name}/aaai/change3_problem_list.pkl", "rb"))

	ws = "_".join(options.workspace_name.split("_")[:-1])

	print(ws, len(high_performer), len(mid_performer), len(low_performer), len(prob_sel_list))
	overall_data = []
	overall_labels = []
	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	# if options.workspace_name == section:
	if ws == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):
	if student in high_performer or student in mid_performer or student in low_performer:
	student_groups.sort_values(by="Time")
	prob_list = list(pd.unique(student_groups["Problem Name"]))

	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# For first 3 and last 3 only
	if not prob in prob_sel_list:
	continue

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 2000:
	time_stamps_list.add(time_stamps[i+1])

	progress = ""
	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False
	totals = 0

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	# if row["Time"] in time_stamps_list:
	auto_complete = True
	# print(row)
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))

	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	errors = 0
	for step, out in zip(step_names_token, outcome):
	if (step in options.final_step):
	out = out.split(":")
	totals = len(out)
	# print(totals)
	for ind in error_ind:
	if ind in out:
	errors +=1

	# 4 and more in sequence
	if step_names_token and totals>0: # and len(step_names_token) > 3

	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")



	correctness = 1 - errors/totals
	strat_correct = "0"
	if correctness > 0.75:
	strat_correct = "1"

	# if not means_and_extremes and label_opt == "2":
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	info = ",".join([str(progress), str(correctness), f"{1 if means_and_extremes else 0}",str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])

	overall_data.append(["\t".join(step_names_token), info])
	overall_labels.append(strat_correct)

	# overall_data.append('')
	# overall_labels.append('')

	overall_labels = np.array(overall_labels)

	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
	if all_data:
	steps_seq = all_data[0]
	info = all_data[1]
	student = info.split(",")[4]

	if student in high_performer:
	train_file.write(steps_seq)
	train_file.write("\n")
	train_label.write(label)
	train_label.write("\n")
	train_info.write(info)
	train_info.write("\n")
	elif student in mid_performer:
	val_file.write(steps_seq)
	val_file.write("\n")
	val_label.write(label)
	val_label.write("\n")
	val_info.write(info)
	val_info.write("\n")
	elif student in low_performer:
	test_file.write(steps_seq)
	test_file.write("\n")
	test_label.write(label)
	test_label.write("\n")
	test_info.write(info)
	test_info.write("\n")



	train_file.close()
	train_info.close()
	train_label.close()

	val_file.close()
	val_info.close()
	val_label.close()

	test_file.close()
	test_info.close()
	test_label.close()

	def prepare_finetuning_SL_files(data_processor, opts):
	'''
	Ongoing research. Student strategy learning/predicting.
	We have defined 9 strategy as:
	Notation; Label
	UU; 0
	CU; 1
	PU; 2
	UC; 3
	UP; 4
	PP; 5
	PC; 6
	CP; 7
	CC; 8
	'''
	options = copy.deepcopy(opts)
	for k,v in vars(opts).items():
	if k.startswith("train") or k.startswith("test"):
	if v:
	f_path = v.split("/")
	f_path = f_path[0]+"/"+f_path[1]+"/SL/"+f_path[2]
	setattr(options, f"{k}", f_path)
	print(f"options.{k} : {getattr(options, f'{k}')}")

	steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")
	trainr_label = open(options.trainr_label_path, "w")
	train_gt_label = open(options.train_gt_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")
	testr_label = open(options.testr_label_path, "w")
	test_gt_label = open(options.test_gt_label_path, "w")

	overall_data = []
	overall_labels = []
	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if options.workspace_name == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):

	student_groups.sort_values(by="Time")
	prob_list = list(pd.unique(student_groups["Problem Name"]))

	# if len(prob_list) < 3:
	# continue

	# first_prob_list = prob_list[:3]
	# last_prob_list = prob_list[-3:]
	# # print(len(first_prob_list), len(last_prob_list))

	# final_prob_list = first_prob_list + last_prob_list
	# print(len(prob_list), len(final_prob_list)) #, final_prob_list)

	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# For first 3 and last 3 only
	# if not prob in last_prob_list:
	# continue

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 2000:
	time_stamps_list.add(time_stamps[i+1])

	progress = ""
	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False
	finals = len(options.final_step)
	totals = 0

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	# if row["Time"] in time_stamps_list:
	auto_complete = True
	# print(row)
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	if finals == 0:
	totals += 1
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	errors = 0
	for step, out in zip(step_names_token, outcome):
	if (finals and step in options.final_step) or totals > 0:
	out = out.split(":")
	if any(any(ind in o for o in out) for ind in error_ind):
	errors +=1

	if finals:
	totals = finals
	# 4 and more in sequence
	if step_names_token: # and len(step_names_token) > 3

	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if options.opt_step1:
	all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
	any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])

	if any_opt1:
	label_opt = "2"
	if all_opt1:
	label_opt = "1"


	if options.opt_step2:
	all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
	any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
	if any_opt2:
	label_opt = "4"
	if all_opt2:
	label_opt = "3"
	if any_opt1 and any_opt2:
	label_opt = "5"
	if any_opt1 and all_opt2:
	label_opt = "6"
	if all_opt1 and any_opt2:
	label_opt = "7"
	if all_opt1 and all_opt2:
	label_opt = "8"


	correctness = 1 - errors/totals
	strat_correct = "0"
	if correctness > 0.75:
	strat_correct = "1"

	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)])

	overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"])
	overall_labels.append(label_opt)

	overall_data.append('')
	overall_labels.append('')

	overall_labels = np.array(overall_labels)
	indices_of_zeros = list(np.where(overall_labels == '0')[0])
	indices_of_ones = list(np.where(overall_labels == '1')[0])
	indices_of_twos = list(np.where(overall_labels == '2')[0])
	indices_of_threes = list(np.where(overall_labels == '3')[0])
	indices_of_fours = list(np.where(overall_labels == '4')[0])
	indices_of_fives = list(np.where(overall_labels == '5')[0])
	indices_of_sixes = list(np.where(overall_labels == '6')[0])
	indices_of_sevens = list(np.where(overall_labels == '7')[0])
	indices_of_eights = list(np.where(overall_labels == '8')[0])

	per = 0.20
	zeros_instances_size = int(per * len(indices_of_zeros))
	ones_instances_size = int(per * len(indices_of_ones))
	twos_instances_size = int(per * len(indices_of_twos))
	threes_instances_size = int(per * len(indices_of_threes))
	fours_instances_size = int(per * len(indices_of_fours))
	fives_instances_size = int(per * len(indices_of_fives))
	sixes_instances_size = int(per * len(indices_of_sixes))
	sevens_instances_size = int(per * len(indices_of_sevens))
	eights_instances_size = int(per * len(indices_of_eights))

	sample_size = min(zeros_instances_size, ones_instances_size, twos_instances_size, threes_instances_size, fours_instances_size, fives_instances_size, sixes_instances_size, sevens_instances_size, eights_instances_size)
	print(f"Sample size.... {sample_size}")
	sampled_instances = random.sample(indices_of_zeros, sample_size)
	sampled_instances.extend(random.sample(indices_of_ones, sample_size))
	sampled_instances.extend(random.sample(indices_of_twos, sample_size))
	sampled_instances.extend(random.sample(indices_of_threes, sample_size))
	sampled_instances.extend(random.sample(indices_of_fours, sample_size))
	sampled_instances.extend(random.sample(indices_of_fives, sample_size))
	sampled_instances.extend(random.sample(indices_of_sixes, sample_size))
	sampled_instances.extend(random.sample(indices_of_sevens, sample_size))
	sampled_instances.extend(random.sample(indices_of_eights, sample_size))

	writtenTrain = False
	writtenTest = False
	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
	if all_data:
	steps_seq = all_data[0]
	strat_correct = all_data[1]
	info = all_data[2]
	me_opt = all_data[3]

	if index in sampled_instances:
	writtenTrain = True
	train_file.write(steps_seq)
	train_file.write("\n")
	train_label.write(label)
	train_label.write("\n")
	trainr_label.write(strat_correct)
	trainr_label.write("\n")
	train_info.write(info)
	train_info.write("\n")
	train_gt_label.write(me_opt)
	train_gt_label.write("\n")
	else:
	writtenTest = True
	test_file.write(steps_seq)
	test_file.write("\n")
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	test_label.write(label)
	test_label.write("\n")
	# testr_label.write(str(correctness))
	testr_label.write(strat_correct)
	testr_label.write("\n")
	test_info.write(info)
	test_info.write("\n")
	test_gt_label.write(me_opt)
	test_gt_label.write("\n")
	else:
	# Indicates actions of next student
	# Indicates next problem
	if writtenTrain:
	writtenTrain = False
	train_file.write("\n")
	train_info.write("\n")
	train_label.write("\n")
	trainr_label.write("\n")
	train_gt_label.write("\n")
	if writtenTest:
	writtenTest = False
	test_file.write("\n")
	test_info.write("\n")
	test_label.write("\n")
	testr_label.write("\n")
	test_gt_label.write("\n")


	train_file.close()
	train_info.close()
	train_label.close()
	trainr_label.close()
	train_gt_label.close()

	test_file.close()
	test_info.close()
	test_label.close()
	testr_label.close()
	test_gt_label.close()

	def prepare_finetuning_effectiveness_files(data_processor, opts):
	'''
	Ongoing research. Student strategy learning/predicting.
	We have defined 9 strategy as:
	Notation; Label
	UU; 0
	CU; 1
	PU; 2
	UC; 3
	UP; 4
	PP; 5
	PC; 6
	CP; 7
	CC; 8

	if UU and CU and PU and gt = ER and correct, a positive instance
	if UU and UC and UP and gt = ME and correct, a positive instance
	else a strategy PP, PC, CP, CC and gt = ER/ME or incorrect, a negative instance
	'''
	options = copy.deepcopy(opts)
	for k,v in vars(opts).items():
	if k.startswith("train") or k.startswith("test"):
	if v:
	f_path = v.split("/")
	f_path = f_path[0]+"/"+f_path[1]+"/effectiveness/"+f_path[2]
	setattr(options, f"{k}", f_path)
	print(f"options.{k} : {getattr(options, f'{k}')}")

	steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")
	trainr_label = open(options.trainr_label_path, "w")
	train_gt_label = open(options.train_gt_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")
	testr_label = open(options.testr_label_path, "w")
	test_gt_label = open(options.test_gt_label_path, "w")

	overall_data = []
	overall_labels = []
	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if options.workspace_name == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time")
	prob_list = list(pd.unique(student_groups["Problem Name"]))

	# if len(prob_list) < 3:
	# continue

	# first_prob_list = prob_list[:3]
	# last_prob_list = prob_list[-3:]
	# # print(len(first_prob_list), len(last_prob_list))

	# final_prob_list = first_prob_list + last_prob_list
	# print(len(prob_list), len(final_prob_list)) #, final_prob_list)

	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# For first 3 and last 3 only
	# if not prob in last_prob_list:
	# continue

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 2000:
	time_stamps_list.add(time_stamps[i+1])

	progress = ""
	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False
	finals = len(options.final_step)
	totals = 0

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	# if row["Time"] in time_stamps_list:
	auto_complete = True
	# print(row)
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	if finals == 0:
	totals += 1
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	errors = 0
	for step, out in zip(step_names_token, outcome):
	if (finals and step in options.final_step) or totals > 0:
	out = out.split(":")
	if any(any(ind in o for o in out) for ind in error_ind):
	errors +=1

	if finals:
	totals = finals
	# 4 and more in sequence
	if step_names_token: # and len(step_names_token) > 3

	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if options.opt_step1:
	all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
	any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])

	if any_opt1:
	label_opt = "2"
	if all_opt1:
	label_opt = "1"


	if options.opt_step2:
	all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
	any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
	if any_opt2:
	label_opt = "4"
	if all_opt2:
	label_opt = "3"
	if any_opt1 and any_opt2:
	label_opt = "5"
	if any_opt1 and all_opt2:
	label_opt = "6"
	if all_opt1 and any_opt2:
	label_opt = "7"
	if all_opt1 and all_opt2:
	label_opt = "8"


	correctness = 1 - errors/totals
	strat_correct = "0"
	if correctness > 0.75:
	strat_correct = "1"

	label_effectiveness = "0"
	if label_opt in ["0", "1", "2"] and not means_and_extremes and strat_correct == "1":
	label_effectiveness = "1"
	elif label_opt in ["0", "3", "4"] and means_and_extremes and strat_correct == "1":
	label_effectiveness = "1"
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), label_opt, f"{1 if means_and_extremes else 0}"])

	overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"])
	overall_labels.append(label_effectiveness)

	overall_data.append('')
	overall_labels.append('')

	overall_labels = np.array(overall_labels)
	indices_of_zeros = list(np.where(overall_labels == '0')[0])
	indices_of_ones = list(np.where(overall_labels == '1')[0])

	per = 0.20
	zeros_instances_size = int(per * len(indices_of_zeros))
	ones_instances_size = int(per * len(indices_of_ones))

	sample_size = min(zeros_instances_size, ones_instances_size)
	sampled_instances = random.sample(indices_of_zeros, sample_size)
	sampled_instances.extend(random.sample(indices_of_ones, sample_size))

	writtenTrain = False
	writtenTest = False
	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
	if all_data:
	steps_seq = all_data[0]
	strat_correct = all_data[1]
	info = all_data[2]
	me_opt = all_data[3]

	if index in sampled_instances:
	writtenTrain = True
	train_file.write(steps_seq)
	train_file.write("\n")
	train_label.write(label)
	train_label.write("\n")
	trainr_label.write(strat_correct)
	trainr_label.write("\n")
	train_info.write(info)
	train_info.write("\n")
	train_gt_label.write(me_opt)
	train_gt_label.write("\n")
	else:
	writtenTest = True
	test_file.write(steps_seq)
	test_file.write("\n")
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	test_label.write(label)
	test_label.write("\n")
	# testr_label.write(str(correctness))
	testr_label.write(strat_correct)
	testr_label.write("\n")
	test_info.write(info)
	test_info.write("\n")
	test_gt_label.write(me_opt)
	test_gt_label.write("\n")
	else:
	# Indicates actions of next student
	# Indicates next problem
	if writtenTrain:
	writtenTrain = False
	train_file.write("\n")
	train_info.write("\n")
	train_label.write("\n")
	trainr_label.write("\n")
	train_gt_label.write("\n")
	if writtenTest:
	writtenTest = False
	test_file.write("\n")
	test_info.write("\n")
	test_label.write("\n")
	testr_label.write("\n")
	test_gt_label.write("\n")


	train_file.close()
	train_info.close()
	train_label.close()
	trainr_label.close()
	train_gt_label.close()

	test_file.close()
	test_info.close()
	test_label.close()
	testr_label.close()
	test_gt_label.close()

	def prepare_attn_test_files(data_processor, opts):
	options = copy.deepcopy(opts)

	if options.code:
	new_folder = f"{options.workspace_name}/{options.code}"
	if not os.path.exists(new_folder):
	os.makedirs(new_folder)


	for k,v in vars(opts).items():
	if k.startswith("train") or k.startswith("test"):
	if v:
	f_path = (f"/{options.code}/").join(v.split("/"))
	setattr(options, f"{k}", f_path)
	print(f"options.{k} : {getattr(options, f'{k}')}")

	steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")

	if options.code != "full":
	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")

	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if options.workspace_name == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):

	student_groups.sort_values(by="Time")
	prob_list = list(pd.unique(student_groups["Problem Name"]))


	for prob, prob_groups in student_groups.groupby("Problem Name"):

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 2000:
	time_stamps_list.add(time_stamps[i+1])

	progress = ""
	outcome = []
	help_level = []
	auto_complete = False
	means_and_extremes = False
	finals = len(options.final_step)
	totals = 0

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	# if row["Time"] in time_stamps_list:
	auto_complete = True
	# print(row)
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	if finals == 0:
	totals += 1
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	errors = 0
	for step, out in zip(step_names_token, outcome):
	if (finals and step in options.final_step) or totals > 0:
	out = out.split(":")
	if any(any(ind in o for o in out) for ind in error_ind):
	errors +=1

	if finals:
	totals = finals
	# 4 and more in sequence
	if step_names_token: # and len(step_names_token) > 3

	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if options.opt_step1:
	all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
	any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])

	if any_opt1:
	label_opt = "2"
	if all_opt1:
	label_opt = "1"


	if options.opt_step2:
	all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
	any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
	if any_opt2:
	label_opt = "4"
	if all_opt2:
	label_opt = "3"
	if any_opt1 and any_opt2:
	label_opt = "5"
	if any_opt1 and all_opt2:
	label_opt = "6"
	if all_opt1 and any_opt2:
	label_opt = "7"
	if all_opt1 and all_opt2:
	label_opt = "8"


	correctness = 1 - errors/totals
	opt_correct = "0"
	if correctness > 0.75:
	opt_correct = "1"

	proba = random.random()

	# if proba <= 0.1:
	# if not means_and_extremes:
	# if prob in first_prob_list:
	if options.code == "full" or (options.code == "gt" and not means_and_extremes) or (options.code == "correct" and opt_correct == "1") or (options.code == "progress" and progress == "GRADUATED"):
	if label_opt == "0":
	continue
	train_file.write("\t".join(step_names_token))
	train_file.write("\n")
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)),
	str(correctness), f"{1 if means_and_extremes else 0}", label_opt]))
	train_info.write("\n")
	# if means_and_extremes:
	# if prob in last_prob_list:
	else:
	test_file.write("\t".join(step_names_token))
	test_file.write("\n")
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)),
	str(correctness), f"{1 if means_and_extremes else 0}", label_opt]))
	test_info.write("\n")

	train_file.close()
	train_info.close()

	if options.code != "full":
	test_file.close()
	test_info.close()

	def prepare_finetuning_future_files(data_processor, opts):
	options = copy.deepcopy(opts)
	for k,v in vars(opts).items():
	if k.startswith("train") or k.startswith("test"):
	if v:
	f_path = ("/effectiveness/").join(v.split("/"))
	setattr(options, f"{k}", f_path)
	print(f"options.{k} : {getattr(options, f'{k}')}")

	steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
	chunk_iterator = data_processor.load_file_iterator()

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")
	trainr_label = open(options.trainr_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")
	testr_label = open(options.testr_label_path, "w")

	for chunk_data in chunk_iterator:
	for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
	if options.workspace_name == section:
	for student, student_groups in section_groups.groupby("Anon Student Id"):
	writtenTrain = False
	writtenTest = False

	student_groups.sort_values(by="Time")
	prob_list = list(pd.unique(student_groups["Problem Name"]))

	# if len(prob_list) < 6:
	# continue

	# first_prob_list = prob_list[:3]
	# last_prob_list = prob_list[-3:]
	# # print(len(first_prob_list), len(last_prob_list))

	# final_prob_list = first_prob_list + last_prob_list
	# print(len(prob_list), len(final_prob_list)) #, final_prob_list)

	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# For first 3 and last 3 only
	# if not prob in final_prob_list:
	# continue

	step_names_token = []

	time_stamps = list(prob_groups["Time"])
	time_stamps_list = set()
	for i in range(len(time_stamps)-1):
	if (time_stamps[i+1] - time_stamps[i]) < 2000:
	time_stamps_list.add(time_stamps[i+1])

	progress = ""
	outcome = []
	help_level = []
	auto_complete = False
	errors = 0
	totals = 0
	means_and_extremes = False

	for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
	step = row["Step Name"]
	etalon = row["CF (Etalon)"]
	progress = row["CF (Workspace Progress Status)"]
	if not pd.isna(step):
	if step in options.opt_step1:
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	# break
	except Exception as e:
	pass
	if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
	# if row["Time"] in time_stamps_list:
	auto_complete = True
	# print(row)
	continue
	# if not step_names_token or step != step_names_token[-1]:
	# step_names_token.append(step)

	if not step_names_token or step != step_names_token[-1]:
	step_names_token.append(step)
	# ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	outcome.append(row['Outcome'])
	help_level.append(str(row["Help Level"]))
	totals += 1
	else:
	outcome[-1] = outcome[-1]+":"+row['Outcome']
	help_level[-1] = help_level[-1]+":"+str(row['Help Level'])

	error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	for out in outcome:
	out = out.split(":")
	if any(any(ind in o for o in out) for ind in error_ind):
	errors +=1
	# 4 and more in sequence
	if step_names_token: # and len(step_names_token) > 3

	where_opt = []
	for stp in step_names_token:
	if stp in options.opt_step1:
	where_opt.append("1")
	elif stp in options.opt_step2:
	where_opt.append("2")
	else:
	where_opt.append("0")

	label_opt = "0"
	if options.opt_step1:
	all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
	any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])

	if any_opt1:
	label_opt = "2"
	if all_opt1:
	label_opt = "1"


	if options.opt_step2:
	all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
	any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
	if any_opt2:
	label_opt = "4"
	if all_opt2:
	label_opt = "3"
	if any_opt1 and any_opt2:
	label_opt = "5"
	if any_opt1 and all_opt2:
	label_opt = "6"
	if all_opt1 and any_opt2:
	label_opt = "7"
	if all_opt1 and all_opt2:
	label_opt = "8"


	correctness = 1 - errors/totals
	opt_correct = "0"
	if correctness < 0.25:
	opt_correct = "0"
	elif correctness < 0.5:
	opt_correct = "1"
	elif correctness < 0.75:
	opt_correct = "2"
	else:
	opt_correct = "3"



	proba = random.random()

	# if proba <= 0.1:
	if not means_and_extremes:
	# if prob in first_prob_list:
	writtenTrain = True
	train_file.write("\t".join(step_names_token))
	train_file.write("\n")
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	train_label.write(label_opt)
	train_label.write("\n")
	# trainr_label.write(str(correctness))
	trainr_label.write(opt_correct)
	trainr_label.write("\n")
	train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
	train_info.write("\n")

	if means_and_extremes:
	# if prob in last_prob_list:
	# else:
	writtenTest = True
	test_file.write("\t".join(step_names_token))
	test_file.write("\n")
	# progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
	test_label.write(label_opt)
	test_label.write("\n")
	# testr_label.write(str(correctness))
	testr_label.write(opt_correct)
	testr_label.write("\n")
	test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
	"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
	test_info.write("\n")
	# Indicates actions of next student
	# Indicates next problem
	if writtenTrain:
	train_file.write("\n")
	train_info.write("\n")
	train_label.write("\n")
	trainr_label.write("\n")
	if writtenTest:
	test_file.write("\n")
	test_info.write("\n")
	test_label.write("\n")
	testr_label.write("\n")


	train_file.close()
	train_info.close()
	train_label.close()
	trainr_label.close()

	test_file.close()
	test_info.close()
	test_label.close()
	testr_label.close()

	def prepare_school_coded_finetuning_partial_seq_files(data_processor, options):
	'''
	Ongoing research.
	FinalAnswer step correctness
	Correct: 0 if attempt at step>1
	1 if attempt at step==1
	'''
	chunk_iterator = data_processor.load_file_iterator(sep=",")

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	# val_file = open(options.val_file_path, "w")
	# val_info = open(options.val_info_path, "w")
	# val_label = open(options.val_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	overall_data = []
	overall_labels = []
	# kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
	# kcs = [kc if not pd.isna(kc) for kc in kcs]
	for chunk_data in chunk_iterator:
	for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
	if not options.school or school in options.school:
	print(f"{school} : {school_group.shape}")
	school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
	(school_group['CF (Encounter)'] == 0) &
	(school_group['CF (Is Review Mode)'] == -1) ]
	print(f"{school} : {school_group.shape}")
	# for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
	for student, student_groups in school_group.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)
	# prob_list = list(pd.unique(student_groups["Problem Name"]))
	for prob, prob_groups in student_groups.groupby("Problem Name"):

	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
	unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
	if unique_steps_len < 4:
	continue
	unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
	if unique_opt_steps_len < 2:
	continue

	class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
	step_names_token = []
	original_steps_actions_attempts_help_levels_outcomes = []
	original_steps = []
	means_and_extremes = False
	correctness = "0"
	opt_used = False
	for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
	'Outcome', 'Help Level', 'CF (Workspace Progress Status)',
	'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
	step = row["Step Name"]
	action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
	attempt = row["Attempt At Step"] # number
	outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	help_level = row["Help Level"] # number
	progress = row["CF (Workspace Progress Status)"]
	scenario = row['CF (Problem Scenario Tags)']

	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	etalon = row["CF (Etalon)"]
	if not pd.isna(etalon):
	etalon = etalon.strip('{}')
	key, value = etalon.split('=')
	etalon = value
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	except Exception as e:
	pass
	if row['CF (Is Autofilled)'] == True:
	continue
	prev = step_names_token[-1] if step_names_token else ""
	prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""

	if not step_names_token or step != prev_step:
	if step in options.opt_step1 or step in options.opt_step2:
	new_step = step
	opt_used = True
	else:
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if step != "FinalAnswer":
	step_names_token.append(new_step)
	else:
	step_names_token.append("FinalAnswer")
	else:
	if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if prev < new_step:
	step_names_token[-1] = new_step

	if step == "FinalAnswer" and opt_used:
	if attempt == 1 and outcome == "OK":
	correctness = "1"
	else:
	correctness = "0"
	original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
	original_steps.append(step)

	unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
	if step_names_token and unique_steps_len > 4:
	info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])
	overall_data.append(["\t".join(step_names_token), info])
	overall_labels.append(correctness)
	# proba = random.random()
	# # if prob in first_prob_list:
	# if proba <= 0.8:
	# train_file.write("\t".join(step_names_token))
	# train_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# train_info.write("\n")

	# elif proba > 0.9:
	# # elif prob in last_prob_list:
	# test_file.write("\t".join(step_names_token))
	# test_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# test_info.write("\n")

	# else:
	# val_file.write("\t".join(step_names_token))
	# val_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# val_info.write("\n")
	# break
	# break
	# break
	# break
	# break
	overall_labels = np.array(overall_labels)
	indices_of_zeros = list(np.where(overall_labels == '0')[0])
	indices_of_ones = list(np.where(overall_labels == '1')[0])

	train_len = int(len(overall_labels) * 0.10)
	sample_size = int(train_len/2)
	print(f"sample_size: {sample_size}")
	sampled_instances = random.sample(indices_of_zeros, sample_size)
	sampled_instances.extend(random.sample(indices_of_ones, sample_size))

	indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
	indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]

	balanced_test = min(len(indices_of_zeros), len(indices_of_ones))
	print(f"balanced_test: {balanced_test}")
	test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
	test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))

	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):

	steps_seq = all_data[0]
	info = all_data[1]

	if index in sampled_instances:
	train_file.write(steps_seq)
	train_file.write("\n")

	train_info.write(info)
	train_info.write("\n")

	train_label.write(label)
	train_label.write("\n")
	elif index in test_sampled_instances:
	# proba = random.random()
	# if proba <0.5:
	test_file.write(steps_seq)
	test_file.write("\n")

	test_info.write(info)
	test_info.write("\n")

	test_label.write(label)
	test_label.write("\n")
	# else:
	# val_file.write(steps_seq)
	# val_file.write("\n")

	# val_info.write(info)
	# val_info.write("\n")

	# val_label.write(label)
	# val_label.write("\n")


	train_file.close()
	train_info.close()
	train_label.close()

	# val_file.close()
	# val_info.close()
	# val_label.close()

	test_file.close()
	test_info.close()
	test_label.close()

	def prepare_school_coded_finetuning_opts_files(data_processor, options):
	'''
	Ongoing research.
	Labels:
	0 - Opt 1
	1 - Opt 2
	2 - Both Opt
	'''
	chunk_iterator = data_processor.load_file_iterator(sep=",")

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	# val_file = open(options.val_file_path, "w")
	# val_info = open(options.val_info_path, "w")
	# val_label = open(options.val_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	overall_data = []
	overall_labels = []
	# kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
	# kcs = [kc if not pd.isna(kc) for kc in kcs]
	for chunk_data in chunk_iterator:
	for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
	if not options.school or school in options.school:
	print(f"{school} : {school_group.shape}")
	school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
	(school_group['CF (Encounter)'] == 0) &
	(school_group['CF (Is Review Mode)'] == -1) ]
	print(f"{school} : {school_group.shape}")
	# for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
	for student, student_groups in school_group.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)
	# prob_list = list(pd.unique(student_groups["Problem Name"]))
	# prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
	# prob_list = prob_list[-int(len(prob_list)/2):]
	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# if not prob in prob_list:
	# continue
	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
	unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
	if unique_steps_len < 4:
	continue
	unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
	if unique_opt_steps_len < 2:
	continue
	print(unique_steps, unique_opt_steps_len)
	class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
	step_names_token = []
	original_steps_actions_attempts_help_levels_outcomes = []
	original_steps = []
	means_and_extremes = False
	opt1_used = False
	opt2_used = False
	for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
	'Outcome', 'Help Level', 'CF (Workspace Progress Status)',
	'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
	step = row["Step Name"]
	action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
	attempt = row["Attempt At Step"] # number
	outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	help_level = row["Help Level"] # number
	progress = row["CF (Workspace Progress Status)"]
	scenario = row['CF (Problem Scenario Tags)']

	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	etalon = row["CF (Etalon)"]
	if not pd.isna(etalon):
	etalon = etalon.strip('{}')
	key, value = etalon.split('=')
	etalon = value
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	except Exception as e:
	pass
	if row['CF (Is Autofilled)'] == True:
	continue
	prev = step_names_token[-1] if step_names_token else ""
	prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""

	if not step_names_token or step != prev_step:
	if step in options.opt_step1 or step in options.opt_step2:
	new_step = step
	if step in options.opt_step1[1:]:
	opt1_used = True
	elif step in options.opt_step2[2:]:
	opt2_used = True
	else:
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	step_names_token.append(new_step)

	else:
	if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if prev < new_step:
	step_names_token[-1] = new_step

	original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
	original_steps.append(step)
	if (not opt1_used) and (not opt2_used):
	continue
	unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
	if step_names_token and unique_steps_len > 4:
	info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])
	overall_data.append(["\t".join(step_names_token), info])
	label = None
	if opt1_used and opt2_used:
	label = "2"
	if (not opt1_used) and opt2_used:
	label = "1"
	if opt1_used and (not opt2_used):
	label = "0"
	print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}")
	overall_labels.append(label)
	# proba = random.random()
	# # if prob in first_prob_list:
	# if proba <= 0.8:
	# train_file.write("\t".join(step_names_token))
	# train_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# train_info.write("\n")

	# elif proba > 0.9:
	# # elif prob in last_prob_list:
	# test_file.write("\t".join(step_names_token))
	# test_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# test_info.write("\n")

	# else:
	# val_file.write("\t".join(step_names_token))
	# val_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# val_info.write("\n")
	# break
	# break
	# break
	# break
	# break
	overall_labels = np.array(overall_labels)
	indices_of_zeros = list(np.where(overall_labels == '0')[0])
	indices_of_ones = list(np.where(overall_labels == '1')[0])
	indices_of_twos = list(np.where(overall_labels == '2')[0])

	train_len = int(len(overall_labels) * 0.10)
	sample_size = int(train_len/3)
	print(f"sample_size: {sample_size}")
	sampled_instances = random.sample(indices_of_zeros, sample_size)
	sampled_instances.extend(random.sample(indices_of_ones, sample_size))
	sampled_instances.extend(random.sample(indices_of_twos, sample_size))

	indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
	indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
	indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]

	balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos))
	print(f"balanced_test: {balanced_test}")
	test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
	test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
	test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))

	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):

	steps_seq = all_data[0]
	info = all_data[1]

	if index in sampled_instances:
	train_file.write(steps_seq)
	train_file.write("\n")

	train_info.write(info)
	train_info.write("\n")

	train_label.write(label)
	train_label.write("\n")
	elif index in test_sampled_instances:
	# proba = random.random()
	# if proba <0.5:
	test_file.write(steps_seq)
	test_file.write("\n")

	test_info.write(info)
	test_info.write("\n")

	test_label.write(label)
	test_label.write("\n")
	# else:
	# val_file.write(steps_seq)
	# val_file.write("\n")

	# val_info.write(info)
	# val_info.write("\n")

	# val_label.write(label)
	# val_label.write("\n")


	train_file.close()
	train_info.close()
	train_label.close()

	# val_file.close()
	# val_info.close()
	# val_label.close()

	test_file.close()
	test_info.close()
	test_label.close()

	def prepare_school_coded_finetuning_opts_intentional_files(data_processor, options):
	'''
	Ongoing research.
	Labels:
	0 - Opt 1
	1 - Opt 2
	2 - Both Opt
	'''
	chunk_iterator = data_processor.load_file_iterator(sep=",")

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	val_file = open(options.val_file_path, "w")
	val_info = open(options.val_info_path, "w")
	val_label = open(options.val_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	# overall_data = []
	# overall_labels = []
	# kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
	# kcs = [kc if not pd.isna(kc) for kc in kcs]
	for chunk_data in chunk_iterator:
	for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
	if not options.school or school in options.school:
	print(f"{school} : {school_group.shape}")
	school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
	(school_group['CF (Encounter)'] == 0) &
	(school_group['CF (Is Review Mode)'] == -1) ]
	print(f"{school} : {school_group.shape}")
	# for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
	for student, student_groups in school_group.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)
	# prob_list = list(pd.unique(student_groups["Problem Name"]))
	prob_list= list(pd.unique(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]))
	# prob_list = prob_list[-int(len(prob_list)/2):]
	if len(prob_list) == 0:
	continue
	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# if not prob in prob_list:
	# continue
	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
	unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
	if unique_steps_len < 4:
	continue
	unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
	if unique_opt_steps_len < 2:
	continue
	# print(unique_steps, unique_opt_steps_len)
	class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
	step_names_token = []
	original_steps_actions_attempts_help_levels_outcomes = []
	original_steps = []
	means_and_extremes = False
	opt1_used = False
	opt2_used = False
	for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
	'Outcome', 'Help Level', 'CF (Workspace Progress Status)',
	'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
	step = row["Step Name"]
	action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
	attempt = row["Attempt At Step"] # number
	outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	help_level = row["Help Level"] # number
	progress = row["CF (Workspace Progress Status)"]
	scenario = row['CF (Problem Scenario Tags)']

	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	etalon = row["CF (Etalon)"]
	if not pd.isna(etalon):
	etalon = etalon.strip('{}')
	key, value = etalon.split('=')
	etalon = value
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	except Exception as e:
	pass
	if row['CF (Is Autofilled)'] == True:
	continue
	prev = step_names_token[-1] if step_names_token else ""
	prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""

	if not step_names_token or step != prev_step:
	if step in options.opt_step1 or step in options.opt_step2:
	new_step = step
	if step in options.opt_step1[1:]:
	opt1_used = True
	elif step in options.opt_step2[2:]:
	opt2_used = True
	else:
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	step_names_token.append(new_step)

	else:
	if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if prev < new_step:
	step_names_token[-1] = new_step

	original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
	original_steps.append(step)
	# if (not opt1_used) and (not opt2_used):
	# continue
	unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
	if step_names_token and unique_steps_len > 4:
	info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])
	# overall_data.append(["\t".join(step_names_token), info])
	# label = None
	# if opt1_used and opt2_used:
	# label = "2"
	# if (not opt1_used) and opt2_used:
	# label = "1"
	# if opt1_used and (not opt2_used):
	# label = "0"
	# print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}")
	# overall_labels.append(label)

	proba = random.random()
	# if prob in first_prob_list:
	if proba <= 0.8:
	train_file.write("\t".join(step_names_token))
	train_file.write("\n")
	# school, class, student id, progress, problem name, scenario,
	# prefered ER or ME, total steps length,
	# original seq-action-attempt-help_level-outcome
	train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	train_info.write("\n")

	elif proba > 0.9:
	# elif prob in last_prob_list:
	test_file.write("\t".join(step_names_token))
	test_file.write("\n")
	# school, class, student id, progress, problem name, scenario,
	# prefered ER or ME, total steps length,
	# original seq-action-attempt-help_level-outcome
	test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	test_info.write("\n")

	else:
	val_file.write("\t".join(step_names_token))
	val_file.write("\n")
	# school, class, student id, progress, problem name, scenario,
	# prefered ER or ME, total steps length,
	# original seq-action-attempt-help_level-outcome
	val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	val_info.write("\n")
	# break
	# break
	# break
	# break
	# break
	# overall_labels = np.array(overall_labels)
	# indices_of_zeros = list(np.where(overall_labels == '0')[0])
	# indices_of_ones = list(np.where(overall_labels == '1')[0])
	# indices_of_twos = list(np.where(overall_labels == '2')[0])

	# train_len = int(len(overall_labels) * 0.10)
	# sample_size = int(train_len/3)
	# print(f"sample_size: {sample_size}")
	# sampled_instances = random.sample(indices_of_zeros, sample_size)
	# sampled_instances.extend(random.sample(indices_of_ones, sample_size))
	# sampled_instances.extend(random.sample(indices_of_twos, sample_size))

	# indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
	# indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
	# indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]

	# balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos))
	# print(f"balanced_test: {balanced_test}")
	# test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
	# test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
	# test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))

	# for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):

	# steps_seq = all_data[0]
	# info = all_data[1]

	# if index in sampled_instances:
	# train_file.write(steps_seq)
	# train_file.write("\n")

	# train_info.write(info)
	# train_info.write("\n")

	# train_label.write(label)
	# train_label.write("\n")
	# elif index in test_sampled_instances:
	# # proba = random.random()
	# # if proba <0.5:
	# test_file.write(steps_seq)
	# test_file.write("\n")

	# test_info.write(info)
	# test_info.write("\n")

	# test_label.write(label)
	# test_label.write("\n")
	# # else:
	# # val_file.write(steps_seq)
	# # val_file.write("\n")

	# # val_info.write(info)
	# # val_info.write("\n")

	# # val_label.write(label)
	# # val_label.write("\n")


	train_file.close()
	train_info.close()
	train_label.close()

	val_file.close()
	val_info.close()
	val_label.close()

	test_file.close()
	test_info.close()
	test_label.close()

	def prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options):
	'''
	Ongoing research.
	FinalAnswer step correctness
	Correctness after opts:
	0 if attempt at step>1
	1 if attempt at step==1
	'''
	kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
	kcs = [kc for kc in kcs if not pd.isna(kc)]
	kcs = np.array(sorted(list(kcs)))
	print(kcs, type(kcs))
	print(f"KCs: {kcs}")
	chunk_iterator = data_processor.load_file_iterator(sep=",")

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	# val_file = open(options.val_file_path, "w")
	# val_info = open(options.val_info_path, "w")
	# val_label = open(options.val_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	overall_data = []
	overall_labels = []
	# kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
	# kcs = [kc if not pd.isna(kc) for kc in kcs]
	for chunk_data in chunk_iterator:
	for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
	if not options.school or school in options.school:
	print(f"{school} : {school_group.shape}")
	school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
	(school_group['CF (Encounter)'] == 0) &
	(school_group['CF (Is Review Mode)'] == -1) ]
	print(f"{school} : {school_group.shape}")
	# for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
	for student, student_groups in school_group.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)
	# prob_list = list(pd.unique(student_groups["Problem Name"]))
	# prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
	# prob_list = prob_list[-int(len(prob_list)/2):]
	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# if not prob in prob_list:
	# continue
	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
	unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
	if unique_steps_len < 4:
	continue
	unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
	if unique_opt_steps_len < 2:
	continue
	# print(unique_steps, unique_opt_steps_len)
	class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
	step_names_token = []
	original_steps_actions_attempts_help_levels_outcomes = []
	original_steps = []
	means_and_extremes = False
	opt1_used = False
	opt2_used = False
	final_after_opts = False
	correctness = "0"
	kcs_skills = [0 for i in kcs]
	diff_skills = [0 for i in kcs]
	finalanswer_skill = [0 for i in kcs]
	for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
	'Outcome', 'Help Level', 'CF (Workspace Progress Status)',
	'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)',
	'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
	step = row["Step Name"]
	action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
	attempt = row["Attempt At Step"] # number
	outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	help_level = row["Help Level"] # number
	progress = row["CF (Workspace Progress Status)"]
	scenario = row['CF (Problem Scenario Tags)']
	kc = row['KC Model(MATHia)']
	prev_skill = row['CF (Skill Previous p-Known)']
	curr_skill = row['CF (Skill New p-Known)']
	# print(kc, prev_skill)
	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	etalon = row["CF (Etalon)"]
	if not pd.isna(etalon):
	etalon = etalon.strip('{}')
	key, value = etalon.split('=')
	etalon = value
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	except Exception as e:
	pass
	if row['CF (Is Autofilled)'] == True:
	continue
	prev = step_names_token[-1] if step_names_token else ""
	prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""

	if not step_names_token or step != prev_step:
	if step in options.opt_step1 or step in options.opt_step2:
	new_step = step
	if step in options.opt_step1[1:]:
	opt1_used = True
	elif step in options.opt_step2[2:]:
	opt2_used = True
	else:
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts:
	final_after_opts = True
	if outcome == "OK":
	correctness = "1"
	step_names_token.append(new_step)

	else:
	if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if prev < new_step:
	step_names_token[-1] = new_step
	if not pd.isna(kc):
	index = np.argwhere(kcs==kc).flatten()[0]
	# print(index, type(index))
	kcs_skills[index] = prev_skill
	diff_skills[index] = prev_skill - curr_skill
	if step == "FinalAnswer":
	finalanswer_skill[index] = prev_skill

	original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
	original_steps.append(step)
	if (not opt1_used) and (not opt2_used):
	continue
	unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
	if step_names_token and unique_steps_len > 4:
	label = None
	if opt1_used and opt2_used:
	label = "2"
	if (not opt1_used) and opt2_used:
	label = "1"
	if opt1_used and (not opt2_used):
	label = "0"
	# print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}")
	info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label,
	"\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)),
	"\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)])
	overall_data.append(["\t".join(step_names_token), info])
	overall_labels.append(correctness)
	# proba = random.random()
	# # if prob in first_prob_list:
	# if proba <= 0.8:
	# train_file.write("\t".join(step_names_token))
	# train_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# train_info.write("\n")

	# elif proba > 0.9:
	# # elif prob in last_prob_list:
	# test_file.write("\t".join(step_names_token))
	# test_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# test_info.write("\n")

	# else:
	# val_file.write("\t".join(step_names_token))
	# val_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# val_info.write("\n")
	# break
	# break
	# break
	# break
	# break
	overall_labels = np.array(overall_labels)
	indices_of_zeros = list(np.where(overall_labels == '0')[0])
	indices_of_ones = list(np.where(overall_labels == '1')[0])
	# indices_of_twos = list(np.where(overall_labels == '2')[0])

	train_len = int(len(overall_labels) * 0.10)
	sample_size = int(train_len/2)
	print(f"sample_size: {sample_size}")
	sampled_instances = random.sample(indices_of_zeros, sample_size)
	sampled_instances.extend(random.sample(indices_of_ones, sample_size))
	# sampled_instances.extend(random.sample(indices_of_twos, sample_size))

	indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
	indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
	# indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]

	balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos))
	print(f"balanced_test: {balanced_test}")
	test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
	test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
	# test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))

	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):

	steps_seq = all_data[0]
	info = all_data[1]

	if index in sampled_instances:
	train_file.write(steps_seq)
	train_file.write("\n")

	train_info.write(info)
	train_info.write("\n")

	train_label.write(label)
	train_label.write("\n")
	elif index in test_sampled_instances:
	# proba = random.random()
	# if proba <0.5:
	test_file.write(steps_seq)
	test_file.write("\n")

	test_info.write(info)
	test_info.write("\n")

	test_label.write(label)
	test_label.write("\n")
	# else:
	# val_file.write(steps_seq)
	# val_file.write("\n")

	# val_info.write(info)
	# val_info.write("\n")

	# val_label.write(label)
	# val_label.write("\n")


	train_file.close()
	train_info.close()
	train_label.close()

	# val_file.close()
	# val_info.close()
	# val_label.close()

	test_file.close()
	test_info.close()
	test_label.close()

	def prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options):
	'''
	Ongoing research.
	FinalAnswer step correctness
	Correctness after opts:
	0 if attempt at step>1
	1 if attempt at step==1
	'''
	kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
	kcs = [kc for kc in kcs if not pd.isna(kc)]
	kcs = np.array(sorted(list(kcs)))
	print(kcs, type(kcs))
	print(f"KCs: {kcs}")
	chunk_iterator = data_processor.load_file_iterator(sep=",")

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	# val_file = open(options.val_file_path, "w")
	# val_info = open(options.val_info_path, "w")
	# val_label = open(options.val_label_path, "w")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	train_data = []
	train_labels = []

	test_data = []
	test_labels = []
	# kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
	# kcs = [kc if not pd.isna(kc) for kc in kcs]
	for chunk_data in chunk_iterator:
	for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
	if not options.school or school in options.school:
	print(f"{school} : {school_group.shape}")
	school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
	(school_group['CF (Encounter)'] == 0) &
	(school_group['CF (Is Review Mode)'] == -1) ]
	print(f"{school} : {school_group.shape}")
	# for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
	for student, student_groups in school_group.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)
	train = True
	proba = random.random()
	if proba < 0.5:
	train = False
	# prob_list = list(pd.unique(student_groups["Problem Name"]))
	# prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
	# prob_list = prob_list[-int(len(prob_list)/2):]
	prev_kcs_skills = [0 for i in kcs]
	for pi, (prob, prob_groups) in enumerate(student_groups.groupby("Problem Name")):
	# if not prob in prob_list:
	# continue
	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
	unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
	if unique_steps_len < 4:
	continue
	unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
	if unique_opt_steps_len < 2:
	continue
	# print(unique_steps, unique_opt_steps_len)
	class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
	step_names_token = []
	original_steps_actions_attempts_help_levels_outcomes = []
	original_steps = []
	means_and_extremes = False
	opt1_used = False
	opt2_used = False
	final_after_opts = False
	correctness = "0"
	kcs_skills = [0 for i in kcs]
	diff_skills = [0 for i in kcs]
	finalanswer_skill = [0 for i in kcs]
	for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
	'Outcome', 'Help Level', 'CF (Workspace Progress Status)',
	'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)',
	'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
	step = row["Step Name"]
	action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
	attempt = row["Attempt At Step"] # number
	outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	help_level = row["Help Level"] # number
	progress = row["CF (Workspace Progress Status)"]
	scenario = row['CF (Problem Scenario Tags)']
	kc = row['KC Model(MATHia)']
	prev_skill = row['CF (Skill Previous p-Known)']
	curr_skill = row['CF (Skill New p-Known)']
	# print(kc, prev_skill)
	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	etalon = row["CF (Etalon)"]
	if not pd.isna(etalon):
	etalon = etalon.strip('{}')
	key, value = etalon.split('=')
	etalon = value
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	except Exception as e:
	pass
	if row['CF (Is Autofilled)'] == True:
	continue
	prev = step_names_token[-1] if step_names_token else ""
	prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""

	if not step_names_token or step != prev_step:
	if step in options.opt_step1 or step in options.opt_step2:
	new_step = step
	if step in options.opt_step1[1:]:
	opt1_used = True
	elif step in options.opt_step2[2:]:
	opt2_used = True
	else:
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts:
	final_after_opts = True
	if outcome == "OK":
	correctness = "1"
	step_names_token.append(new_step)

	else:
	if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if prev < new_step:
	step_names_token[-1] = new_step
	if not pd.isna(kc):
	index = np.argwhere(kcs==kc).flatten()[0]
	# print(index, type(index))
	kcs_skills[index] = prev_skill
	if pi != 0:
	diff_skills[index] = prev_skill - prev_kcs_skills[index]
	prev_kcs_skills[index] = prev_skill
	if step == "FinalAnswer":
	finalanswer_skill[index] = prev_skill

	original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
	original_steps.append(step)
	if (not opt1_used) and (not opt2_used):
	continue
	unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
	if step_names_token and unique_steps_len > 4:
	label = None
	if opt1_used and opt2_used:
	label = "2"
	if (not opt1_used) and opt2_used:
	label = "1"
	if opt1_used and (not opt2_used):
	label = "0"
	# print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}")
	info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label,
	"\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)),
	"\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)])
	if train:
	train_data.append(["\t".join(step_names_token), info])
	train_labels.append(correctness)
	else:
	test_data.append(["\t".join(step_names_token), info])
	test_labels.append(correctness)
	# proba = random.random()
	# # if prob in first_prob_list:
	# if proba <= 0.8:
	# train_file.write("\t".join(step_names_token))
	# train_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# train_info.write("\n")

	# elif proba > 0.9:
	# # elif prob in last_prob_list:
	# test_file.write("\t".join(step_names_token))
	# test_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# test_info.write("\n")

	# else:
	# val_file.write("\t".join(step_names_token))
	# val_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# val_info.write("\n")
	# break
	# break
	# break
	# break
	# break
	# overall_labels = np.array(overall_labels)
	# indices_of_zeros = list(np.where(overall_labels == '0')[0])
	# indices_of_ones = list(np.where(overall_labels == '1')[0])
	# # indices_of_twos = list(np.where(overall_labels == '2')[0])

	# train_len = int(len(overall_labels) * 0.10)
	# sample_size = int(train_len/2)
	# print(f"sample_size: {sample_size}")
	# sampled_instances = random.sample(indices_of_zeros, sample_size)
	# sampled_instances.extend(random.sample(indices_of_ones, sample_size))
	# # sampled_instances.extend(random.sample(indices_of_twos, sample_size))

	# indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
	# indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
	# # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]

	# balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos))
	# print(f"balanced_test: {balanced_test}")
	# test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
	# test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
	# # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))

	for index, (all_data, label) in enumerate(zip(train_data, train_labels)):
	steps_seq = all_data[0]
	info = all_data[1]

	train_file.write(steps_seq)
	train_file.write("\n")

	train_info.write(info)
	train_info.write("\n")

	train_label.write(label)
	train_label.write("\n")
	train_file.close()
	train_info.close()
	train_label.close()

	for index, (all_data, label) in enumerate(zip(test_data, test_labels)):
	steps_seq = all_data[0]
	info = all_data[1]

	test_file.write(steps_seq)
	test_file.write("\n")

	test_info.write(info)
	test_info.write("\n")

	test_label.write(label)
	test_label.write("\n")
	test_file.close()
	test_info.close()
	test_label.close()

	def prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options):
	'''
	Ongoing research.
	FinalAnswer step correctness
	Correctness after opts:
	0 if attempt at step>1
	1 if attempt at step==1
	'''
	kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
	kcs = [kc for kc in kcs if not pd.isna(kc)]
	kcs = np.array(sorted(list(kcs)))
	print(kcs, type(kcs))
	print(f"KCs: {kcs}")
	chunk_iterator = data_processor.load_file_iterator(sep=",")

	train_file = open(options.train_file_path, "w")
	train_info = open(options.train_info_path, "w")
	train_label = open(options.train_label_path, "w")

	val_file = open(options.val_file_path, "a")
	val_info = open(options.val_info_path, "a")
	val_label = open(options.val_label_path, "a")

	test_file = open(options.test_file_path, "w")
	test_info = open(options.test_info_path, "w")
	test_label = open(options.test_label_path, "w")

	overall_data = []
	overall_labels = []
	# kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
	# kcs = [kc if not pd.isna(kc) for kc in kcs]
	for chunk_data in chunk_iterator:
	for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
	if not options.school or school in options.school:
	print(f"{school} : {school_group.shape}")
	school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
	(school_group['CF (Encounter)'] == 0) &
	(school_group['CF (Is Review Mode)'] == -1) ]
	print(f"{school} : {school_group.shape}")
	# for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
	for student, student_groups in school_group.groupby("Anon Student Id"):
	student_groups.sort_values(by="Time", inplace=True)
	# prob_list = list(pd.unique(student_groups["Problem Name"]))
	# prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
	# prob_list = prob_list[-int(len(prob_list)/2):]
	for prob, prob_groups in student_groups.groupby("Problem Name"):
	# if not prob in prob_list:
	# continue
	actions = list(prob_groups["Action"])
	# A problem should be completed by a student clicking Done button.
	if not "Done" in actions:
	continue
	unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
	unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
	if unique_steps_len < 4:
	continue
	unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
	if unique_opt_steps_len < 2:
	continue
	# print(unique_steps, unique_opt_steps_len)
	class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
	step_names_token = []
	original_steps_actions_attempts_help_levels_outcomes = []
	original_steps = []
	means_and_extremes = False
	opt1_used = False
	opt2_used = False
	final_after_opts = False
	correctness = "0"
	kcs_skills = [0 for i in kcs]
	diff_skills = [0 for i in kcs]
	finalanswer_skill = [0 for i in kcs]
	for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
	'Outcome', 'Help Level', 'CF (Workspace Progress Status)',
	'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)',
	'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
	step = row["Step Name"]
	action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
	attempt = row["Attempt At Step"] # number
	outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
	help_level = row["Help Level"] # number
	progress = row["CF (Workspace Progress Status)"]
	scenario = row['CF (Problem Scenario Tags)']
	kc = row['KC Model(MATHia)']
	prev_skill = row['CF (Skill Previous p-Known)']
	curr_skill = row['CF (Skill New p-Known)']
	# print(kc, prev_skill)
	if not pd.isna(step):
	if step in options.opt_step1 and not means_and_extremes:
	etalon = row["CF (Etalon)"]
	if not pd.isna(etalon):
	etalon = etalon.strip('{}')
	key, value = etalon.split('=')
	etalon = value
	try:
	etalon = int(etalon)
	except Exception as e:
	try:
	etalon = float(etalon)
	means_and_extremes = True
	except Exception as e:
	pass
	if row['CF (Is Autofilled)'] == True:
	continue
	prev = step_names_token[-1] if step_names_token else ""
	prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""

	if not step_names_token or step != prev_step:
	if step in options.opt_step1 or step in options.opt_step2:
	new_step = step
	if step in options.opt_step1[1:]:
	opt1_used = True
	elif step in options.opt_step2[2:]:
	opt2_used = True
	else:
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts:
	final_after_opts = True
	if outcome == "OK":
	correctness = "1"
	step_names_token.append(new_step)

	else:
	if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
	if action == "Attempt" and outcome != "OK":
	new_step = step+"-2"
	elif "Hint" in action:
	new_step = step+"-1"
	else:
	new_step = step+"-0"

	if prev < new_step:
	step_names_token[-1] = new_step
	if not pd.isna(kc):
	index = np.argwhere(kcs==kc).flatten()[0]
	# print(index, type(index))
	kcs_skills[index] = prev_skill
	diff_skills[index] = prev_skill - curr_skill
	if step == "FinalAnswer":
	finalanswer_skill[index] = prev_skill

	original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
	original_steps.append(step)
	if (not opt1_used) and (not opt2_used):
	continue
	unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
	if step_names_token and unique_steps_len > 4:
	label = None
	if opt1_used and opt2_used:
	label = "2"
	if (not opt1_used) and opt2_used:
	label = "1"
	if opt1_used and (not opt2_used):
	label = "0"
	# print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}")
	info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	"\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label,
	"\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)),
	"\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)])
	overall_data.append(["\t".join(step_names_token), info])
	overall_labels.append(correctness)
	# proba = random.random()
	# # if prob in first_prob_list:
	# if proba <= 0.8:
	# train_file.write("\t".join(step_names_token))
	# train_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# train_info.write("\n")

	# elif proba > 0.9:
	# # elif prob in last_prob_list:
	# test_file.write("\t".join(step_names_token))
	# test_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# test_info.write("\n")

	# else:
	# val_file.write("\t".join(step_names_token))
	# val_file.write("\n")
	# # school, class, student id, progress, problem name, scenario,
	# # prefered ER or ME, total steps length,
	# # original seq-action-attempt-help_level-outcome
	# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario),
	# f"{1 if means_and_extremes else 0}", str(len(step_names_token)),
	# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
	# val_info.write("\n")
	# break
	# break
	# break
	# break
	# break
	overall_labels = np.array(overall_labels)
	indices_of_zeros = list(np.where(overall_labels == '0')[0])
	indices_of_ones = list(np.where(overall_labels == '1')[0])
	# indices_of_twos = list(np.where(overall_labels == '2')[0])

	# train_len = int(len(overall_labels) * 0.10)
	train_len = int(len(overall_labels) * float(options.per))

	sample_size = int(train_len/2)
	if float(options.per) == 1:
	sample_size = min(len(indices_of_zeros), len(indices_of_ones))
	elif float(options.per) > 1:
	sample_size = int(options.per)
	print(f"sample_size: {sample_size}")
	sampled_instances = random.sample(indices_of_zeros, sample_size)
	sampled_instances.extend(random.sample(indices_of_ones, sample_size))
	# sampled_instances.extend(random.sample(indices_of_twos, sample_size))

	indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
	indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
	# indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]

	balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos))
	print(f"balanced_test: {balanced_test}")
	test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
	test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
	# test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))

	for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):

	steps_seq = all_data[0]
	info = all_data[1]

	if index in sampled_instances:
	train_file.write(steps_seq)
	train_file.write("\n")

	train_info.write(info)
	train_info.write("\n")

	train_label.write(label)
	train_label.write("\n")
	if float(options.per) == 1.0:
	val_file.write(steps_seq)
	val_file.write("\n")

	val_info.write(info)
	val_info.write("\n")

	val_label.write(label)
	val_label.write("\n")

	elif index in test_sampled_instances:
	# proba = random.random()
	# if proba <0.5:
	test_file.write(steps_seq)
	test_file.write("\n")

	test_info.write(info)
	test_info.write("\n")

	test_label.write(label)
	test_label.write("\n")

	if float(options.per) != 1.0:
	val_file.write(steps_seq)
	val_file.write("\n")

	val_info.write(info)
	val_info.write("\n")

	val_label.write(label)
	val_label.write("\n")


	train_file.close()
	train_info.close()
	train_label.close()

	val_file.close()
	val_info.close()
	val_label.close()

	test_file.close()
	test_info.close()
	test_label.close()



	def prepare_pretraining_vocab_file(options):

	# kc = pickle.load(open("dataset/unique/unique_kcs_list.pkl","rb"))
	# kc_token = {"KC"+str(i):k for i, k in enumerate(kc)}
	# pickle.dump(kc_token, open("pretraining/unique_dict_kc_token.pkl", "wb"))

	# steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
	# step_token = {"step"+str(i):k for i, k in enumerate(steps)}
	# folder_name = options.workspace_name+"/" if options.workspace_name else ""
	# pickle.dump(step_token, open(f"{folder_name}pretraining/unique_dict_step_token.pkl", "wb"))

	# steps = pickle.load(open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl","rb"))
	steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))

	# print("No of unique kc", len(kc))
	print("No of unique steps ", len(steps))
	# print("No of unique problem", len(prob))
	# print("Size of vocab ", len(steps))

	ordered_steps = sorted(list(steps))

	with (open(options.vocab_file_path,"w")) as vb_file:
	vb_file.write("[PAD]\n")
	vb_file.write("[UNK]\n")
	vb_file.write("[MASK]\n")
	vb_file.write("[CLS]\n")
	vb_file.write("[SEP]\n")
	# vb_file.write("\n".join(kc_token.keys()))
	# vb_file.write("\n")
	# vb_file.write("\n".join(step_token.keys()))
	# vb_file.write("\n".join(ordered_steps))
	for step in ordered_steps:
	if step in options.opt_step1 or step in options.opt_step2:
	vb_file.write(f"{step}\n")
	else:
	for i in range(3):
	vb_file.write(f"{step}-{i}\n")
	vb_file.close()
	with open(options.vocab_file_path,"r") as f:
	l = f.readlines()
	print(l, len(l))
	f.close()


	def main(opt):
	options = copy.deepcopy(opt)
	if opt.workspace_name:
	options.dataset_folder = opt.dataset_folder+opt.workspace_name+"/"

	data_processor = DataPreprocessor(input_file_path=opt.dataset)

	if opt.analyze_dataset_by_section:
	print(f"Analyzing dataset by section for workspace: {opt.workspace_name}")
	data_processor.analyze_dataset_by_section(opt.workspace_name)

	pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb"))
	pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb"))
	pickle.dump(data_processor.unique_prob_hierarchy, open(f"{options.dataset_folder}unique_hierarchy_list.pkl", "wb"))
	pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb"))
	pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb"))

	if opt.analyze_dataset_by_school:
	print(f"Analyzing dataset of all school for workspace: {opt.workspace_name}")
	data_processor.analyze_dataset_by_school(opt.workspace_name)

	if not os.path.exists(options.dataset_folder):
	os.makedirs(options.dataset_folder)
	pickle.dump(data_processor.unique_schools, open(f"{options.dataset_folder}unique_schools_list.pkl", "wb"))
	pickle.dump(data_processor.unique_class, open(f"{options.dataset_folder}unique_class_list.pkl", "wb"))
	pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb"))
	pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb"))
	pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb"))
	pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb"))
	pickle.dump(data_processor.unique_new_steps_w_action_attempt, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl", "wb"))
	pickle.dump(data_processor.unique_new_steps_w_action_attempt_kcs, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_kcs.pkl", "wb"))
	pickle.dump(data_processor.unique_new_steps_w_kcs, open(f"{options.dataset_folder}unique_new_steps_w_kcs_list.pkl", "wb"))

	if opt.workspace_name:
	for k,v in vars(opt).items():
	if 'path' in k:
	if v:
	redirect_path = opt.workspace_name+"/"
	if opt.school and opt.pretrain:
	sch = f"sch_largest_{len(opt.school)}-coded" #f"sch_largest_655"
	redirect_path = redirect_path + sch+"/"
	if opt.school_folder:
	redirect_path = redirect_path + opt.school_folder+"/"
	# else:
	# sch = "sch_largest_655"
	if k != "vocab_file_path":
	if opt.pretrain:
	redirect_path = redirect_path + "pretraining/"
	else:
	if opt.code:
	redirect_path = redirect_path + f"{opt.code}/"
	elif opt.finetune_task:
	if opt.diff_val_folder and "val" in v:
	redirect_path = redirect_path + f"finetuning/"
	else:
	redirect_path = redirect_path + f"finetuning/{opt.finetune_task}/"
	if not os.path.exists(redirect_path):
	os.makedirs(redirect_path)
	else:
	if not os.path.exists(redirect_path+"/pretraining/"):
	os.makedirs(redirect_path+"/pretraining/")
	setattr(options, f"{k}", redirect_path+v)
	# setattr(options, f"{k}", opt.workspace_name+"/check/"+v)
	print(f"options.{k} : {getattr(options, f'{k}')}")



	if options.pretrain:
	print("Preparing vocab...")
	prepare_pretraining_vocab_file(options)
	print("Preparing pre-training dataset...")
	# old non-repeated steps
	# prepare_pretraining_files(data_processor, options)
	# coded
	# prepare_school_coded_pretraining_files(data_processor, options)
	prepare_school_coded_finetuning_opts_intentional_files(data_processor, options)
	# prepare_pretraining_files(data_processor, options)
	# prepare_school_pretraining_files(data_processor, options)
	# else:
	# print("Preparing attention dataset...")
	# prepare_school_attention_files(data_processor, options)
	else:
	print("Preparing fine-tuning dataset...")
	# _1920
	# prepare_finetuning_10per_files(data_processor, options)
	# prepare_finetuning_IS_FS_files(data_processor, options)
	# prepare_finetuning_correctness_files(data_processor, options)

	# _2223
	# prepare_school_coded_finetuning_partial_seq_files(data_processor, options)
	# prepare_school_coded_finetuning_opts_files(data_processor, options)
	prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options)
	# prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options)
	# prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options)
	# prepare_finetuning_IS_files(data_processor, options)
	# # prepare_finetuning_FS_files(data_processor, options)
	# prepare_finetuning_correctness_aaai_files(data_processor, options)
	# # prepare_finetuning_SL_files(data_processor, options)
	# # prepare_finetuning_effectiveness_files(data_processor, options)
	# prepare_attn_test_files(data_processor, options)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('-dataset_folder', type=str, default="dataset/CL4999_1920/")

	parser.add_argument('-analyze_dataset_by_section', type=bool, default=False)
	parser.add_argument('-analyze_dataset_by_school', type=bool, default=False)
	parser.add_argument('-workspace_name', type=str, default=None)
	parser.add_argument('-school', nargs='+', type=str, default=None)
	parser.add_argument('-school_folder', type=str, default=None)

	# parser.add_argument('-highGRschool', nargs='+', type=str, default=None)
	# parser.add_argument('-lowGRschool', nargs='+', type=str, default=None)

	parser.add_argument('-code', type=str, default=None)
	parser.add_argument('-finetune_task', type=str, default=None)

	parser.add_argument('-per', type=float, default=None)
	parser.add_argument("-diff_val_folder", type=bool, default=False, help="use for different val folder")

	parser.add_argument('-opt_step1', nargs='+', type=str, help='List of optional steps 1')
	parser.add_argument('-opt_step2', nargs='+', type=str, help='List of optional steps 2')
	parser.add_argument('-final_step', nargs='+', type=str, help='List of final step')

	parser.add_argument('-dataset', type=str, default="dataset/CL4999_1920/course2_1920_4999_students_datashop.txt")

	parser.add_argument('-pretrain', type=bool, default=False)
	parser.add_argument('-vocab_file_path', type=str, default="pretraining/vocab.txt") #pretraining/vocab.txt

	# Prepare for pretraining
	parser.add_argument('-train_file_path', type=str, default="train.txt") #pretraining/pretrain.txt
	parser.add_argument('-train_info_path', type=str, default="train_info.txt") #pretraining/pretrain_info.txt
	parser.add_argument('-train_label_path', type=str, default="train_label.txt") #finetuning/train_label.txt

	parser.add_argument('-val_file_path', type=str, default="val.txt") #pretraining/val.txt
	parser.add_argument('-val_info_path', type=str, default="val_info.txt") #pretraining/val_info.txt
	parser.add_argument('-val_label_path', type=str, default="val_label.txt") #finetuning/val_label.txt

	parser.add_argument('-test_file_path', type=str, default="test.txt") #pretraining/test.txt
	parser.add_argument('-test_info_path', type=str, default="test_info.txt") #pretraining/test_info.txt
	parser.add_argument('-test_label_path', type=str, default="test_label.txt") #finetuning/test_label.txt


	# parser.add_argument('-train_gt_label_path', type=str, default="finetuning/train_gt_label.txt")
	# parser.add_argument('-test_gt_label_path', type=str, default="finetuning/test_gt_label.txt")


	options = parser.parse_args()
	if not options.opt_step1:
	setattr(options, "opt_step1", [])
	print("Optional steps 1: ", options.opt_step1)

	if not options.opt_step2:
	setattr(options, "opt_step2", [])
	print("Optional steps 2: ", options.opt_step2)

	if not options.final_step:
	setattr(options, "final_step", [])
	print("Final steps: ", options.final_step)

	main(options)