Spaces:

OdiaGenAI
/

Olive_Farm

Running

App Files Files Community

Olive_Farm / open_instruct /reformat_data.py

sam2ai

Synced repo using 'sync_with_huggingface' Github Action

11fa0f1 over 2 years ago

raw

history blame contribute delete

23.5 kB

	#!/usr/bin/env python
	# coding=utf-8
	'''
	This script is used to reformat the downloaded datasets into the format that can be used by the model.
	Here we use jsonl for the converted data. Each line in the jsonl file is a json object formatted as follows:
	{
	"dataset": "dataset_name",
	"id": "unique_id",
	"messages": [
	{"role": "system", "content": "message_text"}, # optional
	{"role": "user", "content": "message_text"},
	{"role": "assistant", "content": "message_text"},
	{"role": "user", "content": "message_text"},
	{"role": "assistant", "content": "message_text"},
	...
	],
	}
	'''

	import json
	import random
	import re
	import os
	import pandas as pd
	import argparse
	from instruction_encode_templates import encode_instruction_example, encode_few_shot_example


	def convert_super_ni_data(data_dir, output_dir, zero_shot_examples_per_task=60, few_shot_examples_per_task=20, n_few_shot=2):
	os.makedirs(output_dir, exist_ok=True)
	train_tasks = []
	with open(os.path.join(data_dir, "splits", "xlingual", "train_tasks.txt"), "r") as fin:
	for line in fin:
	if not "_mmmlu_" in line: # skip mmlu to avoid test leakage
	train_tasks.append(line.strip())
	with open(os.path.join(output_dir, "super_ni_data.jsonl"), "w") as fout:
	for task in train_tasks:
	with open(os.path.join(data_dir, "tasks", f"{task}.json"), "r") as fin:
	task_data = json.load(fin)
	instruction = task_data["Definition"][0]
	if zero_shot_examples_per_task + few_shot_examples_per_task < len(task_data["Instances"]):
	instances = random.sample(task_data["Instances"], k=zero_shot_examples_per_task+few_shot_examples_per_task)
	else:
	instances = task_data["Instances"]
	for instance in instances[:zero_shot_examples_per_task]:
	encoded_example = encode_instruction_example(
	instruction=instruction,
	input=instance["input"],
	output=instance["output"][0],
	random_template=True,
	eos_token=None
	)
	fout.write(json.dumps({
	"dataset": "super_ni",
	"id": f"super_ni_{instance['id']}",
	"messages": [
	{"role": "user", "content": encoded_example["prompt"]},
	{"role": "assistant", "content": encoded_example["completion"]},
	]
	}) + "\n")
	for instance in instances[zero_shot_examples_per_task:]:
	if n_few_shot < len(task_data["Positive Examples"]):
	examplars = random.sample(task_data["Positive Examples"], k=n_few_shot)
	else:
	examplars = task_data["Positive Examples"]
	encoded_example = encode_few_shot_example(
	instruction=instruction,
	examplars=examplars,
	input=instance["input"],
	output=instance["output"][0],
	eos_token=None
	)
	fout.write(json.dumps({
	"dataset": "super_ni",
	"id": f"super_ni_{instance['id']}",
	"messages": [
	{"role": "user", "content": encoded_example["prompt"]},
	{"role": "assistant", "content": encoded_example["completion"]},
	]
	}) + "\n")


	def convert_cot_data(data_dir, output_dir, num_zero_shot_examples=50000, num_few_shot_examples=50000):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	if num_few_shot_examples > 0:
	with open(os.path.join(data_dir, "cot_zsopt.jsonl"), "r") as fin:
	zero_shot_examples = [json.loads(line) for line in fin]
	if num_zero_shot_examples < len(zero_shot_examples):
	zero_shot_examples = random.sample(zero_shot_examples, k=num_zero_shot_examples)
	examples.extend(zero_shot_examples)
	if num_few_shot_examples > 0:
	with open(os.path.join(data_dir, "cot_fsopt.jsonl"), "r") as fin:
	few_shot_examples = [json.loads(line) for line in fin]
	if num_few_shot_examples < len(few_shot_examples):
	few_shot_examples = random.sample(few_shot_examples, k=num_few_shot_examples)
	examples.extend(few_shot_examples)
	output_path = os.path.join(output_dir, "cot_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	prompt = example["inputs"]
	if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
	prompt += "\n"
	completion = example["targets"]
	fout.write(json.dumps({
	"dataset": "cot",
	"id": f"cot_{idx}",
	"messages": [
	{"role": "user", "content": prompt},
	{"role": "assistant", "content": completion},
	]
	}) + "\n")


	def convert_flan_v2_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	with open(os.path.join(data_dir, "flan_v2_resampled_100k.jsonl"), "r") as fin:
	for line in fin:
	examples.append(json.loads(line))
	output_path = os.path.join(output_dir, "flan_v2_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	prompt = example["inputs"]
	if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
	prompt += "\n"
	completion = example["targets"]
	fout.write(json.dumps({
	"dataset": "flan_v2",
	"id": f"flan_v2_{idx}",
	"messages": [
	{"role": "user", "content": prompt},
	{"role": "assistant", "content": completion},
	]
	}) + "\n")


	def convert_dolly_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	with open(os.path.join(data_dir, "databricks-dolly-15k.jsonl"), "r") as fin:
	for line in fin:
	examples.append(json.loads(line))
	output_path = os.path.join(output_dir, "dolly_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	encoded_example = encode_instruction_example(
	instruction=example["instruction"],
	input=example["context"],
	output=example["response"],
	random_template=True,
	eos_token=None
	)
	fout.write(json.dumps({
	"dataset": "dolly",
	"id": f"dolly_{idx}",
	"messages": [
	{"role": "user", "content": encoded_example["prompt"]},
	{"role": "assistant", "content": encoded_example["completion"]},
	]
	}) + "\n")


	def convert_self_instruct_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	with open(os.path.join(data_dir, "all_instances_82K.jsonl"), "r") as fin:
	for line in fin:
	examples.append(json.loads(line))
	output_path = os.path.join(output_dir, "self_instruct_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	encoded_example = encode_instruction_example(
	instruction=example["instruction"],
	input=example["input"],
	output=example["output"],
	random_template=True,
	eos_token=None
	)
	fout.write(json.dumps({
	"dataset": "self_instruct",
	"id": f"self_instruct_{idx}",
	"messages": [
	{"role": "user", "content": encoded_example["prompt"]},
	{"role": "assistant", "content": encoded_example["completion"]},
	]
	}) + "\n")


	def convert_unnatural_instructions_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	instance_cnt = 0
	with open(os.path.join(data_dir, "core_data.jsonl"), "r") as fin, open((os.path.join(output_dir, "unnatural_instructions_data.jsonl")), "w") as fout:
	for line in fin:
	task_data = json.loads(line)
	instruction = task_data["instruction"]
	for instance in task_data["instances"]:
	if instance["constraints"] and instance["constraints"].lower() not in ["none", "none."]:
	instance_instruction = instruction + "\n" + instance["constraints"]
	else:
	instance_instruction = instruction
	encoded_example = encode_instruction_example(
	instruction=instance_instruction,
	input=instance["input"],
	output=instance["output"],
	random_template=True,
	eos_token=None
	)
	fout.write(json.dumps({
	"dataset": "unnatural_instructions",
	"id": f"unnatural_instructions_{instance_cnt}",
	"messages": [
	{"role": "user", "content": encoded_example["prompt"]},
	{"role": "assistant", "content": encoded_example["completion"]},
	]
	}) + "\n")
	instance_cnt += 1


	def convert_stanford_alpaca_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	with open(os.path.join(data_dir, "alpaca_data.json"), "r") as fin:
	examples.extend(json.load(fin))
	output_path = os.path.join(output_dir, "stanford_alpaca_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	encoded_example = encode_instruction_example(
	instruction=example["instruction"],
	input=example["input"],
	output=example["output"],
	random_template=True,
	eos_token=None
	)
	fout.write(json.dumps({
	"dataset": "stanford_alpaca",
	"id": f"stanford_alpaca_{idx}",
	"messages": [
	{"role": "user", "content": encoded_example["prompt"]},
	{"role": "assistant", "content": encoded_example["completion"]},
	]
	}) + "\n")


	def convert_code_alpaca_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	with open(os.path.join(data_dir, "code_alpaca_20k.json"), "r") as fin:
	examples.extend(json.load(fin))
	output_path = os.path.join(output_dir, "code_alpaca_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	encoded_example = encode_instruction_example(
	instruction=example["instruction"],
	input=example["input"],
	output=example["output"],
	random_template=True,
	eos_token=None
	)
	fout.write(json.dumps({
	"dataset": "code_alpaca",
	"id": f"code_alpaca_{idx}",
	"messages": [
	{"role": "user", "content": encoded_example["prompt"]},
	{"role": "assistant", "content": encoded_example["completion"]},
	]
	}) + "\n")


	def convert_gpt4_alpaca_data(data_dir, output_dir, load_en=True, load_zh=False):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	if load_en:
	with open(os.path.join(data_dir, "alpaca_gpt4_data.json"), "r") as fin:
	examples.extend(json.load(fin))
	if load_zh:
	with open(os.path.join(data_dir, "alpaca_gpt4_data_zh.json"), "r") as fin:
	examples.extend(json.load(fin))
	output_path = os.path.join(output_dir, "gpt4_alpaca_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	encoded_example = encode_instruction_example(
	instruction=example["instruction"],
	input=example["input"],
	output=example["output"],
	random_template=True,
	eos_token=None
	)
	fout.write(json.dumps({
	"dataset": "gpt4_alpaca",
	"id": f"gpt4_alpaca_{idx}",
	"messages": [
	{"role": "user", "content": encoded_example["prompt"]},
	{"role": "assistant", "content": encoded_example["completion"]},
	]
	}) + "\n")


	def convert_sharegpt_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	with open(os.path.join(data_dir, "sharegpt_html_cleaned_and_split.json"), "r") as fin:
	examples.extend(json.load(fin))

	output_path = os.path.join(output_dir, "sharegpt_data.jsonl")
	with open(output_path, "w") as fout:
	invalid_cnt = 0
	for idx, example in enumerate(examples):
	messages = []
	valid = True
	for message in example["conversations"]:
	if message["from"] == "human" or message["from"] == "user":
	messages.append({
	"role": "user",
	"content": message["value"]
	})
	elif message["from"] == "gpt" or message["from"] == "chatgpt":
	messages.append({
	"role": "assistant",
	"content": message["value"]
	})
	elif message["from"] == "system":
	valid = False
	invalid_cnt += 1
	break
	elif message["from"] == "bing":
	valid = False
	invalid_cnt += 1
	break
	else:
	raise ValueError(f"Unknown message sender: {message['from']}")
	if messages and valid:
	fout.write(json.dumps({
	"dataset": "sharegpt",
	"id": f"sharegpt_{example['id']}",
	"messages": messages
	}) + "\n")
	print(f"# of invalid examples in sharegpt data: {invalid_cnt}")


	def convert_baize_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	for source in ["alpaca", "medical", "quora", "stackoverflow"]:
	with open(os.path.join(data_dir, f"{source}_chat_data.json"), "r") as fin:
	examples.extend(json.load(fin))

	output_path = os.path.join(output_dir, "baize_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	# split example["input"] by [\|Human\|] and [\|AI\|]
	messages = []
	rounds = example["input"].split("[\|Human\|]")[1:]
	for round in rounds:
	if not round.strip() or "[\|AI\|]" not in round:
	continue
	human, assistant = round.split("[\|AI\|]")
	messages.append({
	"role": "user",
	"content": human.strip()
	})
	messages.append({
	"role": "assistant",
	"content": assistant.strip()
	})
	fout.write(json.dumps({
	"dataset": "baize",
	"id": f"baize_{idx}",
	"messages": messages
	}) + "\n")


	def convert_oasst1_data(data_dir, output_dir):
	'''
	For OASST1, because it's in a tree structure, where every user input might get multiple replies,
	we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node).
	This results in some of the messages being duplicated among different paths (instances).
	Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path.
	'''
	os.makedirs(output_dir, exist_ok=True)
	conversations = []
	with open(os.path.join(data_dir, "2023-04-12_oasst_ready.trees.jsonl"), "r") as fin:
	for line in fin:
	conversations.append(json.loads(line))

	output_path = os.path.join(output_dir, "oasst1_data.jsonl")

	# we filter out the sequences that mention the creator information
	filter_strings = [
	"LAION",
	"Open Asssistant",
	"OpenAssistant",
	]

	# tranvers the conversation tree, and collect all valid sequences
	def dfs(reply, messages, valid_sequences):
	if any([filter_string in reply["text"] for filter_string in filter_strings]):
	return
	if reply["role"] == "assistant":
	messages.append(
	{"role": "assistant", "content": reply["text"]}
	)
	if not reply["replies"]: # leaf node
	valid_sequences.append(messages[:])
	else:
	for child in reply["replies"]:
	dfs(child, messages, valid_sequences)
	messages.pop()
	elif reply["role"] == "prompter":
	messages.append(
	{"role": "user", "content": reply["text"]}
	)
	for child in reply["replies"]:
	dfs(child, messages, valid_sequences)
	messages.pop()
	else:
	raise ValueError(f"Unknown role: {reply['role']}")

	with open(output_path, "w") as fout:
	example_cnt = 0
	for _, conversation in enumerate(conversations):
	valid_sequences = []
	dfs(conversation["prompt"], [], valid_sequences)
	for sequence in valid_sequences:
	fout.write(json.dumps({
	"dataset": "oasst1",
	"id": f"oasst1_{example_cnt}",
	"messages": sequence
	}) + "\n")
	example_cnt += 1


	def convert_lima_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	with open(os.path.join(data_dir, "train.jsonl"), "r") as fin:
	for line in fin:
	examples.append(json.loads(line))
	output_path = os.path.join(output_dir, "lima_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	messages = []
	if not len(example["conversations"]) % 2 == 0:
	print(f"Waring: example {idx} in LIMA has odd number of messages. Cutting off the last message.")
	example["conversations"] = example["conversations"][:-1]

	for i in range(0, len(example["conversations"]), 2):
	messages.append({
	"role": "user",
	"content": example["conversations"][i]
	})
	messages.append({
	"role": "assistant",
	"content": example["conversations"][i+1]
	})
	fout.write(json.dumps({
	"dataset": "lima",
	"id": f"lima_{idx}",
	"messages": messages,
	}) + "\n")


	def convert_wizardlm_data(data_dir, output_dir):
	os.makedirs(output_dir, exist_ok=True)
	examples = []
	with open(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json"), "r") as fin:
	examples = json.load(fin)

	output_path = os.path.join(output_dir, "wizardlm_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	messages = []
	assert len(example["conversations"]) % 2 == 0
	for i in range(0, len(example["conversations"]), 2):
	assert example["conversations"][i]["from"] == "human"
	assert example["conversations"][i+1]["from"] == "gpt"
	messages.append({
	"role": "user",
	"content": example["conversations"][i]["value"]
	})
	messages.append({
	"role": "assistant",
	"content": example["conversations"][i+1]["value"]
	})
	fout.write(json.dumps({
	"dataset": "wizardlm",
	"id": f"wizardlm_{example['idx']}",
	"messages": messages,
	}) + "\n")


	def convert_open_orca_data(data_dir, output_dir, num_gpt4_examples=100000, num_gpt35_examples=0):
	os.makedirs(output_dir, exist_ok=True)
	examples = []

	df = pd.read_parquet(os.path.join(data_dir, "1M-GPT4-Augmented.parquet"))
	gpt4_examples = [row.to_dict() for _, row in df.iterrows()]
	random.shuffle(gpt4_examples)
	examples.extend(gpt4_examples[:num_gpt4_examples])

	df = pd.read_parquet(os.path.join(data_dir, "3_5M-GPT3_5-Augmented.parquet"))
	gpt35_examples = [row.to_dict() for _, row in df.iterrows()]
	random.shuffle(gpt35_examples)
	examples.extend(gpt35_examples[:num_gpt35_examples])

	output_path = os.path.join(output_dir, "open_orca_data.jsonl")
	with open(output_path, "w") as fout:
	for idx, example in enumerate(examples):
	messages = [
	{"role": "system", "content": example["system_prompt"]},
	{"role": "user", "content": example["question"]},
	{"role": "assistant", "content": example["response"]}
	]
	fout.write(json.dumps({
	"dataset": "open_orca",
	"id": f"open_orca_{example['id']}",
	"messages": messages,
	}) + "\n")


	if __name__ == "__main__":
	arg_parser = argparse.ArgumentParser()
	arg_parser.add_argument("--raw_data_dir", type=str, default="data/downloads")
	arg_parser.add_argument("--output_dir", type=str, default="data/processed")
	arg_parser.add_argument("--seed", type=int, default=42)
	args = arg_parser.parse_args()
	random.seed(args.seed)

	# get the subfolder names in raw_data_dir
	subfolders = [f for f in os.listdir(args.raw_data_dir) if os.path.isdir(os.path.join(args.raw_data_dir, f))]

	# all supported datasets
	supported_datasets = []
	all_funcs = [func_name for func_name in globals() if callable(globals()[func_name])]
	for func_name in all_funcs:
	if re.match(r"convert_.+_data", func_name):
	supported_datasets.append(func_name[8:-5])

	# check if the subfolder names are supported datasets
	valid_subfolders = []
	for subfolder in subfolders:
	if subfolder not in supported_datasets:
	print(f"Warning: {subfolder} in the raw data folder is not a supported dataset. We will skip it.")
	else:
	valid_subfolders.append(subfolder)

	# prepare data for each dataset
	statistics = {}
	for subfolder in valid_subfolders:
	print(f"Processing {subfolder} data...")
	globals()[f"convert_{subfolder}_data"](os.path.join(args.raw_data_dir, subfolder), os.path.join(args.output_dir, subfolder))