Spaces:
Running
Running
| #!/usr/bin/env python | |
| # coding=utf-8 | |
| ''' | |
| This script is used to reformat the downloaded datasets into the format that can be used by the model. | |
| Here we use jsonl for the converted data. Each line in the jsonl file is a json object formatted as follows: | |
| { | |
| "dataset": "dataset_name", | |
| "id": "unique_id", | |
| "messages": [ | |
| {"role": "system", "content": "message_text"}, # optional | |
| {"role": "user", "content": "message_text"}, | |
| {"role": "assistant", "content": "message_text"}, | |
| {"role": "user", "content": "message_text"}, | |
| {"role": "assistant", "content": "message_text"}, | |
| ... | |
| ], | |
| } | |
| ''' | |
| import json | |
| import random | |
| import re | |
| import os | |
| import pandas as pd | |
| import argparse | |
| from instruction_encode_templates import encode_instruction_example, encode_few_shot_example | |
| def convert_super_ni_data(data_dir, output_dir, zero_shot_examples_per_task=60, few_shot_examples_per_task=20, n_few_shot=2): | |
| os.makedirs(output_dir, exist_ok=True) | |
| train_tasks = [] | |
| with open(os.path.join(data_dir, "splits", "xlingual", "train_tasks.txt"), "r") as fin: | |
| for line in fin: | |
| if not "_mmmlu_" in line: # skip mmlu to avoid test leakage | |
| train_tasks.append(line.strip()) | |
| with open(os.path.join(output_dir, "super_ni_data.jsonl"), "w") as fout: | |
| for task in train_tasks: | |
| with open(os.path.join(data_dir, "tasks", f"{task}.json"), "r") as fin: | |
| task_data = json.load(fin) | |
| instruction = task_data["Definition"][0] | |
| if zero_shot_examples_per_task + few_shot_examples_per_task < len(task_data["Instances"]): | |
| instances = random.sample(task_data["Instances"], k=zero_shot_examples_per_task+few_shot_examples_per_task) | |
| else: | |
| instances = task_data["Instances"] | |
| for instance in instances[:zero_shot_examples_per_task]: | |
| encoded_example = encode_instruction_example( | |
| instruction=instruction, | |
| input=instance["input"], | |
| output=instance["output"][0], | |
| random_template=True, | |
| eos_token=None | |
| ) | |
| fout.write(json.dumps({ | |
| "dataset": "super_ni", | |
| "id": f"super_ni_{instance['id']}", | |
| "messages": [ | |
| {"role": "user", "content": encoded_example["prompt"]}, | |
| {"role": "assistant", "content": encoded_example["completion"]}, | |
| ] | |
| }) + "\n") | |
| for instance in instances[zero_shot_examples_per_task:]: | |
| if n_few_shot < len(task_data["Positive Examples"]): | |
| examplars = random.sample(task_data["Positive Examples"], k=n_few_shot) | |
| else: | |
| examplars = task_data["Positive Examples"] | |
| encoded_example = encode_few_shot_example( | |
| instruction=instruction, | |
| examplars=examplars, | |
| input=instance["input"], | |
| output=instance["output"][0], | |
| eos_token=None | |
| ) | |
| fout.write(json.dumps({ | |
| "dataset": "super_ni", | |
| "id": f"super_ni_{instance['id']}", | |
| "messages": [ | |
| {"role": "user", "content": encoded_example["prompt"]}, | |
| {"role": "assistant", "content": encoded_example["completion"]}, | |
| ] | |
| }) + "\n") | |
| def convert_cot_data(data_dir, output_dir, num_zero_shot_examples=50000, num_few_shot_examples=50000): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| if num_few_shot_examples > 0: | |
| with open(os.path.join(data_dir, "cot_zsopt.jsonl"), "r") as fin: | |
| zero_shot_examples = [json.loads(line) for line in fin] | |
| if num_zero_shot_examples < len(zero_shot_examples): | |
| zero_shot_examples = random.sample(zero_shot_examples, k=num_zero_shot_examples) | |
| examples.extend(zero_shot_examples) | |
| if num_few_shot_examples > 0: | |
| with open(os.path.join(data_dir, "cot_fsopt.jsonl"), "r") as fin: | |
| few_shot_examples = [json.loads(line) for line in fin] | |
| if num_few_shot_examples < len(few_shot_examples): | |
| few_shot_examples = random.sample(few_shot_examples, k=num_few_shot_examples) | |
| examples.extend(few_shot_examples) | |
| output_path = os.path.join(output_dir, "cot_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| prompt = example["inputs"] | |
| if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"): | |
| prompt += "\n" | |
| completion = example["targets"] | |
| fout.write(json.dumps({ | |
| "dataset": "cot", | |
| "id": f"cot_{idx}", | |
| "messages": [ | |
| {"role": "user", "content": prompt}, | |
| {"role": "assistant", "content": completion}, | |
| ] | |
| }) + "\n") | |
| def convert_flan_v2_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| with open(os.path.join(data_dir, "flan_v2_resampled_100k.jsonl"), "r") as fin: | |
| for line in fin: | |
| examples.append(json.loads(line)) | |
| output_path = os.path.join(output_dir, "flan_v2_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| prompt = example["inputs"] | |
| if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"): | |
| prompt += "\n" | |
| completion = example["targets"] | |
| fout.write(json.dumps({ | |
| "dataset": "flan_v2", | |
| "id": f"flan_v2_{idx}", | |
| "messages": [ | |
| {"role": "user", "content": prompt}, | |
| {"role": "assistant", "content": completion}, | |
| ] | |
| }) + "\n") | |
| def convert_dolly_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| with open(os.path.join(data_dir, "databricks-dolly-15k.jsonl"), "r") as fin: | |
| for line in fin: | |
| examples.append(json.loads(line)) | |
| output_path = os.path.join(output_dir, "dolly_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| encoded_example = encode_instruction_example( | |
| instruction=example["instruction"], | |
| input=example["context"], | |
| output=example["response"], | |
| random_template=True, | |
| eos_token=None | |
| ) | |
| fout.write(json.dumps({ | |
| "dataset": "dolly", | |
| "id": f"dolly_{idx}", | |
| "messages": [ | |
| {"role": "user", "content": encoded_example["prompt"]}, | |
| {"role": "assistant", "content": encoded_example["completion"]}, | |
| ] | |
| }) + "\n") | |
| def convert_self_instruct_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| with open(os.path.join(data_dir, "all_instances_82K.jsonl"), "r") as fin: | |
| for line in fin: | |
| examples.append(json.loads(line)) | |
| output_path = os.path.join(output_dir, "self_instruct_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| encoded_example = encode_instruction_example( | |
| instruction=example["instruction"], | |
| input=example["input"], | |
| output=example["output"], | |
| random_template=True, | |
| eos_token=None | |
| ) | |
| fout.write(json.dumps({ | |
| "dataset": "self_instruct", | |
| "id": f"self_instruct_{idx}", | |
| "messages": [ | |
| {"role": "user", "content": encoded_example["prompt"]}, | |
| {"role": "assistant", "content": encoded_example["completion"]}, | |
| ] | |
| }) + "\n") | |
| def convert_unnatural_instructions_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| instance_cnt = 0 | |
| with open(os.path.join(data_dir, "core_data.jsonl"), "r") as fin, open((os.path.join(output_dir, "unnatural_instructions_data.jsonl")), "w") as fout: | |
| for line in fin: | |
| task_data = json.loads(line) | |
| instruction = task_data["instruction"] | |
| for instance in task_data["instances"]: | |
| if instance["constraints"] and instance["constraints"].lower() not in ["none", "none."]: | |
| instance_instruction = instruction + "\n" + instance["constraints"] | |
| else: | |
| instance_instruction = instruction | |
| encoded_example = encode_instruction_example( | |
| instruction=instance_instruction, | |
| input=instance["input"], | |
| output=instance["output"], | |
| random_template=True, | |
| eos_token=None | |
| ) | |
| fout.write(json.dumps({ | |
| "dataset": "unnatural_instructions", | |
| "id": f"unnatural_instructions_{instance_cnt}", | |
| "messages": [ | |
| {"role": "user", "content": encoded_example["prompt"]}, | |
| {"role": "assistant", "content": encoded_example["completion"]}, | |
| ] | |
| }) + "\n") | |
| instance_cnt += 1 | |
| def convert_stanford_alpaca_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| with open(os.path.join(data_dir, "alpaca_data.json"), "r") as fin: | |
| examples.extend(json.load(fin)) | |
| output_path = os.path.join(output_dir, "stanford_alpaca_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| encoded_example = encode_instruction_example( | |
| instruction=example["instruction"], | |
| input=example["input"], | |
| output=example["output"], | |
| random_template=True, | |
| eos_token=None | |
| ) | |
| fout.write(json.dumps({ | |
| "dataset": "stanford_alpaca", | |
| "id": f"stanford_alpaca_{idx}", | |
| "messages": [ | |
| {"role": "user", "content": encoded_example["prompt"]}, | |
| {"role": "assistant", "content": encoded_example["completion"]}, | |
| ] | |
| }) + "\n") | |
| def convert_code_alpaca_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| with open(os.path.join(data_dir, "code_alpaca_20k.json"), "r") as fin: | |
| examples.extend(json.load(fin)) | |
| output_path = os.path.join(output_dir, "code_alpaca_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| encoded_example = encode_instruction_example( | |
| instruction=example["instruction"], | |
| input=example["input"], | |
| output=example["output"], | |
| random_template=True, | |
| eos_token=None | |
| ) | |
| fout.write(json.dumps({ | |
| "dataset": "code_alpaca", | |
| "id": f"code_alpaca_{idx}", | |
| "messages": [ | |
| {"role": "user", "content": encoded_example["prompt"]}, | |
| {"role": "assistant", "content": encoded_example["completion"]}, | |
| ] | |
| }) + "\n") | |
| def convert_gpt4_alpaca_data(data_dir, output_dir, load_en=True, load_zh=False): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| if load_en: | |
| with open(os.path.join(data_dir, "alpaca_gpt4_data.json"), "r") as fin: | |
| examples.extend(json.load(fin)) | |
| if load_zh: | |
| with open(os.path.join(data_dir, "alpaca_gpt4_data_zh.json"), "r") as fin: | |
| examples.extend(json.load(fin)) | |
| output_path = os.path.join(output_dir, "gpt4_alpaca_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| encoded_example = encode_instruction_example( | |
| instruction=example["instruction"], | |
| input=example["input"], | |
| output=example["output"], | |
| random_template=True, | |
| eos_token=None | |
| ) | |
| fout.write(json.dumps({ | |
| "dataset": "gpt4_alpaca", | |
| "id": f"gpt4_alpaca_{idx}", | |
| "messages": [ | |
| {"role": "user", "content": encoded_example["prompt"]}, | |
| {"role": "assistant", "content": encoded_example["completion"]}, | |
| ] | |
| }) + "\n") | |
| def convert_sharegpt_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| with open(os.path.join(data_dir, "sharegpt_html_cleaned_and_split.json"), "r") as fin: | |
| examples.extend(json.load(fin)) | |
| output_path = os.path.join(output_dir, "sharegpt_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| invalid_cnt = 0 | |
| for idx, example in enumerate(examples): | |
| messages = [] | |
| valid = True | |
| for message in example["conversations"]: | |
| if message["from"] == "human" or message["from"] == "user": | |
| messages.append({ | |
| "role": "user", | |
| "content": message["value"] | |
| }) | |
| elif message["from"] == "gpt" or message["from"] == "chatgpt": | |
| messages.append({ | |
| "role": "assistant", | |
| "content": message["value"] | |
| }) | |
| elif message["from"] == "system": | |
| valid = False | |
| invalid_cnt += 1 | |
| break | |
| elif message["from"] == "bing": | |
| valid = False | |
| invalid_cnt += 1 | |
| break | |
| else: | |
| raise ValueError(f"Unknown message sender: {message['from']}") | |
| if messages and valid: | |
| fout.write(json.dumps({ | |
| "dataset": "sharegpt", | |
| "id": f"sharegpt_{example['id']}", | |
| "messages": messages | |
| }) + "\n") | |
| print(f"# of invalid examples in sharegpt data: {invalid_cnt}") | |
| def convert_baize_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| for source in ["alpaca", "medical", "quora", "stackoverflow"]: | |
| with open(os.path.join(data_dir, f"{source}_chat_data.json"), "r") as fin: | |
| examples.extend(json.load(fin)) | |
| output_path = os.path.join(output_dir, "baize_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| # split example["input"] by [|Human|] and [|AI|] | |
| messages = [] | |
| rounds = example["input"].split("[|Human|]")[1:] | |
| for round in rounds: | |
| if not round.strip() or "[|AI|]" not in round: | |
| continue | |
| human, assistant = round.split("[|AI|]") | |
| messages.append({ | |
| "role": "user", | |
| "content": human.strip() | |
| }) | |
| messages.append({ | |
| "role": "assistant", | |
| "content": assistant.strip() | |
| }) | |
| fout.write(json.dumps({ | |
| "dataset": "baize", | |
| "id": f"baize_{idx}", | |
| "messages": messages | |
| }) + "\n") | |
| def convert_oasst1_data(data_dir, output_dir): | |
| ''' | |
| For OASST1, because it's in a tree structure, where every user input might get multiple replies, | |
| we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node). | |
| This results in some of the messages being duplicated among different paths (instances). | |
| Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path. | |
| ''' | |
| os.makedirs(output_dir, exist_ok=True) | |
| conversations = [] | |
| with open(os.path.join(data_dir, "2023-04-12_oasst_ready.trees.jsonl"), "r") as fin: | |
| for line in fin: | |
| conversations.append(json.loads(line)) | |
| output_path = os.path.join(output_dir, "oasst1_data.jsonl") | |
| # we filter out the sequences that mention the creator information | |
| filter_strings = [ | |
| "LAION", | |
| "Open Asssistant", | |
| "OpenAssistant", | |
| ] | |
| # tranvers the conversation tree, and collect all valid sequences | |
| def dfs(reply, messages, valid_sequences): | |
| if any([filter_string in reply["text"] for filter_string in filter_strings]): | |
| return | |
| if reply["role"] == "assistant": | |
| messages.append( | |
| {"role": "assistant", "content": reply["text"]} | |
| ) | |
| if not reply["replies"]: # leaf node | |
| valid_sequences.append(messages[:]) | |
| else: | |
| for child in reply["replies"]: | |
| dfs(child, messages, valid_sequences) | |
| messages.pop() | |
| elif reply["role"] == "prompter": | |
| messages.append( | |
| {"role": "user", "content": reply["text"]} | |
| ) | |
| for child in reply["replies"]: | |
| dfs(child, messages, valid_sequences) | |
| messages.pop() | |
| else: | |
| raise ValueError(f"Unknown role: {reply['role']}") | |
| with open(output_path, "w") as fout: | |
| example_cnt = 0 | |
| for _, conversation in enumerate(conversations): | |
| valid_sequences = [] | |
| dfs(conversation["prompt"], [], valid_sequences) | |
| for sequence in valid_sequences: | |
| fout.write(json.dumps({ | |
| "dataset": "oasst1", | |
| "id": f"oasst1_{example_cnt}", | |
| "messages": sequence | |
| }) + "\n") | |
| example_cnt += 1 | |
| def convert_lima_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| with open(os.path.join(data_dir, "train.jsonl"), "r") as fin: | |
| for line in fin: | |
| examples.append(json.loads(line)) | |
| output_path = os.path.join(output_dir, "lima_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| messages = [] | |
| if not len(example["conversations"]) % 2 == 0: | |
| print(f"Waring: example {idx} in LIMA has odd number of messages. Cutting off the last message.") | |
| example["conversations"] = example["conversations"][:-1] | |
| for i in range(0, len(example["conversations"]), 2): | |
| messages.append({ | |
| "role": "user", | |
| "content": example["conversations"][i] | |
| }) | |
| messages.append({ | |
| "role": "assistant", | |
| "content": example["conversations"][i+1] | |
| }) | |
| fout.write(json.dumps({ | |
| "dataset": "lima", | |
| "id": f"lima_{idx}", | |
| "messages": messages, | |
| }) + "\n") | |
| def convert_wizardlm_data(data_dir, output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| with open(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json"), "r") as fin: | |
| examples = json.load(fin) | |
| output_path = os.path.join(output_dir, "wizardlm_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| messages = [] | |
| assert len(example["conversations"]) % 2 == 0 | |
| for i in range(0, len(example["conversations"]), 2): | |
| assert example["conversations"][i]["from"] == "human" | |
| assert example["conversations"][i+1]["from"] == "gpt" | |
| messages.append({ | |
| "role": "user", | |
| "content": example["conversations"][i]["value"] | |
| }) | |
| messages.append({ | |
| "role": "assistant", | |
| "content": example["conversations"][i+1]["value"] | |
| }) | |
| fout.write(json.dumps({ | |
| "dataset": "wizardlm", | |
| "id": f"wizardlm_{example['idx']}", | |
| "messages": messages, | |
| }) + "\n") | |
| def convert_open_orca_data(data_dir, output_dir, num_gpt4_examples=100000, num_gpt35_examples=0): | |
| os.makedirs(output_dir, exist_ok=True) | |
| examples = [] | |
| df = pd.read_parquet(os.path.join(data_dir, "1M-GPT4-Augmented.parquet")) | |
| gpt4_examples = [row.to_dict() for _, row in df.iterrows()] | |
| random.shuffle(gpt4_examples) | |
| examples.extend(gpt4_examples[:num_gpt4_examples]) | |
| df = pd.read_parquet(os.path.join(data_dir, "3_5M-GPT3_5-Augmented.parquet")) | |
| gpt35_examples = [row.to_dict() for _, row in df.iterrows()] | |
| random.shuffle(gpt35_examples) | |
| examples.extend(gpt35_examples[:num_gpt35_examples]) | |
| output_path = os.path.join(output_dir, "open_orca_data.jsonl") | |
| with open(output_path, "w") as fout: | |
| for idx, example in enumerate(examples): | |
| messages = [ | |
| {"role": "system", "content": example["system_prompt"]}, | |
| {"role": "user", "content": example["question"]}, | |
| {"role": "assistant", "content": example["response"]} | |
| ] | |
| fout.write(json.dumps({ | |
| "dataset": "open_orca", | |
| "id": f"open_orca_{example['id']}", | |
| "messages": messages, | |
| }) + "\n") | |
| if __name__ == "__main__": | |
| arg_parser = argparse.ArgumentParser() | |
| arg_parser.add_argument("--raw_data_dir", type=str, default="data/downloads") | |
| arg_parser.add_argument("--output_dir", type=str, default="data/processed") | |
| arg_parser.add_argument("--seed", type=int, default=42) | |
| args = arg_parser.parse_args() | |
| random.seed(args.seed) | |
| # get the subfolder names in raw_data_dir | |
| subfolders = [f for f in os.listdir(args.raw_data_dir) if os.path.isdir(os.path.join(args.raw_data_dir, f))] | |
| # all supported datasets | |
| supported_datasets = [] | |
| all_funcs = [func_name for func_name in globals() if callable(globals()[func_name])] | |
| for func_name in all_funcs: | |
| if re.match(r"convert_.+_data", func_name): | |
| supported_datasets.append(func_name[8:-5]) | |
| # check if the subfolder names are supported datasets | |
| valid_subfolders = [] | |
| for subfolder in subfolders: | |
| if subfolder not in supported_datasets: | |
| print(f"Warning: {subfolder} in the raw data folder is not a supported dataset. We will skip it.") | |
| else: | |
| valid_subfolders.append(subfolder) | |
| # prepare data for each dataset | |
| statistics = {} | |
| for subfolder in valid_subfolders: | |
| print(f"Processing {subfolder} data...") | |
| globals()[f"convert_{subfolder}_data"](os.path.join(args.raw_data_dir, subfolder), os.path.join(args.output_dir, subfolder)) | |