| import argparse |
| import json |
| import pathlib |
|
|
| |
| PROMPT_DICT = { |
| "prompt_input": ( |
| "Below is an instruction that describes a task, paired with an input that provides further context. " |
| "Write a response that appropriately completes the request.\n\n" |
| "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" |
| ), |
| "prompt_no_input": ( |
| "Below is an instruction that describes a task. " |
| "Write a response that appropriately completes the request.\n\n" |
| "### Instruction:\n{instruction}\n\n### Response:" |
| ), |
| } |
|
|
|
|
| def main(args): |
| data_path = pathlib.Path(args.data_path) |
| with data_path.open() as f: |
| data = json.load(f) |
|
|
| prompt_input, prompt_no_input = ( |
| PROMPT_DICT["prompt_input"], |
| PROMPT_DICT["prompt_no_input"], |
| ) |
| sources = [ |
| prompt_input.format_map(example) |
| if example.get("input", "") != "" |
| else prompt_no_input.format_map(example) |
| for example in data |
| ] |
| targets = [example["output"] for example in data] |
|
|
| new_data = [] |
| cnt = 1 |
| for s, t in zip(sources, targets): |
| new_data.append( |
| { |
| "id": str(cnt), |
| "conversations": [ |
| { |
| "from": "human", |
| "value": s, |
| }, |
| { |
| "from": "gpt", |
| "value": t, |
| }, |
| ], |
| } |
| ) |
| cnt += 1 |
|
|
| json.dump(new_data, open(args.output_path, "w"), indent=2) |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--data_path", type=str, default="alpaca-data.json") |
| parser.add_argument( |
| "--output_path", type=str, default="alpaca-data-conversation.json" |
| ) |
| args = parser.parse_args() |
| main(args) |
|
|