| import argparse |
| import jsonlines |
| import json |
| from tqdm import tqdm |
| import uuid |
|
|
| parser = argparse.ArgumentParser() |
| parser.add_argument("--in-file", type=str, default="flan1m-alpaca-uncensored.jsonl") |
| parser.add_argument("--out-file", type=str, default="flan1m-sharegpt-deduped.json") |
| args = parser.parse_args() |
| in_file = args.in_file |
| out_file = args.out_file |
|
|
| f = open(out_file, "w", encoding="utf-8") |
|
|
| questions = {} |
|
|
| out = [] |
| with jsonlines.open(in_file) as reader: |
| for obj in tqdm(reader): |
| if questions.get(obj["instruction"] + obj["input"]) is None: |
| questions[obj["instruction"] + obj["input"]] = True |
| out.append( |
| { |
| "id": f"{uuid.uuid4()}", |
| "bot": "dolphin", |
| "training": obj["instruction"], |
| "conversations": [ |
| {"from": "human", "value": obj["input"]}, |
| {"from": "gpt", "value": obj["output"]}, |
| ], |
| } |
| ) |
| json.dump(out, f, ensure_ascii=False) |
| f.close() |
|
|