|
|
import argparse |
|
|
import json |
|
|
from pathlib import Path |
|
|
|
|
|
def merge_json_dataset(dataset_dir: str, output_name: str = "ChemQA") -> None: |
|
|
""" |
|
|
自动合并dataset_dir中所有JSON文件的数据部分 |
|
|
|
|
|
参数: |
|
|
dataset_dir : 存放.json文件的目录 |
|
|
output_name : 合并后的数据集名称(默认ChemQA) |
|
|
""" |
|
|
dataset_path = Path(dataset_dir) |
|
|
output_json = dataset_path / f"{output_name}.json" |
|
|
|
|
|
|
|
|
json_files = sorted(dataset_path.glob("*.json")) |
|
|
json_files = [f for f in json_files if f != output_json] |
|
|
|
|
|
if not json_files: |
|
|
raise FileNotFoundError("未找到任何JSON文件") |
|
|
|
|
|
merged_data = [] |
|
|
global_offset = 0 |
|
|
|
|
|
for json_file in json_files: |
|
|
|
|
|
with open(json_file, 'r', encoding='utf-8') as f: |
|
|
part_data = json.load(f) |
|
|
|
|
|
|
|
|
for item in part_data: |
|
|
item["index"] = global_offset + item["index"] |
|
|
|
|
|
|
|
|
merged_data.extend(part_data) |
|
|
global_offset += len(part_data) |
|
|
|
|
|
|
|
|
with open(output_json, 'w', encoding='utf-8') as f: |
|
|
json.dump(merged_data, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
print(f"\n合并完成!共处理 {len(json_files)} 个JSON文件") |
|
|
print(f"生成数据集: {output_name}.json") |
|
|
print(f"- 总条目数: {len(merged_data)} 条") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser(description="合并JSON格式的问答数据集") |
|
|
parser.add_argument('-i',dest="dataset_dir", required=True, help="JSON文件所在的目录路径") |
|
|
parser.add_argument('-o',"--output_name", default="ChemQA", help="输出数据集名称(默认为ChemQA)") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
merge_json_dataset( |
|
|
dataset_dir=args.dataset_dir, |
|
|
output_name=args.output_name |
|
|
) |