tools

File size: 2,005 Bytes

1c980b1

import argparse
import json
from pathlib import Path

def merge_json_dataset(dataset_dir: str, output_name: str = "ChemQA") -> None:
    """
    自动合并dataset_dir中所有JSON文件的数据部分
    
    参数：
    dataset_dir : 存放.json文件的目录
    output_name : 合并后的数据集名称（默认ChemQA）
    """
    dataset_path = Path(dataset_dir)
    output_json = dataset_path / f"{output_name}.json"
    
    # 获取所有需要合并的JSON文件（排除输出文件自身）
    json_files = sorted(dataset_path.glob("*.json"))
    json_files = [f for f in json_files if f != output_json]
    
    if not json_files:
        raise FileNotFoundError("未找到任何JSON文件")
    
    merged_data = []
    global_offset = 0  # 全局索引偏移量
    
    for json_file in json_files:
        # 读取JSON数据
        with open(json_file, 'r', encoding='utf-8') as f:
            part_data = json.load(f)
        
        # 更新索引
        for item in part_data:
            item["index"] = global_offset + item["index"]
        
        # 合并数据并更新偏移量
        merged_data.extend(part_data)
        global_offset += len(part_data)
    
    # 保存合并结果
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, indent=2, ensure_ascii=False)
    
    print(f"\n合并完成！共处理 {len(json_files)} 个JSON文件")
    print(f"生成数据集: {output_name}.json")
    print(f"- 总条目数: {len(merged_data)} 条")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="合并JSON格式的问答数据集")
    parser.add_argument('-i',dest="dataset_dir", required=True, help="JSON文件所在的目录路径")
    parser.add_argument('-o',"--output_name", default="ChemQA", help="输出数据集名称（默认为ChemQA）")
    
    args = parser.parse_args()
    
    merge_json_dataset(
        dataset_dir=args.dataset_dir,
        output_name=args.output_name
    )