Mickey25
/

liangyi_LLaMA_Factory

Model card Files Files and versions

Metrics Training metrics Community

liangyi_LLaMA_Factory / data /dataset /preprocess_dara /merge_json_files.py

Mickey25's picture

Upload folder using huggingface_hub

46b244e verified 3 months ago

history blame contribute delete

2.92 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	JSON文件合并工具
	支持合并多个JSON文件到一个文件中
	"""

	import json
	import os
	import argparse
	from typing import List, Dict, Any
	import glob

	def load_json_file(file_path: str) -> List[Dict[str, Any]]:
	"""加载JSON文件"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	print(f"成功加载文件: {file_path}, 包含 {len(data)} 条记录")
	return data
	except Exception as e:
	print(f"加载文件 {file_path} 时出错: {e}")
	return []

	def merge_json_files(file_paths: List[str], output_path: str) -> None:
	"""合并多个JSON文件"""
	merged_data = []
	total_records = 0

	for file_path in file_paths:
	if os.path.exists(file_path):
	data = load_json_file(file_path)
	if data:
	merged_data.extend(data)
	total_records += len(data)
	else:
	print(f"警告: 文件不存在 {file_path}")

	# 保存合并后的数据
	try:
	with open(output_path, 'w', encoding='utf-8') as f:
	json.dump(merged_data, f, ensure_ascii=False, indent=2)
	print(f"成功合并 {len(file_paths)} 个文件到 {output_path}")
	print(f"总共包含 {total_records} 条记录")
	except Exception as e:
	print(f"保存合并文件时出错: {e}")

	def main():
	parser = argparse.ArgumentParser(description='合并多个JSON文件')
	parser.add_argument('--input', '-i', nargs='+', required=True,
	help='输入文件路径列表')
	parser.add_argument('--output', '-o', required=True,
	help='输出文件路径')
	parser.add_argument('--pattern', '-p',
	help='使用glob模式匹配文件 (例如: "*.json")')

	args = parser.parse_args()

	file_paths = []

	if args.pattern:
	# 使用glob模式匹配
	file_paths = glob.glob(args.pattern)
	print(f"使用模式 '{args.pattern}' 找到 {len(file_paths)} 个文件")
	else:
	# 使用指定的文件列表
	file_paths = args.input

	if not file_paths:
	print("没有找到要合并的文件")
	return

	merge_json_files(file_paths, args.output)

	if __name__ == "__main__":
	main()
	'''
	python /home/ziqiang/LLaMA-Factory/data/dataset/preprocess_dara/merge_json_files.py --input /home/ziqiang/LLaMA-Factory/data/function_call_data/function_call_context_audit_page_irrelevant.json /home/ziqiang/LLaMA-Factory/data/function_call_data/function_call_context_audit.json /home/ziqiang/LLaMA-Factory/data/function_call_data/function_call_context_order.json /home/ziqiang/LLaMA-Factory/data/dataset/9_17/mixed_training_data_9_17.json --output /home/ziqiang/LLaMA-Factory/data/dataset/9_29/merged_dataset.json
	'''