import os def split_jsonl_by_size(input_path, output_dir, target_size_mb): target_size = target_size_mb * 1024 * 1024 # 转换为字节 os.makedirs(output_dir, exist_ok=True) part_num = 1 current_size = 0 current_file = None line_counter = 0 # 用于跟踪行号 with open(input_path, 'r', encoding='utf-8') as infile: for line in infile: line_counter += 1 line_bytes = line.encode('utf-8') line_size = len(line_bytes) # 新增:检查单行大小是否超过1MB if line_size > 1 * 1024 * 1024: raise ValueError( f"JSON object at line {line_counter} exceeds 1MB limit " f"(actual size: {line_size / 1024 / 1024:.2f}MB)" ) # 原有分割逻辑 if current_file and (current_size + line_size > target_size): current_file.close() current_file = None current_size = 0 part_num += 1 if not current_file: output_path = os.path.join(output_dir, f'vlm_requests_{part_num}.jsonl') current_file = open(output_path, 'wb') current_size = 0 current_file.write(line_bytes) current_size += line_size if current_file: current_file.close() # 示例调用 split_jsonl_by_size('/mnt/data/users/zys/proj/vlm_reasoning/request/vlm_batch_requests.jsonl', '/mnt/data/users/zys/proj/vlm_reasoning/upload/vlm', 450)