| import os | |
| def split_jsonl_by_size(input_path, output_dir, target_size_mb): | |
| target_size = target_size_mb * 1024 * 1024 # 转换为字节 | |
| os.makedirs(output_dir, exist_ok=True) | |
| part_num = 1 | |
| current_size = 0 | |
| current_file = None | |
| line_counter = 0 # 用于跟踪行号 | |
| with open(input_path, 'r', encoding='utf-8') as infile: | |
| for line in infile: | |
| line_counter += 1 | |
| line_bytes = line.encode('utf-8') | |
| line_size = len(line_bytes) | |
| # 新增:检查单行大小是否超过1MB | |
| if line_size > 1 * 1024 * 1024: | |
| raise ValueError( | |
| f"JSON object at line {line_counter} exceeds 1MB limit " | |
| f"(actual size: {line_size / 1024 / 1024:.2f}MB)" | |
| ) | |
| # 原有分割逻辑 | |
| if current_file and (current_size + line_size > target_size): | |
| current_file.close() | |
| current_file = None | |
| current_size = 0 | |
| part_num += 1 | |
| if not current_file: | |
| output_path = os.path.join(output_dir, f'vlm_requests_{part_num}.jsonl') | |
| current_file = open(output_path, 'wb') | |
| current_size = 0 | |
| current_file.write(line_bytes) | |
| current_size += line_size | |
| if current_file: | |
| current_file.close() | |
| # 示例调用 | |
| split_jsonl_by_size('/mnt/data/users/zys/proj/vlm_reasoning/request/vlm_batch_requests.jsonl', '/mnt/data/users/zys/proj/vlm_reasoning/upload/vlm', 450) |