tools / utils /upload /jsonl_split.py
Adinosaur's picture
Upload folder using huggingface_hub
1c980b1 verified
import os
def split_jsonl_by_size(input_path, output_dir, target_size_mb):
target_size = target_size_mb * 1024 * 1024 # 转换为字节
os.makedirs(output_dir, exist_ok=True)
part_num = 1
current_size = 0
current_file = None
line_counter = 0 # 用于跟踪行号
with open(input_path, 'r', encoding='utf-8') as infile:
for line in infile:
line_counter += 1
line_bytes = line.encode('utf-8')
line_size = len(line_bytes)
# 新增:检查单行大小是否超过1MB
if line_size > 1 * 1024 * 1024:
raise ValueError(
f"JSON object at line {line_counter} exceeds 1MB limit "
f"(actual size: {line_size / 1024 / 1024:.2f}MB)"
)
# 原有分割逻辑
if current_file and (current_size + line_size > target_size):
current_file.close()
current_file = None
current_size = 0
part_num += 1
if not current_file:
output_path = os.path.join(output_dir, f'vlm_requests_{part_num}.jsonl')
current_file = open(output_path, 'wb')
current_size = 0
current_file.write(line_bytes)
current_size += line_size
if current_file:
current_file.close()
# 示例调用
split_jsonl_by_size('/mnt/data/users/zys/proj/vlm_reasoning/request/vlm_batch_requests.jsonl', '/mnt/data/users/zys/proj/vlm_reasoning/upload/vlm', 450)