tools / utils /upload /jsonl_otest.py
Adinosaur's picture
Upload folder using huggingface_hub
1c980b1 verified
def analyze_jsonl_size(input_path):
over_count = 0
total_over_size = 0 # 总大小,单位字节
max_size = 0 # 最大对象大小,单位字节
line_counter = 0 # 行号跟踪(可选)
with open(input_path, 'r', encoding='utf-8') as file:
for line in file:
line_counter += 1
# 计算当前行的字节大小
line_bytes = line.encode('utf-8')
current_size = len(line_bytes)
# 检查是否超过1MB
if current_size > 1 * 1024 * 1024:
over_count += 1
total_over_size += current_size
if current_size > max_size:
max_size = current_size
# 输出统计结果
print(f"Number of objects exceeding 1MB: {over_count}")
if over_count > 0:
avg_size_mb = (total_over_size / over_count) / (1024 * 1024)
max_size_mb = max_size / (1024 * 1024)
print(f"Average size of oversized objects: {avg_size_mb:.2f} MB")
print(f"Largest object size: {max_size_mb:.2f} MB")
else:
print("No objects exceed the 1MB limit.")
# 示例调用
analyze_jsonl_size('/mnt/data/users/zys/proj/vlm_reasoning/request/vlm_batch_requests.jsonl')