craffel's picture
download
raw
2.31 kB
#!/usr/bin/env python3
"""
Fast parallel JSONL file splitter
Usage: python3 fast_split.py <input_file> <output_prefix> <num_chunks>
"""
import sys
import os
from multiprocessing import Pool
from pathlib import Path
def split_chunk(args):
"""Split a chunk of the file"""
input_file, output_file, start_pos, end_pos, chunk_id = args
lines_written = 0
with open(input_file, 'rb') as infile, open(output_file, 'wb') as outfile:
infile.seek(start_pos)
# If not at start, skip to next newline
if start_pos > 0:
infile.readline()
while infile.tell() < end_pos:
line = infile.readline()
if not line:
break
outfile.write(line)
lines_written += 1
# Read any remaining complete lines
while True:
pos = infile.tell()
line = infile.readline()
if not line or pos >= end_pos:
break
outfile.write(line)
lines_written += 1
print(f"Chunk {chunk_id:02d} complete: {lines_written} lines written")
return lines_written
def split_file_parallel(input_file, output_prefix, num_chunks=8):
"""Split file into chunks using parallel processing"""
file_size = os.path.getsize(input_file)
chunk_size = file_size // num_chunks
# Prepare arguments for parallel processing
tasks = []
for i in range(num_chunks):
start_pos = i * chunk_size
end_pos = (i + 1) * chunk_size if i < num_chunks - 1 else file_size
output_file = f"{output_prefix}{i:02d}.jsonl"
tasks.append((input_file, output_file, start_pos, end_pos, i))
# Process chunks in parallel
with Pool(processes=min(num_chunks, os.cpu_count())) as pool:
results = pool.map(split_chunk, tasks)
total_lines = sum(results)
print(f"Split complete: {total_lines} total lines across {num_chunks} files")
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python3 fast_split.py <input_file> <output_prefix> <num_chunks>")
sys.exit(1)
input_file = sys.argv[1]
output_prefix = sys.argv[2]
num_chunks = int(sys.argv[3])
split_file_parallel(input_file, output_prefix, num_chunks)

Xet Storage Details

Size:
2.31 kB
·
Xet hash:
d4c7d9e6dcfc7dbdcb36ffb47e9146020499b0268303cb0641759753bfcc3db0

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.