Buckets:
| #!/usr/bin/env python3 | |
| """ | |
| Fast parallel JSONL file splitter | |
| Usage: python3 fast_split.py <input_file> <output_prefix> <num_chunks> | |
| """ | |
| import sys | |
| import os | |
| from multiprocessing import Pool | |
| from pathlib import Path | |
| def split_chunk(args): | |
| """Split a chunk of the file""" | |
| input_file, output_file, start_pos, end_pos, chunk_id = args | |
| lines_written = 0 | |
| with open(input_file, 'rb') as infile, open(output_file, 'wb') as outfile: | |
| infile.seek(start_pos) | |
| # If not at start, skip to next newline | |
| if start_pos > 0: | |
| infile.readline() | |
| while infile.tell() < end_pos: | |
| line = infile.readline() | |
| if not line: | |
| break | |
| outfile.write(line) | |
| lines_written += 1 | |
| # Read any remaining complete lines | |
| while True: | |
| pos = infile.tell() | |
| line = infile.readline() | |
| if not line or pos >= end_pos: | |
| break | |
| outfile.write(line) | |
| lines_written += 1 | |
| print(f"Chunk {chunk_id:02d} complete: {lines_written} lines written") | |
| return lines_written | |
| def split_file_parallel(input_file, output_prefix, num_chunks=8): | |
| """Split file into chunks using parallel processing""" | |
| file_size = os.path.getsize(input_file) | |
| chunk_size = file_size // num_chunks | |
| # Prepare arguments for parallel processing | |
| tasks = [] | |
| for i in range(num_chunks): | |
| start_pos = i * chunk_size | |
| end_pos = (i + 1) * chunk_size if i < num_chunks - 1 else file_size | |
| output_file = f"{output_prefix}{i:02d}.jsonl" | |
| tasks.append((input_file, output_file, start_pos, end_pos, i)) | |
| # Process chunks in parallel | |
| with Pool(processes=min(num_chunks, os.cpu_count())) as pool: | |
| results = pool.map(split_chunk, tasks) | |
| total_lines = sum(results) | |
| print(f"Split complete: {total_lines} total lines across {num_chunks} files") | |
| if __name__ == "__main__": | |
| if len(sys.argv) != 4: | |
| print("Usage: python3 fast_split.py <input_file> <output_prefix> <num_chunks>") | |
| sys.exit(1) | |
| input_file = sys.argv[1] | |
| output_prefix = sys.argv[2] | |
| num_chunks = int(sys.argv[3]) | |
| split_file_parallel(input_file, output_prefix, num_chunks) | |
Xet Storage Details
- Size:
- 2.31 kB
- Xet hash:
- d4c7d9e6dcfc7dbdcb36ffb47e9146020499b0268303cb0641759753bfcc3db0
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.