deepspeed / filter_jsonl.py
xingzhikb's picture
init
002bd9b
import os
import json
import shutil
def process_jsonl(input_jsonl, root_dir, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
unique_sources = set() # Set to store unique sources
with open(input_jsonl, 'r') as infile:
for line in infile:
data = json.loads(line.strip())
source = data.get('source')
if source:
unique_sources.add(source) # Add source to the set
if source == 'mimic_cxr':
file_name = data.get('file_name')
if file_name:
file_path = os.path.join(root_dir, file_name)
if os.path.exists(file_path):
# Copy the file to the new directory
shutil.copy(file_path, output_dir)
# Write the JSON line to a new file in the new directory
output_file = os.path.join(output_dir, 'output.jsonl')
with open(output_file, 'a') as outfile:
json.dump(data, outfile)
outfile.write('\n')
# Print the number of unique sources
print(f"Number of unique sources: {len(unique_sources)}")
print(f"Unique sources: {unique_sources}")
# Example usage
input_jsonl = '/workspace/part_left/metadata.jsonl'
root_dir = '/workspace/part_left'
output_dir = '/workspace/mimic_cxr'
process_jsonl(input_jsonl, root_dir, output_dir)