|
|
import os |
|
|
import json |
|
|
import shutil |
|
|
|
|
|
def process_jsonl(input_jsonl, root_dir, output_dir): |
|
|
if not os.path.exists(output_dir): |
|
|
os.makedirs(output_dir) |
|
|
|
|
|
unique_sources = set() |
|
|
|
|
|
with open(input_jsonl, 'r') as infile: |
|
|
for line in infile: |
|
|
data = json.loads(line.strip()) |
|
|
source = data.get('source') |
|
|
if source: |
|
|
unique_sources.add(source) |
|
|
|
|
|
if source == 'mimic_cxr': |
|
|
file_name = data.get('file_name') |
|
|
if file_name: |
|
|
file_path = os.path.join(root_dir, file_name) |
|
|
if os.path.exists(file_path): |
|
|
|
|
|
shutil.copy(file_path, output_dir) |
|
|
|
|
|
|
|
|
output_file = os.path.join(output_dir, 'output.jsonl') |
|
|
with open(output_file, 'a') as outfile: |
|
|
json.dump(data, outfile) |
|
|
outfile.write('\n') |
|
|
|
|
|
|
|
|
print(f"Number of unique sources: {len(unique_sources)}") |
|
|
print(f"Unique sources: {unique_sources}") |
|
|
|
|
|
|
|
|
input_jsonl = '/workspace/part_left/metadata.jsonl' |
|
|
root_dir = '/workspace/part_left' |
|
|
output_dir = '/workspace/mimic_cxr' |
|
|
|
|
|
process_jsonl(input_jsonl, root_dir, output_dir) |
|
|
|