FelixzeroSun's picture
Upload folder using huggingface_hub
19c1f58 verified
import argparse
import multiprocessing
import shutil
from copy import deepcopy
from multiprocessing import Pool
from typing import List, Union, Tuple
import numpy as np
from batchgenerators.utilities.file_and_folder_operations import load_json, join, subfiles, \
maybe_mkdir_p, isdir, save_pickle, load_pickle, isfile
from nnunetv2.configuration import default_num_processes
from nnunetv2.imageio.base_reader_writer import BaseReaderWriter
from nnunetv2.utilities.label_handling.label_handling import LabelManager
from nnunetv2.utilities.plans_handling.plans_handler import PlansManager
def average_probabilities(list_of_files: List[str]) -> np.ndarray:
assert len(list_of_files), 'At least one file must be given in list_of_files'
avg = None
for f in list_of_files:
if avg is None:
avg = np.load(f)['probabilities']
# maybe increase precision to prevent rounding errors
if avg.dtype != np.float32:
avg = avg.astype(np.float32)
else:
avg += np.load(f)['probabilities']
avg /= len(list_of_files)
return avg
def merge_files(list_of_files,
output_filename_truncated: str,
output_file_ending: str,
image_reader_writer: BaseReaderWriter,
label_manager: LabelManager,
save_probabilities: bool = False):
# load the pkl file associated with the first file in list_of_files
properties = load_pickle(list_of_files[0][:-4] + '.pkl')
# load and average predictions
probabilities = average_probabilities(list_of_files)
segmentation = label_manager.convert_logits_to_segmentation(probabilities)
image_reader_writer.write_seg(segmentation, output_filename_truncated + output_file_ending, properties)
if save_probabilities:
np.savez_compressed(output_filename_truncated + '.npz', probabilities=probabilities)
save_pickle(probabilities, output_filename_truncated + '.pkl')
def ensemble_folders(list_of_input_folders: List[str],
output_folder: str,
save_merged_probabilities: bool = False,
num_processes: int = default_num_processes,
dataset_json_file_or_dict: str = None,
plans_json_file_or_dict: str = None):
"""we need too much shit for this function. Problem is that we now have to support region-based training plus
multiple input/output formats so there isn't really a way around this.
If plans and dataset json are not specified, we assume each of the folders has a corresponding plans.json
and/or dataset.json in it. These are usually copied into those folders by nnU-Net during prediction.
We just pick the dataset.json and plans.json from the first of the folders and we DONT check whether the 5
folders contain the same plans etc! This can be a feature if results from different datasets are to be merged (only
works if label dict in dataset.json is the same between these datasets!!!)"""
if dataset_json_file_or_dict is not None:
if isinstance(dataset_json_file_or_dict, str):
dataset_json = load_json(dataset_json_file_or_dict)
else:
dataset_json = dataset_json_file_or_dict
else:
dataset_json = load_json(join(list_of_input_folders[0], 'dataset.json'))
if plans_json_file_or_dict is not None:
if isinstance(plans_json_file_or_dict, str):
plans = load_json(plans_json_file_or_dict)
else:
plans = plans_json_file_or_dict
else:
plans = load_json(join(list_of_input_folders[0], 'plans.json'))
plans_manager = PlansManager(plans)
# now collect the files in each of the folders and enforce that all files are present in all folders
files_per_folder = [set(subfiles(i, suffix='.npz', join=False)) for i in list_of_input_folders]
# first build a set with all files
s = deepcopy(files_per_folder[0])
for f in files_per_folder[1:]:
s.update(f)
for f in files_per_folder:
assert len(s.difference(f)) == 0, "Not all folders contain the same files for ensembling. Please only " \
"provide folders that contain the predictions"
lists_of_lists_of_files = [[join(fl, fi) for fl in list_of_input_folders] for fi in s]
output_files_truncated = [join(output_folder, fi[:-4]) for fi in s]
image_reader_writer = plans_manager.image_reader_writer_class()
label_manager = plans_manager.get_label_manager(dataset_json)
maybe_mkdir_p(output_folder)
shutil.copy(join(list_of_input_folders[0], 'dataset.json'), output_folder)
with multiprocessing.get_context("spawn").Pool(num_processes) as pool:
num_preds = len(s)
_ = pool.starmap(
merge_files,
zip(
lists_of_lists_of_files,
output_files_truncated,
[dataset_json['file_ending']] * num_preds,
[image_reader_writer] * num_preds,
[label_manager] * num_preds,
[save_merged_probabilities] * num_preds
)
)
def entry_point_ensemble_folders():
parser = argparse.ArgumentParser()
parser.add_argument('-i', nargs='+', type=str, required=True,
help='list of input folders')
parser.add_argument('-o', type=str, required=True, help='output folder')
parser.add_argument('-np', type=int, required=False, default=default_num_processes,
help=f"Numbers of processes used for ensembling. Default: {default_num_processes}")
parser.add_argument('--save_npz', action='store_true', required=False, help='Set this flag to store output '
'probabilities in separate .npz files')
args = parser.parse_args()
ensemble_folders(args.i, args.o, args.save_npz, args.np)
def ensemble_crossvalidations(list_of_trained_model_folders: List[str],
output_folder: str,
folds: Union[Tuple[int, ...], List[int]] = (0, 1, 2, 3, 4),
num_processes: int = default_num_processes,
overwrite: bool = True) -> None:
"""
Feature: different configurations can now have different splits
"""
dataset_json = load_json(join(list_of_trained_model_folders[0], 'dataset.json'))
plans_manager = PlansManager(join(list_of_trained_model_folders[0], 'plans.json'))
# first collect all unique filenames
files_per_folder = {}
unique_filenames = set()
for tr in list_of_trained_model_folders:
files_per_folder[tr] = {}
for f in folds:
if not isdir(join(tr, f'fold_{f}', 'validation')):
raise RuntimeError(f'Expected model output directory does not exist. You must train all requested '
f'folds of the specified model.\nModel: {tr}\nFold: {f}')
files_here = subfiles(join(tr, f'fold_{f}', 'validation'), suffix='.npz', join=False)
if len(files_here) == 0:
raise RuntimeError(f"No .npz files found in folder {join(tr, f'fold_{f}', 'validation')}. Rerun your "
f"validation with the --npz flag. Use nnUNetv2_train [...] --val --npz.")
files_per_folder[tr][f] = subfiles(join(tr, f'fold_{f}', 'validation'), suffix='.npz', join=False)
unique_filenames.update(files_per_folder[tr][f])
# verify that all trained_model_folders have all predictions
ok = True
for tr, fi in files_per_folder.items():
all_files_here = set()
for f in folds:
all_files_here.update(fi[f])
diff = unique_filenames.difference(all_files_here)
if len(diff) > 0:
ok = False
print(f'model {tr} does not seem to contain all predictions. Missing: {diff}')
if not ok:
raise RuntimeError('There were missing files, see print statements above this one')
# now we need to collect where these files are
file_mapping = []
for tr in list_of_trained_model_folders:
file_mapping.append({})
for f in folds:
for fi in files_per_folder[tr][f]:
# check for duplicates
assert fi not in file_mapping[-1].keys(), f"Duplicate detected. Case {fi} is present in more than " \
f"one fold of model {tr}."
file_mapping[-1][fi] = join(tr, f'fold_{f}', 'validation', fi)
lists_of_lists_of_files = [[fm[i] for fm in file_mapping] for i in unique_filenames]
output_files_truncated = [join(output_folder, fi[:-4]) for fi in unique_filenames]
image_reader_writer = plans_manager.image_reader_writer_class()
maybe_mkdir_p(output_folder)
label_manager = plans_manager.get_label_manager(dataset_json)
if not overwrite:
tmp = [isfile(i + dataset_json['file_ending']) for i in output_files_truncated]
lists_of_lists_of_files = [lists_of_lists_of_files[i] for i in range(len(tmp)) if not tmp[i]]
output_files_truncated = [output_files_truncated[i] for i in range(len(tmp)) if not tmp[i]]
with multiprocessing.get_context("spawn").Pool(num_processes) as pool:
num_preds = len(lists_of_lists_of_files)
_ = pool.starmap(
merge_files,
zip(
lists_of_lists_of_files,
output_files_truncated,
[dataset_json['file_ending']] * num_preds,
[image_reader_writer] * num_preds,
[label_manager] * num_preds,
[False] * num_preds
)
)
shutil.copy(join(list_of_trained_model_folders[0], 'plans.json'), join(output_folder, 'plans.json'))
shutil.copy(join(list_of_trained_model_folders[0], 'dataset.json'), join(output_folder, 'dataset.json'))