|
|
import argparse |
|
|
import multiprocessing |
|
|
import shutil |
|
|
from copy import deepcopy |
|
|
from multiprocessing import Pool |
|
|
from typing import List, Union, Tuple |
|
|
|
|
|
import numpy as np |
|
|
from batchgenerators.utilities.file_and_folder_operations import load_json, join, subfiles, \ |
|
|
maybe_mkdir_p, isdir, save_pickle, load_pickle, isfile |
|
|
from nnunetv2.configuration import default_num_processes |
|
|
from nnunetv2.imageio.base_reader_writer import BaseReaderWriter |
|
|
from nnunetv2.utilities.label_handling.label_handling import LabelManager |
|
|
from nnunetv2.utilities.plans_handling.plans_handler import PlansManager |
|
|
|
|
|
|
|
|
def average_probabilities(list_of_files: List[str]) -> np.ndarray: |
|
|
assert len(list_of_files), 'At least one file must be given in list_of_files' |
|
|
avg = None |
|
|
for f in list_of_files: |
|
|
if avg is None: |
|
|
avg = np.load(f)['probabilities'] |
|
|
|
|
|
if avg.dtype != np.float32: |
|
|
avg = avg.astype(np.float32) |
|
|
else: |
|
|
avg += np.load(f)['probabilities'] |
|
|
avg /= len(list_of_files) |
|
|
return avg |
|
|
|
|
|
|
|
|
def merge_files(list_of_files, |
|
|
output_filename_truncated: str, |
|
|
output_file_ending: str, |
|
|
image_reader_writer: BaseReaderWriter, |
|
|
label_manager: LabelManager, |
|
|
save_probabilities: bool = False): |
|
|
|
|
|
properties = load_pickle(list_of_files[0][:-4] + '.pkl') |
|
|
|
|
|
probabilities = average_probabilities(list_of_files) |
|
|
segmentation = label_manager.convert_logits_to_segmentation(probabilities) |
|
|
|
|
|
image_reader_writer.write_seg(segmentation, output_filename_truncated + output_file_ending, properties) |
|
|
if save_probabilities: |
|
|
np.savez_compressed(output_filename_truncated + '.npz', probabilities=probabilities) |
|
|
save_pickle(probabilities, output_filename_truncated + '.pkl') |
|
|
|
|
|
|
|
|
def ensemble_folders(list_of_input_folders: List[str], |
|
|
output_folder: str, |
|
|
save_merged_probabilities: bool = False, |
|
|
num_processes: int = default_num_processes, |
|
|
dataset_json_file_or_dict: str = None, |
|
|
plans_json_file_or_dict: str = None): |
|
|
"""we need too much shit for this function. Problem is that we now have to support region-based training plus |
|
|
multiple input/output formats so there isn't really a way around this. |
|
|
|
|
|
If plans and dataset json are not specified, we assume each of the folders has a corresponding plans.json |
|
|
and/or dataset.json in it. These are usually copied into those folders by nnU-Net during prediction. |
|
|
We just pick the dataset.json and plans.json from the first of the folders and we DONT check whether the 5 |
|
|
folders contain the same plans etc! This can be a feature if results from different datasets are to be merged (only |
|
|
works if label dict in dataset.json is the same between these datasets!!!)""" |
|
|
if dataset_json_file_or_dict is not None: |
|
|
if isinstance(dataset_json_file_or_dict, str): |
|
|
dataset_json = load_json(dataset_json_file_or_dict) |
|
|
else: |
|
|
dataset_json = dataset_json_file_or_dict |
|
|
else: |
|
|
dataset_json = load_json(join(list_of_input_folders[0], 'dataset.json')) |
|
|
|
|
|
if plans_json_file_or_dict is not None: |
|
|
if isinstance(plans_json_file_or_dict, str): |
|
|
plans = load_json(plans_json_file_or_dict) |
|
|
else: |
|
|
plans = plans_json_file_or_dict |
|
|
else: |
|
|
plans = load_json(join(list_of_input_folders[0], 'plans.json')) |
|
|
|
|
|
plans_manager = PlansManager(plans) |
|
|
|
|
|
|
|
|
files_per_folder = [set(subfiles(i, suffix='.npz', join=False)) for i in list_of_input_folders] |
|
|
|
|
|
s = deepcopy(files_per_folder[0]) |
|
|
for f in files_per_folder[1:]: |
|
|
s.update(f) |
|
|
for f in files_per_folder: |
|
|
assert len(s.difference(f)) == 0, "Not all folders contain the same files for ensembling. Please only " \ |
|
|
"provide folders that contain the predictions" |
|
|
lists_of_lists_of_files = [[join(fl, fi) for fl in list_of_input_folders] for fi in s] |
|
|
output_files_truncated = [join(output_folder, fi[:-4]) for fi in s] |
|
|
|
|
|
image_reader_writer = plans_manager.image_reader_writer_class() |
|
|
label_manager = plans_manager.get_label_manager(dataset_json) |
|
|
|
|
|
maybe_mkdir_p(output_folder) |
|
|
shutil.copy(join(list_of_input_folders[0], 'dataset.json'), output_folder) |
|
|
|
|
|
with multiprocessing.get_context("spawn").Pool(num_processes) as pool: |
|
|
num_preds = len(s) |
|
|
_ = pool.starmap( |
|
|
merge_files, |
|
|
zip( |
|
|
lists_of_lists_of_files, |
|
|
output_files_truncated, |
|
|
[dataset_json['file_ending']] * num_preds, |
|
|
[image_reader_writer] * num_preds, |
|
|
[label_manager] * num_preds, |
|
|
[save_merged_probabilities] * num_preds |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
def entry_point_ensemble_folders(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument('-i', nargs='+', type=str, required=True, |
|
|
help='list of input folders') |
|
|
parser.add_argument('-o', type=str, required=True, help='output folder') |
|
|
parser.add_argument('-np', type=int, required=False, default=default_num_processes, |
|
|
help=f"Numbers of processes used for ensembling. Default: {default_num_processes}") |
|
|
parser.add_argument('--save_npz', action='store_true', required=False, help='Set this flag to store output ' |
|
|
'probabilities in separate .npz files') |
|
|
|
|
|
args = parser.parse_args() |
|
|
ensemble_folders(args.i, args.o, args.save_npz, args.np) |
|
|
|
|
|
|
|
|
def ensemble_crossvalidations(list_of_trained_model_folders: List[str], |
|
|
output_folder: str, |
|
|
folds: Union[Tuple[int, ...], List[int]] = (0, 1, 2, 3, 4), |
|
|
num_processes: int = default_num_processes, |
|
|
overwrite: bool = True) -> None: |
|
|
""" |
|
|
Feature: different configurations can now have different splits |
|
|
""" |
|
|
dataset_json = load_json(join(list_of_trained_model_folders[0], 'dataset.json')) |
|
|
plans_manager = PlansManager(join(list_of_trained_model_folders[0], 'plans.json')) |
|
|
|
|
|
|
|
|
files_per_folder = {} |
|
|
unique_filenames = set() |
|
|
for tr in list_of_trained_model_folders: |
|
|
files_per_folder[tr] = {} |
|
|
for f in folds: |
|
|
if not isdir(join(tr, f'fold_{f}', 'validation')): |
|
|
raise RuntimeError(f'Expected model output directory does not exist. You must train all requested ' |
|
|
f'folds of the specified model.\nModel: {tr}\nFold: {f}') |
|
|
files_here = subfiles(join(tr, f'fold_{f}', 'validation'), suffix='.npz', join=False) |
|
|
if len(files_here) == 0: |
|
|
raise RuntimeError(f"No .npz files found in folder {join(tr, f'fold_{f}', 'validation')}. Rerun your " |
|
|
f"validation with the --npz flag. Use nnUNetv2_train [...] --val --npz.") |
|
|
files_per_folder[tr][f] = subfiles(join(tr, f'fold_{f}', 'validation'), suffix='.npz', join=False) |
|
|
unique_filenames.update(files_per_folder[tr][f]) |
|
|
|
|
|
|
|
|
ok = True |
|
|
for tr, fi in files_per_folder.items(): |
|
|
all_files_here = set() |
|
|
for f in folds: |
|
|
all_files_here.update(fi[f]) |
|
|
diff = unique_filenames.difference(all_files_here) |
|
|
if len(diff) > 0: |
|
|
ok = False |
|
|
print(f'model {tr} does not seem to contain all predictions. Missing: {diff}') |
|
|
if not ok: |
|
|
raise RuntimeError('There were missing files, see print statements above this one') |
|
|
|
|
|
|
|
|
file_mapping = [] |
|
|
for tr in list_of_trained_model_folders: |
|
|
file_mapping.append({}) |
|
|
for f in folds: |
|
|
for fi in files_per_folder[tr][f]: |
|
|
|
|
|
assert fi not in file_mapping[-1].keys(), f"Duplicate detected. Case {fi} is present in more than " \ |
|
|
f"one fold of model {tr}." |
|
|
file_mapping[-1][fi] = join(tr, f'fold_{f}', 'validation', fi) |
|
|
|
|
|
lists_of_lists_of_files = [[fm[i] for fm in file_mapping] for i in unique_filenames] |
|
|
output_files_truncated = [join(output_folder, fi[:-4]) for fi in unique_filenames] |
|
|
|
|
|
image_reader_writer = plans_manager.image_reader_writer_class() |
|
|
maybe_mkdir_p(output_folder) |
|
|
label_manager = plans_manager.get_label_manager(dataset_json) |
|
|
|
|
|
if not overwrite: |
|
|
tmp = [isfile(i + dataset_json['file_ending']) for i in output_files_truncated] |
|
|
lists_of_lists_of_files = [lists_of_lists_of_files[i] for i in range(len(tmp)) if not tmp[i]] |
|
|
output_files_truncated = [output_files_truncated[i] for i in range(len(tmp)) if not tmp[i]] |
|
|
|
|
|
with multiprocessing.get_context("spawn").Pool(num_processes) as pool: |
|
|
num_preds = len(lists_of_lists_of_files) |
|
|
_ = pool.starmap( |
|
|
merge_files, |
|
|
zip( |
|
|
lists_of_lists_of_files, |
|
|
output_files_truncated, |
|
|
[dataset_json['file_ending']] * num_preds, |
|
|
[image_reader_writer] * num_preds, |
|
|
[label_manager] * num_preds, |
|
|
[False] * num_preds |
|
|
) |
|
|
) |
|
|
|
|
|
shutil.copy(join(list_of_trained_model_folders[0], 'plans.json'), join(output_folder, 'plans.json')) |
|
|
shutil.copy(join(list_of_trained_model_folders[0], 'dataset.json'), join(output_folder, 'dataset.json')) |
|
|
|