import os import slideio import numpy as np from PIL import Image from tqdm import tqdm from scipy.io import loadmat from pycocotools.coco import COCO from matplotlib import pyplot as plt import shutil from xml.dom import minidom from skimage.draw import polygon from tifffile import imread def preprocess_camelyon(): data_directory = "/vol/data/histo_datasets/CAMELYON/CAMELYON17/" mask_list = os.listdir(data_directory + "masks/") for m in tqdm(mask_list): slide = slideio.open_slide(data_directory + "masks/" + m) image_slide = slideio.open_slide(data_directory + "images/" + m[:-9] + ".tif") scene = slide.get_scene(0) image_scene = image_slide.get_scene(0) dim0 = int(np.ceil(scene.size[0] / 1024)) dim1 = int(np.ceil(scene.size[1] / 1024)) resolutions = np.ceil(np.log2(max(dim0,dim1))) for r in range(int(resolutions) + 1): res = 2**r * 1024 dim0 = int(np.ceil(scene.size[0] / res)) dim1 = int(np.ceil(scene.size[1] / res)) last_dim = (int(scene.size[0] % res), int(scene.size[1] % res)) if last_dim[0] == 0 and last_dim[1] == 0: last_dim = (res, res) elif last_dim[0] == 0: last_dim = (res, last_dim[1]) elif last_dim[1] == 0: last_dim = (last_dim[0], res) for i in range(dim0): for j in range(dim1): if i == dim0-1 and j == dim1-1: width = last_dim[0] height = last_dim[1] elif i == dim0-1: width = last_dim[0] height = res elif j == dim1-1: width = res height = last_dim[1] else: width = res height = res mask = scene.read_block((i*res,j*res, width, height), (width // (2**r), height // (2**r))) mask = np.where(mask == 2, 1, 0).astype(np.uint8) if (np.max(mask) == 1): image = image_scene.read_block((i*res,j*res, width, height), (width // (2**r), height // (2**r))) # Save image and mask # Save image Image.fromarray(image).save(data_directory + "images_patches/" + m[:-9] + "_{}_{}_{}_{}_{}_{}.png".format(i*res,j*res,width, height, width // (2**r), height // (2**r))) # Save mask Image.fromarray(mask).save(data_directory + "masks_patches/" + m[:-9] + "_{}_{}_{}_{}_{}_{}.png".format(i*res,j*res,width, height, width // (2**r), height // (2**r))) def preprocess_conic(data_directory="/vol/data/histo_datasets/CoNIC/"): images = np.load(data_directory+"images.npy") masks = np.load(data_directory+"labels.npy") for i in tqdm(range(len(images))): mask = masks[i,:,:,0] if np.unique(mask).shape[0] > 1: mask = mask.astype(np.int32) Image.fromarray(images[i]).save(data_directory + "images_png/" + str(i).zfill(4) + ".png") Image.fromarray(mask).save(data_directory + "labels_png/" + str(i).zfill(4) + ".png") def preprocess_cpm(data_directory = "/vol/data/histo_datasets/CPM_15_and_17/"): dir_list = ["cpm15/", "cpm17/test/", "cpm17/train/"] for d in dir_list: mask_list = os.listdir(data_directory + d + "Labels/") for i in range(len(mask_list)): mask = loadmat(data_directory + d + "Labels/" + mask_list[i])["inst_map"] mask = mask.astype(np.int32) os.makedirs(data_directory + d + "Labels_png/", exist_ok=True) Image.fromarray(mask).save(data_directory + d + "Labels_png/" + mask_list[i][:-4] + ".png") def preprocess_crag(data_directory): datasets = ["train2017", "val2017"] for d in datasets: os.makedirs(data_directory + "cell_CRAG/" + d + "/labels/", exist_ok=True) coco = COCO(data_directory+'cell_CRAG/annotations/instances_' + d + '.json') cat_ids = coco.getCatIds() for image_id in range(len(coco.imgs)): img = coco.imgs[image_id+1] if "aug" in img['file_name']: continue anns_ids = coco.getAnnIds(imgIds=image_id+1, catIds=cat_ids, iscrowd=None) anns = coco.loadAnns(anns_ids) mask = coco.annToMask(anns[0]) for i in range(len(anns)): mask += (i+2)*coco.annToMask(anns[i]) mask = mask.astype(np.uint8) Image.fromarray(mask).save(data_directory + "cell_CRAG/" + d + "/labels/" + img['file_name']) def preprocess_icia2018(): data_directory = "/home/ubuntu/thesis/data/ICIA2018/ICIAR2018_BACH_Challenge/WSI/" mask_list = ["A"+str(i+1).zfill(2)+".npy" for i in range(10)] for m in tqdm(mask_list): scene = np.load(data_directory + m).transpose() image_slide = slideio.open_slide(data_directory + m[:-4] + ".svs") image_scene = image_slide.get_scene(0) dim0 = int(np.ceil(scene.shape[0] / 1024)) dim1 = int(np.ceil(scene.shape[1] / 1024)) resolutions = np.ceil(np.log2(max(dim0,dim1))) for r in range(int(resolutions) + 1): res = 2**r * 1024 dim0 = int(np.ceil(scene.shape[0] / res)) dim1 = int(np.ceil(scene.shape[1] / res)) last_dim = (int(scene.shape[0] % res), int(scene.shape[1] % res)) if last_dim[0] == 0 and last_dim[1] == 0: last_dim = (res, res) elif last_dim[0] == 0: last_dim = (res, last_dim[1]) elif last_dim[1] == 0: last_dim = (last_dim[0], res) for i in range(dim0): for j in range(dim1): if i == dim0-1 and j == dim1-1: width = last_dim[0] height = last_dim[1] elif i == dim0-1: width = last_dim[0] height = res elif j == dim1-1: width = res height = last_dim[1] else: width = res height = res #mask = scene.read_block((i*res,j*res, width, height), (width // (2**r), height // (2**r))) mask = scene[i*res:i*res+width:2**r, j*res:j*res+height:2**r] #mask = np.where(mask == 2, 1, 0).astype(np.uint8) if (np.max(mask) > 0): image = image_scene.read_block((i*res,j*res, width, height), (width // (2**r), height // (2**r))) # Save image and mask # Save image Image.fromarray(image).save(data_directory + "images_patches/" + m[:-9] + "_{}_{}_{}_{}_{}_{}.png".format(i*res,j*res,width, height, width // (2**r), height // (2**r))) # Save mask Image.fromarray(mask).save(data_directory + "masks_patches/" + m[:-9] + "_{}_{}_{}_{}_{}_{}.png".format(i*res,j*res,width, height, width // (2**r), height // (2**r))) def preprocess_kumar(data_directory): dirs = ["train/", "test_same/", "test_diff/"] for d in dirs: dir_list = os.listdir(data_directory + d + "Labels/") for i in dir_list: masks = loadmat(data_directory + d + "Labels/" + i)["inst_map"] os.makedirs(data_directory + d + "Labels_png_new/", exist_ok=True) Image.fromarray(masks).save(data_directory + d + "Labels_png_new/" + i[:-4] + ".png") def preprocess_monusac(data_directory = '/home/ubuntu/thesis/data/MoNuSAC/'): image_source = data_directory + 'MoNuSAC_images_and_annotations/' mask_source = data_directory + 'MoNuSAC_masks/' image_destination = data_directory + 'images/' mask_destination = data_directory + 'masks/' image_list = os.listdir(image_source) # Training data for i in image_list: subimage_list = os.listdir(image_source + i) for j in subimage_list: if j.endswith('.tif'): target_image_file = image_destination + j shutil.copyfile(image_source + i + '/' + j, target_image_file) types = ["Epithelial", "Lymphocyte", "Macrophage", "Neutrophil"] for t in types: if os.path.exists(mask_source + i + '/' + j[:-4] + '/' + t): mask_list = os.listdir(mask_source + i + '/' + j[:-4] + '/' + t) for m in mask_list: target_mask_file = mask_destination + j[:-4] + '_' + t + '.png' slide = slideio.open_slide(mask_source + i + '/' + j[:-4] + '/' + t + '/' + m, "GDAL") scene = slide.get_scene(0) mask = scene.read_block((0,0, scene.size[0], scene.size[1])) Image.fromarray(mask.astype(np.uint8)).save(target_mask_file) # Testing data source = data_directory + 'MoNuSAC_Testing_Color_Coded_Masks/' image_list = os.listdir(source) for i in tqdm(image_list): subimage_list = os.listdir(source + i) for j in subimage_list: if j.endswith('.png'): target_image_file = image_destination + j shutil.copyfile(source + i + '/' + j, target_image_file) else: target_mask_file = mask_destination + j[:-17] + '.png' mask_array = np.array(Image.open(source + i + '/' + j)) binary_arr = np.zeros((mask_array.shape[0], mask_array.shape[1])) for x in range(mask_array.shape[0]): for y in range(mask_array.shape[1]): if mask_array[x, y, 0] == 255 or mask_array[x, y, 1] == 255 or mask_array[x, y, 2] == 255: binary_arr[x][y] = 1 Image.fromarray(binary_arr.astype(np.uint8)).save(target_mask_file) def preprocess_monuseg(data_directory = "/home/ubuntu/thesis/data/MoNuSeg/MoNuSeg 2018 Training Data/"): def he_to_binary_mask(filename): im_file = data_directory + "Tissue Images/" + filename + '.tif' xml_file = data_directory + "Annotations/" + filename + '.xml' # Parse the XML file xDoc = minidom.parse(xml_file) Regions = xDoc.getElementsByTagName('Region') xy = [] for regioni in range(Regions.length): Region = Regions.item(regioni) verticies = Region.getElementsByTagName('Vertex') xy_region = np.zeros((verticies.length, 2)) for vertexi in range(verticies.length): x = float(verticies.item(vertexi).getAttribute('X')) y = float(verticies.item(vertexi).getAttribute('Y')) xy_region[vertexi] = [x, y] xy.append(xy_region) arr = imread(im_file) # Get image information im_info = { 'Height': arr.shape[0], 'Width': arr.shape[1] } binary_mask = np.zeros((im_info['Height'], im_info['Width'])) color_mask = np.zeros((im_info['Height'], im_info['Width'], 3)) for zz, region in enumerate(xy): print(f'Processing object # {zz + 1}') smaller_x = region[:, 0] smaller_y = region[:, 1] # Create binary and color masks polygon_mask = polygon(smaller_y, smaller_x, (im_info['Height'], im_info['Width'])) binary_mask[polygon_mask] += zz + 1 color_mask[polygon_mask] += np.random.rand(3) return binary_mask, color_mask image_list = os.listdir(data_directory + "Tissue Images/") for i in image_list: binary_mask, color_mask = he_to_binary_mask(i[:-4]) values = np.unique(binary_mask) masks = np.zeros(binary_mask.shape) for k in range(len(values)): masks = np.where(binary_mask == values[k], k, masks) os.makedirs(data_directory + "Masks_new/", exist_ok=True) Image.fromarray(masks.astype(np.int32)).save(data_directory + "Masks_new/" + i[:-4] + ".png") def preprocess_nuclick(data_directory = "/vol/data/histo_datasets/NuClick/IHC_nuclick/IHC/"): splits = ["Train", "Validation"] for s in splits: masks_list = os.listdir(data_directory + "masks/" + s) for m in masks_list: mask = np.load(data_directory + "masks/" + s + "/" + m) mask = mask.astype(np.uint8) if np.unique(mask).shape[0] > 1: Image.fromarray(mask).save(data_directory + "masks_png/" + s + "/" + m[:-4] + ".png") def preprocess_pannuke(data_source="/vol/data/histo_datasets/PanNuke/"): folds = [("Fold 1/", "fold1/"), ("Fold 2/", "fold2/"), ("Fold 3/", "fold3/")] os.makedirs(data_source + "images_png/", exist_ok=True) os.makedirs(data_source + "masks_png/", exist_ok=True) counter = 0 for f in folds: images = np.load(data_source + f[0] + "images/" + f[1] + "images.npy") masks = np.load(data_source + f[0] + "masks/" + f[1] + "masks.npy") for i in tqdm(range(images.shape[0])): output = np.zeros((masks.shape[1], masks.shape[2]), dtype=np.int32) k = 0 for j in range(masks.shape[3]-1): values = np.unique(masks[i, :, :, j]) for v in values: if v != 0: output[masks[i, :, :, j] == v] = k k += 1 if np.unique(output).shape[0] > 1: Image.fromarray(images[i].astype(np.uint8)).save(data_source + "images_png/" + str(counter).zfill(4) + ".png") Image.fromarray(output).save(data_source + "masks_png/" + str(counter).zfill(4) + ".png") counter += 1 del images del masks def preprocess_segpc(data_source = "/vol/data/histo_datasets/SegPC/TCIA_SegPC_dataset/"): splits = ["train", "validation"] for s in splits: images_list = os.listdir(data_source + s + "/x/") masks_list = os.listdir(data_source + s + "/y/") os.makedirs(data_source + s + "/masks_png/", exist_ok=True) for i in images_list: short_masks_list = [m for m in masks_list if i[:-4] == m[:-6]] j = 0 for m in short_masks_list: if j == 0: mask = np.array(Image.open(data_source + s + "/y/" + m)) if len(mask.shape) == 3: mask = mask[:, :, 0] mask = np.where(mask == 20, 1, mask) mask = np.where(mask == 40, 2, mask) else: additional_mask = np.array(Image.open(data_source + s + "/y/" + m)) if len(additional_mask.shape) == 3: additional_mask = additional_mask[:, :, 0] mask = np.where(additional_mask == 20, mask + 2*j -1, mask) mask = np.where(additional_mask == 40, mask + 2*j, mask) j += 1 Image.fromarray(mask).save(data_source + s + "/masks_png/" + m[:-6] + ".png") def preprocess_wsss4luad(data_directory = '/home/ubuntu/thesis/data/WSSS4LUAD/2.validation/'): source_directory = data_directory + 'mask/' output_directory = data_directory + 'masks_relabeled/' os.makedirs(output_directory, exist_ok=True) dir_list = os.listdir(source_directory) for d in dir_list: arr = np.array(Image.open(source_directory + d)) arr[arr == 3] = 255 arr = arr + 1 Image.fromarray(arr).save(output_directory + d) preprocess_pannuke()