Spaces:

philippendres
/

CellPilot

No application file

File size: 15,891 Bytes

907462b

import os
import slideio
import numpy as np
from PIL import Image
from tqdm import tqdm
from scipy.io import loadmat
from pycocotools.coco import COCO
from matplotlib import pyplot as plt
import shutil
from xml.dom import minidom
from skimage.draw import polygon
from tifffile import imread

def preprocess_camelyon():
    data_directory = "/vol/data/histo_datasets/CAMELYON/CAMELYON17/"
    mask_list = os.listdir(data_directory + "masks/")
    for m in tqdm(mask_list):
        slide = slideio.open_slide(data_directory + "masks/" + m)
        image_slide = slideio.open_slide(data_directory + "images/" + m[:-9] + ".tif")
        scene = slide.get_scene(0)
        image_scene = image_slide.get_scene(0)

        dim0 = int(np.ceil(scene.size[0] / 1024))
        dim1 = int(np.ceil(scene.size[1] / 1024))
        resolutions = np.ceil(np.log2(max(dim0,dim1)))
        for r in range(int(resolutions) + 1):
            res = 2**r * 1024
            dim0 = int(np.ceil(scene.size[0] / res))
            dim1 = int(np.ceil(scene.size[1] / res))
            last_dim = (int(scene.size[0] % res), int(scene.size[1] % res))

            if last_dim[0] == 0 and last_dim[1] == 0:
                last_dim = (res, res)
            elif last_dim[0] == 0:
                last_dim = (res, last_dim[1])
            elif last_dim[1] == 0:  
                last_dim = (last_dim[0], res)

            for i in range(dim0):
                for j in range(dim1):
                    if i == dim0-1 and j == dim1-1:
                        width = last_dim[0]
                        height = last_dim[1]
                    elif i == dim0-1:
                        width = last_dim[0]
                        height = res
                    elif j == dim1-1:
                        width = res
                        height = last_dim[1]
                    else:
                        width = res
                        height = res
                    mask = scene.read_block((i*res,j*res, width, height), (width // (2**r), height // (2**r)))
                    mask = np.where(mask == 2, 1, 0).astype(np.uint8)
                    if (np.max(mask) == 1):
                        image = image_scene.read_block((i*res,j*res, width, height), (width // (2**r), height // (2**r)))
                        # Save image and mask
                        # Save image
                        Image.fromarray(image).save(data_directory + "images_patches/" + m[:-9] + "_{}_{}_{}_{}_{}_{}.png".format(i*res,j*res,width, height, width // (2**r),  height // (2**r)))
                        # Save mask
                        Image.fromarray(mask).save(data_directory + "masks_patches/" + m[:-9] + "_{}_{}_{}_{}_{}_{}.png".format(i*res,j*res,width, height, width // (2**r),  height // (2**r)))
        
def preprocess_conic(data_directory="/vol/data/histo_datasets/CoNIC/"):
    images = np.load(data_directory+"images.npy")
    masks = np.load(data_directory+"labels.npy")
    for i in tqdm(range(len(images))):
        mask = masks[i,:,:,0]
        if np.unique(mask).shape[0] > 1:
            mask = mask.astype(np.int32)
            Image.fromarray(images[i]).save(data_directory + "images_png/" + str(i).zfill(4) + ".png")
            Image.fromarray(mask).save(data_directory + "labels_png/" + str(i).zfill(4) + ".png")

def preprocess_cpm(data_directory = "/vol/data/histo_datasets/CPM_15_and_17/"):
    dir_list = ["cpm15/", "cpm17/test/", "cpm17/train/"]
    for d in dir_list:
        mask_list = os.listdir(data_directory + d + "Labels/")
        for i in range(len(mask_list)):
            mask = loadmat(data_directory + d + "Labels/" + mask_list[i])["inst_map"]
            mask = mask.astype(np.int32)
            os.makedirs(data_directory + d + "Labels_png/", exist_ok=True)
            Image.fromarray(mask).save(data_directory + d + "Labels_png/" + mask_list[i][:-4] + ".png")

def preprocess_crag(data_directory):
    datasets = ["train2017", "val2017"]
    for d in datasets:
        os.makedirs(data_directory + "cell_CRAG/" + d + "/labels/", exist_ok=True)
        coco = COCO(data_directory+'cell_CRAG/annotations/instances_' + d + '.json')
        cat_ids = coco.getCatIds()
        for image_id in range(len(coco.imgs)):
            img = coco.imgs[image_id+1]
            if "aug" in img['file_name']:
                continue  
            anns_ids = coco.getAnnIds(imgIds=image_id+1, catIds=cat_ids, iscrowd=None)
            anns = coco.loadAnns(anns_ids)
            mask = coco.annToMask(anns[0])
            for i in range(len(anns)):
                mask += (i+2)*coco.annToMask(anns[i])
            mask = mask.astype(np.uint8)
            Image.fromarray(mask).save(data_directory + "cell_CRAG/" + d + "/labels/" + img['file_name'])

def preprocess_icia2018():
    data_directory = "/home/ubuntu/thesis/data/ICIA2018/ICIAR2018_BACH_Challenge/WSI/"
    mask_list = ["A"+str(i+1).zfill(2)+".npy" for i in range(10)]
    for m in tqdm(mask_list):
        scene = np.load(data_directory + m).transpose()
        image_slide = slideio.open_slide(data_directory + m[:-4] + ".svs")
        image_scene = image_slide.get_scene(0)

        dim0 = int(np.ceil(scene.shape[0] / 1024))
        dim1 = int(np.ceil(scene.shape[1] / 1024))
        resolutions = np.ceil(np.log2(max(dim0,dim1)))
        for r in range(int(resolutions) + 1):
            res = 2**r * 1024
            dim0 = int(np.ceil(scene.shape[0] / res))
            dim1 = int(np.ceil(scene.shape[1] / res))
            last_dim = (int(scene.shape[0] % res), int(scene.shape[1] % res))

            if last_dim[0] == 0 and last_dim[1] == 0:
                last_dim = (res, res)
            elif last_dim[0] == 0:
                last_dim = (res, last_dim[1])
            elif last_dim[1] == 0:  
                last_dim = (last_dim[0], res)

            for i in range(dim0):
                for j in range(dim1):
                    if i == dim0-1 and j == dim1-1:
                        width = last_dim[0]
                        height = last_dim[1]
                    elif i == dim0-1:
                        width = last_dim[0]
                        height = res
                    elif j == dim1-1:
                        width = res
                        height = last_dim[1]
                    else:
                        width = res
                        height = res
                    #mask = scene.read_block((i*res,j*res, width, height), (width // (2**r), height // (2**r)))
                    mask = scene[i*res:i*res+width:2**r, j*res:j*res+height:2**r]
                    #mask = np.where(mask == 2, 1, 0).astype(np.uint8)
                    if (np.max(mask) > 0):
                        image = image_scene.read_block((i*res,j*res, width, height), (width // (2**r), height // (2**r)))
                        # Save image and mask
                        # Save image
                        Image.fromarray(image).save(data_directory + "images_patches/" + m[:-9] + "_{}_{}_{}_{}_{}_{}.png".format(i*res,j*res,width, height, width // (2**r),  height // (2**r)))
                        # Save mask
                        Image.fromarray(mask).save(data_directory + "masks_patches/" + m[:-9] + "_{}_{}_{}_{}_{}_{}.png".format(i*res,j*res,width, height, width // (2**r),  height // (2**r)))
        
def preprocess_kumar(data_directory):
    dirs = ["train/", "test_same/", "test_diff/"]
    for d in dirs:
        dir_list = os.listdir(data_directory + d + "Labels/")
        for i in dir_list:
            masks = loadmat(data_directory + d + "Labels/" + i)["inst_map"]
            os.makedirs(data_directory + d + "Labels_png_new/", exist_ok=True)
            Image.fromarray(masks).save(data_directory + d + "Labels_png_new/" + i[:-4] + ".png")

def preprocess_monusac(data_directory = '/home/ubuntu/thesis/data/MoNuSAC/'):
    image_source = data_directory + 'MoNuSAC_images_and_annotations/'
    mask_source = data_directory + 'MoNuSAC_masks/'
    image_destination = data_directory + 'images/'
    mask_destination = data_directory + 'masks/'
    image_list = os.listdir(image_source)

    # Training data
    for i in image_list:
        subimage_list = os.listdir(image_source + i)
        for j in subimage_list:
            if j.endswith('.tif'):
                target_image_file = image_destination + j
                shutil.copyfile(image_source + i + '/' + j, target_image_file)
                types = ["Epithelial", "Lymphocyte", "Macrophage", "Neutrophil"]
                for t in types:
                    if os.path.exists(mask_source + i + '/' + j[:-4] + '/' + t):
                        mask_list = os.listdir(mask_source + i + '/' + j[:-4] + '/' + t)
                        for m in mask_list:
                            target_mask_file = mask_destination + j[:-4] + '_' + t + '.png'
                            slide = slideio.open_slide(mask_source + i + '/' + j[:-4] + '/' + t + '/' + m, "GDAL")
                            scene = slide.get_scene(0)
                            mask = scene.read_block((0,0, scene.size[0], scene.size[1]))
                            Image.fromarray(mask.astype(np.uint8)).save(target_mask_file)
                            
    # Testing data
    source = data_directory + 'MoNuSAC_Testing_Color_Coded_Masks/'
    image_list = os.listdir(source)
    for i in tqdm(image_list):
        subimage_list = os.listdir(source + i)
        for j in subimage_list:
            if j.endswith('.png'):
                target_image_file = image_destination + j
                shutil.copyfile(source + i + '/' + j, target_image_file)
            else:
                target_mask_file = mask_destination + j[:-17] + '.png'
                mask_array = np.array(Image.open(source + i + '/' + j))
                binary_arr = np.zeros((mask_array.shape[0], mask_array.shape[1]))
                for x in range(mask_array.shape[0]):
                    for y in range(mask_array.shape[1]):
                        if mask_array[x, y, 0] == 255 or mask_array[x, y, 1] == 255 or mask_array[x, y, 2] == 255:
                            binary_arr[x][y] = 1
                Image.fromarray(binary_arr.astype(np.uint8)).save(target_mask_file)

def preprocess_monuseg(data_directory = "/home/ubuntu/thesis/data/MoNuSeg/MoNuSeg 2018 Training Data/"):
    def he_to_binary_mask(filename):
        im_file = data_directory + "Tissue Images/" + filename + '.tif'
        xml_file = data_directory + "Annotations/" + filename + '.xml'

        # Parse the XML file
        xDoc = minidom.parse(xml_file)
        Regions = xDoc.getElementsByTagName('Region')

        xy = []
        for regioni in range(Regions.length):
            Region = Regions.item(regioni)
            verticies = Region.getElementsByTagName('Vertex')
            xy_region = np.zeros((verticies.length, 2))
            for vertexi in range(verticies.length):
                x = float(verticies.item(vertexi).getAttribute('X'))
                y = float(verticies.item(vertexi).getAttribute('Y'))
                xy_region[vertexi] = [x, y]
            xy.append(xy_region)

        arr = imread(im_file)
        # Get image information
        im_info = {
            'Height': arr.shape[0],
            'Width': arr.shape[1]
        }
        binary_mask = np.zeros((im_info['Height'], im_info['Width']))
        color_mask = np.zeros((im_info['Height'], im_info['Width'], 3))

        for zz, region in enumerate(xy):
            print(f'Processing object # {zz + 1}')
            smaller_x = region[:, 0]
            smaller_y = region[:, 1]

            # Create binary and color masks
            polygon_mask = polygon(smaller_y, smaller_x, (im_info['Height'], im_info['Width']))
            binary_mask[polygon_mask] += zz + 1
            color_mask[polygon_mask] += np.random.rand(3)

        return binary_mask, color_mask

    image_list = os.listdir(data_directory + "Tissue Images/")
    for i in image_list:
        binary_mask, color_mask = he_to_binary_mask(i[:-4])
        values = np.unique(binary_mask)
        masks = np.zeros(binary_mask.shape)
        for k in range(len(values)):
            masks = np.where(binary_mask == values[k], k, masks)
        os.makedirs(data_directory + "Masks_new/", exist_ok=True)
        Image.fromarray(masks.astype(np.int32)).save(data_directory + "Masks_new/" + i[:-4] + ".png")

def preprocess_nuclick(data_directory = "/vol/data/histo_datasets/NuClick/IHC_nuclick/IHC/"):
    splits = ["Train", "Validation"]
    for s in splits:
        masks_list = os.listdir(data_directory + "masks/" + s)
        for m in masks_list:
            mask = np.load(data_directory + "masks/" + s + "/" + m)
            mask = mask.astype(np.uint8)
            if np.unique(mask).shape[0] > 1: 
                Image.fromarray(mask).save(data_directory + "masks_png/" + s + "/" + m[:-4] + ".png")

def preprocess_pannuke(data_source="/vol/data/histo_datasets/PanNuke/"):
    folds = [("Fold 1/", "fold1/"), ("Fold 2/", "fold2/"), ("Fold 3/", "fold3/")]
    os.makedirs(data_source + "images_png/", exist_ok=True)
    os.makedirs(data_source + "masks_png/", exist_ok=True)
    counter = 0
    for f in folds:
        images = np.load(data_source + f[0] + "images/" + f[1] + "images.npy")
        masks = np.load(data_source + f[0] + "masks/" + f[1] + "masks.npy")
        for i in tqdm(range(images.shape[0])):
            output = np.zeros((masks.shape[1], masks.shape[2]), dtype=np.int32)
            k = 0
            for j in range(masks.shape[3]-1):
                values = np.unique(masks[i, :, :, j])
                for v in values:
                    if v != 0:
                        output[masks[i, :, :, j] == v] = k
                        k += 1
            if np.unique(output).shape[0] > 1:
                Image.fromarray(images[i].astype(np.uint8)).save(data_source + "images_png/" + str(counter).zfill(4) + ".png")
                Image.fromarray(output).save(data_source + "masks_png/" + str(counter).zfill(4) + ".png")
                counter += 1
        del images
        del masks

def preprocess_segpc(data_source = "/vol/data/histo_datasets/SegPC/TCIA_SegPC_dataset/"):
    splits = ["train", "validation"]
    for s in splits:
        images_list = os.listdir(data_source + s + "/x/")
        masks_list = os.listdir(data_source + s + "/y/")
        os.makedirs(data_source + s + "/masks_png/", exist_ok=True)
        for i in images_list:
            short_masks_list = [m for m in masks_list if i[:-4] == m[:-6]]
            j = 0
            for m in short_masks_list:
                if j == 0:
                    mask = np.array(Image.open(data_source + s + "/y/" + m))
                    if len(mask.shape) == 3:
                        mask = mask[:, :, 0]
                    mask = np.where(mask == 20, 1, mask)
                    mask = np.where(mask == 40, 2, mask)
                else:
                    additional_mask = np.array(Image.open(data_source + s + "/y/" + m))
                    if len(additional_mask.shape) == 3:
                        additional_mask = additional_mask[:, :, 0]
                    mask = np.where(additional_mask == 20, mask + 2*j -1, mask)
                    mask = np.where(additional_mask == 40, mask + 2*j, mask)
                j += 1
            Image.fromarray(mask).save(data_source + s + "/masks_png/" + m[:-6] + ".png")

def preprocess_wsss4luad(data_directory = '/home/ubuntu/thesis/data/WSSS4LUAD/2.validation/'):
    source_directory = data_directory + 'mask/'
    output_directory = data_directory + 'masks_relabeled/'
    os.makedirs(output_directory, exist_ok=True)
    dir_list = os.listdir(source_directory)
    for d in dir_list:
        arr = np.array(Image.open(source_directory + d))
        arr[arr == 3] = 255
        arr = arr + 1
        Image.fromarray(arr).save(output_directory + d)

preprocess_pannuke()