File size: 6,308 Bytes
8381e8e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import os
import openslide
from PIL import Image
from openslide import OpenSlideError
from openslide.deepzoom import DeepZoomGenerator
import math
import random
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as F
from scipy.ndimage.morphology import binary_fill_holes
from skimage.color import rgb2gray
from skimage.feature import canny
from skimage.morphology import binary_closing, binary_dilation, disk
from concurrent.futures import ProcessPoolExecutor
import tqdm
class SlideProcessor:
def __init__(self, tile_size=1024, overlap=0, tissue_threshold=0.65, max_workers=30):
self.tile_size = tile_size
self.overlap = overlap
self.tissue_threshold = tissue_threshold
self.max_workers = max_workers
def optical_density(self, tile):
tile = tile.astype(np.float64)
od = -np.log((tile+1)/240)
return od
def keep_tile(self, tile, tissue_threshold=None):
if tissue_threshold is None:
tissue_threshold = self.tissue_threshold
if tile.shape[0:2] == (self.tile_size, self.tile_size):
tile_orig = tile
tile = rgb2gray(tile)
tile = 1 - tile
tile = canny(tile)
tile = binary_closing(tile, disk(10))
tile = binary_dilation(tile, disk(10))
tile = binary_fill_holes(tile)
percentage = tile.mean()
check1 = percentage >= tissue_threshold
tile = self.optical_density(tile_orig)
beta = 0.15
tile = np.min(tile, axis=2) >= beta
tile = binary_closing(tile, disk(2))
tile = binary_dilation(tile, disk(2))
tile = binary_fill_holes(tile)
percentage = tile.mean()
check2 = percentage >= tissue_threshold
return check1 and check2
else:
return False
def filter_tiles(self, tile_indices, generator):
filtered_tiles = []
for i in range(len(tile_indices)):
tile_size, overlap, zoom_level, col, row = tile_indices[i]
tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
if self.keep_tile(tile, self.tissue_threshold):
filtered_tiles.append((col, row))
return filtered_tiles
def get_tiles(self, samples, tile_indices, generator):
tiles = []
for i in samples:
tile_size, overlap, zoom_level, col, row = tile_indices[i]
tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
tiles.append((i, tile))
return tiles
def save_tiles(self, sample_tiles, slide_num, loc='pDataset/rest'):
for sample in sample_tiles:
i, tile = sample
im = Image.fromarray(tile)
fname = f"{slide_num}_{i}"
file_path = os.path.join(loc, f"{fname}.jpeg")
im.save(file_path)
def get_save_tiles(self, samples, tile_indices, slide_num, generator, file, loc=None):
if loc is None:
loc = f'/home/gp7/ml_pni/Dataset/tiles_1024/{file}'
for i, cord in enumerate(samples):
x, y = cord
tile_size, overlap, zoom_level, col, row = tile_indices[i]
tile = np.asarray(generator.get_tile(zoom_level, (x, y)))
im = Image.fromarray(tile)
fname = f"{slide_num}_{x}_{y}"
file_path = os.path.join(loc, f"{fname}.jpeg")
im.save(file_path)
def process_one_slide(self, file, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'):
f2p = os.path.join(base_dir, f'{file}.svs')
if not os.path.exists(output_dir):
os.makedirs(output_dir)
img1 = openslide.open_slide(f2p)
generator = DeepZoomGenerator(img1, tile_size=self.tile_size, overlap=self.overlap, limit_bounds=True)
highest_zoom_level = generator.level_count - 1
try:
mag = int(img1.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER])
offset = math.floor((mag / 20) / 2)
level = highest_zoom_level - offset
except (ValueError, KeyError):
level = highest_zoom_level
zoom_level = level
cols, rows = generator.level_tiles[zoom_level]
tile_indices = [(self.tile_size, self.overlap, zoom_level, col, row) for col in range(cols) for row in range(rows)]
filter_sname = os.path.join(output_dir, f'{file}_info.npy')
if os.path.exists(filter_sname):
try:
filtered_tiles = np.load(filter_sname)
print(f"Found existing filtered tiles for {file}, skipping tile filtering.")
except:
print(f"Error reading {filter_sname}, re-filtering tiles.")
filtered_tiles = self.filter_tiles(tile_indices, generator)
np.save(filter_sname, filtered_tiles)
else:
print(f"Didn't find existing filtered tiles for {file}, filtering tiles.")
filtered_tiles = self.filter_tiles(tile_indices, generator)
np.save(filter_sname, filtered_tiles)
directory = os.path.join(output_dir, file)
if not os.path.exists(directory):
os.makedirs(directory)
existing_files_count = len([f for f in os.listdir(directory) if f.endswith('.jpeg')])
filtered_tiles_count = len(filtered_tiles)
threshold = 5
if abs(existing_files_count - filtered_tiles_count) <= threshold:
print(f"Found approximately the same number of files as filtered tiles for {file}, skipping tile saving.")
else:
print('Now going to save tiles')
self.get_save_tiles(filtered_tiles, tile_indices, file, generator, directory)
return file
def parallel_process(self, files, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'):
with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
results = list(tqdm.tqdm(executor.map(self.process_one_slide, files, [base_dir]*len(files), [output_dir]*len(files)), total=len(files)))
return results |