| import numpy as np | |
| import tensorflow as tf | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import os | |
| import openslide | |
| from PIL import Image | |
| from openslide import OpenSlideError | |
| from openslide.deepzoom import DeepZoomGenerator | |
| import math | |
| import random | |
| from pyspark.ml.linalg import Vectors | |
| import pyspark.sql.functions as F | |
| from scipy.ndimage.morphology import binary_fill_holes | |
| from skimage.color import rgb2gray | |
| from skimage.feature import canny | |
| from skimage.morphology import binary_closing, binary_dilation, disk | |
| from concurrent.futures import ProcessPoolExecutor | |
| import tqdm | |
| class SlideProcessor: | |
| def __init__(self, tile_size=1024, overlap=0, tissue_threshold=0.65, max_workers=30): | |
| self.tile_size = tile_size | |
| self.overlap = overlap | |
| self.tissue_threshold = tissue_threshold | |
| self.max_workers = max_workers | |
| def optical_density(self, tile): | |
| tile = tile.astype(np.float64) | |
| od = -np.log((tile+1)/240) | |
| return od | |
| def keep_tile(self, tile, tissue_threshold=None): | |
| if tissue_threshold is None: | |
| tissue_threshold = self.tissue_threshold | |
| if tile.shape[0:2] == (self.tile_size, self.tile_size): | |
| tile_orig = tile | |
| tile = rgb2gray(tile) | |
| tile = 1 - tile | |
| tile = canny(tile) | |
| tile = binary_closing(tile, disk(10)) | |
| tile = binary_dilation(tile, disk(10)) | |
| tile = binary_fill_holes(tile) | |
| percentage = tile.mean() | |
| check1 = percentage >= tissue_threshold | |
| tile = self.optical_density(tile_orig) | |
| beta = 0.15 | |
| tile = np.min(tile, axis=2) >= beta | |
| tile = binary_closing(tile, disk(2)) | |
| tile = binary_dilation(tile, disk(2)) | |
| tile = binary_fill_holes(tile) | |
| percentage = tile.mean() | |
| check2 = percentage >= tissue_threshold | |
| return check1 and check2 | |
| else: | |
| return False | |
| def filter_tiles(self, tile_indices, generator): | |
| filtered_tiles = [] | |
| for i in range(len(tile_indices)): | |
| tile_size, overlap, zoom_level, col, row = tile_indices[i] | |
| tile = np.asarray(generator.get_tile(zoom_level, (col, row))) | |
| if self.keep_tile(tile, self.tissue_threshold): | |
| filtered_tiles.append((col, row)) | |
| return filtered_tiles | |
| def get_tiles(self, samples, tile_indices, generator): | |
| tiles = [] | |
| for i in samples: | |
| tile_size, overlap, zoom_level, col, row = tile_indices[i] | |
| tile = np.asarray(generator.get_tile(zoom_level, (col, row))) | |
| tiles.append((i, tile)) | |
| return tiles | |
| def save_tiles(self, sample_tiles, slide_num, loc='pDataset/rest'): | |
| for sample in sample_tiles: | |
| i, tile = sample | |
| im = Image.fromarray(tile) | |
| fname = f"{slide_num}_{i}" | |
| file_path = os.path.join(loc, f"{fname}.jpeg") | |
| im.save(file_path) | |
| def get_save_tiles(self, samples, tile_indices, slide_num, generator, file, loc=None): | |
| if loc is None: | |
| loc = f'/home/gp7/ml_pni/Dataset/tiles_1024/{file}' | |
| for i, cord in enumerate(samples): | |
| x, y = cord | |
| tile_size, overlap, zoom_level, col, row = tile_indices[i] | |
| tile = np.asarray(generator.get_tile(zoom_level, (x, y))) | |
| im = Image.fromarray(tile) | |
| fname = f"{slide_num}_{x}_{y}" | |
| file_path = os.path.join(loc, f"{fname}.jpeg") | |
| im.save(file_path) | |
| def process_one_slide(self, file, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'): | |
| f2p = os.path.join(base_dir, f'{file}.svs') | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| img1 = openslide.open_slide(f2p) | |
| generator = DeepZoomGenerator(img1, tile_size=self.tile_size, overlap=self.overlap, limit_bounds=True) | |
| highest_zoom_level = generator.level_count - 1 | |
| try: | |
| mag = int(img1.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER]) | |
| offset = math.floor((mag / 20) / 2) | |
| level = highest_zoom_level - offset | |
| except (ValueError, KeyError): | |
| level = highest_zoom_level | |
| zoom_level = level | |
| cols, rows = generator.level_tiles[zoom_level] | |
| tile_indices = [(self.tile_size, self.overlap, zoom_level, col, row) for col in range(cols) for row in range(rows)] | |
| filter_sname = os.path.join(output_dir, f'{file}_info.npy') | |
| if os.path.exists(filter_sname): | |
| try: | |
| filtered_tiles = np.load(filter_sname) | |
| print(f"Found existing filtered tiles for {file}, skipping tile filtering.") | |
| except: | |
| print(f"Error reading {filter_sname}, re-filtering tiles.") | |
| filtered_tiles = self.filter_tiles(tile_indices, generator) | |
| np.save(filter_sname, filtered_tiles) | |
| else: | |
| print(f"Didn't find existing filtered tiles for {file}, filtering tiles.") | |
| filtered_tiles = self.filter_tiles(tile_indices, generator) | |
| np.save(filter_sname, filtered_tiles) | |
| directory = os.path.join(output_dir, file) | |
| if not os.path.exists(directory): | |
| os.makedirs(directory) | |
| existing_files_count = len([f for f in os.listdir(directory) if f.endswith('.jpeg')]) | |
| filtered_tiles_count = len(filtered_tiles) | |
| threshold = 5 | |
| if abs(existing_files_count - filtered_tiles_count) <= threshold: | |
| print(f"Found approximately the same number of files as filtered tiles for {file}, skipping tile saving.") | |
| else: | |
| print('Now going to save tiles') | |
| self.get_save_tiles(filtered_tiles, tile_indices, file, generator, directory) | |
| return file | |
| def parallel_process(self, files, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'): | |
| with ProcessPoolExecutor(max_workers=self.max_workers) as executor: | |
| results = list(tqdm.tqdm(executor.map(self.process_one_slide, files, [base_dir]*len(files), [output_dir]*len(files)), total=len(files))) | |
| return results |