File size: 6,308 Bytes
8381e8e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import os
import openslide
from PIL import Image
from openslide import OpenSlideError
from openslide.deepzoom import DeepZoomGenerator
import math
import random
from pyspark.ml.linalg import Vectors
import pyspark.sql.functions as F
from scipy.ndimage.morphology import binary_fill_holes
from skimage.color import rgb2gray
from skimage.feature import canny
from skimage.morphology import binary_closing, binary_dilation, disk
from concurrent.futures import ProcessPoolExecutor
import tqdm

class SlideProcessor:
    def __init__(self, tile_size=1024, overlap=0, tissue_threshold=0.65, max_workers=30):
        self.tile_size = tile_size
        self.overlap = overlap
        self.tissue_threshold = tissue_threshold
        self.max_workers = max_workers

    def optical_density(self, tile):
        tile = tile.astype(np.float64)
        od = -np.log((tile+1)/240)
        return od

    def keep_tile(self, tile, tissue_threshold=None):
        if tissue_threshold is None:
            tissue_threshold = self.tissue_threshold
            
        if tile.shape[0:2] == (self.tile_size, self.tile_size):
            tile_orig = tile
            tile = rgb2gray(tile)
            tile = 1 - tile
            tile = canny(tile)
            tile = binary_closing(tile, disk(10))
            tile = binary_dilation(tile, disk(10))
            tile = binary_fill_holes(tile)
            percentage = tile.mean()

            check1 = percentage >= tissue_threshold

            tile = self.optical_density(tile_orig)
            beta = 0.15
            tile = np.min(tile, axis=2) >= beta
            tile = binary_closing(tile, disk(2))
            tile = binary_dilation(tile, disk(2))
            tile = binary_fill_holes(tile)
            percentage = tile.mean()

            check2 = percentage >= tissue_threshold

            return check1 and check2
        else:
            return False
        
    def filter_tiles(self, tile_indices, generator):
        filtered_tiles = []
        for i in range(len(tile_indices)):
            tile_size, overlap, zoom_level, col, row = tile_indices[i]
            tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
            if self.keep_tile(tile, self.tissue_threshold):
                filtered_tiles.append((col, row))
        return filtered_tiles


    def get_tiles(self, samples, tile_indices, generator):
        tiles = []
        for i in samples:
            tile_size, overlap, zoom_level, col, row = tile_indices[i]
            tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
            tiles.append((i, tile))
        return tiles
    
    def save_tiles(self, sample_tiles, slide_num, loc='pDataset/rest'):
        for sample in sample_tiles:
            i, tile = sample
            im = Image.fromarray(tile)
            fname = f"{slide_num}_{i}"
            file_path = os.path.join(loc, f"{fname}.jpeg")
            im.save(file_path)

    def get_save_tiles(self, samples, tile_indices, slide_num, generator, file, loc=None):
        if loc is None:
            loc = f'/home/gp7/ml_pni/Dataset/tiles_1024/{file}'

        for i, cord in enumerate(samples):
            x, y = cord
            tile_size, overlap, zoom_level, col, row = tile_indices[i]
            tile = np.asarray(generator.get_tile(zoom_level, (x, y)))
            im = Image.fromarray(tile)
            fname = f"{slide_num}_{x}_{y}"
            file_path = os.path.join(loc, f"{fname}.jpeg")
            im.save(file_path)

    def process_one_slide(self, file, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'):
        f2p = os.path.join(base_dir, f'{file}.svs')
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        img1 = openslide.open_slide(f2p) 
        generator = DeepZoomGenerator(img1, tile_size=self.tile_size, overlap=self.overlap, limit_bounds=True)
        highest_zoom_level = generator.level_count - 1

        try:
            mag = int(img1.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER])
            offset = math.floor((mag / 20) / 2)
            level = highest_zoom_level - offset
        except (ValueError, KeyError):
            level = highest_zoom_level

        zoom_level = level
        cols, rows = generator.level_tiles[zoom_level]
        tile_indices = [(self.tile_size, self.overlap, zoom_level, col, row) for col in range(cols) for row in range(rows)]
        
        filter_sname = os.path.join(output_dir, f'{file}_info.npy')

        if os.path.exists(filter_sname):
            try:
                filtered_tiles = np.load(filter_sname)
                print(f"Found existing filtered tiles for {file}, skipping tile filtering.")
            except:
                print(f"Error reading {filter_sname}, re-filtering tiles.")
                filtered_tiles = self.filter_tiles(tile_indices, generator)
                np.save(filter_sname, filtered_tiles)
        else:
            print(f"Didn't find existing filtered tiles for {file}, filtering tiles.")
            filtered_tiles = self.filter_tiles(tile_indices, generator)
            np.save(filter_sname, filtered_tiles)
        
        directory = os.path.join(output_dir, file)
        if not os.path.exists(directory):
            os.makedirs(directory)

        existing_files_count = len([f for f in os.listdir(directory) if f.endswith('.jpeg')])
        
        filtered_tiles_count = len(filtered_tiles)
        threshold = 5 
        if abs(existing_files_count - filtered_tiles_count) <= threshold:
            print(f"Found approximately the same number of files as filtered tiles for {file}, skipping tile saving.")
        else:
            print('Now going to save tiles') 
            self.get_save_tiles(filtered_tiles, tile_indices, file, generator, directory)
        
        return file

    def parallel_process(self, files, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'):
        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
            results = list(tqdm.tqdm(executor.map(self.process_one_slide, files, [base_dir]*len(files), [output_dir]*len(files)), total=len(files)))
        return results