st24hour's picture
Upload folder using huggingface_hub
e101805 verified
import argparse
import numpy as np
import openslide
import os
import pandas as pd
import time
import h5py
from PIL import Image
# Remove PIL's image size limit to handle very large WSIs
Image.MAX_IMAGE_PIXELS = None
# NOTE: We add a robust MPP inference helper & CLI fallback (assumed_mpp) to handle slides without explicit mpp metadata.
from .wsi_core.WholeSlideImage import WholeSlideImage
from .wsi_core.wsi_utils import StitchCoords
from .wsi_core.batch_process_utils import initialize_df
# Parse common boolean strings to python bool
def str_to_bool(v):
"""Parse a string to boolean.
Accepts: 'true', 't', 'yes', 'y', '1', 'on' => True
'false', 'f', 'no', 'n', '0', 'off' => False
Case-insensitive. Also passes through bool values unchanged.
"""
if isinstance(v, bool):
return v
if v is None:
raise argparse.ArgumentTypeError('Boolean value expected.')
val = str(v).strip().lower()
if val in ('1', 'true', 't', 'yes', 'y', 'on'):
return True
if val in ('0', 'false', 'f', 'no', 'n', 'off'):
return False
raise argparse.ArgumentTypeError("Boolean value expected (e.g., 'true' or 'false').")
def stitching(file_path, wsi_object, downscale = 64):
start = time.time()
heatmap = StitchCoords(file_path, wsi_object, downscale=downscale, bg_color=(0,0,0), alpha=-1, draw_grid=False)
total_time = time.time() - start
return heatmap, total_time
def segment(WSI_object, seg_params = None, filter_params = None, mask_file = None):
### Start Seg Timer
start_time = time.time()
# Use segmentation file
if mask_file is not None:
WSI_object.initSegmentation(mask_file)
# Segment
else:
WSI_object.segmentTissue(**seg_params, filter_params=filter_params)
### Stop Seg Timers
seg_time_elapsed = time.time() - start_time
return WSI_object, seg_time_elapsed
def patching(WSI_object, patient_id, **kwargs):
### Start Patch Timer
start_time = time.time()
# Patch
file_path = WSI_object.process_contours(patient_id, **kwargs)
### Stop Patch Timer
patch_time_elapsed = time.time() - start_time
return file_path, patch_time_elapsed
def _h5_is_complete(h5_path: str) -> bool:
"""Check if an HDF5 file has a 'complete' marker and at least one coord.
This avoids auto-skipping for partially written files from interrupted runs.
"""
if not os.path.isfile(h5_path):
return False
try:
with h5py.File(h5_path, 'r') as f:
complete = bool(f.attrs.get('complete', False))
has_coords = 'coords' in f and len(f['coords']) > 0
return complete and has_coords
except Exception:
# Treat unreadable/corrupt files as incomplete
return False
def _get_mpp_from_properties(wsi) -> tuple:
"""Try to infer MPP (microns-per-pixel) from slide properties.
Order of checks:
1) openslide.mpp-x (standard key)
2) aperio.MPP (Aperio-specific)
3) tiff.XResolution with tiff.ResolutionUnit (convert to microns)
4) openslide.objective-power -> heuristic mapping (10x=1.0, 20x=0.5, 40x=0.25)
Returns:
(mpp: float | None, source: str) where source indicates the origin of the value.
"""
try:
props = getattr(wsi, 'properties', {})
except Exception:
props = {}
# 1) Standard OpenSlide MPP
try:
suspicious_upper_bound = 1000.0
mpp_val = props.get(openslide.PROPERTY_NAME_MPP_X) or props.get('openslide.mpp-x')
if mpp_val is not None:
mpp = float(mpp_val)
# Some pipelines encode sentinel 100 -> treat as 0.5 µm/px
if mpp == 100:
mpp = 0.5
if mpp > 0:
# Skip clearly incorrect values (e.g., 1000 µm/px) and fall back to other sources
if mpp < suspicious_upper_bound:
return mpp, 'openslide.mpp-x'
except Exception:
pass
# 2) Aperio MPP
try:
aperio_mpp = props.get('aperio.MPP')
if aperio_mpp is not None:
mpp = float(aperio_mpp)
if mpp > 0:
return mpp, 'aperio.MPP'
except Exception:
pass
# 3) TIFF resolution + unit
try:
if 'tiff.XResolution' in props:
xres = float(props['tiff.XResolution'])
unit_raw = props.get('tiff.ResolutionUnit', '')
unit_str = str(unit_raw).strip().lower()
# 2 = inch, 3 = centimeter (per TIFF spec)
if unit_str in {'2', 'inch', 'inches'}:
mpp = 25400.0 / xres # microns per pixel
elif unit_str in {'3', 'centimeter', 'centimetre', 'cm'}:
mpp = 10000.0 / xres
else:
# Fallback used in prior code
mpp = 10000.0 / xres
if mpp > 0:
return mpp, 'tiff.XResolution'
except Exception:
pass
# 4) Objective power heuristic
try:
obj = props.get(openslide.PROPERTY_NAME_OBJECTIVE_POWER) or props.get('openslide.objective-power')
if obj is not None:
obj = float(obj)
if obj > 0:
# Heuristic: 10x -> 1.0 µm/px, 20x -> 0.5, 40x -> 0.25
mpp = 10.0 / obj
if mpp > 0:
return mpp, 'objective-power-heuristic'
except Exception:
pass
return None, 'unknown'
def _infer_or_assume_mpp(wsi, assumed_mpp: float = 0.5) -> tuple:
"""Return (mpp, source), falling back to assumed_mpp when metadata isn't available.
This centralizes MPP inference so patch sizing and coord scaling stay consistent.
"""
mpp, src = _get_mpp_from_properties(wsi)
if mpp is None or not np.isfinite(mpp) or mpp <= 0:
return float(assumed_mpp), 'assumed'
return float(mpp), src
def _extract_patient_wsi_ids(slide_id: str):
"""Return (patient_id, wsi_id) from a relative slide path without extension.
- Preserves full parent directory path so subtypes (e.g., "AML/") are kept.
- For non-nested ids (no '/'), both patient_id and wsi_id fall back to slide_id.
"""
# Keep behavior compatible with existing code when there is no nesting
if '/' not in slide_id:
return slide_id, slide_id
# Preserve the entire parent directory path (e.g., "AML/C3N-00466-43") as patient_id
parts = slide_id.split('/')
wsi_id = parts[-1]
patient_id = '/'.join(parts[:-1]) if len(parts) > 1 else wsi_id
return patient_id, wsi_id
def seg_and_patch(source, save_dir, patch_save_dir, mask_save_dir, stitch_save_dir,
patch_size = 256, step_size = 256,
seg_params = {'seg_level': -1, 'sthresh': 8, 'mthresh': 7, 'close': 4, 'use_otsu': False,
'keep_ids': 'none', 'exclude_ids': 'none', 'seg_downsample': 1.0},
filter_params = {'a_t':100, 'a_h': 16, 'max_n_holes':8},
vis_params = {'vis_level': -1, 'line_thickness': 500},
patch_params = {'use_padding': True, 'contour_fn': 'four_pt'},
patch_level = 0,
use_default_params = False,
seg = False, save_mask = True,
stitch= False,
patch = False, auto_skip=True, process_list = None, rank=0, world_size=1,
assumed_mpp: float = 0.5):
# Build slide list robustly across different layouts
# - Some datasets (e.g., GTEx) may store files like "*.svs.cloud_transfer.***" where the final extension is not the WSI type
# - Therefore, don't rely on endswith; check if an allowed token appears anywhere in the filename
# - Also, avoid treating files as directories (prevents NotADirectoryError)
# NOTE: Switch to recursive discovery (os.walk) to support arbitrarily nested folder trees.
# Allowed WSI suffixes (strict) to detect supported slide files
# Note: Keep common OpenSlide formats only; exclude generic image types like .jpg/.jpeg
# Using endswith to exclude sidecar files like "*.svs.cloud_transfer..."
allowed_suffixes = ('.svs', '.tif', '.tiff', '.ndpi', '.mrxs', '.bif')
def is_wsi_candidate(name: str) -> bool:
"""Return True if the filename ends with an allowed WSI suffix (WSI formats only).
Strict endswith prevents including non-WSI sidecar files (e.g., *.svs.cloud_transfer.*) and excludes JPGs.
"""
lname = name.lower()
return lname.endswith(allowed_suffixes)
slides = []
for root, _dirs, files in os.walk(source):
# Sort to keep deterministic order across nodes
for file in sorted(files):
if is_wsi_candidate(file):
full_path = os.path.join(root, file)
# Store relative path so later os.path.join(source, slide) still works
rel_path = os.path.relpath(full_path, start=source)
# Normalize to POSIX-style separators for downstream split logic
slides.append(rel_path.replace('\\', '/'))
### for debug####
"""
slides = ['6a0ea716-a5f2-47f3-880b-537a5cdc2324/TCGA-86-8074-01Z-00-DX1.0c34b434-8701-4060-a4ea-08a72371ee1e.svs',
'083fea50-313e-42dc-bd80-4b70fec04ddf/TCGA-86-8279-01Z-00-DX1.fd12b60e-d181-454b-a655-298a973a849d.svs',
'64cc65c6-2076-40cb-8ec7-91f29ca8cd1d/TCGA-86-8076-01Z-00-DX1.e7378b2f-e20e-4d2f-a86c-3a8ead08a385.svs',
'c9e28a52-98a1-40f0-91c8-c8ea09a87a36/TCGA-86-8075-01Z-00-DX1.171bd8bd-af24-4770-b24b-732e675efd75.svs',
'1ac67634-7c99-445e-abb0-7ed7f801a080/TCGA-86-7953-01Z-00-DX1.847865ce-df59-4677-bac2-ee88a258fe4e.svs',
'ce057ce4-d194-4189-a91a-981962a7e354/TCGA-86-7714-01Z-00-DX1.8ee03c3b-013e-4dd5-aa0e-9c1a771cbdc5.svs',
'c7aad125-c092-4451-8480-993a3d982879/TCGA-86-8056-01Z-00-DX1.eee7e03a-842c-4a44-bff8-f7b906725605.svs',
'855d4a48-b3d4-4088-a6a0-bca7d75ccefa/TCGA-86-7711-01Z-00-DX1.f64dd9d8-b9ca-4d1f-9783-d1042979132d.svs',
'c6e36f24-46f3-4e20-8154-970e749aa0ca/TCGA-86-8278-01Z-00-DX1.6500afef-f0f0-4e7c-a6c0-60da81993641.svs',
'5192c4d5-69df-400e-af31-c964c9512bd7/TCGA-86-8054-01Z-00-DX1.2c4c08f6-be1c-46d1-a719-5983cade0c54.svs',
'0edfc16d-25cc-403d-8862-38ec9c90060a/TCGA-50-6597-01Z-00-DX1.ec7fc0b2-78a1-4384-bddd-e89f02ee5eb6.svs',
'b5506234-c969-4b27-89f5-5dffead8683a/TCGA-95-7043-01Z-00-DX1.AE0FD8AA-9B88-45FE-B247-402BED1285EF.svs',
'94dfeae0-aaa6-405f-bf37-1c17d207f0ca/TCGA-86-7954-01Z-00-DX1.0a063ee6-18b7-4b66-9726-04fb452156cb.svs',
'432c8861-64dd-4587-b597-93fb748cb4cd/TCGA-50-5044-01Z-00-DX1.7E0E651F-411B-4784-B0FA-6EB612527430.svs',
'325bc10e-b478-44d8-92f7-ddfd6e2daf2b/TCGA-86-8280-01Z-00-DX1.52627b97-1cfa-4382-819b-949b58c0f995.svs',
'd3519b7a-b09e-4900-a0ba-dbd44909a43f/TCGA-86-7955-01Z-00-DX1.ef4f4d94-5efb-4a07-97cf-b0ed69085827.svs',
'48db62d6-8fc8-4059-8553-59c88ee46fa2/TCGA-86-7713-01Z-00-DX1.23f9e213-b566-47bb-beb8-7d12b2f0508b.svs',
'b53d1202-bb6f-4494-9e01-4684e09486ba/TCGA-86-8055-01Z-00-DX1.546dc42e-3742-4da5-8f9b-80732180ce76.svs',
'035fdf1f-b813-4ea6-9395-f2ea38ccaee6/TCGA-86-6851-01Z-00-DX1.0b13e600-fd7b-44a2-9ec2-26e9938fb7bc.svs',
'3cde1c79-65c1-4d3b-8b0b-f8e81a512865/TCGA-69-7980-01Z-00-DX1.8bbf8cc0-eca7-49e5-a022-c22e3e6ed6dc.svs',
'f8e5a0da-e46f-4ac9-be5e-ae11327e5a26/TCGA-86-8073-01Z-00-DX1.33c016fc-5c9e-4ad6-8de2-a7f8521d205c.svs',
'82b362c4-c5a0-40c7-889b-a2694e5cb6ce/TCGA-50-5931-01Z-00-DX1.34261ED6-7815-487C-A50C-2DAD587187B9.svs',
'78b2104d-3173-441e-8e33-46ab57f3ef42/TCGA-86-7701-01Z-00-DX1.a8a6e71e-9fa9-42c6-a186-0ac7526e9960.svs',
'1c27c0a1-5e0f-4143-a6c2-a22af744830b/TCGA-50-6593-01Z-00-DX1.a63e298c-cbe7-44e8-8d8e-34ebb93530ca.svs',
]
"""
#slides = [slide for slide in slides if os.path.isfile(os.path.join(source, slide))]
def remove_elements_with_string(file_list, string):
return [file for file in file_list if string not in file]
file_to_remove = '18ed4f5f-5f1c-4f87-a79c-e8d2b6f167b0/TCGA-19-1389-01Z-00-DX1.bd925898-9fb1-4c7c-81b6-9b492e956ca1.svs'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = '035a89f8-a30c-4039-8424-0c38a32546ff/TCGA-06-1086-01Z-00-DX2.e1961f1f-a823-4775-acf7-04a46f05e15e.svs'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = 'patient_103_node_1.tif'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = '11327473-2921-4916-a111-89d0eda6be8a/TCGA-A8-A06U-01A-01-TS1.63824040-373f-4c6c-a74e-881c127567a6.svs'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = '9c393d1e-c0ef-453f-882c-cd5541fa07fd/TCGA-CV-7242-11A-01-TS1.1838afb1-9eee-4a70-9ae3-50e3ab45e242.svs'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = 'AML/C3L-06352-41.svs'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = 'AML/C3L-05870-42.svs'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = 'CM/C3L-02622-28.svs'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = 'LG-0252_B.svs'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = '3790f55cad63053e956fb73027179707.tiff' # 안읽어짐
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = 'Angiomatous meningioma/a1982bd3-357f-11eb-aec7-001a7dda7111.ndpi'
slides = remove_elements_with_string(slides, file_to_remove)
file_to_remove = 'a1982bd3-357f-11eb-aec7-001a7dda7111.ndpi' # 안읽어짐
slides = remove_elements_with_string(slides, file_to_remove)
#"""
#slides = slides[:1000]
# Determine base list filename, respecting a provided --process_list for clarity
if process_list is None:
base_list_file_name = 'process_list_autogen.csv'
else:
base_list_file_name = os.path.basename(process_list)
if process_list is None:
full_df = initialize_df(slides, seg_params, filter_params, vis_params, patch_params)
else:
full_df = pd.read_csv(process_list)
full_df = initialize_df(full_df, seg_params, filter_params, vis_params, patch_params)
rank = 0 if rank is None else int(rank)
world_size = int(world_size) if world_size else 1
if world_size < 1:
world_size = 1
# Candidate selection must be strict: only 'tbp' stays as a candidate.
# Exclude everything else including processed, already_exist, and any failed statuses.
if 'status' in full_df.columns:
candidate_mask = full_df['status'].astype(str).eq('tbp')
# Keep 'process' updated for downstream visibility, but don't use it to decide candidates.
full_df['process'] = 0
full_df.loc[candidate_mask, 'process'] = 1
else:
# Fall back: if no status, use process when present; otherwise consider all as candidates
if 'process' in full_df.columns:
candidate_mask = full_df['process'] == 1
else:
candidate_mask = pd.Series([True] * len(full_df), index=full_df.index)
process_candidates = full_df[candidate_mask]
total_candidates = len(process_candidates)
print("total:", total_candidates)
candidate_indices = list(process_candidates.index)
if rank < 0 or rank >= world_size:
assigned_indices = []
elif world_size > 1:
assigned_indices = [idx for order, idx in enumerate(candidate_indices) if order % world_size == rank]
else:
assigned_indices = candidate_indices
assigned_count = len(assigned_indices)
print(f"rank {rank}/{world_size} assigned slides: {assigned_count}")
df = full_df.loc[assigned_indices].copy()
process_stack = df[df['process'] == 1]
total = len(process_stack)
base_stem, base_ext = os.path.splitext(base_list_file_name)
if not base_ext:
base_ext = '.csv'
if world_size > 1:
tentative_name = f'{base_stem}_rank{rank:04d}{base_ext}'
else:
tentative_name = f'{base_stem}{base_ext}'
# Avoid overwriting an existing progress CSV; if exists, append a timestamp suffix
list_file_path = os.path.join(save_dir, tentative_name)
if os.path.exists(list_file_path):
ts = time.strftime('%Y%m%d_%H%M%S')
if world_size > 1:
list_file_name = f'{base_stem}_rank{rank:04d}_{ts}{base_ext}'
else:
list_file_name = f'{base_stem}_{ts}{base_ext}'
else:
list_file_name = tentative_name
if total == 0:
print(f"rank {rank} has no slides to process.")
legacy_support = 'a' in df.keys()
if legacy_support:
print('detected legacy segmentation csv file, legacy support enabled')
df = df.assign(**{'a_t': np.full((len(df)), int(filter_params['a_t']), dtype=np.uint32),
'a_h': np.full((len(df)), int(filter_params['a_h']), dtype=np.uint32),
'max_n_holes': np.full((len(df)), int(filter_params['max_n_holes']), dtype=np.uint32),
'line_thickness': np.full((len(df)), int(vis_params['line_thickness']), dtype=np.uint32),
'contour_fn': np.full((len(df)), patch_params['contour_fn'])})
seg_times = 0.
patch_times = 0.
stitch_times = 0.
###### patching start#######
for i in range(total):
df.to_csv(os.path.join(save_dir, list_file_name), index=False)
idx = process_stack.index[i]
slide = process_stack.loc[idx, 'slide_id']
print("\n\nprogress: {:.2f}, {}/{}".format(i/total, i, total))
print('processing {}'.format(slide))
# Known problematic slides: mark as failed_seg and skip processing
# This prevents repeated crashes while keeping a record in the CSV
skip_as_failed_seg = {
'a7a607ab-7b7a-469d-bffb-6003cce2d6ae/TCGA-UZ-A9PQ-01Z-00-DX1.C2CB0E94-2548-4399-BCAB-E4D556D533EF.svs'
}
if slide in skip_as_failed_seg:
print(f"{slide} is known to fail; marking as failed_seg and skipping.")
df.loc[idx, 'status'] = 'failed_seg'
df.loc[idx, 'process'] = 0
continue
df.loc[idx, 'process'] = 0
slide_id, _ = os.path.splitext(slide)
if auto_skip:
# Two possible layouts are checked for backwards compatibility
legacy_path = os.path.join(patch_save_dir, slide_id + '.h5')
nested_path = os.path.join(patch_save_dir, slide_id, slide_id + '.h5')
# Newer code writes to patch_save_dir/<patient_id>/<wsi_id>.h5
patient_id, wsi_id = _extract_patient_wsi_ids(slide_id)
canonical_path = os.path.join(patch_save_dir, patient_id, wsi_id + '.h5')
existing_paths = [p for p in [canonical_path, legacy_path, nested_path] if os.path.isfile(p)]
if existing_paths:
# Prefer canonical path for inspection
candidate = canonical_path if os.path.isfile(canonical_path) else existing_paths[0]
if _h5_is_complete(candidate):
print('{} already exist in destination location (complete), skipped'.format(slide_id))
df.loc[idx, 'status'] = 'already_exist'
continue
else:
print(f"found incomplete h5 for {slide_id}: {candidate} -> removing and regenerating")
try:
os.remove(candidate)
except Exception as e:
print(f"warning: failed to remove incomplete file {candidate}: {e}")
# Inialize WSI
full_path = os.path.join(source, slide)
WSI_object = WholeSlideImage(full_path)
if use_default_params:
current_vis_params = vis_params.copy()
current_filter_params = filter_params.copy()
current_seg_params = seg_params.copy()
current_patch_params = patch_params.copy()
else:
current_vis_params = {}
current_filter_params = {}
current_seg_params = {}
current_patch_params = {}
for key in vis_params.keys():
if legacy_support and key == 'vis_level':
df.loc[idx, key] = -1
current_vis_params.update({key: df.loc[idx, key]})
for key in filter_params.keys():
if legacy_support and key == 'a_t':
old_area = df.loc[idx, 'a']
seg_level = df.loc[idx, 'seg_level']
scale = WSI_object.level_downsamples[seg_level]
adjusted_area = int(old_area * (scale[0] * scale[1]) / (512 * 512))
current_filter_params.update({key: adjusted_area})
df.loc[idx, key] = adjusted_area
current_filter_params.update({key: df.loc[idx, key]})
for key in seg_params.keys():
if legacy_support and key == 'seg_level':
df.loc[idx, key] = -1
current_seg_params.update({key: df.loc[idx, key]})
for key in patch_params.keys():
current_patch_params.update({key: df.loc[idx, key]})
if current_vis_params['vis_level'] < 0:
if len(WSI_object.level_dim) == 1:
current_vis_params['vis_level'] = 0
else:
wsi = WSI_object.getOpenSlide()
best_level = wsi.get_best_level_for_downsample(64)
current_vis_params['vis_level'] = best_level
if current_seg_params['seg_level'] < 0:
if len(WSI_object.level_dim) == 1:
current_seg_params['seg_level'] = 0
else:
wsi = WSI_object.getOpenSlide()
best_level = wsi.get_best_level_for_downsample(64)
current_seg_params['seg_level'] = best_level
keep_ids = str(current_seg_params['keep_ids'])
if keep_ids != 'none' and len(keep_ids) > 0:
str_ids = current_seg_params['keep_ids']
current_seg_params['keep_ids'] = np.array(str_ids.split(',')).astype(int)
else:
current_seg_params['keep_ids'] = []
exclude_ids = str(current_seg_params['exclude_ids'])
if exclude_ids != 'none' and len(exclude_ids) > 0:
str_ids = current_seg_params['exclude_ids']
current_seg_params['exclude_ids'] = np.array(str_ids.split(',')).astype(int)
else:
current_seg_params['exclude_ids'] = []
w, h = WSI_object.level_dim[current_seg_params['seg_level']]
l = current_seg_params['seg_level']
print('seg_level {}, level_dim {} x {}'.format(l, w, h))
if w * h > 4e10:
print('level_dim {} x {} is likely too large for successful segmentation, aborting'.format(w, h))
df.loc[idx, 'status'] = 'failed_seg'
continue
df.loc[idx, 'vis_level'] = current_vis_params['vis_level']
df.loc[idx, 'seg_level'] = current_seg_params['seg_level']
seg_time_elapsed = -1
if seg:
try:
WSI_object, seg_time_elapsed = segment(WSI_object, current_seg_params, current_filter_params)
except Exception as e:
print(str(e))
current_seg_params['seg_level']=2
WSI_object, seg_time_elapsed = segment(WSI_object, current_seg_params, current_filter_params)
if len(WSI_object.contours_tissue)==0:
print('failed to extract contours')
df.loc[idx, 'status'] = 'failed_seg'
continue
if save_mask:
mask = WSI_object.visWSI(**current_vis_params)
seg_ds = current_seg_params.get('seg_downsample', 1)
try:
seg_ds = float(seg_ds)
except (TypeError, ValueError):
seg_ds = 1
if seg_ds > 1:
new_w = max(1, int(mask.width / seg_ds))
new_h = max(1, int(mask.height / seg_ds))
# Downsample the saved mask to match seg_downsample for lightweight outputs
mask = mask.resize((new_w, new_h), resample=Image.Resampling.BILINEAR)
patient_id, wsi_id = _extract_patient_wsi_ids(slide_id)
os.makedirs(os.path.join(mask_save_dir, patient_id), exist_ok=True)
# Try JPG first for smaller file size, fallback to PNG if size exceeds JPEG limit
try:
mask_path = os.path.join(mask_save_dir, patient_id, wsi_id+'.jpg')
mask.save(mask_path)
except (OSError, ValueError) as e:
print(f"JPG save failed ({e}), using PNG instead for {wsi_id}")
mask_path = os.path.join(mask_save_dir, patient_id, wsi_id+'.png')
mask.save(mask_path)
patch_time_elapsed = -1 # Default time
if patch:
patient_id, wsi_id = _extract_patient_wsi_ids(slide_id)
os.makedirs(os.path.join(patch_save_dir, patient_id), exist_ok=True)
### add for patching based on x20 256px patching
print(WSI_object.wsi.properties)
if 'ver-' in save_dir: #patch gen version 0.1 or 0.2 or 0.3
# Unified MPP inference with float crop factor
this_mpp, mpp_src = _infer_or_assume_mpp(WSI_object.wsi, assumed_mpp=assumed_mpp)
print(f"mpp: {this_mpp} (source={mpp_src})")
standard_mpp = 0.5
crop_factor = standard_mpp / this_mpp
new_patch_size = int(patch_size * crop_factor)
new_step_size = int(step_size * crop_factor)
print("patch_size: ", new_patch_size )
else: #init version
# Unified MPP inference with rounded crop factor (to preserve original behavior)
this_mpp, mpp_src = _infer_or_assume_mpp(WSI_object.wsi, assumed_mpp=assumed_mpp)
print(f"mpp: {this_mpp} (source={mpp_src})")
standard_mpp = 0.5
crop_factor = round(standard_mpp / this_mpp)
if crop_factor <= 0:
crop_factor = 1
new_patch_size = int(patch_size * crop_factor)
new_step_size = int(step_size * crop_factor)
print("patch_size: ", new_patch_size )
#if openslide.PROPERTY_NAME_OBJECTIVE_POWER in WSI_object.wsi.properties:
# if int(WSI_object.wsi.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER]) == 40 : #x40
# new_patch_size = patch_size*2
# new_step_size = step_size*2
# else: #x20
# new_patch_size = patch_size
# new_step_size = step_size
#========================================
current_patch_params.update({'patch_level': patch_level, 'patch_size': new_patch_size, 'step_size': new_step_size,
'save_path': patch_save_dir})
file_path, patch_time_elapsed = patching(WSI_object = WSI_object, patient_id=patient_id, **current_patch_params,)
# Optional: sanity check completion marker
if file_path and not _h5_is_complete(file_path):
print(f"warning: generated file missing completion marker: {file_path}")
stitch_time_elapsed = -1
if stitch:
patient_id, wsi_id = _extract_patient_wsi_ids(slide_id)
file_path = os.path.join(patch_save_dir, patient_id, wsi_id+'.h5')
if os.path.isfile(file_path):
heatmap, stitch_time_elapsed = stitching(file_path, WSI_object, downscale=64)
#patient_id, wsi_id = slide_id.split('/')
os.makedirs(os.path.join(stitch_save_dir, patient_id), exist_ok=True)
# Try JPG first for smaller file size, fallback to PNG if size exceeds JPEG limit
try:
stitch_path = os.path.join(stitch_save_dir, patient_id, wsi_id+'.jpg')
heatmap.save(stitch_path)
except (OSError, ValueError) as e:
print(f"JPG save failed ({e}), using PNG instead for {wsi_id}")
stitch_path = os.path.join(stitch_save_dir, patient_id, wsi_id+'.png')
heatmap.save(stitch_path)
print("segmentation took {} seconds".format(seg_time_elapsed))
print("patching took {} seconds".format(patch_time_elapsed))
print("stitching took {} seconds".format(stitch_time_elapsed))
df.loc[idx, 'status'] = 'processed'
seg_times += seg_time_elapsed
patch_times += patch_time_elapsed
stitch_times += stitch_time_elapsed
if total > 0:
seg_times /= total
patch_times /= total
stitch_times /= total
else:
seg_times = 0.0
patch_times = 0.0
stitch_times = 0.0
df.to_csv(os.path.join(save_dir, list_file_name), index=False)
print("average segmentation time in s per slide: {}".format(seg_times))
print("average patching time in s per slide: {}".format(patch_times))
print("average stiching time in s per slide: {}".format(stitch_times))
return seg_times, patch_times
def get_args_parser():
parser = argparse.ArgumentParser(description='seg and patch', add_help=False)
parser.add_argument('--source', type = str,
help='path to folder containing raw wsi image files')
parser.add_argument('--step_size', type = int, default=256,
help='step_size')
parser.add_argument('--patch_size', type = int, default=256,
help='patch_size')
parser.add_argument('--patch', type=str_to_bool, default=True,
help="whether to run patching; accepts true/false")
parser.add_argument('--seg', type=str_to_bool, default=True,
help="whether to run segmentation; accepts true/false")
parser.add_argument('--stitch', type=str_to_bool, default=False,
help="whether to run stitching; accepts true/false")
parser.add_argument('--auto_skip', type=str_to_bool, default=False,
help="if true, auto-skip is enabled (skip slides with existing outputs); if false, reprocess even if outputs exist")
parser.add_argument('--save_dir', type = str,
help='directory to save processed data')
parser.add_argument('--preset', default=None, type=str,
help='predefined profile of default segmentation and filter parameters (.csv)')
parser.add_argument('--patch_level', type=int, default=0,
help='downsample level at which to patch')
parser.add_argument('--process_list', type = str, default=None,
help='name of list of images to process with parameters (.csv)')
parser.add_argument('--rank', type=int, default=0,
help='Process rank used to shard slides across parallel jobs')
parser.add_argument('--world_size', type=int, default=1,
help='Total number of parallel jobs that split the slide list')
parser.add_argument('--assumed_mpp', type=float, default=0.5,
help='Fallback microns-per-pixel to use when slide metadata lacks MPP information (default: 0.5).')
parser.add_argument('--seg_downsample', type=float, default=1.0,
help='Optional extra downsample factor (>1) applied before segmentation to speed up single-level slides.')
return parser
def main(args):
"""
Entrypoint for running seg-and-patch from code or CLI.
"""
patch_save_dir = os.path.join(args.save_dir, 'patches')
mask_save_dir = os.path.join(args.save_dir, 'masks')
stitch_save_dir = os.path.join(args.save_dir, 'stitches')
if args.process_list:
process_list = os.path.join(args.save_dir, args.process_list)
else:
process_list = None
print('source: ', args.source)
print('patch_save_dir: ', patch_save_dir)
print('mask_save_dir: ', mask_save_dir)
print('stitch_save_dir: ', stitch_save_dir)
directories = {'source': args.source,
'save_dir': args.save_dir,
'patch_save_dir': patch_save_dir,
'mask_save_dir' : mask_save_dir,
'stitch_save_dir': stitch_save_dir}
for key, val in directories.items():
print("{} : {}".format(key, val))
if key not in ['source']:
os.makedirs(val, exist_ok=True)
seg_params = {'seg_level': -1, 'sthresh': 8, 'mthresh': 7, 'close': 4, 'use_otsu': False,
'keep_ids': 'none', 'exclude_ids': 'none', 'seg_downsample': args.seg_downsample}
#mod_v0.2
filter_params = {'a_t':1, 'a_h': 1, 'max_n_holes':100}
#filter_params = {'a_t':100, 'a_h': 16, 'max_n_holes':8}
vis_params = {'vis_level': -1, 'line_thickness': 100}
patch_params = {'use_padding': True, 'contour_fn': 'four_pt'}
if args.preset:
preset_df = pd.read_csv(os.path.join('presets', args.preset))
for key in seg_params.keys():
if key in preset_df.columns:
seg_params[key] = preset_df.loc[0, key]
for key in filter_params.keys():
if key in preset_df.columns:
filter_params[key] = preset_df.loc[0, key]
for key in vis_params.keys():
if key in preset_df.columns:
vis_params[key] = preset_df.loc[0, key]
for key in patch_params.keys():
if key in preset_df.columns:
patch_params[key] = preset_df.loc[0, key]
parameters = {'seg_params': seg_params,
'filter_params': filter_params,
'patch_params': patch_params,
'vis_params': vis_params}
print(parameters)
rank = getattr(args, 'rank', 0)
world_size = getattr(args, 'world_size', 1)
print(f"rank: {rank}, world_size: {world_size}")
seg_times, patch_times = seg_and_patch(**directories, **parameters,
patch_size = args.patch_size, step_size=args.step_size,
seg = args.seg, use_default_params=False, save_mask = True,
stitch= args.stitch,
patch_level=args.patch_level, patch = args.patch,
process_list = process_list, auto_skip=args.auto_skip,
rank=rank, world_size=world_size,
assumed_mpp=args.assumed_mpp)
return seg_times, patch_times
if __name__ == '__main__':
parser = argparse.ArgumentParser('create_patches', parents=[get_args_parser()])
args = parser.parse_args()
main(args)