Upload folder using huggingface_hub

e101805 verified 4 months ago

35.4 kB

	import argparse
	import numpy as np
	import openslide
	import os
	import pandas as pd
	import time
	import h5py
	from PIL import Image

	# Remove PIL's image size limit to handle very large WSIs
	Image.MAX_IMAGE_PIXELS = None

	# NOTE: We add a robust MPP inference helper & CLI fallback (assumed_mpp) to handle slides without explicit mpp metadata.

	from .wsi_core.WholeSlideImage import WholeSlideImage
	from .wsi_core.wsi_utils import StitchCoords
	from .wsi_core.batch_process_utils import initialize_df

	# Parse common boolean strings to python bool
	def str_to_bool(v):
	"""Parse a string to boolean.

	Accepts: 'true', 't', 'yes', 'y', '1', 'on' => True
	'false', 'f', 'no', 'n', '0', 'off' => False
	Case-insensitive. Also passes through bool values unchanged.
	"""
	if isinstance(v, bool):
	return v
	if v is None:
	raise argparse.ArgumentTypeError('Boolean value expected.')
	val = str(v).strip().lower()
	if val in ('1', 'true', 't', 'yes', 'y', 'on'):
	return True
	if val in ('0', 'false', 'f', 'no', 'n', 'off'):
	return False
	raise argparse.ArgumentTypeError("Boolean value expected (e.g., 'true' or 'false').")

	def stitching(file_path, wsi_object, downscale = 64):
	start = time.time()
	heatmap = StitchCoords(file_path, wsi_object, downscale=downscale, bg_color=(0,0,0), alpha=-1, draw_grid=False)
	total_time = time.time() - start

	return heatmap, total_time

	def segment(WSI_object, seg_params = None, filter_params = None, mask_file = None):
	### Start Seg Timer
	start_time = time.time()
	# Use segmentation file
	if mask_file is not None:
	WSI_object.initSegmentation(mask_file)
	# Segment
	else:
	WSI_object.segmentTissue(**seg_params, filter_params=filter_params)

	### Stop Seg Timers
	seg_time_elapsed = time.time() - start_time
	return WSI_object, seg_time_elapsed

	def patching(WSI_object, patient_id, **kwargs):
	### Start Patch Timer
	start_time = time.time()

	# Patch
	file_path = WSI_object.process_contours(patient_id, **kwargs)


	### Stop Patch Timer
	patch_time_elapsed = time.time() - start_time
	return file_path, patch_time_elapsed


	def _h5_is_complete(h5_path: str) -> bool:
	"""Check if an HDF5 file has a 'complete' marker and at least one coord.

	This avoids auto-skipping for partially written files from interrupted runs.
	"""
	if not os.path.isfile(h5_path):
	return False
	try:
	with h5py.File(h5_path, 'r') as f:
	complete = bool(f.attrs.get('complete', False))
	has_coords = 'coords' in f and len(f['coords']) > 0
	return complete and has_coords
	except Exception:
	# Treat unreadable/corrupt files as incomplete
	return False


	def _get_mpp_from_properties(wsi) -> tuple:
	"""Try to infer MPP (microns-per-pixel) from slide properties.

	Order of checks:
	1) openslide.mpp-x (standard key)
	2) aperio.MPP (Aperio-specific)
	3) tiff.XResolution with tiff.ResolutionUnit (convert to microns)
	4) openslide.objective-power -> heuristic mapping (10x=1.0, 20x=0.5, 40x=0.25)

	Returns:
	(mpp: float \| None, source: str) where source indicates the origin of the value.
	"""
	try:
	props = getattr(wsi, 'properties', {})
	except Exception:
	props = {}

	# 1) Standard OpenSlide MPP
	try:
	suspicious_upper_bound = 1000.0
	mpp_val = props.get(openslide.PROPERTY_NAME_MPP_X) or props.get('openslide.mpp-x')
	if mpp_val is not None:
	mpp = float(mpp_val)
	# Some pipelines encode sentinel 100 -> treat as 0.5 µm/px
	if mpp == 100:
	mpp = 0.5
	if mpp > 0:
	# Skip clearly incorrect values (e.g., 1000 µm/px) and fall back to other sources
	if mpp < suspicious_upper_bound:
	return mpp, 'openslide.mpp-x'
	except Exception:
	pass

	# 2) Aperio MPP
	try:
	aperio_mpp = props.get('aperio.MPP')
	if aperio_mpp is not None:
	mpp = float(aperio_mpp)
	if mpp > 0:
	return mpp, 'aperio.MPP'
	except Exception:
	pass

	# 3) TIFF resolution + unit
	try:
	if 'tiff.XResolution' in props:
	xres = float(props['tiff.XResolution'])
	unit_raw = props.get('tiff.ResolutionUnit', '')
	unit_str = str(unit_raw).strip().lower()
	# 2 = inch, 3 = centimeter (per TIFF spec)
	if unit_str in {'2', 'inch', 'inches'}:
	mpp = 25400.0 / xres # microns per pixel
	elif unit_str in {'3', 'centimeter', 'centimetre', 'cm'}:
	mpp = 10000.0 / xres
	else:
	# Fallback used in prior code
	mpp = 10000.0 / xres
	if mpp > 0:
	return mpp, 'tiff.XResolution'
	except Exception:
	pass

	# 4) Objective power heuristic
	try:
	obj = props.get(openslide.PROPERTY_NAME_OBJECTIVE_POWER) or props.get('openslide.objective-power')
	if obj is not None:
	obj = float(obj)
	if obj > 0:
	# Heuristic: 10x -> 1.0 µm/px, 20x -> 0.5, 40x -> 0.25
	mpp = 10.0 / obj
	if mpp > 0:
	return mpp, 'objective-power-heuristic'
	except Exception:
	pass

	return None, 'unknown'


	def _infer_or_assume_mpp(wsi, assumed_mpp: float = 0.5) -> tuple:
	"""Return (mpp, source), falling back to assumed_mpp when metadata isn't available.

	This centralizes MPP inference so patch sizing and coord scaling stay consistent.
	"""
	mpp, src = _get_mpp_from_properties(wsi)
	if mpp is None or not np.isfinite(mpp) or mpp <= 0:
	return float(assumed_mpp), 'assumed'
	return float(mpp), src


	def _extract_patient_wsi_ids(slide_id: str):
	"""Return (patient_id, wsi_id) from a relative slide path without extension.

	- Preserves full parent directory path so subtypes (e.g., "AML/") are kept.
	- For non-nested ids (no '/'), both patient_id and wsi_id fall back to slide_id.
	"""
	# Keep behavior compatible with existing code when there is no nesting
	if '/' not in slide_id:
	return slide_id, slide_id
	# Preserve the entire parent directory path (e.g., "AML/C3N-00466-43") as patient_id
	parts = slide_id.split('/')
	wsi_id = parts[-1]
	patient_id = '/'.join(parts[:-1]) if len(parts) > 1 else wsi_id
	return patient_id, wsi_id


	def seg_and_patch(source, save_dir, patch_save_dir, mask_save_dir, stitch_save_dir,
	patch_size = 256, step_size = 256,
	seg_params = {'seg_level': -1, 'sthresh': 8, 'mthresh': 7, 'close': 4, 'use_otsu': False,
	'keep_ids': 'none', 'exclude_ids': 'none', 'seg_downsample': 1.0},
	filter_params = {'a_t':100, 'a_h': 16, 'max_n_holes':8},
	vis_params = {'vis_level': -1, 'line_thickness': 500},
	patch_params = {'use_padding': True, 'contour_fn': 'four_pt'},
	patch_level = 0,
	use_default_params = False,
	seg = False, save_mask = True,
	stitch= False,
	patch = False, auto_skip=True, process_list = None, rank=0, world_size=1,
	assumed_mpp: float = 0.5):

	# Build slide list robustly across different layouts
	# - Some datasets (e.g., GTEx) may store files like ".svs.cloud_transfer.**" where the final extension is not the WSI type
	# - Therefore, don't rely on endswith; check if an allowed token appears anywhere in the filename
	# - Also, avoid treating files as directories (prevents NotADirectoryError)
	# NOTE: Switch to recursive discovery (os.walk) to support arbitrarily nested folder trees.
	# Allowed WSI suffixes (strict) to detect supported slide files
	# Note: Keep common OpenSlide formats only; exclude generic image types like .jpg/.jpeg
	# Using endswith to exclude sidecar files like "*.svs.cloud_transfer..."
	allowed_suffixes = ('.svs', '.tif', '.tiff', '.ndpi', '.mrxs', '.bif')

	def is_wsi_candidate(name: str) -> bool:
	"""Return True if the filename ends with an allowed WSI suffix (WSI formats only).
	Strict endswith prevents including non-WSI sidecar files (e.g., .svs.cloud_transfer.) and excludes JPGs.
	"""
	lname = name.lower()
	return lname.endswith(allowed_suffixes)

	slides = []
	for root, _dirs, files in os.walk(source):
	# Sort to keep deterministic order across nodes
	for file in sorted(files):
	if is_wsi_candidate(file):
	full_path = os.path.join(root, file)
	# Store relative path so later os.path.join(source, slide) still works
	rel_path = os.path.relpath(full_path, start=source)
	# Normalize to POSIX-style separators for downstream split logic
	slides.append(rel_path.replace('\\', '/'))

	### for debug####
	"""
	slides = ['6a0ea716-a5f2-47f3-880b-537a5cdc2324/TCGA-86-8074-01Z-00-DX1.0c34b434-8701-4060-a4ea-08a72371ee1e.svs',
	'083fea50-313e-42dc-bd80-4b70fec04ddf/TCGA-86-8279-01Z-00-DX1.fd12b60e-d181-454b-a655-298a973a849d.svs',
	'64cc65c6-2076-40cb-8ec7-91f29ca8cd1d/TCGA-86-8076-01Z-00-DX1.e7378b2f-e20e-4d2f-a86c-3a8ead08a385.svs',
	'c9e28a52-98a1-40f0-91c8-c8ea09a87a36/TCGA-86-8075-01Z-00-DX1.171bd8bd-af24-4770-b24b-732e675efd75.svs',
	'1ac67634-7c99-445e-abb0-7ed7f801a080/TCGA-86-7953-01Z-00-DX1.847865ce-df59-4677-bac2-ee88a258fe4e.svs',
	'ce057ce4-d194-4189-a91a-981962a7e354/TCGA-86-7714-01Z-00-DX1.8ee03c3b-013e-4dd5-aa0e-9c1a771cbdc5.svs',
	'c7aad125-c092-4451-8480-993a3d982879/TCGA-86-8056-01Z-00-DX1.eee7e03a-842c-4a44-bff8-f7b906725605.svs',
	'855d4a48-b3d4-4088-a6a0-bca7d75ccefa/TCGA-86-7711-01Z-00-DX1.f64dd9d8-b9ca-4d1f-9783-d1042979132d.svs',
	'c6e36f24-46f3-4e20-8154-970e749aa0ca/TCGA-86-8278-01Z-00-DX1.6500afef-f0f0-4e7c-a6c0-60da81993641.svs',
	'5192c4d5-69df-400e-af31-c964c9512bd7/TCGA-86-8054-01Z-00-DX1.2c4c08f6-be1c-46d1-a719-5983cade0c54.svs',
	'0edfc16d-25cc-403d-8862-38ec9c90060a/TCGA-50-6597-01Z-00-DX1.ec7fc0b2-78a1-4384-bddd-e89f02ee5eb6.svs',
	'b5506234-c969-4b27-89f5-5dffead8683a/TCGA-95-7043-01Z-00-DX1.AE0FD8AA-9B88-45FE-B247-402BED1285EF.svs',
	'94dfeae0-aaa6-405f-bf37-1c17d207f0ca/TCGA-86-7954-01Z-00-DX1.0a063ee6-18b7-4b66-9726-04fb452156cb.svs',
	'432c8861-64dd-4587-b597-93fb748cb4cd/TCGA-50-5044-01Z-00-DX1.7E0E651F-411B-4784-B0FA-6EB612527430.svs',
	'325bc10e-b478-44d8-92f7-ddfd6e2daf2b/TCGA-86-8280-01Z-00-DX1.52627b97-1cfa-4382-819b-949b58c0f995.svs',
	'd3519b7a-b09e-4900-a0ba-dbd44909a43f/TCGA-86-7955-01Z-00-DX1.ef4f4d94-5efb-4a07-97cf-b0ed69085827.svs',
	'48db62d6-8fc8-4059-8553-59c88ee46fa2/TCGA-86-7713-01Z-00-DX1.23f9e213-b566-47bb-beb8-7d12b2f0508b.svs',
	'b53d1202-bb6f-4494-9e01-4684e09486ba/TCGA-86-8055-01Z-00-DX1.546dc42e-3742-4da5-8f9b-80732180ce76.svs',
	'035fdf1f-b813-4ea6-9395-f2ea38ccaee6/TCGA-86-6851-01Z-00-DX1.0b13e600-fd7b-44a2-9ec2-26e9938fb7bc.svs',
	'3cde1c79-65c1-4d3b-8b0b-f8e81a512865/TCGA-69-7980-01Z-00-DX1.8bbf8cc0-eca7-49e5-a022-c22e3e6ed6dc.svs',
	'f8e5a0da-e46f-4ac9-be5e-ae11327e5a26/TCGA-86-8073-01Z-00-DX1.33c016fc-5c9e-4ad6-8de2-a7f8521d205c.svs',
	'82b362c4-c5a0-40c7-889b-a2694e5cb6ce/TCGA-50-5931-01Z-00-DX1.34261ED6-7815-487C-A50C-2DAD587187B9.svs',
	'78b2104d-3173-441e-8e33-46ab57f3ef42/TCGA-86-7701-01Z-00-DX1.a8a6e71e-9fa9-42c6-a186-0ac7526e9960.svs',
	'1c27c0a1-5e0f-4143-a6c2-a22af744830b/TCGA-50-6593-01Z-00-DX1.a63e298c-cbe7-44e8-8d8e-34ebb93530ca.svs',
	]
	"""

	#slides = [slide for slide in slides if os.path.isfile(os.path.join(source, slide))]
	def remove_elements_with_string(file_list, string):
	return [file for file in file_list if string not in file]

	file_to_remove = '18ed4f5f-5f1c-4f87-a79c-e8d2b6f167b0/TCGA-19-1389-01Z-00-DX1.bd925898-9fb1-4c7c-81b6-9b492e956ca1.svs'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = '035a89f8-a30c-4039-8424-0c38a32546ff/TCGA-06-1086-01Z-00-DX2.e1961f1f-a823-4775-acf7-04a46f05e15e.svs'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = 'patient_103_node_1.tif'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = '11327473-2921-4916-a111-89d0eda6be8a/TCGA-A8-A06U-01A-01-TS1.63824040-373f-4c6c-a74e-881c127567a6.svs'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = '9c393d1e-c0ef-453f-882c-cd5541fa07fd/TCGA-CV-7242-11A-01-TS1.1838afb1-9eee-4a70-9ae3-50e3ab45e242.svs'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = 'AML/C3L-06352-41.svs'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = 'AML/C3L-05870-42.svs'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = 'CM/C3L-02622-28.svs'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = 'LG-0252_B.svs'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = '3790f55cad63053e956fb73027179707.tiff' # 안읽어짐
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = 'Angiomatous meningioma/a1982bd3-357f-11eb-aec7-001a7dda7111.ndpi'
	slides = remove_elements_with_string(slides, file_to_remove)
	file_to_remove = 'a1982bd3-357f-11eb-aec7-001a7dda7111.ndpi' # 안읽어짐
	slides = remove_elements_with_string(slides, file_to_remove)
	#"""

	#slides = slides[:1000]
	# Determine base list filename, respecting a provided --process_list for clarity
	if process_list is None:
	base_list_file_name = 'process_list_autogen.csv'
	else:
	base_list_file_name = os.path.basename(process_list)

	if process_list is None:
	full_df = initialize_df(slides, seg_params, filter_params, vis_params, patch_params)
	else:
	full_df = pd.read_csv(process_list)
	full_df = initialize_df(full_df, seg_params, filter_params, vis_params, patch_params)

	rank = 0 if rank is None else int(rank)
	world_size = int(world_size) if world_size else 1
	if world_size < 1:
	world_size = 1

	# Candidate selection must be strict: only 'tbp' stays as a candidate.
	# Exclude everything else including processed, already_exist, and any failed statuses.
	if 'status' in full_df.columns:
	candidate_mask = full_df['status'].astype(str).eq('tbp')
	# Keep 'process' updated for downstream visibility, but don't use it to decide candidates.
	full_df['process'] = 0
	full_df.loc[candidate_mask, 'process'] = 1
	else:
	# Fall back: if no status, use process when present; otherwise consider all as candidates
	if 'process' in full_df.columns:
	candidate_mask = full_df['process'] == 1
	else:
	candidate_mask = pd.Series([True] * len(full_df), index=full_df.index)

	process_candidates = full_df[candidate_mask]
	total_candidates = len(process_candidates)
	print("total:", total_candidates)

	candidate_indices = list(process_candidates.index)
	if rank < 0 or rank >= world_size:
	assigned_indices = []
	elif world_size > 1:
	assigned_indices = [idx for order, idx in enumerate(candidate_indices) if order % world_size == rank]
	else:
	assigned_indices = candidate_indices

	assigned_count = len(assigned_indices)
	print(f"rank {rank}/{world_size} assigned slides: {assigned_count}")

	df = full_df.loc[assigned_indices].copy()
	process_stack = df[df['process'] == 1]
	total = len(process_stack)

	base_stem, base_ext = os.path.splitext(base_list_file_name)
	if not base_ext:
	base_ext = '.csv'
	if world_size > 1:
	tentative_name = f'{base_stem}_rank{rank:04d}{base_ext}'
	else:
	tentative_name = f'{base_stem}{base_ext}'

	# Avoid overwriting an existing progress CSV; if exists, append a timestamp suffix
	list_file_path = os.path.join(save_dir, tentative_name)
	if os.path.exists(list_file_path):
	ts = time.strftime('%Y%m%d_%H%M%S')
	if world_size > 1:
	list_file_name = f'{base_stem}_rank{rank:04d}_{ts}{base_ext}'
	else:
	list_file_name = f'{base_stem}_{ts}{base_ext}'
	else:
	list_file_name = tentative_name

	if total == 0:
	print(f"rank {rank} has no slides to process.")

	legacy_support = 'a' in df.keys()
	if legacy_support:
	print('detected legacy segmentation csv file, legacy support enabled')
	df = df.assign(**{'a_t': np.full((len(df)), int(filter_params['a_t']), dtype=np.uint32),
	'a_h': np.full((len(df)), int(filter_params['a_h']), dtype=np.uint32),
	'max_n_holes': np.full((len(df)), int(filter_params['max_n_holes']), dtype=np.uint32),
	'line_thickness': np.full((len(df)), int(vis_params['line_thickness']), dtype=np.uint32),
	'contour_fn': np.full((len(df)), patch_params['contour_fn'])})

	seg_times = 0.
	patch_times = 0.
	stitch_times = 0.

	###### patching start#######
	for i in range(total):
	df.to_csv(os.path.join(save_dir, list_file_name), index=False)
	idx = process_stack.index[i]
	slide = process_stack.loc[idx, 'slide_id']
	print("\n\nprogress: {:.2f}, {}/{}".format(i/total, i, total))
	print('processing {}'.format(slide))
	# Known problematic slides: mark as failed_seg and skip processing
	# This prevents repeated crashes while keeping a record in the CSV
	skip_as_failed_seg = {
	'a7a607ab-7b7a-469d-bffb-6003cce2d6ae/TCGA-UZ-A9PQ-01Z-00-DX1.C2CB0E94-2548-4399-BCAB-E4D556D533EF.svs'
	}
	if slide in skip_as_failed_seg:
	print(f"{slide} is known to fail; marking as failed_seg and skipping.")
	df.loc[idx, 'status'] = 'failed_seg'
	df.loc[idx, 'process'] = 0
	continue

	df.loc[idx, 'process'] = 0
	slide_id, _ = os.path.splitext(slide)

	if auto_skip:
	# Two possible layouts are checked for backwards compatibility
	legacy_path = os.path.join(patch_save_dir, slide_id + '.h5')
	nested_path = os.path.join(patch_save_dir, slide_id, slide_id + '.h5')
	# Newer code writes to patch_save_dir/<patient_id>/<wsi_id>.h5
	patient_id, wsi_id = _extract_patient_wsi_ids(slide_id)
	canonical_path = os.path.join(patch_save_dir, patient_id, wsi_id + '.h5')

	existing_paths = [p for p in [canonical_path, legacy_path, nested_path] if os.path.isfile(p)]
	if existing_paths:
	# Prefer canonical path for inspection
	candidate = canonical_path if os.path.isfile(canonical_path) else existing_paths[0]
	if _h5_is_complete(candidate):
	print('{} already exist in destination location (complete), skipped'.format(slide_id))
	df.loc[idx, 'status'] = 'already_exist'
	continue
	else:
	print(f"found incomplete h5 for {slide_id}: {candidate} -> removing and regenerating")
	try:
	os.remove(candidate)
	except Exception as e:
	print(f"warning: failed to remove incomplete file {candidate}: {e}")

	# Inialize WSI
	full_path = os.path.join(source, slide)
	WSI_object = WholeSlideImage(full_path)

	if use_default_params:
	current_vis_params = vis_params.copy()
	current_filter_params = filter_params.copy()
	current_seg_params = seg_params.copy()
	current_patch_params = patch_params.copy()

	else:
	current_vis_params = {}
	current_filter_params = {}
	current_seg_params = {}
	current_patch_params = {}


	for key in vis_params.keys():
	if legacy_support and key == 'vis_level':
	df.loc[idx, key] = -1
	current_vis_params.update({key: df.loc[idx, key]})

	for key in filter_params.keys():
	if legacy_support and key == 'a_t':
	old_area = df.loc[idx, 'a']
	seg_level = df.loc[idx, 'seg_level']
	scale = WSI_object.level_downsamples[seg_level]
	adjusted_area = int(old_area * (scale[0] * scale[1]) / (512 * 512))
	current_filter_params.update({key: adjusted_area})
	df.loc[idx, key] = adjusted_area
	current_filter_params.update({key: df.loc[idx, key]})

	for key in seg_params.keys():
	if legacy_support and key == 'seg_level':
	df.loc[idx, key] = -1
	current_seg_params.update({key: df.loc[idx, key]})

	for key in patch_params.keys():
	current_patch_params.update({key: df.loc[idx, key]})

	if current_vis_params['vis_level'] < 0:
	if len(WSI_object.level_dim) == 1:
	current_vis_params['vis_level'] = 0

	else:
	wsi = WSI_object.getOpenSlide()
	best_level = wsi.get_best_level_for_downsample(64)
	current_vis_params['vis_level'] = best_level

	if current_seg_params['seg_level'] < 0:
	if len(WSI_object.level_dim) == 1:
	current_seg_params['seg_level'] = 0
	else:
	wsi = WSI_object.getOpenSlide()
	best_level = wsi.get_best_level_for_downsample(64)
	current_seg_params['seg_level'] = best_level


	keep_ids = str(current_seg_params['keep_ids'])
	if keep_ids != 'none' and len(keep_ids) > 0:
	str_ids = current_seg_params['keep_ids']
	current_seg_params['keep_ids'] = np.array(str_ids.split(',')).astype(int)
	else:
	current_seg_params['keep_ids'] = []

	exclude_ids = str(current_seg_params['exclude_ids'])
	if exclude_ids != 'none' and len(exclude_ids) > 0:
	str_ids = current_seg_params['exclude_ids']
	current_seg_params['exclude_ids'] = np.array(str_ids.split(',')).astype(int)
	else:
	current_seg_params['exclude_ids'] = []

	w, h = WSI_object.level_dim[current_seg_params['seg_level']]
	l = current_seg_params['seg_level']
	print('seg_level {}, level_dim {} x {}'.format(l, w, h))
	if w * h > 4e10:
	print('level_dim {} x {} is likely too large for successful segmentation, aborting'.format(w, h))
	df.loc[idx, 'status'] = 'failed_seg'
	continue

	df.loc[idx, 'vis_level'] = current_vis_params['vis_level']
	df.loc[idx, 'seg_level'] = current_seg_params['seg_level']


	seg_time_elapsed = -1
	if seg:
	try:
	WSI_object, seg_time_elapsed = segment(WSI_object, current_seg_params, current_filter_params)
	except Exception as e:
	print(str(e))
	current_seg_params['seg_level']=2
	WSI_object, seg_time_elapsed = segment(WSI_object, current_seg_params, current_filter_params)

	if len(WSI_object.contours_tissue)==0:
	print('failed to extract contours')
	df.loc[idx, 'status'] = 'failed_seg'
	continue

	if save_mask:
	mask = WSI_object.visWSI(**current_vis_params)
	seg_ds = current_seg_params.get('seg_downsample', 1)
	try:
	seg_ds = float(seg_ds)
	except (TypeError, ValueError):
	seg_ds = 1
	if seg_ds > 1:
	new_w = max(1, int(mask.width / seg_ds))
	new_h = max(1, int(mask.height / seg_ds))
	# Downsample the saved mask to match seg_downsample for lightweight outputs
	mask = mask.resize((new_w, new_h), resample=Image.Resampling.BILINEAR)
	patient_id, wsi_id = _extract_patient_wsi_ids(slide_id)
	os.makedirs(os.path.join(mask_save_dir, patient_id), exist_ok=True)
	# Try JPG first for smaller file size, fallback to PNG if size exceeds JPEG limit
	try:
	mask_path = os.path.join(mask_save_dir, patient_id, wsi_id+'.jpg')
	mask.save(mask_path)
	except (OSError, ValueError) as e:
	print(f"JPG save failed ({e}), using PNG instead for {wsi_id}")
	mask_path = os.path.join(mask_save_dir, patient_id, wsi_id+'.png')
	mask.save(mask_path)

	patch_time_elapsed = -1 # Default time
	if patch:
	patient_id, wsi_id = _extract_patient_wsi_ids(slide_id)
	os.makedirs(os.path.join(patch_save_dir, patient_id), exist_ok=True)
	### add for patching based on x20 256px patching
	print(WSI_object.wsi.properties)
	if 'ver-' in save_dir: #patch gen version 0.1 or 0.2 or 0.3
	# Unified MPP inference with float crop factor
	this_mpp, mpp_src = _infer_or_assume_mpp(WSI_object.wsi, assumed_mpp=assumed_mpp)
	print(f"mpp: {this_mpp} (source={mpp_src})")
	standard_mpp = 0.5
	crop_factor = standard_mpp / this_mpp
	new_patch_size = int(patch_size * crop_factor)
	new_step_size = int(step_size * crop_factor)
	print("patch_size: ", new_patch_size )
	else: #init version
	# Unified MPP inference with rounded crop factor (to preserve original behavior)
	this_mpp, mpp_src = _infer_or_assume_mpp(WSI_object.wsi, assumed_mpp=assumed_mpp)
	print(f"mpp: {this_mpp} (source={mpp_src})")
	standard_mpp = 0.5
	crop_factor = round(standard_mpp / this_mpp)
	if crop_factor <= 0:
	crop_factor = 1
	new_patch_size = int(patch_size * crop_factor)
	new_step_size = int(step_size * crop_factor)
	print("patch_size: ", new_patch_size )
	#if openslide.PROPERTY_NAME_OBJECTIVE_POWER in WSI_object.wsi.properties:
	# if int(WSI_object.wsi.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER]) == 40 : #x40
	# new_patch_size = patch_size*2
	# new_step_size = step_size*2
	# else: #x20
	# new_patch_size = patch_size
	# new_step_size = step_size

	#========================================
	current_patch_params.update({'patch_level': patch_level, 'patch_size': new_patch_size, 'step_size': new_step_size,
	'save_path': patch_save_dir})
	file_path, patch_time_elapsed = patching(WSI_object = WSI_object, patient_id=patient_id, **current_patch_params,)
	# Optional: sanity check completion marker
	if file_path and not _h5_is_complete(file_path):
	print(f"warning: generated file missing completion marker: {file_path}")

	stitch_time_elapsed = -1
	if stitch:
	patient_id, wsi_id = _extract_patient_wsi_ids(slide_id)
	file_path = os.path.join(patch_save_dir, patient_id, wsi_id+'.h5')
	if os.path.isfile(file_path):
	heatmap, stitch_time_elapsed = stitching(file_path, WSI_object, downscale=64)
	#patient_id, wsi_id = slide_id.split('/')
	os.makedirs(os.path.join(stitch_save_dir, patient_id), exist_ok=True)
	# Try JPG first for smaller file size, fallback to PNG if size exceeds JPEG limit
	try:
	stitch_path = os.path.join(stitch_save_dir, patient_id, wsi_id+'.jpg')
	heatmap.save(stitch_path)
	except (OSError, ValueError) as e:
	print(f"JPG save failed ({e}), using PNG instead for {wsi_id}")
	stitch_path = os.path.join(stitch_save_dir, patient_id, wsi_id+'.png')
	heatmap.save(stitch_path)

	print("segmentation took {} seconds".format(seg_time_elapsed))
	print("patching took {} seconds".format(patch_time_elapsed))
	print("stitching took {} seconds".format(stitch_time_elapsed))
	df.loc[idx, 'status'] = 'processed'

	seg_times += seg_time_elapsed
	patch_times += patch_time_elapsed
	stitch_times += stitch_time_elapsed

	if total > 0:
	seg_times /= total
	patch_times /= total
	stitch_times /= total
	else:
	seg_times = 0.0
	patch_times = 0.0
	stitch_times = 0.0

	df.to_csv(os.path.join(save_dir, list_file_name), index=False)
	print("average segmentation time in s per slide: {}".format(seg_times))
	print("average patching time in s per slide: {}".format(patch_times))
	print("average stiching time in s per slide: {}".format(stitch_times))

	return seg_times, patch_times

	def get_args_parser():
	parser = argparse.ArgumentParser(description='seg and patch', add_help=False)

	parser.add_argument('--source', type = str,
	help='path to folder containing raw wsi image files')
	parser.add_argument('--step_size', type = int, default=256,
	help='step_size')
	parser.add_argument('--patch_size', type = int, default=256,
	help='patch_size')
	parser.add_argument('--patch', type=str_to_bool, default=True,
	help="whether to run patching; accepts true/false")
	parser.add_argument('--seg', type=str_to_bool, default=True,
	help="whether to run segmentation; accepts true/false")
	parser.add_argument('--stitch', type=str_to_bool, default=False,
	help="whether to run stitching; accepts true/false")
	parser.add_argument('--auto_skip', type=str_to_bool, default=False,
	help="if true, auto-skip is enabled (skip slides with existing outputs); if false, reprocess even if outputs exist")
	parser.add_argument('--save_dir', type = str,
	help='directory to save processed data')
	parser.add_argument('--preset', default=None, type=str,
	help='predefined profile of default segmentation and filter parameters (.csv)')
	parser.add_argument('--patch_level', type=int, default=0,
	help='downsample level at which to patch')
	parser.add_argument('--process_list', type = str, default=None,
	help='name of list of images to process with parameters (.csv)')
	parser.add_argument('--rank', type=int, default=0,
	help='Process rank used to shard slides across parallel jobs')
	parser.add_argument('--world_size', type=int, default=1,
	help='Total number of parallel jobs that split the slide list')
	parser.add_argument('--assumed_mpp', type=float, default=0.5,
	help='Fallback microns-per-pixel to use when slide metadata lacks MPP information (default: 0.5).')
	parser.add_argument('--seg_downsample', type=float, default=1.0,
	help='Optional extra downsample factor (>1) applied before segmentation to speed up single-level slides.')
	return parser

	def main(args):
	"""
	Entrypoint for running seg-and-patch from code or CLI.
	"""

	patch_save_dir = os.path.join(args.save_dir, 'patches')
	mask_save_dir = os.path.join(args.save_dir, 'masks')
	stitch_save_dir = os.path.join(args.save_dir, 'stitches')

	if args.process_list:
	process_list = os.path.join(args.save_dir, args.process_list)
	else:
	process_list = None

	print('source: ', args.source)
	print('patch_save_dir: ', patch_save_dir)
	print('mask_save_dir: ', mask_save_dir)
	print('stitch_save_dir: ', stitch_save_dir)

	directories = {'source': args.source,
	'save_dir': args.save_dir,
	'patch_save_dir': patch_save_dir,
	'mask_save_dir' : mask_save_dir,
	'stitch_save_dir': stitch_save_dir}

	for key, val in directories.items():
	print("{} : {}".format(key, val))
	if key not in ['source']:
	os.makedirs(val, exist_ok=True)

	seg_params = {'seg_level': -1, 'sthresh': 8, 'mthresh': 7, 'close': 4, 'use_otsu': False,
	'keep_ids': 'none', 'exclude_ids': 'none', 'seg_downsample': args.seg_downsample}
	#mod_v0.2
	filter_params = {'a_t':1, 'a_h': 1, 'max_n_holes':100}
	#filter_params = {'a_t':100, 'a_h': 16, 'max_n_holes':8}
	vis_params = {'vis_level': -1, 'line_thickness': 100}
	patch_params = {'use_padding': True, 'contour_fn': 'four_pt'}

	if args.preset:
	preset_df = pd.read_csv(os.path.join('presets', args.preset))
	for key in seg_params.keys():
	if key in preset_df.columns:
	seg_params[key] = preset_df.loc[0, key]

	for key in filter_params.keys():
	if key in preset_df.columns:
	filter_params[key] = preset_df.loc[0, key]

	for key in vis_params.keys():
	if key in preset_df.columns:
	vis_params[key] = preset_df.loc[0, key]

	for key in patch_params.keys():
	if key in preset_df.columns:
	patch_params[key] = preset_df.loc[0, key]

	parameters = {'seg_params': seg_params,
	'filter_params': filter_params,
	'patch_params': patch_params,
	'vis_params': vis_params}

	print(parameters)

	rank = getattr(args, 'rank', 0)
	world_size = getattr(args, 'world_size', 1)
	print(f"rank: {rank}, world_size: {world_size}")

	seg_times, patch_times = seg_and_patch(directories, parameters,
	patch_size = args.patch_size, step_size=args.step_size,
	seg = args.seg, use_default_params=False, save_mask = True,
	stitch= args.stitch,
	patch_level=args.patch_level, patch = args.patch,
	process_list = process_list, auto_skip=args.auto_skip,
	rank=rank, world_size=world_size,
	assumed_mpp=args.assumed_mpp)

	return seg_times, patch_times


	if __name__ == '__main__':

	parser = argparse.ArgumentParser('create_patches', parents=[get_args_parser()])
	args = parser.parse_args()

	main(args)