whats2000

feat(eda): normalize dataset paths and deduplicate results in summary

95969f7 about 1 month ago

58.1 kB

	#!/usr/bin/env python3
	"""Distributed EDA with chunk-slice architecture for billion-scale datasets.

	Each dataset is sliced into small row-ranges. Each slice is a Dask task that
	processes a bounded amount of memory (chunk_size * n_vars). Slice results are
	merged per-dataset on the scheduler side with O(1) memory.

	No quantiles - only non-zero min, max, mean, sparsity, gene-level stats,
	and metadata summaries. This handles datasets from 2 GB to 500 GB.
	"""

	from __future__ import annotations

	import argparse
	import concurrent.futures
	import gc
	import hashlib
	import json
	import math
	import time
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any

	import anndata as ad
	import dask
	import numpy as np
	import pandas as pd
	import yaml
	from dask.distributed import Client, LocalCluster
	from scipy import sparse
	from tqdm import tqdm


	# ---------------------------------------------------------------------------
	# Slice result: the only thing returned from each Dask task
	# ---------------------------------------------------------------------------
	@dataclass
	class SliceResult:
	"""Mergeable statistics from one row-slice of a dataset.

	Every field is O(1) memory except gene arrays which are O(n_vars).
	All fields are JSON-serialisable after to_dict().
	"""
	# identity
	dataset_path: str = ""
	slice_start: int = 0
	slice_end: int = 0

	# matrix global
	n_obs_slice: int = 0
	n_vars: int = 0
	nnz: int = 0
	x_sum: float = 0.0
	x_sum_sq: float = 0.0

	# cell-level running stats (non-zero counts per cell)
	cell_total_counts_sum: float = 0.0
	cell_total_counts_min: float = math.inf
	cell_total_counts_max: float = -math.inf
	cell_n_genes_sum: int = 0
	cell_n_genes_min: int = 2**63 - 1
	cell_n_genes_max: int = 0

	# gene-level accumulators (length = n_vars, stored as list for serialisation)
	gene_n_cells: list \| None = None
	gene_total_counts: list \| None = None

	# status
	status: str = "ok"
	error: str = ""
	elapsed_sec: float = 0.0


	def merge_slice_results(slices: list[SliceResult], n_obs: int, n_vars: int) -> dict:
	"""Merge many SliceResults into one per-dataset summary dict.

	Uses O(n_vars) memory for gene arrays, everything else O(1).
	"""
	nnz_total = 0
	x_sum = 0.0
	x_sum_sq = 0.0
	n_obs_seen = 0

	cell_total_counts_sum = 0.0
	cell_total_counts_min = math.inf
	cell_total_counts_max = -math.inf
	cell_n_genes_sum = 0
	cell_n_genes_min = 2**63 - 1
	cell_n_genes_max = 0

	gene_n_cells = np.zeros(n_vars, dtype=np.int64)
	gene_total_counts = np.zeros(n_vars, dtype=np.float64)

	for s in slices:
	if s.status != "ok":
	continue
	n_obs_seen += s.n_obs_slice
	nnz_total += s.nnz
	x_sum += s.x_sum
	x_sum_sq += s.x_sum_sq

	cell_total_counts_sum += s.cell_total_counts_sum
	cell_total_counts_min = min(cell_total_counts_min, s.cell_total_counts_min)
	cell_total_counts_max = max(cell_total_counts_max, s.cell_total_counts_max)
	cell_n_genes_sum += s.cell_n_genes_sum
	cell_n_genes_min = min(cell_n_genes_min, s.cell_n_genes_min)
	cell_n_genes_max = max(cell_n_genes_max, s.cell_n_genes_max)

	if s.gene_n_cells is not None:
	gene_n_cells += np.asarray(s.gene_n_cells, dtype=np.int64)
	if s.gene_total_counts is not None:
	gene_total_counts += np.asarray(s.gene_total_counts, dtype=np.float64)

	total_entries = n_obs * n_vars
	row: dict[str, Any] = {
	"n_obs": n_obs,
	"n_vars": n_vars,
	"n_obs_processed": n_obs_seen,
	"nnz": int(nnz_total),
	"sparsity": float(1.0 - nnz_total / total_entries) if total_entries else None,
	"x_mean": float(x_sum / total_entries) if total_entries else None,
	}
	if total_entries:
	var = max(0.0, x_sum_sq / total_entries - (x_sum / total_entries) ** 2)
	row["x_std"] = float(math.sqrt(var))
	else:
	row["x_std"] = None

	# Cell-level summaries
	if n_obs_seen > 0:
	row["cell_total_counts_min"] = float(cell_total_counts_min)
	row["cell_total_counts_max"] = float(cell_total_counts_max)
	row["cell_total_counts_mean"] = float(cell_total_counts_sum / n_obs_seen)
	row["cell_n_genes_detected_min"] = int(cell_n_genes_min)
	row["cell_n_genes_detected_max"] = int(cell_n_genes_max)
	row["cell_n_genes_detected_mean"] = float(cell_n_genes_sum / n_obs_seen)
	else:
	row["cell_total_counts_min"] = None
	row["cell_total_counts_max"] = None
	row["cell_total_counts_mean"] = None
	row["cell_n_genes_detected_min"] = None
	row["cell_n_genes_detected_max"] = None
	row["cell_n_genes_detected_mean"] = None

	# Gene-level summaries
	genes_detected = int(np.count_nonzero(gene_n_cells))
	row["genes_detected_in_any_cell"] = genes_detected
	row["genes_detected_in_any_cell_pct"] = float(genes_detected / n_vars * 100) if n_vars else 0.0
	if genes_detected > 0:
	mask = gene_n_cells > 0
	row["gene_n_cells_min"] = int(gene_n_cells[mask].min())
	row["gene_n_cells_max"] = int(gene_n_cells[mask].max())
	row["gene_n_cells_mean"] = float(gene_n_cells[mask].mean())
	row["gene_total_counts_min"] = float(gene_total_counts[mask].min())
	row["gene_total_counts_max"] = float(gene_total_counts[mask].max())
	row["gene_total_counts_mean"] = float(gene_total_counts[mask].mean())
	else:
	for k in ("gene_n_cells_min", "gene_n_cells_max", "gene_n_cells_mean",
	"gene_total_counts_min", "gene_total_counts_max", "gene_total_counts_mean"):
	row[k] = 0

	# Clean up
	del gene_n_cells, gene_total_counts
	return row


	# ---------------------------------------------------------------------------
	# Simple worker function for small datasets (no Dask overhead)
	# ---------------------------------------------------------------------------
	def process_dataset_simple(
	path_str: str,
	n_obs: int,
	n_vars: int,
	chunk_size: int,
	max_meta_cols: int,
	max_categories: int,
	) -> dict:
	"""Process entire small dataset in one worker (no slicing, no Dask)."""
	t0 = time.time()
	path = Path(path_str)
	row: dict[str, Any] = {
	"dataset_path": path_str,
	"dataset_file": path.name,
	"n_obs": n_obs,
	"n_vars": n_vars,
	}

	try:
	adata = ad.read_h5ad(path, backed="r")
	total_entries = n_obs * n_vars

	nnz_total = 0
	x_sum = 0.0
	x_sum_sq = 0.0

	# Cell-level accumulators
	cell_total_counts_sum = 0.0
	cell_total_counts_min = math.inf
	cell_total_counts_max = -math.inf
	cell_n_genes_sum = 0
	cell_n_genes_min = 2**63 - 1
	cell_n_genes_max = 0

	# Gene-level accumulators
	gene_n_cells = np.zeros(n_vars, dtype=np.int64)
	gene_total_counts = np.zeros(n_vars, dtype=np.float64)

	# Process in chunks
	for start in range(0, n_obs, chunk_size):
	end = min(start + chunk_size, n_obs)
	chunk = adata.X[start:end, :]

	if sparse.issparse(chunk):
	csr = chunk.tocsr() if not sparse.isspmatrix_csr(chunk) else chunk
	data = csr.data.astype(np.float64, copy=False)

	nnz_total += int(csr.nnz)
	x_sum += float(data.sum())
	x_sum_sq += float(np.square(data).sum())

	# Cell stats
	cell_counts = np.asarray(csr.sum(axis=1)).ravel()
	cell_genes = np.diff(csr.indptr).astype(np.int64)

	cell_total_counts_sum += float(cell_counts.sum())
	cell_total_counts_min = min(cell_total_counts_min, float(cell_counts.min()))
	cell_total_counts_max = max(cell_total_counts_max, float(cell_counts.max()))
	cell_n_genes_sum += int(cell_genes.sum())
	cell_n_genes_min = min(cell_n_genes_min, int(cell_genes.min()))
	cell_n_genes_max = max(cell_n_genes_max, int(cell_genes.max()))

	# Gene stats
	csc = csr.tocsc()
	gene_n_cells += np.diff(csc.indptr).astype(np.int64)
	gene_total_counts += np.asarray(csc.sum(axis=0)).ravel()

	del csr, csc, data
	else:
	arr = np.asarray(chunk, dtype=np.float64)
	nz = arr != 0

	nnz_total += int(nz.sum())
	x_sum += float(arr.sum())
	x_sum_sq += float(np.square(arr).sum())

	# Cell stats
	cell_counts = arr.sum(axis=1)
	cell_genes = nz.sum(axis=1).astype(np.int64)

	cell_total_counts_sum += float(cell_counts.sum())
	cell_total_counts_min = min(cell_total_counts_min, float(cell_counts.min()))
	cell_total_counts_max = max(cell_total_counts_max, float(cell_counts.max()))
	cell_n_genes_sum += int(cell_genes.sum())
	cell_n_genes_min = min(cell_n_genes_min, int(cell_genes.min()))
	cell_n_genes_max = max(cell_n_genes_max, int(cell_genes.max()))

	# Gene stats
	gene_n_cells += nz.sum(axis=0).astype(np.int64)
	gene_total_counts += arr.sum(axis=0)

	del arr, nz

	del chunk
	gc.collect()

	# Matrix-level stats
	row["nnz"] = int(nnz_total)
	row["sparsity"] = float(1.0 - nnz_total / total_entries) if total_entries else None
	row["x_mean"] = float(x_sum / total_entries) if total_entries else None
	if total_entries:
	var = max(0.0, x_sum_sq / total_entries - (x_sum / total_entries) ** 2)
	row["x_std"] = float(math.sqrt(var))
	else:
	row["x_std"] = None

	# Cell-level stats
	if n_obs > 0:
	row["cell_total_counts_min"] = float(cell_total_counts_min)
	row["cell_total_counts_max"] = float(cell_total_counts_max)
	row["cell_total_counts_mean"] = float(cell_total_counts_sum / n_obs)
	row["cell_n_genes_detected_min"] = int(cell_n_genes_min)
	row["cell_n_genes_detected_max"] = int(cell_n_genes_max)
	row["cell_n_genes_detected_mean"] = float(cell_n_genes_sum / n_obs)
	else:
	row["cell_total_counts_min"] = None
	row["cell_total_counts_max"] = None
	row["cell_total_counts_mean"] = None
	row["cell_n_genes_detected_min"] = None
	row["cell_n_genes_detected_max"] = None
	row["cell_n_genes_detected_mean"] = None

	# Gene-level stats
	genes_detected = int(np.count_nonzero(gene_n_cells))
	row["genes_detected_in_any_cell"] = genes_detected
	row["genes_detected_in_any_cell_pct"] = float(genes_detected / n_vars * 100) if n_vars else 0.0
	if genes_detected > 0:
	mask = gene_n_cells > 0
	row["gene_n_cells_min"] = int(gene_n_cells[mask].min())
	row["gene_n_cells_max"] = int(gene_n_cells[mask].max())
	row["gene_n_cells_mean"] = float(gene_n_cells[mask].mean())
	row["gene_total_counts_min"] = float(gene_total_counts[mask].min())
	row["gene_total_counts_max"] = float(gene_total_counts[mask].max())
	row["gene_total_counts_mean"] = float(gene_total_counts[mask].mean())
	else:
	for k in ("gene_n_cells_min", "gene_n_cells_max", "gene_n_cells_mean",
	"gene_total_counts_min", "gene_total_counts_max", "gene_total_counts_mean"):
	row[k] = 0

	# Metadata
	row["obs_columns"] = int(len(adata.obs.columns))
	row["var_columns"] = int(len(adata.var.columns))
	row["metadata_obs_summary"] = summarize_metadata(
	adata.obs, max_cols=max_meta_cols, max_categories=max_categories
	)
	row["metadata_var_summary"] = summarize_metadata(
	adata.var, max_cols=max_meta_cols, max_categories=max_categories
	)
	row["obs_schema"] = extract_schema(adata.obs)
	row["var_schema"] = extract_schema(adata.var)

	# Clean up
	del gene_n_cells, gene_total_counts
	try:
	if hasattr(adata, "file") and adata.file is not None:
	adata.file.close()
	except Exception:
	pass
	del adata

	row["status"] = "ok"
	row["n_slices_total"] = 1
	row["n_slices_ok"] = 1
	row["n_slices_failed"] = 0

	except Exception as exc:
	row["status"] = "failed"
	row["error"] = str(exc)

	gc.collect()
	row["elapsed_sec"] = round(time.time() - t0, 2)
	return row


	# ---------------------------------------------------------------------------
	# Core worker function: process ONE slice of ONE dataset (Dask)
	# ---------------------------------------------------------------------------
	def process_slice(
	path_str: str,
	obs_start: int,
	obs_end: int,
	chunk_size: int,
	) -> SliceResult:
	"""Process rows [obs_start, obs_end) of a dataset.

	Memory usage bounded by: chunk_size * n_vars * ~12 bytes * 3x overhead.
	"""
	t0 = time.time()
	path = Path(path_str)
	result = SliceResult(dataset_path=path_str, slice_start=obs_start, slice_end=obs_end)

	try:
	adata = ad.read_h5ad(path, backed="r")
	n_vars = int(adata.n_vars)
	result.n_vars = n_vars
	result.n_obs_slice = obs_end - obs_start

	# Gene-level accumulators for this slice
	gene_n_cells = np.zeros(n_vars, dtype=np.int64)
	gene_total_counts = np.zeros(n_vars, dtype=np.float64)

	# Process in sub-chunks within this slice
	for start in range(obs_start, obs_end, chunk_size):
	end = min(start + chunk_size, obs_end)
	chunk = adata.X[start:end, :]

	if sparse.issparse(chunk):
	csr = chunk.tocsr() if not sparse.isspmatrix_csr(chunk) else chunk
	data = csr.data.astype(np.float64, copy=False)

	result.nnz += int(csr.nnz)
	result.x_sum += float(data.sum())
	result.x_sum_sq += float(np.square(data).sum())

	# Cell stats
	cell_counts = np.asarray(csr.sum(axis=1)).ravel()
	cell_genes = np.diff(csr.indptr).astype(np.int64)

	# Gene stats (optimized: use bincount instead of CSC conversion)
	# Accumulate counts directly from CSR indices/data
	gene_total_counts += np.bincount(
	csr.indices,
	weights=data,
	minlength=n_vars
	)
	gene_n_cells += np.bincount(
	csr.indices,
	minlength=n_vars
	)

	del csr, data
	else:
	arr = np.asarray(chunk, dtype=np.float64)
	nz = arr != 0

	result.nnz += int(nz.sum())
	result.x_sum += float(arr.sum())
	result.x_sum_sq += float(np.square(arr).sum())

	# Cell stats
	cell_counts = arr.sum(axis=1)
	cell_genes = nz.sum(axis=1).astype(np.int64)

	# Gene stats
	gene_n_cells += nz.sum(axis=0).astype(np.int64)
	gene_total_counts += arr.sum(axis=0)

	del arr, nz

	# Update cell-level running stats
	result.cell_total_counts_sum += float(cell_counts.sum())
	result.cell_total_counts_min = min(result.cell_total_counts_min, float(cell_counts.min()))
	result.cell_total_counts_max = max(result.cell_total_counts_max, float(cell_counts.max()))
	result.cell_n_genes_sum += int(cell_genes.sum())
	result.cell_n_genes_min = min(result.cell_n_genes_min, int(cell_genes.min()))
	result.cell_n_genes_max = max(result.cell_n_genes_max, int(cell_genes.max()))

	del chunk, cell_counts, cell_genes
	gc.collect()

	# Store gene arrays as lists for serialisation
	result.gene_n_cells = gene_n_cells.tolist()
	result.gene_total_counts = gene_total_counts.tolist()
	del gene_n_cells, gene_total_counts

	# Close file
	try:
	if hasattr(adata, "file") and adata.file is not None:
	adata.file.close()
	except Exception:
	pass
	del adata

	except Exception as exc:
	result.status = "failed"
	result.error = str(exc)

	gc.collect()
	result.elapsed_sec = round(time.time() - t0, 2)
	return result


	# ---------------------------------------------------------------------------
	# Metadata helpers (run on scheduler, not workers)
	# ---------------------------------------------------------------------------
	def safe_name(path: Path) -> str:
	"""Generate safe filename from path."""
	digest = hashlib.md5(str(path).encode("utf-8"), usedforsecurity=False).hexdigest()[:10]
	stem = path.stem.replace(" ", "_")
	if len(stem) > 80:
	stem = stem[:80]
	return f"{stem}_{digest}"


	def summarize_metadata(df: pd.DataFrame, max_cols: int, max_categories: int) -> dict[str, dict]:
	"""Summarize DataFrame metadata with top categories."""
	if df.empty:
	return {}

	preferred = ["cell_type", "assay", "tissue", "disease", "sex", "donor_id"]
	selected: list[str] = [c for c in preferred if c in df.columns]
	for col in df.columns:
	if col not in selected:
	selected.append(col)
	if len(selected) >= max_cols:
	break

	out: dict[str, dict] = {}
	n_rows = max(1, len(df))
	for col in selected:
	s = df[col]
	summary: dict[str, Any] = {
	"dtype": str(s.dtype),
	"missing_fraction": float(s.isna().sum()) / n_rows,
	}
	if isinstance(s.dtype, pd.CategoricalDtype):
	summary["n_unique"] = int(len(s.cat.categories))
	vc = s.value_counts(dropna=False).head(max_categories)
	summary["top_values"] = {str(k): int(v) for k, v in vc.items()}
	elif pd.api.types.is_string_dtype(s.dtype) or s.dtype == object:
	s_str = s.dropna().astype(str)
	summary["n_unique"] = int(s_str.nunique())
	vc = s_str.value_counts(dropna=False).head(max_categories)
	summary["top_values"] = {str(k): int(v) for k, v in vc.items()}
	out[col] = summary
	return out


	def extract_schema(df: pd.DataFrame) -> dict[str, object]:
	"""Extract DataFrame schema."""
	return {
	"n_columns": int(len(df.columns)),
	"columns": [str(c) for c in df.columns],
	"dtypes": {str(c): str(df[c].dtype) for c in df.columns},
	}


	def extract_metadata_on_scheduler(
	path: Path,
	max_meta_cols: int,
	max_categories: int,
	) -> dict:
	"""Extract obs/var metadata. Runs on scheduler (lightweight, no X access)."""
	try:
	adata = ad.read_h5ad(path, backed="r")
	result = {
	"obs_columns": int(len(adata.obs.columns)),
	"var_columns": int(len(adata.var.columns)),
	"metadata_obs_summary": summarize_metadata(
	adata.obs, max_cols=max_meta_cols, max_categories=max_categories
	),
	"metadata_var_summary": summarize_metadata(
	adata.var, max_cols=max_meta_cols, max_categories=max_categories
	),
	"obs_schema": extract_schema(adata.obs),
	"var_schema": extract_schema(adata.var),
	}
	try:
	if hasattr(adata, "file") and adata.file is not None:
	adata.file.close()
	except Exception:
	pass
	del adata
	gc.collect()
	return result
	except Exception as exc:
	return {"metadata_error": str(exc)}


	# ---------------------------------------------------------------------------
	# Dask configuration
	# ---------------------------------------------------------------------------
	def configure_dask_for_hpc() -> None:
	"""Configure Dask for HPC with aggressive memory management."""
	dask.config.set({
	"distributed.worker.memory.target": 0.60,
	"distributed.worker.memory.spill": 0.70,
	"distributed.worker.memory.pause": 0.80,
	"distributed.worker.memory.terminate": 0.95,
	"distributed.worker.daemon": False,
	"distributed.worker.use-file-locking": False,
	"distributed.scheduler.allowed-failures": 10,
	"distributed.scheduler.work-stealing": True,
	"distributed.scheduler.work-stealing-interval": "100ms",
	"distributed.comm.timeouts.connect": "120s",
	"distributed.comm.timeouts.tcp": "120s",
	"distributed.admin.tick.interval": "2s",
	"distributed.admin.log-length": 500,
	})


	# ---------------------------------------------------------------------------
	# Config / metadata helpers
	# ---------------------------------------------------------------------------
	def load_config(config_path: Path) -> dict:
	with open(config_path) as f:
	return yaml.safe_load(f)


	def load_enhanced_metadata(cache_path: Path) -> pd.DataFrame:
	if not cache_path.exists():
	raise FileNotFoundError(
	f"Enhanced metadata cache not found: {cache_path}\n"
	"Run: uv run python scripts/build_metadata_cache.py --config <config.yaml>"
	)
	return pd.read_parquet(cache_path)


	def get_datasets_for_shard(
	metadata_df: pd.DataFrame,
	config: dict,
	num_shards: int,
	shard_index: int,
	) -> list[dict]:
	"""Get dataset info for this shard.

	Returns list of dicts with keys: dataset_path, n_obs, n_vars, total_entries.
	"""
	if num_shards > 1:
	metadata_df = metadata_df.sort_values("total_entries", ascending=False).reset_index(drop=True)
	shard_df = metadata_df[metadata_df.index % num_shards == shard_index].copy()
	else:
	shard_df = metadata_df.copy()

	shard_df = shard_df[shard_df["status"].str.startswith("ok", na=False)].copy()
	max_entries = config["dataset_thresholds"]["max_entries"]
	shard_df = shard_df[shard_df["total_entries"] <= max_entries].copy()

	datasets = []
	for _, row in shard_df.iterrows():
	# Normalize path to absolute to avoid duplicates from relative/absolute mixups
	dataset_path = Path(str(row["dataset_path"])).resolve()
	datasets.append({
	"dataset_path": str(dataset_path),
	"n_obs": int(row.get("n_obs", 0)),
	"n_vars": int(row.get("n_vars", 0)),
	"total_entries": int(row.get("total_entries", 0)),
	"size_category": str(row.get("size_category", "large")), # Include size category
	})
	return datasets


	# ---------------------------------------------------------------------------
	# Main processing pipeline
	# ---------------------------------------------------------------------------
	def create_slice_tasks(
	dataset: dict,
	obs_slice_size: int,
	small_dataset_threshold: int,
	) -> list[tuple[str, int, int]]:
	"""Create (path, start, end) slice tasks for a dataset.

	Small datasets (< threshold): Single task for entire dataset (faster, no slicing overhead)
	Medium/Large datasets: Sliced into obs_slice_size chunks (memory-safe)
	"""
	path = dataset["dataset_path"]
	n_obs = dataset["n_obs"]
	total_entries = dataset.get("total_entries", n_obs * dataset.get("n_vars", 0))

	if n_obs <= 0:
	return [(path, 0, 0)]

	# For small datasets, process entire dataset in one task (no slicing overhead)
	if total_entries < small_dataset_threshold:
	return [(path, 0, n_obs)]

	# For medium/large datasets, slice to manage memory
	tasks = []
	for start in range(0, n_obs, obs_slice_size):
	end = min(start + obs_slice_size, n_obs)
	tasks.append((path, start, end))
	return tasks


	def process_all_datasets(
	datasets: list[dict],
	config: dict,
	per_dataset_dir: Path,
	client: Client \| None,
	max_retries: int = 3,
	) -> tuple[list[dict], list[dict]]:
	"""Process all datasets: small ones with ProcessPoolExecutor, large ones with Dask."""
	base_chunk_size = config["resources"]["chunk_size"]
	base_obs_slice_size = config["slicing"].get("obs_slice_size", 75_000)
	obs_slice_size_xlarge = config["slicing"].get("obs_slice_size_xlarge", 150_000)
	small_threshold = config["dataset_thresholds"]["small"]
	max_meta_cols = config["metadata"]["max_meta_cols"]
	max_categories = config["metadata"]["max_categories"]
	max_workers_base = config["resources"]["max_workers"]

	# Helper function to get adjusted parameters based on size category
	def get_dataset_params(dataset):
	size_cat = dataset.get("size_category", "large")
	strategy = config.get("strategy", {}).get(size_cat, config["strategy"]["large"])

	chunk_mult = strategy.get("chunk_size_multiplier", 1.0)
	chunk_size = int(base_chunk_size * chunk_mult)

	# Use smaller slice size for xlarge datasets
	if size_cat == "xlarge":
	obs_slice = obs_slice_size_xlarge
	else:
	obs_slice = base_obs_slice_size

	return chunk_size, obs_slice, size_cat

	successes = []
	failures = []

	# Categorize datasets: small, dask-ready (medium/large), xlarge (skip Dask)
	small_datasets = [d for d in datasets if d.get("total_entries", 0) < small_threshold]
	non_small = [d for d in datasets if d.get("total_entries", 0) >= small_threshold]

	# Split non-small into Dask-compatible and xlarge (which skip Dask due to failures)
	dask_datasets = [d for d in non_small if d.get("size_category", "large") != "xlarge"]
	xlarge_datasets = [d for d in non_small if d.get("size_category", "large") == "xlarge"]

	small_datasets.sort(key=lambda d: d["total_entries"])
	dask_datasets.sort(key=lambda d: d["total_entries"])
	xlarge_datasets.sort(key=lambda d: d["total_entries"])

	datasets_sorted = small_datasets + dask_datasets + xlarge_datasets
	small_count = len(small_datasets)
	dask_count = len(dask_datasets)
	xlarge_count = len(xlarge_datasets)

	datasets_sorted = small_datasets + dask_datasets + xlarge_datasets
	small_count = len(small_datasets)
	dask_count = len(dask_datasets)
	xlarge_count = len(xlarge_datasets)

	print(f"\n{'=' * 80}")
	print(f"Processing {len(datasets_sorted)} datasets")
	print(f" Small datasets (ProcessPoolExecutor): {small_count}")
	print(f" Medium/Large (Dask + slicing): {dask_count}")
	print(f" XLarge (Direct, skip Dask): {xlarge_count}")
	print(f"Slice size: {base_obs_slice_size:,} rows (medium/large), {obs_slice_size_xlarge:,} rows (xlarge)")
	print(f"Small threshold: {small_threshold:,} entries")
	print(f"Base chunk size: {base_chunk_size:,} rows (adjusted per dataset size)")
	print(f"{'=' * 80}\n")

	total_datasets = len(datasets_sorted)

	# ========================================================================
	# Phase 1: Process small datasets with ProcessPoolExecutor (batched)
	# ========================================================================
	if small_count > 0:
	print(f"{'='*80}")
	print(f"PHASE 1: Small datasets ({small_count}) - ProcessPoolExecutor")
	print(f"{'='*80}\n")

	# Adaptive worker management
	current_workers = max_workers_base
	min_workers_ratio = config["resources"].get("min_workers_ratio", 0.25)
	min_workers = max(1, int(max_workers_base * min_workers_ratio))
	batch_size = max(30, min(100, small_count // 4))

	# Throughput monitoring
	check_interval = 50
	baseline_throughput = None
	slowdown_threshold = config["resources"].get("slowdown_threshold", 0.5)
	last_check_idx = 0
	batch_start_time = time.time()

	print(f"Workers: {current_workers} (adaptive: {min_workers}-{max_workers_base})")
	print(f"Batch size: {batch_size} (recycled between batches)\n")

	with tqdm(total=small_count, desc="Small datasets", position=0) as pbar:
	for batch_start in range(0, small_count, batch_size):
	batch_end = min(batch_start + batch_size, small_count)
	batch = small_datasets[batch_start:batch_end]

	# Check throughput and adjust workers
	processed = len(successes) + len(failures)
	if processed >= last_check_idx + check_interval and processed > check_interval:
	elapsed = time.time() - batch_start_time
	current_throughput = processed / elapsed if elapsed > 0 else 0

	if baseline_throughput is None and processed >= check_interval * 2:
	baseline_throughput = current_throughput
	tqdm.write(f"Baseline: {baseline_throughput:.2f} ds/sec")

	if baseline_throughput and current_throughput < baseline_throughput * slowdown_threshold:
	if current_workers > min_workers:
	old_workers = current_workers
	current_workers = max(min_workers, current_workers // 2)
	tqdm.write(f"⚠️ Slowdown detected. Workers: {old_workers} → {current_workers}")
	baseline_throughput = None

	last_check_idx = processed

	# Process batch
	executor = concurrent.futures.ProcessPoolExecutor(max_workers=current_workers)
	futures = {}

	try:
	for dataset in batch:
	# Get chunk size for this dataset
	chunk_size, _, _ = get_dataset_params(dataset)
	future = executor.submit(
	process_dataset_simple,
	dataset["dataset_path"],
	dataset["n_obs"],
	dataset["n_vars"],
	chunk_size,
	max_meta_cols,
	max_categories,
	)
	futures[future] = dataset

	for future in concurrent.futures.as_completed(futures):
	dataset = futures[future]
	ds_path = dataset["dataset_path"]
	ds_name = Path(ds_path).name

	try:
	row = future.result(timeout=3600)

	# File size
	try:
	row["file_size_gib"] = round(Path(ds_path).stat().st_size / (1024 ** 3), 4)
	except Exception:
	pass

	# Save JSON
	try:
	payload_name = safe_name(Path(ds_path)) + ".json"
	(per_dataset_dir / payload_name).write_text(json.dumps(row, indent=2))
	except Exception as exc:
	row["save_error"] = str(exc)

	if row.get("status") == "ok":
	successes.append(row)
	elapsed = row.get("elapsed_sec", "?")
	tqdm.write(f" [{len(successes)}/{total_datasets}] ✓ {ds_name[:50]} \| {elapsed}s")
	else:
	failures.append(row)
	error = row.get("error", "Unknown")[:60]
	tqdm.write(f" [{len(successes) + len(failures)}/{total_datasets}] ✗ {ds_name[:50]} \| {error}")

	except concurrent.futures.TimeoutError:
	failures.append({
	"dataset_path": ds_path,
	"dataset_file": ds_name,
	"status": "failed",
	"error": "Timeout",
	})
	tqdm.write(f" [{len(successes) + len(failures)}/{total_datasets}] ✗ {ds_name[:50]} \| Timeout")
	except Exception as exc:
	failures.append({
	"dataset_path": ds_path,
	"dataset_file": ds_name,
	"status": "failed",
	"error": str(exc),
	})
	tqdm.write(f" [{len(successes) + len(failures)}/{total_datasets}] ✗ {ds_name[:50]} \| {exc}")
	finally:
	pbar.update(1)
	finally:
	executor.shutdown(wait=True)
	gc.collect()
	time.sleep(1)

	print(f"\nPhase 1 complete: {len([s for s in successes if s in successes[-small_count:]])} ok, " +
	f"{len([f for f in failures if f in failures[-small_count:]])} failed\n")

	# ========================================================================
	# Phase 2: Process medium/large datasets with Dask
	# ========================================================================
	if dask_count > 0 and client:
	print(f"{'='*80}")
	print(f"PHASE 2: Medium/Large datasets ({dask_count}) - Dask + slicing")
	print(f"{'='*80}\n")

	with tqdm(
	total=dask_count,
	desc="Med/Large datasets",
	position=0,
	leave=True,
	ncols=100
	) as dataset_pbar:
	for ds_local_idx, dataset in enumerate(dask_datasets):
	dataset_idx = small_count + ds_local_idx
	ds_path = dataset["dataset_path"]
	ds_name = Path(ds_path).name
	n_obs = dataset["n_obs"]
	n_vars = dataset["n_vars"]
	total_entries = dataset["total_entries"]

	# Get size-specific parameters
	chunk_size, obs_slice_size, size_cat = get_dataset_params(dataset)

	t0 = time.time()

	# Create slice tasks with adjusted slice size
	slice_tasks = create_slice_tasks(dataset, obs_slice_size, small_threshold)
	n_slices = len(slice_tasks)

	dataset_pbar.set_description(f"Med/Large [{ds_local_idx + 1}/{dask_count}] ({size_cat})")

	# Submit all slices for this dataset
	slice_results: list[SliceResult] = []
	failed_slices: list[tuple[str, int, int]] = []

	# Submit slice tasks to Dask
	futures = client.map(
	lambda t: process_slice(t[0], t[1], t[2], chunk_size),
	slice_tasks,
	pure=False,
	)

	# Collect results with progress bar (show for sliced datasets)
	show_slice_bar = n_slices > 1
	slice_pbar = tqdm(
	total=n_slices,
	desc=f" \u2514\u2500 Slices",
	position=1,
	leave=False,
	ncols=100,
	disable=not show_slice_bar
	) if show_slice_bar else None

	if slice_pbar:
	slice_pbar.set_postfix(ok=0, fail=0)

	for task, future in zip(slice_tasks, futures):
	try:
	sr = future.result(timeout=3600)
	if sr.status == "ok":
	slice_results.append(sr)
	else:
	failed_slices.append(task)
	except Exception:
	failed_slices.append(task)
	finally:
	if slice_pbar:
	slice_pbar.set_postfix(ok=len(slice_results), fail=len(failed_slices))
	slice_pbar.update(1)

	if slice_pbar:
	slice_pbar.close()

	# Retry failed slices
	for retry in range(max_retries):
	if not failed_slices:
	break
	tqdm.write(f" [{dataset_idx + 1}/{total_datasets}] Retry {retry + 1}/{max_retries}: {len(failed_slices)} failed slices")
	time.sleep(1)

	retry_futures = client.map(
	lambda t: process_slice(t[0], t[1], t[2], chunk_size),
	failed_slices,
	pure=False,
	)
	next_failed = []
	for task, future in zip(failed_slices, retry_futures):
	try:
	sr = future.result(timeout=3600)
	if sr.status == "ok":
	slice_results.append(sr)
	else:
	next_failed.append(task)
	except Exception:
	next_failed.append(task)
	failed_slices = next_failed

	# EMERGENCY MODE: If still failing after all retries, use extreme settings
	if failed_slices and len(failed_slices) > 0:
	emergency_chunk = max(10000, chunk_size // 10) # Use 10% of original or 10K min
	tqdm.write(f" [{dataset_idx + 1}/{total_datasets}] ⚠️ EMERGENCY MODE: {len(failed_slices)} slices with extreme settings (chunk={emergency_chunk:,})")
	time.sleep(2)

	# Process failed slices one at a time with minimal chunk size
	emergency_ok = 0
	for task in failed_slices:
	try:
	future = client.submit(
	process_slice, task[0], task[1], task[2], emergency_chunk,
	pure=False,
	)
	sr = future.result(timeout=7200) # 2 hour timeout for extreme cases
	if sr.status == "ok":
	slice_results.append(sr)
	emergency_ok += 1
	except Exception as e:
	tqdm.write(f" Emergency failed for slice {task[1]}-{task[2]}: {str(e)[:100]}")
	continue

	if emergency_ok > 0:
	tqdm.write(f" [{dataset_idx + 1}/{total_datasets}] ✓ Emergency mode recovered {emergency_ok}/{len(failed_slices)} slices")

	# Update failed_slices to only those that still failed
	failed_slices = [t for t in failed_slices if not any(
	sr.slice_start == t[1] and sr.slice_end == t[2] for sr in slice_results
	)]

	# Check if we got enough data
	ok_count = len(slice_results)
	fail_count = len(failed_slices)
	elapsed = round(time.time() - t0, 1)

	if ok_count == 0:
	tqdm.write(f" [{dataset_idx + 1}/{total_datasets}] ✗ FAILED: {ds_name[:50]} \| all {n_slices} slices failed \| {elapsed}s")
	failures.append({
	"dataset_path": ds_path,
	"dataset_file": ds_name,
	"status": "failed",
	"error": f"All {n_slices} slices failed",
	"elapsed_sec": elapsed,
	})
	dataset_pbar.update(1)
	continue

	# Merge slice results into dataset summary
	row = merge_slice_results(slice_results, n_obs, n_vars)
	row["dataset_path"] = ds_path
	row["dataset_file"] = ds_name
	row["n_slices_total"] = n_slices
	row["n_slices_ok"] = ok_count
	row["n_slices_failed"] = fail_count

	# File size
	try:
	row["file_size_gib"] = round(Path(ds_path).stat().st_size / (1024 ** 3), 4)
	except Exception:
	pass

	# Extract metadata (lightweight, on scheduler)
	meta = extract_metadata_on_scheduler(
	Path(ds_path), max_meta_cols, max_categories
	)
	row.update(meta)

	row["status"] = "ok" if fail_count == 0 else "partial"
	row["elapsed_sec"] = elapsed

	# Save per-dataset JSON
	try:
	payload_name = safe_name(Path(ds_path)) + ".json"
	(per_dataset_dir / payload_name).write_text(json.dumps(row, indent=2))
	except Exception as exc:
	row["save_error"] = str(exc)

	successes.append(row)
	status = "✓" if fail_count == 0 else "⚠"
	tqdm.write(f" [{dataset_idx + 1}/{total_datasets}] {status} {ds_name[:50]} \| {ok_count}/{n_slices} slices \| {elapsed}s")

	# Free memory
	del slice_results
	gc.collect()

	# Update dataset progress
	dataset_pbar.update(1)

	print(f"\nPhase 2 complete\n")

	# Close Dask cluster before Phase 3 (xlarge direct processing doesn't use Dask)
	if xlarge_count > 0 and client:
	print("Closing Dask cluster before Phase 3 (xlarge datasets process directly)...")
	try:
	client.close()
	del client
	gc.collect()
	time.sleep(2)
	except Exception as e:
	print(f"Warning: Error closing Dask client: {e}")

	# ========================================================================
	# Phase 3: Process xlarge datasets DIRECTLY (skip Dask - causes failures)
	# ========================================================================
	if xlarge_count > 0:
	print(f"{'='*80}")
	print(f"PHASE 3: XLarge datasets ({xlarge_count}) - Direct processing (no Dask)")
	print(f"{'='*80}\n")

	with tqdm(
	total=xlarge_count,
	desc="XLarge datasets",
	position=0,
	leave=True,
	ncols=100
	) as dataset_pbar:
	for ds_local_idx, dataset in enumerate(xlarge_datasets):
	dataset_idx = small_count + dask_count + ds_local_idx
	ds_path = dataset["dataset_path"]
	ds_name = Path(ds_path).name
	n_obs = dataset["n_obs"]
	n_vars = dataset["n_vars"]

	# Get xlarge-specific parameters
	chunk_size, obs_slice_size, size_cat = get_dataset_params(dataset)

	t0 = time.time()

	# Create slice tasks
	slice_tasks = create_slice_tasks(dataset, obs_slice_size, small_threshold)
	n_slices = len(slice_tasks)

	tqdm.write(f" [{dataset_idx + 1}/{total_datasets}] Processing {ds_name[:50]} \| {n_slices} slices \| chunk={chunk_size:,}")

	# Process slices DIRECTLY without Dask (one at a time)
	slice_results: list[SliceResult] = []
	for slice_idx, (path, start, end) in enumerate(slice_tasks):
	try:
	sr = process_slice(path, start, end, chunk_size)
	if sr.status == "ok":
	slice_results.append(sr)
	else:
	tqdm.write(f" Slice {slice_idx+1}/{n_slices} failed: {sr.error}")
	except Exception as e:
	tqdm.write(f" Slice {slice_idx+1}/{n_slices} error: {str(e)[:100]}")

	ok_count = len(slice_results)
	fail_count = n_slices - ok_count
	elapsed = round(time.time() - t0, 1)

	if ok_count == 0:
	tqdm.write(f" [{dataset_idx + 1}/{total_datasets}] ✗ FAILED: {ds_name[:50]} \| all slices failed \| {elapsed}s")
	failures.append({
	"dataset_path": ds_path,
	"dataset_file": ds_name,
	"status": "failed",
	"error": f"All {n_slices} slices failed (xlarge direct mode)",
	"elapsed_sec": elapsed,
	})
	dataset_pbar.update(1)
	continue

	# Merge results
	row = merge_slice_results(slice_results, n_obs, n_vars)
	row["dataset_path"] = ds_path
	row["dataset_file"] = ds_name
	row["n_slices_total"] = n_slices
	row["n_slices_ok"] = ok_count
	row["n_slices_failed"] = fail_count
	row["processing_mode"] = "xlarge_direct"

	# File size
	try:
	row["file_size_gib"] = round(Path(ds_path).stat().st_size / (1024 ** 3), 4)
	except Exception:
	pass

	# Extract metadata
	meta = extract_metadata_on_scheduler(
	Path(ds_path), max_meta_cols, max_categories
	)
	row.update(meta)

	row["status"] = "ok" if fail_count == 0 else "partial"
	row["elapsed_sec"] = elapsed

	# Save per-dataset JSON
	try:
	payload_name = safe_name(Path(ds_path)) + ".json"
	(per_dataset_dir / payload_name).write_text(json.dumps(row, indent=2))
	except Exception as exc:
	row["save_error"] = str(exc)

	successes.append(row)
	status = "✓" if fail_count == 0 else "⚠"
	tqdm.write(f" [{dataset_idx + 1}/{total_datasets}] {status} {ds_name[:50]} \| {ok_count}/{n_slices} slices \| {elapsed}s")

	dataset_pbar.update(1)
	gc.collect()

	print(f"\nPhase 3 complete\n")

	return successes, failures


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------
	def main() -> None:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("--config", type=Path, required=True, help="YAML config")
	parser.add_argument("--num-shards", type=int, help="Override num_shards")
	parser.add_argument("--shard-index", type=int, help="Override shard_index")
	parser.add_argument("--max-retries", type=int, default=3, help="Max retries per slice")
	args = parser.parse_args()

	config = load_config(args.config)

	if args.num_shards is not None:
	config["sharding"]["num_shards"] = args.num_shards
	config["sharding"]["enabled"] = args.num_shards > 1
	if args.shard_index is not None:
	config["sharding"]["shard_index"] = args.shard_index

	num_shards = config["sharding"]["num_shards"]
	shard_index = config["sharding"]["shard_index"]

	configure_dask_for_hpc()

	# Load metadata
	cache_path = Path(config["paths"]["enhanced_metadata_cache"])
	if not cache_path.is_absolute():
	cache_path = Path(args.config).parent.parent / cache_path
	print(f"Loading metadata from: {cache_path}")
	metadata_df = load_enhanced_metadata(cache_path)

	datasets = get_datasets_for_shard(metadata_df, config, num_shards, shard_index)
	if not datasets:
	print("No datasets scheduled for this shard.")
	return

	# Output dirs
	output_dir = Path(config["paths"]["output_dir"])
	if not output_dir.is_absolute():
	output_dir = Path(args.config).parent.parent / output_dir
	output_dir.mkdir(parents=True, exist_ok=True)
	per_dataset_dir = output_dir / "per_dataset"
	per_dataset_dir.mkdir(parents=True, exist_ok=True)

	# Filter out already-completed datasets (resume capability)
	def is_dataset_done(ds_path: str) -> bool:
	"""Check if dataset already has a successful result."""
	try:
	payload_name = safe_name(Path(ds_path)) + ".json"
	result_file = per_dataset_dir / payload_name
	if result_file.exists():
	result_data = json.loads(result_file.read_text())
	return result_data.get("status") == "ok"
	except Exception:
	pass
	return False

	original_count = len(datasets)
	datasets = [d for d in datasets if not is_dataset_done(d["dataset_path"])]
	skipped_count = original_count - len(datasets)

	if skipped_count > 0:
	print(f"\n{'='*80}")
	print(f"RESUME MODE: Skipping {skipped_count} already-completed datasets")
	print(f"Remaining to process: {len(datasets)}")
	print(f"{'='*80}\n")

	if not datasets:
	print("All datasets already completed. Nothing to do.")
	return

	# Check if we need Dask cluster (for medium/large datasets)
	small_threshold = config["dataset_thresholds"]["small"]
	# Count datasets by processing type
	small_count_init = sum(1 for d in datasets if d.get("total_entries", 0) < small_threshold)
	dask_count_init = sum(1 for d in datasets if d.get("total_entries", 0) >= small_threshold and d.get("size_category", "large") != "xlarge")
	xlarge_count_init = sum(1 for d in datasets if d.get("size_category", "large") == "xlarge")

	client = None
	cluster = None

	if dask_count_init > 0:
	# Cluster setup for large datasets
	max_memory_gib = config["resources"]["max_memory_gib"]
	max_workers = config["resources"]["max_workers"]
	min_workers = config["resources"].get("min_workers", min(4, max_workers))
	threads_per_worker = config["resources"].get("threads_per_worker", 1)

	# Adaptive scaling config
	adaptive_config = config["resources"].get("adaptive_scaling", {})
	target_duration = adaptive_config.get("target_duration", "30s")
	wait_count = adaptive_config.get("wait_count", 3)
	interval = adaptive_config.get("interval", "2s")

	memory_per_worker_gib = max(2.0, max_memory_gib / max_workers)

	total_entries = sum(d["total_entries"] for d in datasets)
	total_slices = sum(
	max(1, math.ceil(d["n_obs"] / config["slicing"].get("obs_slice_size", 75_000)))
	for d in datasets if d.get("total_entries", 0) >= small_threshold and d.get("size_category", "large") != "xlarge"
	)

	print(json.dumps({
	"total_datasets": len(datasets),
	"small_datasets": small_count_init,
	"large_datasets": dask_count_init + xlarge_count_init,
	"total_slices": total_slices,
	"total_entries": total_entries,
	"shard_index": shard_index,
	"num_shards": num_shards,
	"memory_per_worker_gib": round(memory_per_worker_gib, 1),
	"max_workers": max_workers,
	}, indent=2))

	print(f"\nStarting Dask LocalCluster (for {dask_count_init} medium/large datasets):")
	print(f" Workers: {min_workers} -> {max_workers} (adaptive)")
	print(f" Memory per worker: {memory_per_worker_gib:.1f} GiB")
	print(f" Total memory budget: {max_memory_gib} GiB\n")

	cluster = LocalCluster(
	n_workers=min_workers,
	threads_per_worker=threads_per_worker,
	processes=True,
	memory_limit=f"{memory_per_worker_gib}GiB",
	silence_logs=True,
	dashboard_address=None,
	lifetime="180 minutes",
	lifetime_stagger="20 minutes",
	)

	cluster.adapt(
	minimum=min_workers,
	maximum=max_workers,
	target_duration=target_duration,
	wait_count=wait_count,
	interval=interval,
	)

	client = Client(cluster)
	print(f"Dask cluster ready: {client}\n")
	else:
	print(f"No Dask-compatible datasets (all small or xlarge)\n")

	if xlarge_count_init > 0:
	print(f"Note: {xlarge_count_init} xlarge datasets will be processed directly (Phase 3, no Dask)\n")

	try:
	successes, failures = process_all_datasets(
	datasets, config, per_dataset_dir, client,
	max_retries=args.max_retries,
	)

	# Include previously completed datasets in final summary
	if skipped_count > 0:
	print(f"\nLoading {skipped_count} previously completed results...")
	for json_file in per_dataset_dir.glob("*.json"):
	try:
	result = json.loads(json_file.read_text())
	if result.get("status") == "ok":
	# Check if not already in successes
	ds_path = result.get("dataset_path", "")
	if not any(s.get("dataset_path") == ds_path for s in successes):
	successes.append(result)
	except Exception:
	pass
	print(f"Total results (new + previous): {len(successes)}")

	print(f"\n{'=' * 80}")
	print(f"PROCESSING COMPLETE")
	print(f" Succeeded: {len(successes)}")
	print(f" Failed: {len(failures)}")
	print(f" Success rate: {len(successes) / max(1, original_count) * 100:.1f}%")
	print(f"{'=' * 80}\n")

	if failures:
	print("WARNING: Some datasets failed permanently:")
	for fail in failures[:10]:
	print(f" - {fail['dataset_file']}: {fail.get('error', 'Unknown')[:80]}")
	if len(failures) > 10:
	print(f" ... and {len(failures) - 10} more")

	except KeyboardInterrupt:
	print("\n\n{'=' * 80}")
	print("INTERRUPTED - Saving partial results...")
	print(f"{'=' * 80}\n")
	successes = []
	failures = []
	seen_paths = set()
	# Load all completed results from disk (deduplicate by dataset_path)
	for json_file in per_dataset_dir.glob("*.json"):
	try:
	result = json.loads(json_file.read_text())
	ds_path = result.get("dataset_path", "")
	if ds_path and ds_path in seen_paths:
	continue # Skip duplicate
	seen_paths.add(ds_path)
	if result.get("status") == "ok":
	successes.append(result)
	else:
	failures.append(result)
	except Exception:
	pass
	except Exception as exc:
	print(f"\n\nERROR during processing: {exc}")
	print("Saving partial results...")
	successes = []
	failures = []
	seen_paths = set()
	# Load all completed results from disk (deduplicate by dataset_path)
	for json_file in per_dataset_dir.glob("*.json"):
	try:
	result = json.loads(json_file.read_text())
	ds_path = result.get("dataset_path", "")
	if ds_path and ds_path in seen_paths:
	continue # Skip duplicate
	seen_paths.add(ds_path)
	if result.get("status") == "ok":
	successes.append(result)
	else:
	failures.append(result)
	except Exception:
	pass

	# Always save results, even on error/interrupt
	try:
	summary_df = pd.DataFrame(successes)
	# Deduplicate by dataset_path (keep first/most recent)
	if not summary_df.empty and 'dataset_path' in summary_df.columns:
	original_count = len(summary_df)
	summary_df = summary_df.drop_duplicates(subset=['dataset_path'], keep='first')
	if len(summary_df) < original_count:
	print(f"\nRemoved {original_count - len(summary_df)} duplicate entries from results")

	summary_csv = output_dir / f"eda_summary_shard_{shard_index:03d}_of_{num_shards:03d}.csv"
	summary_df.to_csv(summary_csv, index=False)

	failures_path = output_dir / f"eda_failures_shard_{shard_index:03d}_of_{num_shards:03d}.json"
	failures_path.write_text(json.dumps(failures, indent=2))

	print(f"\n{'=' * 80}")
	print("RESULTS SAVED")
	print(f" Summary CSV: {summary_csv}")
	print(f" Failures JSON: {failures_path}")
	print(json.dumps({
	"ok_count": len(successes),
	"failed_count": len(failures),
	}, indent=2))
	print(f"{'=' * 80}\n")
	except Exception as save_exc:
	print(f"ERROR saving results: {save_exc}")

	finally:
	if client:
	client.close()
	if cluster:
	cluster.close()


	if __name__ == "__main__":
	main()