NCHC-bio
/

cell_x_gene_exploratory_data_analysis

Model card Files Files and versions

xet

Community

whats2000 commited on Feb 11

Commit

b8d98f3

1 Parent(s): 74e20c3

feat(eda): implement hybrid processing strategy for small and large datasets

Browse files

Files changed (1) hide show

scripts/distributed_eda.py +400 -131

scripts/distributed_eda.py CHANGED Viewed

@@ -12,6 +12,7 @@ and metadata summaries. This handles datasets from 2 GB to 500 GB.
 from __future__ import annotations
 import argparse
 import gc
 import hashlib
 import json
@@ -165,7 +166,184 @@ def merge_slice_results(slices: list[SliceResult], n_obs: int, n_vars: int) -> d
 # ---------------------------------------------------------------------------
-# Core worker function: process ONE slice of ONE dataset
 # ---------------------------------------------------------------------------
 def process_slice(
     path_str: str,
@@ -458,122 +636,203 @@ def process_all_datasets(
     datasets: list[dict],
     config: dict,
     per_dataset_dir: Path,
-    client: Client,
     max_retries: int = 3,
 ) -> tuple[list[dict], list[dict]]:
-    """Process all datasets by slicing into bounded tasks.
-    Each task processes at most obs_slice_size rows. Results are merged
-    per-dataset with O(n_vars) memory on the scheduler side.
-    """
     chunk_size = config["resources"]["chunk_size"]
     obs_slice_size = config["slicing"].get("obs_slice_size", 75_000)
     small_threshold = config["dataset_thresholds"]["small"]
     max_meta_cols = config["metadata"]["max_meta_cols"]
     max_categories = config["metadata"]["max_categories"]
     successes = []
     failures = []
-    # Categorize and sort datasets
     small_datasets = [d for d in datasets if d.get("total_entries", 0) < small_threshold]
     large_datasets = [d for d in datasets if d.get("total_entries", 0) >= small_threshold]
-    # Sort each category
     small_datasets.sort(key=lambda d: d["total_entries"])
     large_datasets.sort(key=lambda d: d["total_entries"])
-    # Process small first in batches for speed, then large one-by-one
     datasets_sorted = small_datasets + large_datasets
     small_count = len(small_datasets)
     sliced_count = len(large_datasets)
-    # Calculate batch size for small datasets (maximize worker utilization)
-    max_workers = client.cluster.maximum if hasattr(client.cluster, 'maximum') else 48
-    small_batch_size = max(1, min(max_workers, small_count))  # Process up to max_workers at once
     print(f"\n{'=' * 80}")
     print(f"Processing {len(datasets_sorted)} datasets")
-    print(f"  Small datasets (no slicing): {small_count} (batch size: {small_batch_size})")
-    print(f"  Medium/Large (sliced): {sliced_count}")
     print(f"Slice size: {obs_slice_size:,} rows per task (for medium/large)")
     print(f"Small threshold: {small_threshold:,} entries")
     print(f"Chunk size: {chunk_size:,} rows per sub-chunk")
     print(f"{'=' * 80}\n")
     total_datasets = len(datasets_sorted)
-    is_small_batch = True  # Track if we're in small dataset phase
-    # Overall progress bar for all datasets
-    with tqdm(
-        total=total_datasets,
-        desc="Datasets",
-        position=0,
-        leave=True,
-        ncols=100
-    ) as dataset_pbar:
-        # Process datasets in batches for small, one-by-one for large
-        ds_idx = 0
-        while ds_idx < total_datasets:
-            # Determine batch size
-            if ds_idx < small_count:
-                # Small datasets: batch processing
-                batch_end = min(ds_idx + small_batch_size, small_count)
-                batch = datasets_sorted[ds_idx:batch_end]
-                is_small_batch = True
-            else:
-                # Large datasets: process one at a time
-                batch = [datasets_sorted[ds_idx]]
-                batch_end = ds_idx + 1
-                is_small_batch = False
-            # Submit all tasks for the batch
-            batch_futures = []
-            batch_info = []
-            for dataset in batch:
-                dataset_idx = ds_idx + batch.index(dataset)
                 ds_path = dataset["dataset_path"]
                 ds_name = Path(ds_path).name
                 n_obs = dataset["n_obs"]
                 n_vars = dataset["n_vars"]
                 total_entries = dataset["total_entries"]
-                # Create slice tasks (small datasets = 1 task, large = sliced)
                 slice_tasks = create_slice_tasks(dataset, obs_slice_size, small_threshold)
                 n_slices = len(slice_tasks)
                 # Submit slice tasks to Dask
                 futures = client.map(
                     lambda t: process_slice(t[0], t[1], t[2], chunk_size),
                     slice_tasks,
                     pure=False,
                 )
-                batch_futures.append(futures)
-                batch_info.append({
-                    'dataset': dataset,
-                    'ds_idx': dataset_idx,
-                    'ds_path': ds_path,
-                    'ds_name': ds_name,
-                    'n_obs': n_obs,
-                    'n_vars': n_vars,
-                    'total_entries': total_entries,
-                    'slice_tasks': slice_tasks,
-                    'n_slices': n_slices,
-                    't0': time.time()
-                })
-            # Process results for the batch
-            dataset_pbar.set_description(f"Datasets [{ds_idx + 1}-{batch_end}/{total_datasets}]" if len(batch) > 1 else f"Datasets [{ds_idx + 1}/{total_datasets}]")
-            for info_idx, (futures, info) in enumerate(zip(batch_futures, batch_info)):
-                dataset = info['dataset']
-                dataset_idx = info['ds_idx']
-                ds_path = info['ds_path']
-                ds_name = info['ds_name']
-                n_obs = info['n_obs']
-                n_vars = info['n_vars']
                 total_entries = info['total_entries']
                 slice_tasks = info['slice_tasks']
                 n_slices = info['n_slices']
@@ -583,11 +842,11 @@ def process_all_datasets(
                 slice_results: list[SliceResult] = []
                 failed_slices: list[tuple[str, int, int]] = []
-                # Collect results with progress bar (show only for sliced datasets)
-                show_slice_bar = n_slices > 1 and not is_small_batch
                 slice_pbar = tqdm(
                     total=n_slices,
-                    desc=f"  └─ Slices",
                     position=1,
                     leave=False,
                     ncols=100,
@@ -696,8 +955,7 @@ def process_all_datasets(
                 # Update dataset progress
                 dataset_pbar.update(1)
-            # Move to next batch
-            ds_idx = batch_end
     return successes, failures
@@ -746,60 +1004,69 @@ def main() -> None:
     per_dataset_dir = output_dir / "per_dataset"
     per_dataset_dir.mkdir(parents=True, exist_ok=True)
-    # Cluster setup
-    max_memory_gib = config["resources"]["max_memory_gib"]
-    max_workers = config["resources"]["max_workers"]
-    min_workers = min(4, max_workers)
-    # Each worker needs enough memory for: chunk_size * n_vars * 12 bytes * 3x overhead
-    # With slice architecture, workers are lightweight - give them decent memory
-    memory_per_worker_gib = max(2.0, max_memory_gib / max_workers)
-    total_entries = sum(d["total_entries"] for d in datasets)
-    total_slices = sum(
-        max(1, math.ceil(d["n_obs"] / config["slicing"].get("obs_slice_size", 50_000)))
-        for d in datasets
-    )
-    print(json.dumps({
-        "total_datasets": len(datasets),
-        "total_slices": total_slices,
-        "total_entries": total_entries,
-        "shard_index": shard_index,
-        "num_shards": num_shards,
-        "memory_per_worker_gib": round(memory_per_worker_gib, 1),
-        "max_workers": max_workers,
-    }, indent=2))
-    print(f"\nStarting Dask LocalCluster:")
-    print(f"  Workers: {min_workers} -> {max_workers} (adaptive)")
-    print(f"  Memory per worker: {memory_per_worker_gib:.1f} GiB")
-    print(f"  Total memory budget: {max_memory_gib} GiB\n")
-    cluster = LocalCluster(
-        n_workers=min_workers,
-        threads_per_worker=1,
-        processes=True,
-        memory_limit=f"{memory_per_worker_gib}GiB",
-        silence_logs=True,
-        dashboard_address=None,
-        lifetime="120 minutes",
-        lifetime_stagger="15 minutes",
-    )
-    cluster.adapt(
-        minimum=min_workers,
-        maximum=max_workers,
-        target_duration="30s",
-        wait_count=3,
-        interval="2s",
-    )
-    client = Client(cluster)
-    try:
         print(f"Dask cluster ready: {client}\n")
         successes, failures = process_all_datasets(
             datasets, config, per_dataset_dir, client,
             max_retries=args.max_retries,
@@ -835,8 +1102,10 @@ def main() -> None:
         }, indent=2))
     finally:
-        client.close()
-        cluster.close()
 if __name__ == "__main__":

 from __future__ import annotations
 import argparse
+import concurrent.futures
 import gc
 import hashlib
 import json
 # ---------------------------------------------------------------------------
+# Simple worker function for small datasets (no Dask overhead)
+# ---------------------------------------------------------------------------
+def process_dataset_simple(
+    path_str: str,
+    n_obs: int,
+    n_vars: int,
+    chunk_size: int,
+    max_meta_cols: int,
+    max_categories: int,
+) -> dict:
+    """Process entire small dataset in one worker (no slicing, no Dask)."""
+    t0 = time.time()
+    path = Path(path_str)
+    row: dict[str, Any] = {
+        "dataset_path": path_str,
+        "dataset_file": path.name,
+        "n_obs": n_obs,
+        "n_vars": n_vars,
+    }
+    try:
+        adata = ad.read_h5ad(path, backed="r")
+        total_entries = n_obs * n_vars
+        nnz_total = 0
+        x_sum = 0.0
+        x_sum_sq = 0.0
+        # Cell-level accumulators
+        cell_total_counts_sum = 0.0
+        cell_total_counts_min = math.inf
+        cell_total_counts_max = -math.inf
+        cell_n_genes_sum = 0
+        cell_n_genes_min = 2**63 - 1
+        cell_n_genes_max = 0
+        # Gene-level accumulators
+        gene_n_cells = np.zeros(n_vars, dtype=np.int64)
+        gene_total_counts = np.zeros(n_vars, dtype=np.float64)
+        # Process in chunks
+        for start in range(0, n_obs, chunk_size):
+            end = min(start + chunk_size, n_obs)
+            chunk = adata.X[start:end, :]
+            if sparse.issparse(chunk):
+                csr = chunk.tocsr() if not sparse.isspmatrix_csr(chunk) else chunk
+                data = csr.data.astype(np.float64, copy=False)
+                nnz_total += int(csr.nnz)
+                x_sum += float(data.sum())
+                x_sum_sq += float(np.square(data).sum())
+                # Cell stats
+                cell_counts = np.asarray(csr.sum(axis=1)).ravel()
+                cell_genes = np.diff(csr.indptr).astype(np.int64)
+                cell_total_counts_sum += float(cell_counts.sum())
+                cell_total_counts_min = min(cell_total_counts_min, float(cell_counts.min()))
+                cell_total_counts_max = max(cell_total_counts_max, float(cell_counts.max()))
+                cell_n_genes_sum += int(cell_genes.sum())
+                cell_n_genes_min = min(cell_n_genes_min, int(cell_genes.min()))
+                cell_n_genes_max = max(cell_n_genes_max, int(cell_genes.max()))
+                # Gene stats
+                csc = csr.tocsc()
+                gene_n_cells += np.diff(csc.indptr).astype(np.int64)
+                gene_total_counts += np.asarray(csc.sum(axis=0)).ravel()
+                del csr, csc, data
+            else:
+                arr = np.asarray(chunk, dtype=np.float64)
+                nz = arr != 0
+                nnz_total += int(nz.sum())
+                x_sum += float(arr.sum())
+                x_sum_sq += float(np.square(arr).sum())
+                # Cell stats
+                cell_counts = arr.sum(axis=1)
+                cell_genes = nz.sum(axis=1).astype(np.int64)
+                cell_total_counts_sum += float(cell_counts.sum())
+                cell_total_counts_min = min(cell_total_counts_min, float(cell_counts.min()))
+                cell_total_counts_max = max(cell_total_counts_max, float(cell_counts.max()))
+                cell_n_genes_sum += int(cell_genes.sum())
+                cell_n_genes_min = min(cell_n_genes_min, int(cell_genes.min()))
+                cell_n_genes_max = max(cell_n_genes_max, int(cell_genes.max()))
+                # Gene stats
+                gene_n_cells += nz.sum(axis=0).astype(np.int64)
+                gene_total_counts += arr.sum(axis=0)
+                del arr, nz
+            del chunk
+            gc.collect()
+        # Matrix-level stats
+        row["nnz"] = int(nnz_total)
+        row["sparsity"] = float(1.0 - nnz_total / total_entries) if total_entries else None
+        row["x_mean"] = float(x_sum / total_entries) if total_entries else None
+        if total_entries:
+            var = max(0.0, x_sum_sq / total_entries - (x_sum / total_entries) ** 2)
+            row["x_std"] = float(math.sqrt(var))
+        else:
+            row["x_std"] = None
+        # Cell-level stats
+        if n_obs > 0:
+            row["cell_total_counts_min"] = float(cell_total_counts_min)
+            row["cell_total_counts_max"] = float(cell_total_counts_max)
+            row["cell_total_counts_mean"] = float(cell_total_counts_sum / n_obs)
+            row["cell_n_genes_detected_min"] = int(cell_n_genes_min)
+            row["cell_n_genes_detected_max"] = int(cell_n_genes_max)
+            row["cell_n_genes_detected_mean"] = float(cell_n_genes_sum / n_obs)
+        else:
+            row["cell_total_counts_min"] = None
+            row["cell_total_counts_max"] = None
+            row["cell_total_counts_mean"] = None
+            row["cell_n_genes_detected_min"] = None
+            row["cell_n_genes_detected_max"] = None
+            row["cell_n_genes_detected_mean"] = None
+        # Gene-level stats
+        genes_detected = int(np.count_nonzero(gene_n_cells))
+        row["genes_detected_in_any_cell"] = genes_detected
+        row["genes_detected_in_any_cell_pct"] = float(genes_detected / n_vars * 100) if n_vars else 0.0
+        if genes_detected > 0:
+            mask = gene_n_cells > 0
+            row["gene_n_cells_min"] = int(gene_n_cells[mask].min())
+            row["gene_n_cells_max"] = int(gene_n_cells[mask].max())
+            row["gene_n_cells_mean"] = float(gene_n_cells[mask].mean())
+            row["gene_total_counts_min"] = float(gene_total_counts[mask].min())
+            row["gene_total_counts_max"] = float(gene_total_counts[mask].max())
+            row["gene_total_counts_mean"] = float(gene_total_counts[mask].mean())
+        else:
+            for k in ("gene_n_cells_min", "gene_n_cells_max", "gene_n_cells_mean",
+                       "gene_total_counts_min", "gene_total_counts_max", "gene_total_counts_mean"):
+                row[k] = 0
+        # Metadata
+        row["obs_columns"] = int(len(adata.obs.columns))
+        row["var_columns"] = int(len(adata.var.columns))
+        row["metadata_obs_summary"] = summarize_metadata(
+            adata.obs, max_cols=max_meta_cols, max_categories=max_categories
+        )
+        row["metadata_var_summary"] = summarize_metadata(
+            adata.var, max_cols=max_meta_cols, max_categories=max_categories
+        )
+        row["obs_schema"] = extract_schema(adata.obs)
+        row["var_schema"] = extract_schema(adata.var)
+        # Clean up
+        del gene_n_cells, gene_total_counts
+        try:
+            if hasattr(adata, "file") and adata.file is not None:
+                adata.file.close()
+        except Exception:
+            pass
+        del adata
+        row["status"] = "ok"
+        row["n_slices_total"] = 1
+        row["n_slices_ok"] = 1
+        row["n_slices_failed"] = 0
+    except Exception as exc:
+        row["status"] = "failed"
+        row["error"] = str(exc)
+    gc.collect()
+    row["elapsed_sec"] = round(time.time() - t0, 2)
+    return row
+# ---------------------------------------------------------------------------
+# Core worker function: process ONE slice of ONE dataset (Dask)
 # ---------------------------------------------------------------------------
 def process_slice(
     path_str: str,
     datasets: list[dict],
     config: dict,
     per_dataset_dir: Path,
+    client: Client | None,
     max_retries: int = 3,
 ) -> tuple[list[dict], list[dict]]:
+    """Process all datasets: small ones with ProcessPoolExecutor, large ones with Dask."""
     chunk_size = config["resources"]["chunk_size"]
     obs_slice_size = config["slicing"].get("obs_slice_size", 75_000)
     small_threshold = config["dataset_thresholds"]["small"]
     max_meta_cols = config["metadata"]["max_meta_cols"]
     max_categories = config["metadata"]["max_categories"]
+    max_workers_base = config["resources"]["max_workers"]
     successes = []
     failures = []
+    # Categorize datasets
     small_datasets = [d for d in datasets if d.get("total_entries", 0) < small_threshold]
     large_datasets = [d for d in datasets if d.get("total_entries", 0) >= small_threshold]
     small_datasets.sort(key=lambda d: d["total_entries"])
     large_datasets.sort(key=lambda d: d["total_entries"])
     datasets_sorted = small_datasets + large_datasets
     small_count = len(small_datasets)
     sliced_count = len(large_datasets)
     print(f"\n{'=' * 80}")
     print(f"Processing {len(datasets_sorted)} datasets")
+    print(f"  Small datasets (ProcessPoolExecutor): {small_count}")
+    print(f"  Medium/Large (Dask + slicing): {sliced_count}")
     print(f"Slice size: {obs_slice_size:,} rows per task (for medium/large)")
     print(f"Small threshold: {small_threshold:,} entries")
     print(f"Chunk size: {chunk_size:,} rows per sub-chunk")
     print(f"{'=' * 80}\n")
     total_datasets = len(datasets_sorted)
+    # ========================================================================
+    # Phase 1: Process small datasets with ProcessPoolExecutor (batched)
+    # ========================================================================
+    if small_count > 0:
+        print(f"{'='*80}")
+        print(f"PHASE 1: Small datasets ({small_count}) - ProcessPoolExecutor")
+        print(f"{'='*80}\n")
+        # Adaptive worker management
+        current_workers = max_workers_base
+        min_workers = max(1, max_workers_base // 4)
+        batch_size = max(30, min(100, small_count // 4))
+        # Throughput monitoring
+        check_interval = 50
+        baseline_throughput = None
+        slowdown_threshold = 0.5
+        last_check_idx = 0
+        batch_start_time = time.time()
+        print(f"Workers: {current_workers} (adaptive: {min_workers}-{max_workers_base})")
+        print(f"Batch size: {batch_size} (recycled between batches)\n")
+        with tqdm(total=small_count, desc="Small datasets", position=0) as pbar:
+            for batch_start in range(0, small_count, batch_size):
+                batch_end = min(batch_start + batch_size, small_count)
+                batch = small_datasets[batch_start:batch_end]
+                # Check throughput and adjust workers
+                processed = len(successes) + len(failures)
+                if processed >= last_check_idx + check_interval and processed > check_interval:
+                    elapsed = time.time() - batch_start_time
+                    current_throughput = processed / elapsed if elapsed > 0 else 0
+                    if baseline_throughput is None and processed >= check_interval * 2:
+                        baseline_throughput = current_throughput
+                        tqdm.write(f"Baseline: {baseline_throughput:.2f} ds/sec")
+                    if baseline_throughput and current_throughput < baseline_throughput * slowdown_threshold:
+                        if current_workers > min_workers:
+                            old_workers = current_workers
+                            current_workers = max(min_workers, current_workers // 2)
+                            tqdm.write(f"⚠️  Slowdown detected. Workers: {old_workers} → {current_workers}")
+                            baseline_throughput = None
+                    last_check_idx = processed
+                # Process batch
+                executor = concurrent.futures.ProcessPoolExecutor(max_workers=current_workers)
+                futures = {}
+                try:
+                    for dataset in batch:
+                        future = executor.submit(
+                            process_dataset_simple,
+                            dataset["dataset_path"],
+                            dataset["n_obs"],
+                            dataset["n_vars"],
+                            chunk_size,
+                            max_meta_cols,
+                            max_categories,
+                        )
+                        futures[future] = dataset
+                    for future in concurrent.futures.as_completed(futures):
+                        dataset = futures[future]
+                        ds_path = dataset["dataset_path"]
+                        ds_name = Path(ds_path).name
+                        try:
+                            row = future.result(timeout=3600)
+                            # File size
+                            try:
+                                row["file_size_gib"] = round(Path(ds_path).stat().st_size / (1024 ** 3), 4)
+                            except Exception:
+                                pass
+                            # Save JSON
+                            try:
+                                payload_name = safe_name(Path(ds_path)) + ".json"
+                                (per_dataset_dir / payload_name).write_text(json.dumps(row, indent=2))
+                            except Exception as exc:
+                                row["save_error"] = str(exc)
+                            if row.get("status") == "ok":
+                                successes.append(row)
+                                elapsed = row.get("elapsed_sec", "?")
+                                tqdm.write(f"  [{len(successes)}/{total_datasets}] ✓ {ds_name[:50]} | {elapsed}s")
+                            else:
+                                failures.append(row)
+                                error = row.get("error", "Unknown")[:60]
+                                tqdm.write(f"  [{len(successes) + len(failures)}/{total_datasets}] ✗ {ds_name[:50]} | {error}")
+                        except concurrent.futures.TimeoutError:
+                            failures.append({
+                                "dataset_path": ds_path,
+                                "dataset_file": ds_name,
+                                "status": "failed",
+                                "error": "Timeout",
+                            })
+                            tqdm.write(f"  [{len(successes) + len(failures)}/{total_datasets}] ✗ {ds_name[:50]} | Timeout")
+                        except Exception as exc:
+                            failures.append({
+                                "dataset_path": ds_path,
+                                "dataset_file": ds_name,
+                                "status": "failed",
+                                "error": str(exc),
+                            })
+                            tqdm.write(f"  [{len(successes) + len(failures)}/{total_datasets}] ✗ {ds_name[:50]} | {exc}")
+                        finally:
+                            pbar.update(1)
+                finally:
+                    executor.shutdown(wait=True)
+                    gc.collect()
+                    time.sleep(1)
+        print(f"\nPhase 1 complete: {len([s for s in successes if s in successes[-small_count:]])} ok, " +
+              f"{len([f for f in failures if f in failures[-small_count:]])} failed\n")
+    # ========================================================================
+    # Phase 2: Process large datasets with Dask (existing logic)
+    # ========================================================================
+    if sliced_count > 0 and client:
+        print(f"{'='*80}")
+        print(f"PHASE 2: Medium/Large datasets ({sliced_count}) - Dask + slicing")
+        print(f"{'='*80}\n")
+        with tqdm(
+            total=sliced_count,
+            desc="Med/Large datasets",
+            position=0,
+            leave=True,
+            ncols=100
+        ) as dataset_pbar:
+            for ds_local_idx, dataset in enumerate(large_datasets):
+                dataset_idx = small_count + ds_local_idx
                 ds_path = dataset["dataset_path"]
                 ds_name = Path(ds_path).name
                 n_obs = dataset["n_obs"]
                 n_vars = dataset["n_vars"]
                 total_entries = dataset["total_entries"]
+                t0 = time.time()
+                # Create slice tasks
                 slice_tasks = create_slice_tasks(dataset, obs_slice_size, small_threshold)
                 n_slices = len(slice_tasks)
+                dataset_pbar.set_description(f"Med/Large [{ds_local_idx + 1}/{sliced_count}]")
+                # Submit all slices for this dataset
+                slice_results: list[SliceResult] = []
+                failed_slices: list[tuple[str, int, int]] = []
                 # Submit slice tasks to Dask
                 futures = client.map(
                     lambda t: process_slice(t[0], t[1], t[2], chunk_size),
                     slice_tasks,
                     pure=False,
                 )
                 total_entries = info['total_entries']
                 slice_tasks = info['slice_tasks']
                 n_slices = info['n_slices']
                 slice_results: list[SliceResult] = []
                 failed_slices: list[tuple[str, int, int]] = []
+                # Collect results with progress bar (show for sliced datasets)
+                show_slice_bar = n_slices > 1
                 slice_pbar = tqdm(
                     total=n_slices,
+                    desc=f" \u2514\u2500 Slices",
                     position=1,
                     leave=False,
                     ncols=100,
                 # Update dataset progress
                 dataset_pbar.update(1)
+        print(f"\nPhase 2 complete\n")
     return successes, failures
     per_dataset_dir = output_dir / "per_dataset"
     per_dataset_dir.mkdir(parents=True, exist_ok=True)
+    # Check if we need Dask cluster (for medium/large datasets)
+    small_threshold = config["dataset_thresholds"]["small"]
+    large_count = sum(1 for d in datasets if d.get("total_entries", 0) >= small_threshold)
+    client = None
+    cluster = None
+    if large_count > 0:
+        # Cluster setup for large datasets
+        max_memory_gib = config["resources"]["max_memory_gib"]
+        max_workers = config["resources"]["max_workers"]
+        min_workers = min(4, max_workers)
+        memory_per_worker_gib = max(2.0, max_memory_gib / max_workers)
+        total_entries = sum(d["total_entries"] for d in datasets)
+        total_slices = sum(
+            max(1, math.ceil(d["n_obs"] / config["slicing"].get("obs_slice_size", 75_000)))
+            for d in datasets if d.get("total_entries", 0) >= small_threshold
+        )
+        print(json.dumps({
+            "total_datasets": len(datasets),
+            "small_datasets": len(datasets) - large_count,
+            "large_datasets": large_count,
+            "total_slices": total_slices,
+            "total_entries": total_entries,
+            "shard_index": shard_index,
+            "num_shards": num_shards,
+            "memory_per_worker_gib": round(memory_per_worker_gib, 1),
+            "max_workers": max_workers,
+        }, indent=2))
+        print(f"\nStarting Dask LocalCluster (for {large_count} large datasets):")
+        print(f"  Workers: {min_workers} -> {max_workers} (adaptive)")
+        print(f"  Memory per worker: {memory_per_worker_gib:.1f} GiB")
+        print(f"  Total memory budget: {max_memory_gib} GiB\n")
+        cluster = LocalCluster(
+            n_workers=min_workers,
+            threads_per_worker=1,
+            processes=True,
+            memory_limit=f"{memory_per_worker_gib}GiB",
+            silence_logs=True,
+            dashboard_address=None,
+            lifetime="120 minutes",
+            lifetime_stagger="15 minutes",
+        )
+        cluster.adapt(
+            minimum=min_workers,
+            maximum=max_workers,
+            target_duration="30s",
+            wait_count=3,
+            interval="2s",
+        )
+        client = Client(cluster)
         print(f"Dask cluster ready: {client}\n")
+    else:
+        print(f"All {len(datasets)} datasets are small - using ProcessPoolExecutor only\n")
+    try:
         successes, failures = process_all_datasets(
             datasets, config, per_dataset_dir, client,
             max_retries=args.max_retries,
         }, indent=2))
     finally:
+        if client:
+            client.close()
+        if cluster:
+            cluster.close()
 if __name__ == "__main__":