fix(eda): optimize gene statistics calculation in distributed EDA

refactor(eda): adjust resource settings for improved performance and reduced I/O contention

Files changed (3) hide show

configs/eda_optimized.yaml CHANGED Viewed

@@ -4,8 +4,8 @@
 resources:
   max_memory_gib: 5500  # Leave ~500 GB buffer for system
-  max_workers: 100      # Based on actual RAM availability
-  chunk_size: 50000     # Increased for larger memory
 paths:
   input_dirs:
@@ -24,30 +24,30 @@ paths:
   enhanced_metadata_cache: output/cache/enhanced_metadata.parquet
 dataset_thresholds:
-  small: 2_000_000_000      # < 2B entries: full speed
-  medium: 15_000_000_000    # < 15B entries: moderate
-  large: 75_000_000_000     # < 75B entries: slice required
   max_entries: 1_000_000_000_000  # Max 1T entries (accommodates largest dataset: 520B)
 slicing:
   enabled: true
-  obs_slice_size: 150000  # Increased for larger memory
   overlap: 0
   merge_strategy: "combine"
 strategy:
   small:
-    workers_fraction: 1.0  # Use all 42 workers
     chunk_size_multiplier: 1.0
     priority: 1
   medium:
-    workers_fraction: 0.7  # ~30 workers
     chunk_size_multiplier: 0.85
     priority: 2
   large:
-    workers_fraction: 0.4  # ~17 workers with slicing
     chunk_size_multiplier: 0.6
     priority: 3
     require_slicing: true

 resources:
   max_memory_gib: 5500  # Leave ~500 GB buffer for system
+  max_workers: 48       # Reduced to minimize I/O contention and oversubscription
+  chunk_size: 50000     # Optimized for current workload
 paths:
   input_dirs:
   enhanced_metadata_cache: output/cache/enhanced_metadata.parquet
 dataset_thresholds:
+  small: 5_000_000_000      # < 5B entries: full speed (reduced overhead)
+  medium: 30_000_000_000    # < 30B entries: moderate (sparse matrices safe)
+  large: 150_000_000_000    # < 150B entries: slice required
   max_entries: 1_000_000_000_000  # Max 1T entries (accommodates largest dataset: 520B)
 slicing:
   enabled: true
+  obs_slice_size: 300000  # Increased to reduce scheduling overhead and HDF5 opens
   overlap: 0
   merge_strategy: "combine"
 strategy:
   small:
+    workers_fraction: 1.0  # Use all 48 workers
     chunk_size_multiplier: 1.0
     priority: 1
   medium:
+    workers_fraction: 0.75  # ~36 workers (reduced I/O pressure)
     chunk_size_multiplier: 0.85
     priority: 2
   large:
+    workers_fraction: 0.5  # ~24 workers (I/O-bound, fewer is faster)
     chunk_size_multiplier: 0.6
     priority: 3
     require_slicing: true

run_eda_slurm.sh CHANGED Viewed

@@ -29,6 +29,13 @@ echo "========================================="
 cd /project/GOV108018/whats2000_work/cell_x_gene_visualization
 # Create logs directory if it doesn't exist
 mkdir -p logs

 cd /project/GOV108018/whats2000_work/cell_x_gene_visualization
+# Limit BLAS/NumPy threading to prevent oversubscription
+export OMP_NUM_THREADS=1
+export MKL_NUM_THREADS=1
+export OPENBLAS_NUM_THREADS=1
+export NUMEXPR_NUM_THREADS=1
+export VECLIB_MAXIMUM_THREADS=1
 # Create logs directory if it doesn't exist
 mkdir -p logs

scripts/distributed_eda.py CHANGED Viewed

@@ -386,12 +386,19 @@ def process_slice(
                 cell_counts = np.asarray(csr.sum(axis=1)).ravel()
                 cell_genes = np.diff(csr.indptr).astype(np.int64)
-                # Gene stats
-                csc = csr.tocsc()
-                gene_n_cells += np.diff(csc.indptr).astype(np.int64)
-                gene_total_counts += np.asarray(csc.sum(axis=0)).ravel()
-                del csr, csc, data
             else:
                 arr = np.asarray(chunk, dtype=np.float64)
                 nz = arr != 0

                 cell_counts = np.asarray(csr.sum(axis=1)).ravel()
                 cell_genes = np.diff(csr.indptr).astype(np.int64)
+                # Gene stats (optimized: use bincount instead of CSC conversion)
+                # Accumulate counts directly from CSR indices/data
+                gene_total_counts += np.bincount(
+                    csr.indices,
+                    weights=data,
+                    minlength=n_vars
+                )
+                gene_n_cells += np.bincount(
+                    csr.indices,
+                    minlength=n_vars
+                )
+                del csr, data
             else:
                 arr = np.asarray(chunk, dtype=np.float64)
                 nz = arr != 0