feat(eda): enhance resource utilization by optimizing worker allocation and processing parameters

Files changed (2) hide show

configs/eda_optimized.yaml CHANGED Viewed

@@ -3,9 +3,9 @@
 # Aggressively optimized for maximum throughput on medium/large/xlarge datasets
 resources:
-  max_memory_gib: 5000  # Up to 5TB available for processing
-  max_workers: 80       # Increased from 48 to utilize more cores (80/112 = 71%)
-  chunk_size: 150000    # Doubled to reduce overhead and increase throughput
 paths:
   input_dirs:
@@ -31,7 +31,7 @@ dataset_thresholds:
 slicing:
   enabled: true
-  obs_slice_size: 500000  # Larger slices = fewer file opens, faster processing
   overlap: 0
   merge_strategy: "combine"

 # Aggressively optimized for maximum throughput on medium/large/xlarge datasets
 resources:
+  max_memory_gib: 5800  # Use nearly all 6TB available
+  max_workers: 100      # More workers = more parallel tasks (90% of 112 cores)
+  chunk_size: 200000    # Large chunks to fill memory - load more data per operation
 paths:
   input_dirs:
 slicing:
   enabled: true
+  obs_slice_size: 250000  # Smaller slices = MORE parallel tasks to feed workers
   overlap: 0
   merge_strategy: "combine"

scripts/distributed_eda.py CHANGED Viewed

@@ -1041,7 +1041,8 @@ def main() -> None:
         # Cluster setup for large datasets
         max_memory_gib = config["resources"]["max_memory_gib"]
         max_workers = config["resources"]["max_workers"]
-        min_workers = min(4, max_workers)
         memory_per_worker_gib = max(2.0, max_memory_gib / max_workers)
@@ -1070,21 +1071,21 @@ def main() -> None:
         cluster = LocalCluster(
             n_workers=min_workers,
-            threads_per_worker=1,
             processes=True,
             memory_limit=f"{memory_per_worker_gib}GiB",
             silence_logs=True,
             dashboard_address=None,
-            lifetime="120 minutes",
-            lifetime_stagger="15 minutes",
         )
         cluster.adapt(
             minimum=min_workers,
             maximum=max_workers,
-            target_duration="30s",
-            wait_count=3,
-            interval="2s",
         )
         client = Client(cluster)

         # Cluster setup for large datasets
         max_memory_gib = config["resources"]["max_memory_gib"]
         max_workers = config["resources"]["max_workers"]
+        # Start with ALL workers immediately for maximum resource utilization
+        min_workers = max_workers  # Start with 100% workers from the beginning
         memory_per_worker_gib = max(2.0, max_memory_gib / max_workers)
         cluster = LocalCluster(
             n_workers=min_workers,
+            threads_per_worker=1,  # 1 thread = more processes = better parallelism
             processes=True,
             memory_limit=f"{memory_per_worker_gib}GiB",
             silence_logs=True,
             dashboard_address=None,
+            lifetime="180 minutes",
+            lifetime_stagger="20 minutes",
         )
         cluster.adapt(
             minimum=min_workers,
             maximum=max_workers,
+            target_duration="5s",  # Very aggressive scaling for large datasets
+            wait_count=1,  # React immediately to workload
+            interval="1s",  # Check frequently
         )
         client = Client(cluster)