feat(eda): update large file processing to support parallel workers and enhance metadata caching

Browse files

Files changed (4) hide show

README.md +4 -2
configs/eda_config_template.yaml +2 -1
configs/eda_optimized.yaml +2 -1
scripts/build_metadata_cache.py +63 -23

README.md CHANGED Viewed

@@ -177,7 +177,8 @@ strategy:
 - **slowdown_threshold**: If processing throughput drops below this fraction of baseline (e.g., 0.5 = 50%), the pipeline automatically reduces worker count to prevent memory thrashing
 - **min_workers_ratio**: Minimum workers to keep as a fraction of max_workers (e.g., 0.25 = always keep at least 25% of workers)
-- **large_file_threshold_gib**: Files larger than this size (in GB) are processed serially during metadata cache building to avoid out-of-memory errors
 - **chunk_size_multiplier**: Adjusts the chunk size based on dataset category (smaller = safer for large datasets)
 ### Dataset Slicing
@@ -333,7 +334,8 @@ Reduce workers and chunk size in config:
 resources:
   max_workers: 24
   chunk_size: 4096
-  large_file_threshold_gib: 20.0  # Process larger files serially
 slicing:
   obs_slice_size: 50000
 ```

 - **slowdown_threshold**: If processing throughput drops below this fraction of baseline (e.g., 0.5 = 50%), the pipeline automatically reduces worker count to prevent memory thrashing
 - **min_workers_ratio**: Minimum workers to keep as a fraction of max_workers (e.g., 0.25 = always keep at least 25% of workers)
+- **large_file_threshold_gib**: Files larger than this size (in GB) use a separate worker pool during metadata cache building
+- **large_file_workers**: Number of parallel workers for large files (0 = serial processing, >0 = parallel). With high RAM systems, using 4-8 workers significantly speeds up cache building
 - **chunk_size_multiplier**: Adjusts the chunk size based on dataset category (smaller = safer for large datasets)
 ### Dataset Slicing
 resources:
   max_workers: 24
   chunk_size: 4096
+  large_file_threshold_gib: 20.0  # Process larger files separately
+  large_file_workers: 2  # Fewer workers for large files (or 0 for serial)
 slicing:
   obs_slice_size: 50000
 ```

configs/eda_config_template.yaml CHANGED Viewed

@@ -14,7 +14,8 @@ resources:
   min_workers_ratio: 0.25  # Minimum workers as ratio of max_workers (0.25 = keep at least 25% of workers)
   # File size thresholds for metadata cache building
-  large_file_threshold_gib: 30.0  # Files larger than this (GB) are processed serially to avoid OOM
   # Adaptive scaling parameters for Dask cluster
   adaptive_scaling:

   min_workers_ratio: 0.25  # Minimum workers as ratio of max_workers (0.25 = keep at least 25% of workers)
   # File size thresholds for metadata cache building
+  large_file_threshold_gib: 30.0  # Files larger than this (GB) use separate worker pool
+  large_file_workers: 4  # Parallel workers for large files (0 = serial, >0 = parallel)
   # Adaptive scaling parameters for Dask cluster
   adaptive_scaling:

configs/eda_optimized.yaml CHANGED Viewed

@@ -14,7 +14,8 @@ resources:
   min_workers_ratio: 0.25  # Minimum workers as ratio of max_workers (0.25 = 1/4)
   # File size thresholds for metadata cache building
-  large_file_threshold_gib: 30.0  # Files larger than this processed serially
   # Adaptive scaling parameters
   adaptive_scaling:

   min_workers_ratio: 0.25  # Minimum workers as ratio of max_workers (0.25 = 1/4)
   # File size thresholds for metadata cache building
+  large_file_threshold_gib: 30.0  # Files larger than this use separate worker pool
+  large_file_workers: 16  # Parallel workers for large files (0 = serial, >0 = parallel)
   # Adaptive scaling parameters
   adaptive_scaling:

scripts/build_metadata_cache.py CHANGED Viewed

@@ -79,6 +79,7 @@ def build_enhanced_metadata(
     force_rescan: bool = False,
     workers: int = 0,
     large_file_threshold_gib: float | None = None,
     config: dict = None,
 ) -> pd.DataFrame:
     """Build enhanced metadata by combining CELLxGENE metadata with quick scans."""
@@ -90,14 +91,19 @@ def build_enhanced_metadata(
         medium_threshold = thresholds.get("medium", 15_000_000_000)
         large_threshold = thresholds.get("large", 40_000_000_000)
         # Get large file threshold from config resources
         if large_file_threshold_gib is None:
-            large_file_threshold_gib = config.get("resources", {}).get("large_file_threshold_gib", 30.0)
     else:
         small_threshold = 2_000_000_000
         medium_threshold = 15_000_000_000
         large_threshold = 40_000_000_000
         if large_file_threshold_gib is None:
             large_file_threshold_gib = 30.0
     # Auto-detect workers if not specified
     if workers <= 0:
@@ -162,7 +168,10 @@ def build_enhanced_metadata(
     print(f"\nScan strategy:")
     print(f"  Small/medium files (<={large_file_threshold_gib} GB): {len(small_files)} files -> parallel with {workers} workers")
-    print(f"  Large files (>{large_file_threshold_gib} GB): {len(large_files)} files -> serial processing")
     print()
     scan_results = []
@@ -190,28 +199,59 @@ def build_enhanced_metadata(
                     finally:
                         pbar.update(1)
-    # Phase 2: Serial processing of large files (one at a time to avoid OOM)
     if large_files:
-        print(f"\nPhase 2: Scanning {len(large_files)} large files serially (one at a time)...")
-        for i, path in enumerate(large_files, 1):
-            size_gib = path.stat().st_size / (1024**3)
-            print(f"  [{i}/{len(large_files)}] {path.name} ({size_gib:.1f} GB)...", end=" ", flush=True)
-            try:
-                result = quick_scan_dataset(path)
-                scan_results.append(result)
-                if result["status"] == "ok":
-                    print(f"✓ ({result.get('scan_time_sec', '?')}s)")
-                else:
-                    print(f"✗ {result.get('error', 'Unknown error')[:50]}")
-            except Exception as e:
-                scan_results.append({
-                    "dataset_path": str(path),
-                    "dataset_file": path.name,
-                    "dataset_id": path.stem,
-                    "error": str(e),
-                    "status": "failed",
-                })
-                print(f"✗ {str(e)[:50]}")
     new_scans_df = pd.DataFrame(scan_results)

     force_rescan: bool = False,
     workers: int = 0,
     large_file_threshold_gib: float | None = None,
+    large_file_workers: int | None = None,
     config: dict = None,
 ) -> pd.DataFrame:
     """Build enhanced metadata by combining CELLxGENE metadata with quick scans."""
         medium_threshold = thresholds.get("medium", 15_000_000_000)
         large_threshold = thresholds.get("large", 40_000_000_000)
         # Get large file threshold from config resources
+        resources = config.get("resources", {})
         if large_file_threshold_gib is None:
+            large_file_threshold_gib = resources.get("large_file_threshold_gib", 30.0)
+        if large_file_workers is None:
+            large_file_workers = resources.get("large_file_workers", 0)
     else:
         small_threshold = 2_000_000_000
         medium_threshold = 15_000_000_000
         large_threshold = 40_000_000_000
         if large_file_threshold_gib is None:
             large_file_threshold_gib = 30.0
+        if large_file_workers is None:
+            large_file_workers = 0
     # Auto-detect workers if not specified
     if workers <= 0:
     print(f"\nScan strategy:")
     print(f"  Small/medium files (<={large_file_threshold_gib} GB): {len(small_files)} files -> parallel with {workers} workers")
+    if large_file_workers > 0:
+        print(f"  Large files (>{large_file_threshold_gib} GB): {len(large_files)} files -> parallel with {large_file_workers} workers")
+    else:
+        print(f"  Large files (>{large_file_threshold_gib} GB): {len(large_files)} files -> serial processing")
     print()
     scan_results = []
                     finally:
                         pbar.update(1)
+    # Phase 2: Processing large files (parallel or serial based on config)
     if large_files:
+        if large_file_workers > 0:
+            # Parallel processing with limited workers
+            print(f"\nPhase 2: Scanning {len(large_files)} large files in parallel ({large_file_workers} workers)...")
+            with tqdm(total=len(large_files), desc="Large files", unit="file") as pbar:
+                with concurrent.futures.ProcessPoolExecutor(max_workers=large_file_workers) as executor:
+                    futures = {executor.submit(quick_scan_dataset, path): path for path in large_files}
+                    for future in concurrent.futures.as_completed(futures):
+                        try:
+                            result = future.result()
+                            scan_results.append(result)
+                            # Show file size in progress
+                            path = futures[future]
+                            size_gib = path.stat().st_size / (1024**3)
+                            status = "✓" if result.get("status") == "ok" else "✗"
+                            elapsed = result.get("scan_time_sec", "?")
+                            tqdm.write(f"  {status} {path.name} ({size_gib:.1f} GB, {elapsed}s)")
+                        except Exception as e:
+                            path = futures[future]
+                            scan_results.append({
+                                "dataset_path": str(path),
+                                "dataset_file": path.name,
+                                "dataset_id": path.stem,
+                                "error": str(e),
+                                "status": "failed",
+                            })
+                            tqdm.write(f"  ✗ {path.name} - {str(e)[:50]}")
+                        finally:
+                            pbar.update(1)
+        else:
+            # Serial processing (original behavior)
+            print(f"\nPhase 2: Scanning {len(large_files)} large files serially (one at a time)...")
+            for i, path in enumerate(large_files, 1):
+                size_gib = path.stat().st_size / (1024**3)
+                print(f"  [{i}/{len(large_files)}] {path.name} ({size_gib:.1f} GB)...", end=" ", flush=True)
+                try:
+                    result = quick_scan_dataset(path)
+                    scan_results.append(result)
+                    if result["status"] == "ok":
+                        print(f"✓ ({result.get('scan_time_sec', '?')}s)")
+                    else:
+                        print(f"✗ {result.get('error', 'Unknown error')[:50]}")
+                except Exception as e:
+                    scan_results.append({
+                        "dataset_path": str(path),
+                        "dataset_file": path.name,
+                        "dataset_id": path.stem,
+                        "error": str(e),
+                        "status": "failed",
+                    })
+                    print(f"✗ {str(e)[:50]}")
     new_scans_df = pd.DataFrame(scan_results)