feat(recovery): add corrupted file redownload script and documentation

Browse files

Files changed (2) hide show

README.md +54 -0
scripts/redownload_corrupted.py +244 -0

README.md CHANGED Viewed

@@ -41,6 +41,12 @@ This creates `output/cache/enhanced_metadata.parquet` with:
 Cache is incremental - only new/changed files are rescanned. Use `--force-rescan` to rebuild.
 ### 3) Run EDA pipeline
 Single command to run everything:
@@ -221,6 +227,54 @@ The notebook provides:
 ## Troubleshooting
 ### Metadata cache not found
 ```bash

 Cache is incremental - only new/changed files are rescanned. Use `--force-rescan` to rebuild.
+**Handling corrupted files**: If some files fail during scanning (status='failed' or 'corrupted'), you can:
+1. Retry them with a more robust strategy: `uv run python scripts/retry_failed_cache.py --cache output/cache/enhanced_metadata.parquet`
+2. Re-download corrupted files from CELLxGENE: `uv run python scripts/redownload_corrupted.py --config configs/eda_optimized.yaml`
+See [Troubleshooting](#troubleshooting) for details.
 ### 3) Run EDA pipeline
 Single command to run everything:
 ## Troubleshooting
+### Corrupted or failed datasets
+If the metadata cache builder reports failed or corrupted files:
+**Step 1: Retry with robust strategy**
+Some files may fail due to transient issues or need special handling:
+```bash
+uv run python scripts/retry_failed_cache.py --cache output/cache/enhanced_metadata.parquet
+```
+This script:
+- Retries failed datasets with progressively safer strategies (anndata backed mode → h5py direct)
+- Categorizes truly corrupted files (truncated/damaged HDF5 structure)
+- Merges retry results back into the cache
+- Reports final statistics (successful recoveries vs truly corrupted)
+**Step 2: Re-download corrupted files**
+For files that are truly corrupted (status='corrupted'), re-download fresh copies from CELLxGENE:
+```bash
+uv run python scripts/redownload_corrupted.py --config configs/eda_optimized.yaml
+```
+This script:
+- Identifies corrupted files from the metadata cache
+- Looks up dataset IDs and download URLs from CELLxGENE metadata CSVs
+- Downloads files to `output/temp/` for safety
+- Verifies each downloaded file is valid HDF5
+- Moves verified files to replace corrupted originals
+- Keeps failed downloads in temp for inspection
+After re-downloading, rebuild the metadata cache to update the status:
+```bash
+uv run python scripts/build_metadata_cache.py --config configs/eda_optimized.yaml --force-rescan
+```
+**Typical corruption causes:**
+- Interrupted downloads during dataset collection
+- HDF5 file not properly closed/finalized during creation
+- Storage/filesystem errors
+- Network transfer errors from original source
+**Note:** Files marked as 'corrupted' have HDF5 structural issues (truncated superblock, missing data blocks) and cannot be repaired - they must be re-downloaded from the source.
 ### Metadata cache not found
 ```bash

scripts/redownload_corrupted.py ADDED Viewed

	@@ -0,0 +1,244 @@

+#!/usr/bin/env python3
+"""Re-download corrupted files based on metadata."""
+import argparse
+import yaml
+import pandas as pd
+import requests
+import h5py
+from pathlib import Path
+from tqdm import tqdm
+def load_config(config_path: Path) -> dict:
+    """Load YAML configuration."""
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+def download_file(url: str, output_path: Path, temp_dir: Path, chunk_size: int = 8192) -> bool:
+    """Download a file with progress bar and return success status."""
+    try:
+        print(f"\n  Downloading {output_path.name}...")
+        # Download to project temp directory first
+        temp_dir.mkdir(parents=True, exist_ok=True)
+        temp_path = temp_dir / output_path.name
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        total_size = int(response.headers.get('content-length', 0))
+        with open(temp_path, 'wb') as f, tqdm(
+            total=total_size,
+            unit='B',
+            unit_scale=True,
+            unit_divisor=1024,
+            desc="  Progress",
+        ) as pbar:
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                if chunk:
+                    f.write(chunk)
+                    pbar.update(len(chunk))
+        # Verify it's a valid HDF5 file
+        print(f"  Verifying HDF5 integrity...")
+        try:
+            with h5py.File(temp_path, 'r') as f:
+                # Try to access basic structure
+                _ = list(f.keys())
+            print(f"  ✓ Valid HDF5 file")
+            # Move to final destination, replacing the corrupted file
+            print(f"  Moving to {output_path}...")
+            if output_path.exists():
+                output_path.unlink()
+            temp_path.rename(output_path)
+            print(f"  ✓ Successfully replaced corrupted file")
+            return True
+        except Exception as e:
+            print(f"  ✗ Downloaded file is corrupted: {e}")
+            print(f"  Keeping temp file for inspection: {temp_path}")
+            return False
+    except requests.exceptions.RequestException as e:
+        print(f"  ✗ Download failed: {e}")
+        return False
+    except Exception as e:
+        print(f"  ✗ Error: {e}")
+        return False
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate download script for corrupted datasets"
+    )
+    parser.add_argument(
+        "--config",
+        type=Path,
+        default=Path("configs/eda_optimized.yaml"),
+        help="Path to YAML configuration file",
+    )
+    args = parser.parse_args()
+    # Load configuration
+    print(f"Loading configuration from {args.config}...")
+    config = load_config(args.config)
+    # Extract paths from config
+    cache_path = Path(config["paths"]["enhanced_metadata_cache"])
+    metadata_csvs = [Path(p) for p in config["paths"]["metadata_csvs"]]
+    input_dirs = [Path(p) for p in config["paths"]["input_dirs"]]
+    print(f"Cache: {cache_path}")
+    print(f"Metadata CSVs: {len(metadata_csvs)} files")
+    print(f"Input dirs: {len(input_dirs)} directories\n")
+    # Load corrupted files from cache
+    print("Loading metadata cache...")
+    cache_df = pd.read_parquet(cache_path)
+    # Get corrupted files
+    corrupted = cache_df[cache_df["status"] == "corrupted"].copy()
+    print(f"Found {len(corrupted)} corrupted files\n")
+    # Load CELLxGENE metadata
+    print("Loading CELLxGENE metadata...")
+    metadata_dfs = []
+    for csv_path in metadata_csvs:
+        if csv_path.exists():
+            df = pd.read_csv(csv_path)
+            metadata_dfs.append(df)
+            print(f"  Loaded {len(df)} records from {csv_path.name}")
+        else:
+            print(f"  ⚠ Not found: {csv_path}")
+    metadata = pd.concat(metadata_dfs, ignore_index=True)
+    # Create organism to directory mapping
+    organism_to_dir = {}
+    for input_dir in input_dirs:
+        if "homo_sapiens" in str(input_dir).lower():
+            organism_to_dir["Homo sapiens"] = input_dir
+        elif "mus_musculus" in str(input_dir).lower():
+            organism_to_dir["Mus musculus"] = input_dir
+    # Extract dataset IDs from filenames
+    print("\nMatching corrupted files with metadata...\n")
+    results = []
+    for _, row in corrupted.iterrows():
+        filename = row["dataset_file"]
+        # Extract dataset_id from filename (format: {dataset_id}__{title}.h5ad)
+        dataset_id = filename.split("__")[0]
+        # Find in metadata
+        match = metadata[metadata["dataset_id"] == dataset_id]
+        if len(match) > 0:
+            record = match.iloc[0]
+            dataset_version_id = record["dataset_version_id"]
+            title = record["dataset_title"]
+            organism = record["organism"]
+            # CELLxGENE download URL format
+            # CELLxGENE download URL format
+            download_url = f"https://datasets.cellxgene.cziscience.com/{dataset_version_id}.h5ad"
+            # Output path based on organism
+            output_dir = organism_to_dir.get(organism)
+            if not output_dir:
+                print(f"  ⚠ Unknown organism: {organism}, skipping")
+                continue
+            output_path = output_dir / filename
+            results.append({
+                "dataset_id": dataset_id,
+                "version_id": dataset_version_id,
+                "title": title,
+                "organism": organism,
+                "filename": filename,
+                "size_gb": row["file_size_gib"],
+                "download_url": download_url,
+                "output_path": str(output_path),
+            })
+            print(f"✓ {dataset_id}")
+            print(f"  Title: {title}")
+            print(f"  Size: {row['file_size_gib']:.2f} GB")
+            print(f"  URL: {download_url}")
+            print()
+        else:
+            print(f"✗ {dataset_id} - NOT FOUND in metadata")
+            print()
+    if not results:
+        print("No corrupted files found!")
+        return
+    # Summary
+    print("\n" + "=" * 80)
+    print("DOWNLOAD SUMMARY")
+    print("=" * 80)
+    total_size = sum(r['size_gb'] for r in results)
+    print(f"\nFound {len(results)} corrupted files to re-download")
+    print(f"Total download size: {total_size:.2f} GB\n")
+    for i, r in enumerate(results, 1):
+        print(f"{i}. {r['title']} ({r['size_gb']:.2f} GB)")
+    # Save CSV for reference
+    results_df = pd.DataFrame(results)
+    csv_path = Path("output/corrupted_files_redownload_info.csv")
+    csv_path.parent.mkdir(parents=True, exist_ok=True)
+    results_df.to_csv(csv_path, index=False)
+    print(f"\nDetails saved to: {csv_path}")
+    # Download files
+    print("\n" + "=" * 80)
+    print("DOWNLOADING FILES")
+    print("=" * 80)
+    # Use project temp directory
+    temp_dir = Path("output/temp")
+    success_count = 0
+    failed_files = []
+    for i, r in enumerate(results, 1):
+        print(f"\n[{i}/{len(results)}] {r['title']} ({r['size_gb']:.2f} GB)")
+        output_path = Path(r['output_path'])
+        success = download_file(r['download_url'], output_path, temp_dir)
+        if success:
+            success_count += 1
+        else:
+            failed_files.append(r['filename'])
+    # Final summary
+    print("\n" + "=" * 80)
+    print("FINAL RESULTS")
+    print("=" * 80)
+    print(f"\nSuccessfully downloaded: {success_count}/{len(results)}")
+    if failed_files:
+        print(f"\nFailed downloads:")
+        for fname in failed_files:
+            print(f"  - {fname}")
+        print("\nYou can retry failed downloads by running this script again.")
+    else:
+        print("\n✓ All files downloaded successfully!")
+        print("\nNext steps:")
+        print("  1. Re-run the metadata cache builder to update the cache:")
+        print("     uv run python scripts/build_metadata_cache.py --config configs/eda_optimized.yaml")
+        print("  2. Or re-run the retry script to update just these files:")
+        print("     uv run python scripts/retry_failed_cache.py --cache output/cache/enhanced_metadata.parquet")
+if __name__ == "__main__":
+    main()