"""Test that mapping JSON files in dataLoader.py point to existing data files. Randomly samples 30 entries per JSON file and checks that the file paths exist on disk. """ import os import sys import json import random # Resolve paths relative to the Dataloader directory, matching dataLoader.py logic ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'Dataloader') ROOT_DIR = os.path.normpath(ROOT_DIR) # Replicate the mapping_files dict from dataLoader.py mapping_files = { 'MSD': 'nifty_mappings/MSD_mappings.json', 'TotalSegmentor': 'nifty_mappings/TotalSegmentorCT_MRI_mappings.json', 'Kaggle_osic': 'nifty_mappings/Kaggle_osic_mappings.json', 'CancerImageArchive': 'nifty_mappings/CIA_mappings.json', 'MnMs': 'nifty_mappings/MnMs_mappings.json', 'Brats2019': 'nifty_mappings/Brats2019_mappings.json', 'Brats2020': 'nifty_mappings/Brats2020_mappings.json', 'Brats2021': 'nifty_mappings/Brats2021_mappings.json', 'OASIS_1': 'nifty_mappings/OASIS_1_mappings.json', 'OASIS_2': 'nifty_mappings/OASIS_2_mappings.json', 'PSMA-FDG-PET-CT-LESION': 'nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json', 'PSMA-CT': 'nifty_mappings/PSMA-CT-Longitud_mappings.json', 'AbdomenAtlas': 'nifty_mappings/AbdomenAtlas_mappings.json', 'AbdomenCT1k': 'nifty_mappings/AbdomenCT1k_mappings.json', } for k, v in mapping_files.items(): mapping_files[k] = os.path.join(ROOT_DIR, v) SAMPLE_SIZE = 30 def _check_mapping(name, json_path): """Load a mapping JSON, sample up to 30 keys, and check if files exist. Returns (total, sampled, missing_paths). """ with open(json_path, 'r') as f: data = json.load(f) all_keys = list(data.keys()) total = len(all_keys) sampled_keys = random.sample(all_keys, min(SAMPLE_SIZE, total)) missing = [] for key in sampled_keys: if not os.path.isfile(key): missing.append(key) return total, len(sampled_keys), missing def test_all_json_files_exist(): """Every JSON mapping file listed in mapping_files must exist on disk.""" missing_jsons = [] for name, path in mapping_files.items(): if not os.path.isfile(path): missing_jsons.append((name, path)) assert not missing_jsons, ( "Missing JSON mapping files:\n" + "\n".join(f" {name}: {path}" for name, path in missing_jsons) ) def test_mapping_paths_exist(): """Randomly check 30 data file paths per mapping JSON.""" random.seed(42) all_results = {} any_failure = False for name, json_path in sorted(mapping_files.items()): if not os.path.isfile(json_path): all_results[name] = f"JSON file not found: {json_path}" any_failure = True continue total, sampled, missing = _check_mapping(name, json_path) all_results[name] = { 'total_entries': total, 'sampled': sampled, 'missing_count': len(missing), 'missing_paths': missing, } if missing: any_failure = True # Build a readable report lines = [] for name, result in sorted(all_results.items()): if isinstance(result, str): lines.append(f"\n[FAIL] {name}: {result}") continue status = "PASS" if result['missing_count'] == 0 else "FAIL" lines.append( f"\n[{status}] {name}: " f"{result['sampled']}/{result['total_entries']} sampled, " f"{result['missing_count']} missing" ) for p in result['missing_paths']: lines.append(f" MISSING: {p}") report = "\n".join(lines) assert not any_failure, f"Some data paths are missing or invalid:\n{report}" # Print report on success too (visible with pytest -v or -s) print(report) if __name__ == '__main__': print(f"Checking mapping files under: {ROOT_DIR}") print(f"Sampling {SAMPLE_SIZE} entries per JSON file\n") random.seed(42) overall_pass = True for name, json_path in sorted(mapping_files.items()): if not os.path.isfile(json_path): print(f"[SKIP] {name}: JSON not found at {json_path}") overall_pass = False continue total, sampled, missing = _check_mapping(name, json_path) status = "PASS" if not missing else "FAIL" if missing: overall_pass = False print(f"[{status}] {name}: {sampled}/{total} sampled, {len(missing)} missing") for p in missing: print(f" MISSING: {p}") print() if overall_pass: print("All checked paths exist.") else: print("Some paths are MISSING — mappings may need updating.") sys.exit(1)