File size: 4,893 Bytes
2af0e94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""Test that mapping JSON files in dataLoader.py point to existing data files.



Randomly samples 30 entries per JSON file and checks that the file paths exist on disk.

"""
import os
import sys
import json
import random

# Resolve paths relative to the Dataloader directory, matching dataLoader.py logic
ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'Dataloader')
ROOT_DIR = os.path.normpath(ROOT_DIR)

# Replicate the mapping_files dict from dataLoader.py
mapping_files = {
    'MSD': 'nifty_mappings/MSD_mappings.json',
    'TotalSegmentor': 'nifty_mappings/TotalSegmentorCT_MRI_mappings.json',
    'Kaggle_osic': 'nifty_mappings/Kaggle_osic_mappings.json',
    'CancerImageArchive': 'nifty_mappings/CIA_mappings.json',
    'MnMs': 'nifty_mappings/MnMs_mappings.json',
    'Brats2019': 'nifty_mappings/Brats2019_mappings.json',
    'Brats2020': 'nifty_mappings/Brats2020_mappings.json',
    'Brats2021': 'nifty_mappings/Brats2021_mappings.json',
    'OASIS_1': 'nifty_mappings/OASIS_1_mappings.json',
    'OASIS_2': 'nifty_mappings/OASIS_2_mappings.json',
    'PSMA-FDG-PET-CT-LESION': 'nifty_mappings/PSMA-FDG-PET-CT-LESION_mappings.json',
    'PSMA-CT': 'nifty_mappings/PSMA-CT-Longitud_mappings.json',
    'AbdomenAtlas': 'nifty_mappings/AbdomenAtlas_mappings.json',
    'AbdomenCT1k': 'nifty_mappings/AbdomenCT1k_mappings.json',
}
for k, v in mapping_files.items():
    mapping_files[k] = os.path.join(ROOT_DIR, v)

SAMPLE_SIZE = 30


def _check_mapping(name, json_path):
    """Load a mapping JSON, sample up to 30 keys, and check if files exist.



    Returns (total, sampled, missing_paths).

    """
    with open(json_path, 'r') as f:
        data = json.load(f)

    all_keys = list(data.keys())
    total = len(all_keys)
    sampled_keys = random.sample(all_keys, min(SAMPLE_SIZE, total))

    missing = []
    for key in sampled_keys:
        if not os.path.isfile(key):
            missing.append(key)

    return total, len(sampled_keys), missing


def test_all_json_files_exist():
    """Every JSON mapping file listed in mapping_files must exist on disk."""
    missing_jsons = []
    for name, path in mapping_files.items():
        if not os.path.isfile(path):
            missing_jsons.append((name, path))
    assert not missing_jsons, (
        "Missing JSON mapping files:\n"
        + "\n".join(f"  {name}: {path}" for name, path in missing_jsons)
    )


def test_mapping_paths_exist():
    """Randomly check 30 data file paths per mapping JSON."""
    random.seed(42)
    all_results = {}
    any_failure = False

    for name, json_path in sorted(mapping_files.items()):
        if not os.path.isfile(json_path):
            all_results[name] = f"JSON file not found: {json_path}"
            any_failure = True
            continue

        total, sampled, missing = _check_mapping(name, json_path)
        all_results[name] = {
            'total_entries': total,
            'sampled': sampled,
            'missing_count': len(missing),
            'missing_paths': missing,
        }
        if missing:
            any_failure = True

    # Build a readable report
    lines = []
    for name, result in sorted(all_results.items()):
        if isinstance(result, str):
            lines.append(f"\n[FAIL] {name}: {result}")
            continue
        status = "PASS" if result['missing_count'] == 0 else "FAIL"
        lines.append(
            f"\n[{status}] {name}: "
            f"{result['sampled']}/{result['total_entries']} sampled, "
            f"{result['missing_count']} missing"
        )
        for p in result['missing_paths']:
            lines.append(f"    MISSING: {p}")

    report = "\n".join(lines)
    assert not any_failure, f"Some data paths are missing or invalid:\n{report}"
    # Print report on success too (visible with pytest -v or -s)
    print(report)


if __name__ == '__main__':
    print(f"Checking mapping files under: {ROOT_DIR}")
    print(f"Sampling {SAMPLE_SIZE} entries per JSON file\n")
    random.seed(42)

    overall_pass = True
    for name, json_path in sorted(mapping_files.items()):
        if not os.path.isfile(json_path):
            print(f"[SKIP] {name}: JSON not found at {json_path}")
            overall_pass = False
            continue

        total, sampled, missing = _check_mapping(name, json_path)
        status = "PASS" if not missing else "FAIL"
        if missing:
            overall_pass = False
        print(f"[{status}] {name}: {sampled}/{total} sampled, {len(missing)} missing")
        for p in missing:
            print(f"    MISSING: {p}")

    print()
    if overall_pass:
        print("All checked paths exist.")
    else:
        print("Some paths are MISSING — mappings may need updating.")
        sys.exit(1)