File size: 4,870 Bytes
f204f29
 
 
 
 
ec87a22
f204f29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec87a22
 
 
 
 
 
 
f204f29
ec87a22
 
f204f29
 
 
 
 
ec87a22
 
 
 
 
 
 
 
 
 
 
 
 
 
f204f29
 
 
 
ec87a22
f204f29
ec87a22
 
 
 
 
203d15a
f204f29
 
 
ec87a22
f204f29
ec87a22
f204f29
 
 
ec87a22
f204f29
ec87a22
f204f29
 
 
 
 
 
 
 
ec87a22
f204f29
 
 
203d15a
 
 
 
f204f29
 
 
ec87a22
f204f29
ec87a22
f204f29
 
 
 
 
 
 
 
 
 
 
 
 
 
ec87a22
 
f204f29
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import yaml
import os
import subprocess
import argparse

def check_dataset_files(yaml_file, rerun=False):
    """
    Check if all required .bin files exist for each dataset in the YAML file.
    """
    try:
        # Open and parse the YAML file
        with open(yaml_file, 'r') as file:
            config = yaml.safe_load(file)

        # Check if 'Datasets' exists in the YAML file
        if 'Datasets' not in config:
            print(f"No 'Datasets' section found in {yaml_file}.")
            return

        datasets = config['Datasets']
        all_files_exist = True

        for dataset_name, dataset_config in datasets.items():
            # Extract required information
            save_dir = dataset_config['args']['save_dir']
            chunks = dataset_config['args']['chunks']
            folding = dataset_config.get('folding', {})
            n_folds = folding.get('n_folds', 0)
            test_folds = folding.get('test', [])
            train_folds = folding.get('train', [])

            print(f"\n== Checking dataset: {dataset_name} ==")
            print(f"  save_dir: {save_dir}")
            print(f"  chunks: {chunks}")
            print(f"  n_folds: {n_folds}")
            print(f"  test_folds: {test_folds}")
            print(f"  train_folds: {train_folds}")

            missing_files = []

            # 1. Check for chunk files
            for chunk in range(chunks):
                chunk_file = os.path.join(save_dir, f"{dataset_name}_{chunk}.bin")
                if not os.path.exists(chunk_file):
                    missing_files.append(chunk_file)

            # 2. Check for prebatched fold files (test and train)
            #    Naming: dataset_name_prebatched_padded_{fold}_n_{n_folds}_f_{foldlist}.bin
            fold_types = [('test', test_folds), ('train', train_folds)]
            for fold_type, folds in fold_types:
                if not folds:
                    continue
                foldlist_str = '_'.join(map(str, folds))
                for i in range(chunks):
                    prebatched_file = os.path.join(
                        save_dir,
                        f"{dataset_name}_prebatched_padded_{i}_n_{n_folds}_f_{foldlist_str}.bin"
                    )
                    if not os.path.exists(prebatched_file):
                        missing_files.append(prebatched_file)

            # Print results for the current dataset
            if missing_files:
                all_files_exist = False
                print(f"  Missing files for dataset '{dataset_name}':")
                for missing_file in missing_files:
                    print(f"    - {missing_file}")

                # Optionally rerun data prep
                if rerun:
                    print(f"  Reprocessing dataset '{dataset_name}' ...")
                    prep_command = f"jobs/prep_data/prep_data.sh {yaml_file} {dataset_name} {chunks}"
                    try:
                        subprocess.run(prep_command, shell=True, check=True)
                    except subprocess.CalledProcessError as e:
                        print(f"  Could NOT reprocess '{dataset_name}': {e}")
            else:
                print(f"  All files exist for dataset '{dataset_name}'.")

        # Final summary
        if all_files_exist:
            print("\nAll required files exist for all datasets.")
        else:
            print("\nSome files are missing.")

    except Exception as e:
        print(f"Error processing {yaml_file}: {e}")

def main(pargs):
    # Base directory containing the YAML files
    base_directory = os.getcwd() + "/configs/"

    if pargs.configs:
        configs = [p.strip() for p in pargs.configs.split(',')]
    else:
        configs = [
            "higgs_production/baseline.yaml",
            "higgs_production/higgs_production_batch_size/higgs_production_bs_2048.yaml",
            "higgs_production/higgs_production_batch_size/higgs_production_bs_4096.yaml",
            "higgs_production/higgs_production_batch_size/higgs_production_bs_8192.yaml",
        ]

    for config in configs:
        yaml_file = os.path.join(base_directory, config)
        if os.path.exists(yaml_file):
            print(f"\nProcessing file: {config}")
            check_dataset_files(yaml_file, pargs.rerun)
        else:
            print(f"File not found: {yaml_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Check YAML config files")
    parser.add_argument(
        "--configs", "-c",
        type=str,
        required=False,
        help="Comma-separated list of YAML config paths relative to base directory"
    )
    parser.add_argument(
        "--rerun", "-r",
        action='store_true',   # Correct way for a boolean flag
        help="Automatically re-run data processing to fix missing files"
    )
    args = parser.parse_args()
    main(args)