File size: 4,870 Bytes
f204f29 ec87a22 f204f29 ec87a22 f204f29 ec87a22 f204f29 ec87a22 f204f29 ec87a22 f204f29 ec87a22 203d15a f204f29 ec87a22 f204f29 ec87a22 f204f29 ec87a22 f204f29 ec87a22 f204f29 ec87a22 f204f29 203d15a f204f29 ec87a22 f204f29 ec87a22 f204f29 ec87a22 f204f29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | import yaml
import os
import subprocess
import argparse
def check_dataset_files(yaml_file, rerun=False):
"""
Check if all required .bin files exist for each dataset in the YAML file.
"""
try:
# Open and parse the YAML file
with open(yaml_file, 'r') as file:
config = yaml.safe_load(file)
# Check if 'Datasets' exists in the YAML file
if 'Datasets' not in config:
print(f"No 'Datasets' section found in {yaml_file}.")
return
datasets = config['Datasets']
all_files_exist = True
for dataset_name, dataset_config in datasets.items():
# Extract required information
save_dir = dataset_config['args']['save_dir']
chunks = dataset_config['args']['chunks']
folding = dataset_config.get('folding', {})
n_folds = folding.get('n_folds', 0)
test_folds = folding.get('test', [])
train_folds = folding.get('train', [])
print(f"\n== Checking dataset: {dataset_name} ==")
print(f" save_dir: {save_dir}")
print(f" chunks: {chunks}")
print(f" n_folds: {n_folds}")
print(f" test_folds: {test_folds}")
print(f" train_folds: {train_folds}")
missing_files = []
# 1. Check for chunk files
for chunk in range(chunks):
chunk_file = os.path.join(save_dir, f"{dataset_name}_{chunk}.bin")
if not os.path.exists(chunk_file):
missing_files.append(chunk_file)
# 2. Check for prebatched fold files (test and train)
# Naming: dataset_name_prebatched_padded_{fold}_n_{n_folds}_f_{foldlist}.bin
fold_types = [('test', test_folds), ('train', train_folds)]
for fold_type, folds in fold_types:
if not folds:
continue
foldlist_str = '_'.join(map(str, folds))
for i in range(chunks):
prebatched_file = os.path.join(
save_dir,
f"{dataset_name}_prebatched_padded_{i}_n_{n_folds}_f_{foldlist_str}.bin"
)
if not os.path.exists(prebatched_file):
missing_files.append(prebatched_file)
# Print results for the current dataset
if missing_files:
all_files_exist = False
print(f" Missing files for dataset '{dataset_name}':")
for missing_file in missing_files:
print(f" - {missing_file}")
# Optionally rerun data prep
if rerun:
print(f" Reprocessing dataset '{dataset_name}' ...")
prep_command = f"jobs/prep_data/prep_data.sh {yaml_file} {dataset_name} {chunks}"
try:
subprocess.run(prep_command, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f" Could NOT reprocess '{dataset_name}': {e}")
else:
print(f" All files exist for dataset '{dataset_name}'.")
# Final summary
if all_files_exist:
print("\nAll required files exist for all datasets.")
else:
print("\nSome files are missing.")
except Exception as e:
print(f"Error processing {yaml_file}: {e}")
def main(pargs):
# Base directory containing the YAML files
base_directory = os.getcwd() + "/configs/"
if pargs.configs:
configs = [p.strip() for p in pargs.configs.split(',')]
else:
configs = [
"higgs_production/baseline.yaml",
"higgs_production/higgs_production_batch_size/higgs_production_bs_2048.yaml",
"higgs_production/higgs_production_batch_size/higgs_production_bs_4096.yaml",
"higgs_production/higgs_production_batch_size/higgs_production_bs_8192.yaml",
]
for config in configs:
yaml_file = os.path.join(base_directory, config)
if os.path.exists(yaml_file):
print(f"\nProcessing file: {config}")
check_dataset_files(yaml_file, pargs.rerun)
else:
print(f"File not found: {yaml_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Check YAML config files")
parser.add_argument(
"--configs", "-c",
type=str,
required=False,
help="Comma-separated list of YAML config paths relative to base directory"
)
parser.add_argument(
"--rerun", "-r",
action='store_true', # Correct way for a boolean flag
help="Automatically re-run data processing to fix missing files"
)
args = parser.parse_args()
main(args) |