Commit
·
ec87a22
1
Parent(s):
184cb7b
fixing the dataset checking script
Browse files
root_gnn_dgl/scripts/check_dataset_files.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
| 3 |
import subprocess
|
| 4 |
import argparse
|
| 5 |
|
| 6 |
-
def check_dataset_files(yaml_file, rerun
|
| 7 |
"""
|
| 8 |
Check if all required .bin files exist for each dataset in the YAML file.
|
| 9 |
"""
|
|
@@ -29,50 +29,59 @@ def check_dataset_files(yaml_file, rerun = False):
|
|
| 29 |
test_folds = folding.get('test', [])
|
| 30 |
train_folds = folding.get('train', [])
|
| 31 |
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
missing_files = []
|
|
|
|
|
|
|
| 34 |
for chunk in range(chunks):
|
| 35 |
chunk_file = os.path.join(save_dir, f"{dataset_name}_{chunk}.bin")
|
| 36 |
if not os.path.exists(chunk_file):
|
| 37 |
missing_files.append(chunk_file)
|
| 38 |
|
| 39 |
-
# Check for prebatched files
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
|
|
|
| 52 |
|
| 53 |
# Print results for the current dataset
|
| 54 |
if missing_files:
|
| 55 |
all_files_exist = False
|
| 56 |
-
print(f"Missing files for dataset '{dataset_name}':")
|
| 57 |
for missing_file in missing_files:
|
| 58 |
-
print(f"
|
| 59 |
-
|
| 60 |
-
#
|
| 61 |
-
if
|
| 62 |
-
print(f"Reprocessing dataset '{dataset_name}")
|
| 63 |
prep_command = f"bash/prep_data.sh {yaml_file} {dataset_name} {chunks}"
|
| 64 |
try:
|
| 65 |
subprocess.run(prep_command, shell=True, check=True)
|
| 66 |
except subprocess.CalledProcessError as e:
|
| 67 |
-
print(f"Could
|
| 68 |
else:
|
| 69 |
-
print(f"All files exist for dataset '{dataset_name}'.")
|
| 70 |
|
| 71 |
# Final summary
|
| 72 |
if all_files_exist:
|
| 73 |
-
print("
|
| 74 |
else:
|
| 75 |
-
print("
|
| 76 |
|
| 77 |
except Exception as e:
|
| 78 |
print(f"Error processing {yaml_file}: {e}")
|
|
@@ -81,12 +90,9 @@ def main(pargs):
|
|
| 81 |
# Base directory containing the YAML files
|
| 82 |
base_directory = os.getcwd() + "/configs/"
|
| 83 |
|
| 84 |
-
if
|
| 85 |
-
# If pargs is a comma-separated list, split it
|
| 86 |
configs = [p.strip() for p in pargs.configs.split(',')]
|
| 87 |
-
|
| 88 |
else:
|
| 89 |
-
# List of YAML configuration files (relative to the base directory)
|
| 90 |
configs = [
|
| 91 |
"attention/ttH_CP_even_vs_odd.yaml",
|
| 92 |
|
|
@@ -99,11 +105,10 @@ def main(pargs):
|
|
| 99 |
"stats_all/ttH_CP_even_vs_odd.yaml",
|
| 100 |
]
|
| 101 |
|
| 102 |
-
# Loop through each config file
|
| 103 |
for config in configs:
|
| 104 |
-
yaml_file = os.path.join(base_directory, config)
|
| 105 |
if os.path.exists(yaml_file):
|
| 106 |
-
print(f"
|
| 107 |
check_dataset_files(yaml_file, pargs.rerun)
|
| 108 |
else:
|
| 109 |
print(f"File not found: {yaml_file}")
|
|
@@ -118,10 +123,8 @@ if __name__ == "__main__":
|
|
| 118 |
)
|
| 119 |
parser.add_argument(
|
| 120 |
"--rerun", "-r",
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
required=False,
|
| 124 |
-
help="Automatically re-runs data processing to fix missing files"
|
| 125 |
)
|
| 126 |
args = parser.parse_args()
|
| 127 |
main(args)
|
|
|
|
| 3 |
import subprocess
|
| 4 |
import argparse
|
| 5 |
|
| 6 |
+
def check_dataset_files(yaml_file, rerun=False):
|
| 7 |
"""
|
| 8 |
Check if all required .bin files exist for each dataset in the YAML file.
|
| 9 |
"""
|
|
|
|
| 29 |
test_folds = folding.get('test', [])
|
| 30 |
train_folds = folding.get('train', [])
|
| 31 |
|
| 32 |
+
print(f"\n== Checking dataset: {dataset_name} ==")
|
| 33 |
+
print(f" save_dir: {save_dir}")
|
| 34 |
+
print(f" chunks: {chunks}")
|
| 35 |
+
print(f" n_folds: {n_folds}")
|
| 36 |
+
print(f" test_folds: {test_folds}")
|
| 37 |
+
print(f" train_folds: {train_folds}")
|
| 38 |
+
|
| 39 |
missing_files = []
|
| 40 |
+
|
| 41 |
+
# 1. Check for chunk files
|
| 42 |
for chunk in range(chunks):
|
| 43 |
chunk_file = os.path.join(save_dir, f"{dataset_name}_{chunk}.bin")
|
| 44 |
if not os.path.exists(chunk_file):
|
| 45 |
missing_files.append(chunk_file)
|
| 46 |
|
| 47 |
+
# 2. Check for prebatched fold files (test and train)
|
| 48 |
+
# Naming: dataset_name_prebatched_padded_{fold}_n_{n_folds}_f_{foldlist}.bin
|
| 49 |
+
fold_types = [('test', test_folds), ('train', train_folds)]
|
| 50 |
+
for fold_type, folds in fold_types:
|
| 51 |
+
if not folds:
|
| 52 |
+
continue
|
| 53 |
+
foldlist_str = '_'.join(map(str, folds))
|
| 54 |
+
for i in range(chunks):
|
| 55 |
+
prebatched_file = os.path.join(
|
| 56 |
+
save_dir,
|
| 57 |
+
f"{dataset_name}_prebatched_padded_{i}_n_{n_folds}_f_{foldlist_str}.bin"
|
| 58 |
+
)
|
| 59 |
+
if not os.path.exists(prebatched_file):
|
| 60 |
+
missing_files.append(prebatched_file)
|
| 61 |
|
| 62 |
# Print results for the current dataset
|
| 63 |
if missing_files:
|
| 64 |
all_files_exist = False
|
| 65 |
+
print(f" Missing files for dataset '{dataset_name}':")
|
| 66 |
for missing_file in missing_files:
|
| 67 |
+
print(f" - {missing_file}")
|
| 68 |
+
|
| 69 |
+
# Optionally rerun data prep
|
| 70 |
+
if rerun:
|
| 71 |
+
print(f" Reprocessing dataset '{dataset_name}' ...")
|
| 72 |
prep_command = f"bash/prep_data.sh {yaml_file} {dataset_name} {chunks}"
|
| 73 |
try:
|
| 74 |
subprocess.run(prep_command, shell=True, check=True)
|
| 75 |
except subprocess.CalledProcessError as e:
|
| 76 |
+
print(f" Could NOT reprocess '{dataset_name}': {e}")
|
| 77 |
else:
|
| 78 |
+
print(f" All files exist for dataset '{dataset_name}'.")
|
| 79 |
|
| 80 |
# Final summary
|
| 81 |
if all_files_exist:
|
| 82 |
+
print("\nAll required files exist for all datasets.")
|
| 83 |
else:
|
| 84 |
+
print("\nSome files are missing.")
|
| 85 |
|
| 86 |
except Exception as e:
|
| 87 |
print(f"Error processing {yaml_file}: {e}")
|
|
|
|
| 90 |
# Base directory containing the YAML files
|
| 91 |
base_directory = os.getcwd() + "/configs/"
|
| 92 |
|
| 93 |
+
if pargs.configs:
|
|
|
|
| 94 |
configs = [p.strip() for p in pargs.configs.split(',')]
|
|
|
|
| 95 |
else:
|
|
|
|
| 96 |
configs = [
|
| 97 |
"attention/ttH_CP_even_vs_odd.yaml",
|
| 98 |
|
|
|
|
| 105 |
"stats_all/ttH_CP_even_vs_odd.yaml",
|
| 106 |
]
|
| 107 |
|
|
|
|
| 108 |
for config in configs:
|
| 109 |
+
yaml_file = os.path.join(base_directory, config)
|
| 110 |
if os.path.exists(yaml_file):
|
| 111 |
+
print(f"\nProcessing file: {config}")
|
| 112 |
check_dataset_files(yaml_file, pargs.rerun)
|
| 113 |
else:
|
| 114 |
print(f"File not found: {yaml_file}")
|
|
|
|
| 123 |
)
|
| 124 |
parser.add_argument(
|
| 125 |
"--rerun", "-r",
|
| 126 |
+
action='store_true', # Correct way for a boolean flag
|
| 127 |
+
help="Automatically re-run data processing to fix missing files"
|
|
|
|
|
|
|
| 128 |
)
|
| 129 |
args = parser.parse_args()
|
| 130 |
main(args)
|