ho22joshua commited on
Commit
ec87a22
·
1 Parent(s): 184cb7b

fixing the dataset checking script

Browse files
root_gnn_dgl/scripts/check_dataset_files.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  import subprocess
4
  import argparse
5
 
6
- def check_dataset_files(yaml_file, rerun = False):
7
  """
8
  Check if all required .bin files exist for each dataset in the YAML file.
9
  """
@@ -29,50 +29,59 @@ def check_dataset_files(yaml_file, rerun = False):
29
  test_folds = folding.get('test', [])
30
  train_folds = folding.get('train', [])
31
 
32
- # Check for chunk files
 
 
 
 
 
 
33
  missing_files = []
 
 
34
  for chunk in range(chunks):
35
  chunk_file = os.path.join(save_dir, f"{dataset_name}_{chunk}.bin")
36
  if not os.path.exists(chunk_file):
37
  missing_files.append(chunk_file)
38
 
39
- # Check for prebatched files for each fold
40
- for fold in range(n_folds):
41
- # Test files
42
- if fold in test_folds:
43
- test_file = os.path.join(save_dir, f"{dataset_name}_prebatched_padded_{fold}_n_{n_folds}_f_{'_'.join(map(str, test_folds))}.bin")
44
- if not os.path.exists(test_file):
45
- missing_files.append(test_file)
46
-
47
- # Train files
48
- if fold in train_folds:
49
- train_file = os.path.join(save_dir, f"{dataset_name}_prebatched_padded_{fold}_n_{n_folds}_f_{'_'.join(map(str, train_folds))}.bin")
50
- if not os.path.exists(train_file):
51
- missing_files.append(train_file)
 
52
 
53
  # Print results for the current dataset
54
  if missing_files:
55
  all_files_exist = False
56
- print(f"Missing files for dataset '{dataset_name}':")
57
  for missing_file in missing_files:
58
- print(f" - {missing_file}")
59
-
60
- # Reprocess the dataset with drop_last=False
61
- if (rerun):
62
- print(f"Reprocessing dataset '{dataset_name}")
63
  prep_command = f"bash/prep_data.sh {yaml_file} {dataset_name} {chunks}"
64
  try:
65
  subprocess.run(prep_command, shell=True, check=True)
66
  except subprocess.CalledProcessError as e:
67
- print(f"Could Not Reprocess '{dataset_name}': {e}")
68
  else:
69
- print(f"All files exist for dataset '{dataset_name}'.")
70
 
71
  # Final summary
72
  if all_files_exist:
73
- print("All required files exist for all datasets.")
74
  else:
75
- print("Some files are missing.")
76
 
77
  except Exception as e:
78
  print(f"Error processing {yaml_file}: {e}")
@@ -81,12 +90,9 @@ def main(pargs):
81
  # Base directory containing the YAML files
82
  base_directory = os.getcwd() + "/configs/"
83
 
84
- if (pargs.configs):
85
- # If pargs is a comma-separated list, split it
86
  configs = [p.strip() for p in pargs.configs.split(',')]
87
-
88
  else:
89
- # List of YAML configuration files (relative to the base directory)
90
  configs = [
91
  "attention/ttH_CP_even_vs_odd.yaml",
92
 
@@ -99,11 +105,10 @@ def main(pargs):
99
  "stats_all/ttH_CP_even_vs_odd.yaml",
100
  ]
101
 
102
- # Loop through each config file
103
  for config in configs:
104
- yaml_file = os.path.join(base_directory, config) # Construct full path
105
  if os.path.exists(yaml_file):
106
- print(f"Processing file: {config}")
107
  check_dataset_files(yaml_file, pargs.rerun)
108
  else:
109
  print(f"File not found: {yaml_file}")
@@ -118,10 +123,8 @@ if __name__ == "__main__":
118
  )
119
  parser.add_argument(
120
  "--rerun", "-r",
121
- type=bool,
122
- default=False,
123
- required=False,
124
- help="Automatically re-runs data processing to fix missing files"
125
  )
126
  args = parser.parse_args()
127
  main(args)
 
3
  import subprocess
4
  import argparse
5
 
6
+ def check_dataset_files(yaml_file, rerun=False):
7
  """
8
  Check if all required .bin files exist for each dataset in the YAML file.
9
  """
 
29
  test_folds = folding.get('test', [])
30
  train_folds = folding.get('train', [])
31
 
32
+ print(f"\n== Checking dataset: {dataset_name} ==")
33
+ print(f" save_dir: {save_dir}")
34
+ print(f" chunks: {chunks}")
35
+ print(f" n_folds: {n_folds}")
36
+ print(f" test_folds: {test_folds}")
37
+ print(f" train_folds: {train_folds}")
38
+
39
  missing_files = []
40
+
41
+ # 1. Check for chunk files
42
  for chunk in range(chunks):
43
  chunk_file = os.path.join(save_dir, f"{dataset_name}_{chunk}.bin")
44
  if not os.path.exists(chunk_file):
45
  missing_files.append(chunk_file)
46
 
47
+ # 2. Check for prebatched fold files (test and train)
48
+ # Naming: dataset_name_prebatched_padded_{fold}_n_{n_folds}_f_{foldlist}.bin
49
+ fold_types = [('test', test_folds), ('train', train_folds)]
50
+ for fold_type, folds in fold_types:
51
+ if not folds:
52
+ continue
53
+ foldlist_str = '_'.join(map(str, folds))
54
+ for i in range(chunks):
55
+ prebatched_file = os.path.join(
56
+ save_dir,
57
+ f"{dataset_name}_prebatched_padded_{i}_n_{n_folds}_f_{foldlist_str}.bin"
58
+ )
59
+ if not os.path.exists(prebatched_file):
60
+ missing_files.append(prebatched_file)
61
 
62
  # Print results for the current dataset
63
  if missing_files:
64
  all_files_exist = False
65
+ print(f" Missing files for dataset '{dataset_name}':")
66
  for missing_file in missing_files:
67
+ print(f" - {missing_file}")
68
+
69
+ # Optionally rerun data prep
70
+ if rerun:
71
+ print(f" Reprocessing dataset '{dataset_name}' ...")
72
  prep_command = f"bash/prep_data.sh {yaml_file} {dataset_name} {chunks}"
73
  try:
74
  subprocess.run(prep_command, shell=True, check=True)
75
  except subprocess.CalledProcessError as e:
76
+ print(f" Could NOT reprocess '{dataset_name}': {e}")
77
  else:
78
+ print(f" All files exist for dataset '{dataset_name}'.")
79
 
80
  # Final summary
81
  if all_files_exist:
82
+ print("\nAll required files exist for all datasets.")
83
  else:
84
+ print("\nSome files are missing.")
85
 
86
  except Exception as e:
87
  print(f"Error processing {yaml_file}: {e}")
 
90
  # Base directory containing the YAML files
91
  base_directory = os.getcwd() + "/configs/"
92
 
93
+ if pargs.configs:
 
94
  configs = [p.strip() for p in pargs.configs.split(',')]
 
95
  else:
 
96
  configs = [
97
  "attention/ttH_CP_even_vs_odd.yaml",
98
 
 
105
  "stats_all/ttH_CP_even_vs_odd.yaml",
106
  ]
107
 
 
108
  for config in configs:
109
+ yaml_file = os.path.join(base_directory, config)
110
  if os.path.exists(yaml_file):
111
+ print(f"\nProcessing file: {config}")
112
  check_dataset_files(yaml_file, pargs.rerun)
113
  else:
114
  print(f"File not found: {yaml_file}")
 
123
  )
124
  parser.add_argument(
125
  "--rerun", "-r",
126
+ action='store_true', # Correct way for a boolean flag
127
+ help="Automatically re-run data processing to fix missing files"
 
 
128
  )
129
  args = parser.parse_args()
130
  main(args)