ho22joshua commited on
Commit
0be75a7
·
1 Parent(s): d1bd3cb

added check data script

Browse files
root_gnn_dgl/jobs/prep_data/run_processing.py CHANGED
@@ -77,9 +77,9 @@ def main():
77
  configs = [
78
  # "configs/stats_100K/pretraining_multiclass.yaml",
79
  # "configs/stats_100K/ttH_CP_even_vs_odd.yaml",
80
- # "configs/stats_all/pretraining_multiclass.yaml",
81
- # "configs/stats_all/ttH_CP_even_vs_odd.yaml",
82
- "configs/attention/ttH_CP_even_vs_odd.yaml",
83
  ]
84
 
85
  # Path to the bash script to be called
 
77
  configs = [
78
  # "configs/stats_100K/pretraining_multiclass.yaml",
79
  # "configs/stats_100K/ttH_CP_even_vs_odd.yaml",
80
+ "configs/stats_all/pretraining_multiclass.yaml",
81
+ "configs/stats_all/ttH_CP_even_vs_odd.yaml",
82
+ # "configs/attention/ttH_CP_even_vs_odd.yaml",
83
  ]
84
 
85
  # Path to the bash script to be called
root_gnn_dgl/scripts/check_dataset_files.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+ import subprocess
4
+ import argparse
5
+
6
+ def check_dataset_files(yaml_file, rerun = False):
7
+ """
8
+ Check if all required .bin files exist for each dataset in the YAML file.
9
+ """
10
+ try:
11
+ # Open and parse the YAML file
12
+ with open(yaml_file, 'r') as file:
13
+ config = yaml.safe_load(file)
14
+
15
+ # Check if 'Datasets' exists in the YAML file
16
+ if 'Datasets' not in config:
17
+ print(f"No 'Datasets' section found in {yaml_file}.")
18
+ return
19
+
20
+ datasets = config['Datasets']
21
+ all_files_exist = True
22
+
23
+ for dataset_name, dataset_config in datasets.items():
24
+ # Extract required information
25
+ save_dir = dataset_config['args']['save_dir']
26
+ chunks = dataset_config['args']['chunks']
27
+ folding = dataset_config.get('folding', {})
28
+ n_folds = folding.get('n_folds', 0)
29
+ test_folds = folding.get('test', [])
30
+ train_folds = folding.get('train', [])
31
+
32
+ # Check for chunk files
33
+ missing_files = []
34
+ for chunk in range(chunks):
35
+ chunk_file = os.path.join(save_dir, f"{dataset_name}_{chunk}.bin")
36
+ if not os.path.exists(chunk_file):
37
+ missing_files.append(chunk_file)
38
+
39
+ # Check for prebatched files for each fold
40
+ for fold in range(n_folds):
41
+ # Test files
42
+ if fold in test_folds:
43
+ test_file = os.path.join(save_dir, f"{dataset_name}_prebatched_padded_{fold}_n_{n_folds}_f_{'_'.join(map(str, test_folds))}.bin")
44
+ if not os.path.exists(test_file):
45
+ missing_files.append(test_file)
46
+
47
+ # Train files
48
+ if fold in train_folds:
49
+ train_file = os.path.join(save_dir, f"{dataset_name}_prebatched_padded_{fold}_n_{n_folds}_f_{'_'.join(map(str, train_folds))}.bin")
50
+ if not os.path.exists(train_file):
51
+ missing_files.append(train_file)
52
+
53
+ # Print results for the current dataset
54
+ if missing_files:
55
+ all_files_exist = False
56
+ print(f"Missing files for dataset '{dataset_name}':")
57
+ for missing_file in missing_files:
58
+ print(f" - {missing_file}")
59
+
60
+ # Reprocess the dataset with drop_last=False
61
+ if (rerun):
62
+ print(f"Reprocessing dataset '{dataset_name}")
63
+ prep_command = f"bash/prep_data.sh {yaml_file} {dataset_name} {chunks}"
64
+ try:
65
+ subprocess.run(prep_command, shell=True, check=True)
66
+ except subprocess.CalledProcessError as e:
67
+ print(f"Could Not Reprocess '{dataset_name}': {e}")
68
+ else:
69
+ print(f"All files exist for dataset '{dataset_name}'.")
70
+
71
+ # Final summary
72
+ if all_files_exist:
73
+ print("All required files exist for all datasets.")
74
+ else:
75
+ print("Some files are missing.")
76
+
77
+ except Exception as e:
78
+ print(f"Error processing {yaml_file}: {e}")
79
+
80
+ def main(pargs):
81
+ # Base directory containing the YAML files
82
+ base_directory = os.getcwd() + "/configs/"
83
+
84
+ if (pargs.configs):
85
+ # If pargs is a comma-separated list, split it
86
+ configs = [p.strip() for p in pargs.configs.split(',')]
87
+
88
+ else:
89
+ # List of YAML configuration files (relative to the base directory)
90
+ configs = [
91
+ "attention/ttH_CP_even_vs_odd.yaml",
92
+
93
+ "stats_100K/finetuning_ttH_CP_even_vs_odd.yaml",
94
+ "stats_100K/pretraining_multiclass.yaml",
95
+ "stats_100K/ttH_CP_even_vs_odd.yaml",
96
+
97
+ "stats_all/finetuning_ttH_CP_even_vs_odd.yaml",
98
+ "stats_all/pretraining_multiclass.yaml",
99
+ "stats_all/ttH_CP_even_vs_odd.yaml",
100
+ ]
101
+
102
+ # Loop through each config file
103
+ for config in configs:
104
+ yaml_file = os.path.join(base_directory, config) # Construct full path
105
+ if os.path.exists(yaml_file):
106
+ print(f"Processing file: {config}")
107
+ check_dataset_files(yaml_file, pargs.rerun)
108
+ else:
109
+ print(f"File not found: {yaml_file}")
110
+
111
+ if __name__ == "__main__":
112
+ parser = argparse.ArgumentParser(description="Check YAML config files")
113
+ parser.add_argument(
114
+ "--configs", "-c",
115
+ type=str,
116
+ required=False,
117
+ help="Comma-separated list of YAML config paths relative to base directory"
118
+ )
119
+ parser.add_argument(
120
+ "--rerun", "-r",
121
+ type=bool,
122
+ default=False,
123
+ required=False,
124
+ help="Automatically re-runs data processing to fix missing files"
125
+ )
126
+ args = parser.parse_args()
127
+ main(args)