|
|
import os |
|
|
import sys |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import uproot |
|
|
import awkward as ak |
|
|
import random |
|
|
from ROOT import TLorentzVector |
|
|
import argparse |
|
|
|
|
|
from utils import * |
|
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
from utils_plot import * |
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser(description='Process physics data analysis steps') |
|
|
parser.add_argument('--step1', action='store_true', help='Step 1: List ROOT files') |
|
|
parser.add_argument('--step2', action='store_true', help='Step 2: ROOT -> NumPy conversion') |
|
|
parser.add_argument('--step3', action='store_true', help='Step 3: Background estimation') |
|
|
parser.add_argument('--step4', action='store_true', help='Step 4: Classification') |
|
|
parser.add_argument('--step5', action='store_true', help='Step 5: Categorization') |
|
|
parser.add_argument('--plot', action='store_true', help='Generate plots') |
|
|
parser.add_argument('--sig', action='store_true', help='Calculate significance') |
|
|
parser.add_argument('--all', action='store_true', help='Run all steps') |
|
|
parser.add_argument("--fraction", type=float, default=1.0, help="Fraction of events to process per file (0.0 - 1.0)") |
|
|
parser.add_argument("--selective-step2", action='store_true', help='Step 2: Process only specific files with custom output names') |
|
|
parser.add_argument("--data-file", type=str, default='data_A.GamGam.root', help='Data file to process in selective mode') |
|
|
parser.add_argument("--signal-file", type=str, default='mc_345318.WpH125J_Wincl_gamgam.GamGam.root', help='Signal file to process in selective mode') |
|
|
parser.add_argument("--data-output", type=str, default='data_A_raw.npy', help='Output filename for data in selective mode') |
|
|
parser.add_argument("--signal-output", type=str, default='signal_WH_raw.npy', help='Output filename for signal in selective mode') |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
if args.all: |
|
|
step1 = True |
|
|
step2 = True |
|
|
step3 = True |
|
|
step4 = True |
|
|
step5 = True |
|
|
plot = True |
|
|
sig = True |
|
|
else: |
|
|
step1 = args.step1 |
|
|
step2 = args.step2 or args.selective_step2 |
|
|
step3 = args.step3 |
|
|
step4 = args.step4 |
|
|
step5 = args.step5 |
|
|
plot = args.plot |
|
|
sig = args.sig |
|
|
|
|
|
|
|
|
|
|
|
np.random.seed(42) |
|
|
random.seed(42) |
|
|
|
|
|
|
|
|
try: |
|
|
import torch |
|
|
torch.manual_seed(42) |
|
|
torch.cuda.manual_seed(42) |
|
|
torch.cuda.manual_seed_all(42) |
|
|
torch.backends.cudnn.deterministic = True |
|
|
torch.backends.cudnn.benchmark = False |
|
|
print("[Info] PyTorch random seeds set for reproducibility") |
|
|
except ImportError: |
|
|
print("[Info] PyTorch not available, skipping PyTorch seed setting") |
|
|
|
|
|
|
|
|
try: |
|
|
import tensorflow as tf |
|
|
tf.random.set_seed(42) |
|
|
print("[Info] TensorFlow random seeds set for reproducibility") |
|
|
except ImportError: |
|
|
print("[Info] TensorFlow not available, skipping TensorFlow seed setting") |
|
|
|
|
|
fraction = args.fraction |
|
|
|
|
|
|
|
|
if step1: |
|
|
print("-----------------------------------------") |
|
|
print("Step 1: Listing ROOT files") |
|
|
print("-----------------------------------------") |
|
|
|
|
|
os.makedirs('arrays', exist_ok=True) |
|
|
|
|
|
filelist = [] |
|
|
data_dir = '/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/' |
|
|
for file in sorted(os.listdir(data_dir)): |
|
|
print(file) |
|
|
filepath = os.path.join(data_dir, file) |
|
|
if os.path.isfile(filepath) and filepath.endswith('.root'): |
|
|
filelist.append(filepath) |
|
|
|
|
|
|
|
|
file_list_path = os.path.abspath('arrays/file_list.txt') |
|
|
with open('arrays/file_list.txt', 'w') as f: |
|
|
for filepath in filelist: |
|
|
filename = os.path.basename(filepath) |
|
|
f.write(f"{filename}\n") |
|
|
print("-------------------------------------------------------------------------") |
|
|
print(f"Found {len(filelist)} ROOT files.") |
|
|
print(f"File list saved to: {file_list_path}") |
|
|
print("-------------------------------------------------------------------------") |
|
|
|
|
|
|
|
|
print("Analyzing ROOT file structures...") |
|
|
print("-------------------------------------------------------------------------") |
|
|
root_summary_path = os.path.abspath('arrays/root_summary.txt') |
|
|
with open('arrays/root_summary.txt', 'w') as f: |
|
|
f.write("=" * 80 + "\n") |
|
|
f.write("ROOT FILES ANALYSIS SUMMARY\n") |
|
|
f.write("=" * 80 + "\n\n") |
|
|
|
|
|
|
|
|
print("Extracting common branches across all files...") |
|
|
all_branches = {} |
|
|
for filepath in filelist: |
|
|
try: |
|
|
with uproot.open(filepath) as root_file: |
|
|
trees = [] |
|
|
for key in root_file.keys(): |
|
|
try: |
|
|
obj = root_file[key] |
|
|
if hasattr(obj, 'keys'): |
|
|
trees.append(key) |
|
|
except: |
|
|
continue |
|
|
|
|
|
for tree_name in trees: |
|
|
try: |
|
|
tree = root_file[tree_name] |
|
|
branches = list(tree.keys()) |
|
|
if tree_name not in all_branches: |
|
|
all_branches[tree_name] = [] |
|
|
all_branches[tree_name].append(set(branches)) |
|
|
except: |
|
|
continue |
|
|
except: |
|
|
continue |
|
|
|
|
|
|
|
|
common_branches = {} |
|
|
for tree_name, branch_sets in all_branches.items(): |
|
|
if len(branch_sets) > 1: |
|
|
common = set.intersection(*branch_sets) |
|
|
common_branches[tree_name] = sorted(list(common)) |
|
|
elif len(branch_sets) == 1: |
|
|
common_branches[tree_name] = sorted(list(branch_sets[0])) |
|
|
|
|
|
|
|
|
f.write("COMMON BRANCHES ACROSS ALL FILES\n") |
|
|
f.write("=" * 40 + "\n\n") |
|
|
for tree_name, branches in common_branches.items(): |
|
|
f.write(f"Tree: {tree_name}\n") |
|
|
f.write(f"Common branches ({len(branches)}):\n") |
|
|
f.write(f" {', '.join(sorted(branches))}\n") |
|
|
f.write("\n") |
|
|
f.write("=" * 80 + "\n\n") |
|
|
|
|
|
for i, filepath in enumerate(filelist, 1): |
|
|
filename = os.path.basename(filepath) |
|
|
f.write(f"File {i}: {filename}\n") |
|
|
f.write("-" * (len(filename) + 8) + "\n") |
|
|
|
|
|
try: |
|
|
with uproot.open(filepath) as root_file: |
|
|
|
|
|
keys = list(root_file.keys()) |
|
|
f.write(f"Total objects: {len(keys)}\n") |
|
|
|
|
|
|
|
|
trees = [] |
|
|
for key in keys: |
|
|
try: |
|
|
obj = root_file[key] |
|
|
if hasattr(obj, 'keys'): |
|
|
trees.append(key) |
|
|
except: |
|
|
continue |
|
|
|
|
|
f.write(f"Trees found: {len(trees)}\n") |
|
|
|
|
|
|
|
|
for tree_name in trees: |
|
|
f.write(f"\nTree: {tree_name}\n") |
|
|
try: |
|
|
tree = root_file[tree_name] |
|
|
n_entries = tree.num_entries |
|
|
f.write(f" Entries: {n_entries}\n") |
|
|
|
|
|
|
|
|
branches = list(tree.keys()) |
|
|
f.write(f" Branches ({len(branches)}):\n") |
|
|
|
|
|
|
|
|
branch_groups = {} |
|
|
for branch in sorted(branches): |
|
|
prefix = branch.split('_')[0] if '_' in branch else 'other' |
|
|
if prefix not in branch_groups: |
|
|
branch_groups[prefix] = [] |
|
|
branch_groups[prefix].append(branch) |
|
|
|
|
|
for prefix, branch_list in sorted(branch_groups.items()): |
|
|
f.write(f" {prefix}: {', '.join(branch_list)}\n") |
|
|
|
|
|
except Exception as e: |
|
|
f.write(f" Error analyzing tree: {e}\n") |
|
|
|
|
|
f.write("\n" + "=" * 50 + "\n") |
|
|
|
|
|
except Exception as e: |
|
|
f.write(f"Error opening file: {e}\n\n") |
|
|
|
|
|
print(f"ROOT analysis saved to: {root_summary_path}") |
|
|
print("--------------------------------") |
|
|
print("[Step 1] completed successfully!") |
|
|
if not (step2 or step3 or step4 or step5 or plot or sig): |
|
|
exit(0) |
|
|
|
|
|
|
|
|
if step2: |
|
|
print("-----------------------------------------") |
|
|
print("Step 2: ROOT -> NumPy conversion") |
|
|
print("-----------------------------------------") |
|
|
|
|
|
if args.selective_step2: |
|
|
print("Running in selective mode - processing only specified files...") |
|
|
print(f"Data file: {args.data_file}") |
|
|
print(f"Signal file: {args.signal_file}") |
|
|
print(f"Data output: {args.data_output}") |
|
|
print(f"Signal output: {args.signal_output}") |
|
|
|
|
|
|
|
|
data_dir = '/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/' |
|
|
if os.path.isabs(args.data_file): |
|
|
data_filepath = args.data_file |
|
|
else: |
|
|
data_filepath = os.path.join(data_dir, args.data_file) |
|
|
|
|
|
if not os.path.exists(data_filepath): |
|
|
print(f"Error: Data file {data_filepath} not found.") |
|
|
exit(1) |
|
|
|
|
|
print(f'Processing data file: {args.data_file}') |
|
|
tree = uproot.open(data_filepath)['mini;1'] |
|
|
entry_stop = int(tree.num_entries * fraction) |
|
|
branches = next(tree.iterate(library="ak", entry_stop=entry_stop)) |
|
|
|
|
|
N = len(branches) |
|
|
arr = np.full((N, 46), np.nan) |
|
|
|
|
|
|
|
|
for i in range(0,2): |
|
|
arr[:,0+3*i] = ak.to_numpy(branches['photon_pt'][:,i]) |
|
|
arr[:,1+3*i] = ak.to_numpy(branches['photon_eta'][:,i]) |
|
|
arr[:,2+3*i] = ak.to_numpy(branches['photon_phi'][:,i]) |
|
|
|
|
|
|
|
|
for i in range(0,2): |
|
|
mask = branches['lep_n'] > i |
|
|
arr[mask,6+3*i] = ak.to_numpy(branches['lep_pt'][mask,i]) |
|
|
arr[mask,7+3*i] = ak.to_numpy(branches['lep_eta'][mask,i]) |
|
|
arr[mask,8+3*i] = ak.to_numpy(branches['lep_phi'][mask,i]) |
|
|
|
|
|
|
|
|
for i in range(0,6): |
|
|
mask = branches['jet_n'] > i |
|
|
arr[mask,12+3*i] = ak.to_numpy(branches['jet_pt'][mask,i]) |
|
|
arr[mask,13+3*i] = ak.to_numpy(branches['jet_eta'][mask,i]) |
|
|
arr[mask,14+3*i] = ak.to_numpy(branches['jet_phi'][mask,i]) |
|
|
|
|
|
|
|
|
arr[:,30] = ak.to_numpy(branches['met_et']) |
|
|
arr[:,31] = ak.to_numpy(branches['met_phi']) |
|
|
|
|
|
|
|
|
arr[:,32] = ak.to_numpy(branches['mcWeight']) |
|
|
arr[:,33] = ak.to_numpy(branches['SumWeights']) |
|
|
arr[:,34] = ak.to_numpy(branches['XSection']) |
|
|
arr[:,35] = ak.to_numpy(branches['photon_isTightID'][:,0]) |
|
|
arr[:,36] = ak.to_numpy(branches['photon_isTightID'][:,1]) |
|
|
|
|
|
|
|
|
arr[:,37] = ak.to_numpy(branches['scaleFactor_PILEUP']) |
|
|
arr[:,38] = ak.to_numpy(branches['scaleFactor_PHOTON']) |
|
|
arr[:,39] = ak.to_numpy(branches['scaleFactor_PhotonTRIGGER']) |
|
|
arr[:,40] = ak.to_numpy(branches['scaleFactor_ELE']) |
|
|
arr[:,41] = ak.to_numpy(branches['scaleFactor_MUON']) |
|
|
arr[:,42] = ak.to_numpy(branches['scaleFactor_LepTRIGGER']) |
|
|
arr[:,43] = ak.to_numpy(branches['scaleFactor_BTAG']) |
|
|
|
|
|
np.save(f'arrays/{args.data_output}', arr) |
|
|
print(f"Saved data to arrays/{args.data_output} with shape: {arr.shape}") |
|
|
|
|
|
|
|
|
if os.path.isabs(args.signal_file): |
|
|
signal_filepath = args.signal_file |
|
|
else: |
|
|
signal_filepath = os.path.join(data_dir, args.signal_file) |
|
|
|
|
|
if not os.path.exists(signal_filepath): |
|
|
print(f"Error: Signal file {signal_filepath} not found.") |
|
|
exit(1) |
|
|
|
|
|
print(f'Processing signal file: {args.signal_file}') |
|
|
tree = uproot.open(signal_filepath)['mini;1'] |
|
|
entry_stop = int(tree.num_entries * fraction) |
|
|
branches = next(tree.iterate(library="ak", entry_stop=entry_stop)) |
|
|
|
|
|
N = len(branches) |
|
|
arr = np.full((N, 46), np.nan) |
|
|
|
|
|
|
|
|
for i in range(0,2): |
|
|
arr[:,0+3*i] = ak.to_numpy(branches['photon_pt'][:,i]) |
|
|
arr[:,1+3*i] = ak.to_numpy(branches['photon_eta'][:,i]) |
|
|
arr[:,2+3*i] = ak.to_numpy(branches['photon_phi'][:,i]) |
|
|
|
|
|
|
|
|
for i in range(0,2): |
|
|
mask = branches['lep_n'] > i |
|
|
arr[mask,6+3*i] = ak.to_numpy(branches['lep_pt'][mask,i]) |
|
|
arr[mask,7+3*i] = ak.to_numpy(branches['lep_eta'][mask,i]) |
|
|
arr[mask,8+3*i] = ak.to_numpy(branches['lep_phi'][mask,i]) |
|
|
|
|
|
|
|
|
for i in range(0,6): |
|
|
mask = branches['jet_n'] > i |
|
|
arr[mask,12+3*i] = ak.to_numpy(branches['jet_pt'][mask,i]) |
|
|
arr[mask,13+3*i] = ak.to_numpy(branches['jet_eta'][mask,i]) |
|
|
arr[mask,14+3*i] = ak.to_numpy(branches['jet_phi'][mask,i]) |
|
|
|
|
|
|
|
|
arr[:,30] = ak.to_numpy(branches['met_et']) |
|
|
arr[:,31] = ak.to_numpy(branches['met_phi']) |
|
|
|
|
|
|
|
|
arr[:,32] = ak.to_numpy(branches['mcWeight']) |
|
|
arr[:,33] = ak.to_numpy(branches['SumWeights']) |
|
|
arr[:,34] = ak.to_numpy(branches['XSection']) |
|
|
arr[:,35] = ak.to_numpy(branches['photon_isTightID'][:,0]) |
|
|
arr[:,36] = ak.to_numpy(branches['photon_isTightID'][:,1]) |
|
|
|
|
|
|
|
|
arr[:,37] = ak.to_numpy(branches['scaleFactor_PILEUP']) |
|
|
arr[:,38] = ak.to_numpy(branches['scaleFactor_PHOTON']) |
|
|
arr[:,39] = ak.to_numpy(branches['scaleFactor_PhotonTRIGGER']) |
|
|
arr[:,40] = ak.to_numpy(branches['scaleFactor_ELE']) |
|
|
arr[:,41] = ak.to_numpy(branches['scaleFactor_MUON']) |
|
|
arr[:,42] = ak.to_numpy(branches['scaleFactor_LepTRIGGER']) |
|
|
arr[:,43] = ak.to_numpy(branches['scaleFactor_BTAG']) |
|
|
|
|
|
np.save(f'arrays/{args.signal_output}', arr) |
|
|
print(f"Saved signal to arrays/{args.signal_output} with shape: {arr.shape}") |
|
|
|
|
|
print("-----------------------------------------------------") |
|
|
print("Selective file processing completed successfully!") |
|
|
print("[Step 2] completed successfully!") |
|
|
if not (step3 or step4 or step5 or plot or sig): |
|
|
exit(0) |
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
if not os.path.exists('arrays/file_list.txt'): |
|
|
print("Error: file_list.txt not found. Run --step1 first.") |
|
|
exit(1) |
|
|
|
|
|
filelist = [] |
|
|
with open('arrays/file_list.txt', 'r') as f: |
|
|
lines = f.readlines() |
|
|
for line in lines: |
|
|
if line.strip(): |
|
|
filename = line.strip() |
|
|
filepath = os.path.join('/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/', filename) |
|
|
filelist.append(filepath) |
|
|
|
|
|
all_data = [] |
|
|
signal_data = [] |
|
|
background_data = [] |
|
|
|
|
|
for filepath in filelist: |
|
|
name = os.path.basename(filepath).removesuffix('.root') |
|
|
print(f'Starting to read {name} into np array') |
|
|
tree = uproot.open(filepath)['mini;1'] |
|
|
entry_stop = int(tree.num_entries * fraction) |
|
|
branches = next(tree.iterate(library="ak", entry_stop=entry_stop)) |
|
|
|
|
|
N = len(branches) |
|
|
arr = np.full((N, 46), np.nan) |
|
|
|
|
|
|
|
|
for i in range(0,2): |
|
|
arr[:,0+3*i] = ak.to_numpy(branches['photon_pt'][:,i]) |
|
|
arr[:,1+3*i] = ak.to_numpy(branches['photon_eta'][:,i]) |
|
|
arr[:,2+3*i] = ak.to_numpy(branches['photon_phi'][:,i]) |
|
|
|
|
|
|
|
|
for i in range(0,2): |
|
|
mask = branches['lep_n'] > i |
|
|
arr[mask,6+3*i] = ak.to_numpy(branches['lep_pt'][mask,i]) |
|
|
arr[mask,7+3*i] = ak.to_numpy(branches['lep_eta'][mask,i]) |
|
|
arr[mask,8+3*i] = ak.to_numpy(branches['lep_phi'][mask,i]) |
|
|
|
|
|
|
|
|
for i in range(0,6): |
|
|
mask = branches['jet_n'] > i |
|
|
arr[mask,12+3*i] = ak.to_numpy(branches['jet_pt'][mask,i]) |
|
|
arr[mask,13+3*i] = ak.to_numpy(branches['jet_eta'][mask,i]) |
|
|
arr[mask,14+3*i] = ak.to_numpy(branches['jet_phi'][mask,i]) |
|
|
|
|
|
|
|
|
arr[:,30] = ak.to_numpy(branches['met_et']) |
|
|
arr[:,31] = ak.to_numpy(branches['met_phi']) |
|
|
|
|
|
|
|
|
arr[:,32] = ak.to_numpy(branches['mcWeight']) |
|
|
arr[:,33] = ak.to_numpy(branches['SumWeights']) |
|
|
arr[:,34] = ak.to_numpy(branches['XSection']) |
|
|
arr[:,35] = ak.to_numpy(branches['photon_isTightID'][:,0]) |
|
|
arr[:,36] = ak.to_numpy(branches['photon_isTightID'][:,1]) |
|
|
|
|
|
|
|
|
arr[:,37] = ak.to_numpy(branches['scaleFactor_PILEUP']) |
|
|
arr[:,38] = ak.to_numpy(branches['scaleFactor_PHOTON']) |
|
|
arr[:,39] = ak.to_numpy(branches['scaleFactor_PhotonTRIGGER']) |
|
|
arr[:,40] = ak.to_numpy(branches['scaleFactor_ELE']) |
|
|
arr[:,41] = ak.to_numpy(branches['scaleFactor_MUON']) |
|
|
arr[:,42] = ak.to_numpy(branches['scaleFactor_LepTRIGGER']) |
|
|
arr[:,43] = ak.to_numpy(branches['scaleFactor_BTAG']) |
|
|
|
|
|
|
|
|
if 'mc' in filepath: |
|
|
signal_data.append(arr) |
|
|
else: |
|
|
background_data.append(arr) |
|
|
|
|
|
|
|
|
if signal_data: |
|
|
signal_array = np.concatenate(signal_data, axis=0) |
|
|
np.save('arrays/signal_raw.npy', signal_array) |
|
|
print(f"Saved {len(signal_data)} signal files to signal_raw.npy with shape: {signal_array.shape}") |
|
|
|
|
|
if background_data: |
|
|
background_array = np.concatenate(background_data, axis=0) |
|
|
np.save('arrays/data_raw.npy', background_array) |
|
|
print(f"Saved {len(background_data)} data files to data_raw.npy with shape: {background_array.shape}") |
|
|
|
|
|
print("-----------------------------------------------------") |
|
|
print(f"Converted {len(filelist)} ROOT files to NumPy arrays") |
|
|
print("[Step 2] completed successfully!") |
|
|
if not (step3 or step4 or step5 or plot or sig): |
|
|
exit(0) |
|
|
|
|
|
|
|
|
if step3: |
|
|
print("-----------------------------------------------") |
|
|
print("Step 3: Preprocessing and Background Estimation") |
|
|
print("-----------------------------------------------") |
|
|
|
|
|
|
|
|
if not os.path.exists('arrays/signal_raw.npy') or not os.path.exists('arrays/data_raw.npy'): |
|
|
print("Error: signal_raw.npy or data_raw.npy not found. Run --step2 first.") |
|
|
exit(1) |
|
|
|
|
|
signal_raw = np.load('arrays/signal_raw.npy') |
|
|
data_raw = np.load('arrays/data_raw.npy') |
|
|
|
|
|
print(f"[Message] Loaded {len(signal_raw)} MC events and {len(data_raw)} data events") |
|
|
|
|
|
|
|
|
if signal_raw.shape[1] != 46: |
|
|
print(f"Error: signal_raw.npy has {signal_raw.shape[1]} columns, expected 46. Please re-run --step2 to regenerate with correct format.") |
|
|
exit(1) |
|
|
if data_raw.shape[1] != 46: |
|
|
print(f"Error: data_raw.npy has {data_raw.shape[1]} columns, expected 46. Please re-run --step2 to regenerate with correct format.") |
|
|
exit(1) |
|
|
|
|
|
|
|
|
signal = [] |
|
|
for arr in [signal_raw]: |
|
|
|
|
|
m_yy = [] |
|
|
pt_yy = [] |
|
|
for pt1, eta1, phi1, pt2, eta2, phi2 in zip(arr[:,0], arr[:,1], arr[:,2], arr[:,3], arr[:,4], arr[:,5]): |
|
|
photon1 = TLorentzVector() |
|
|
photon1.SetPtEtaPhiM(pt1, eta1, phi1, 0.0) |
|
|
photon2 = TLorentzVector() |
|
|
photon2.SetPtEtaPhiM(pt2, eta2, phi2, 0.0) |
|
|
diphoton = photon1 + photon2 |
|
|
m_yy.append(diphoton.M()) |
|
|
pt_yy.append(diphoton.Pt()) |
|
|
|
|
|
arr[:,44] = np.array(m_yy) |
|
|
arr[:,45] = np.array(pt_yy) |
|
|
|
|
|
|
|
|
mcWeight = arr[:,32] |
|
|
scaleFactor_PILEUP = arr[:,37] |
|
|
scaleFactor_PHOTON = arr[:,38] |
|
|
scaleFactor_PhotonTRIGGER = arr[:,39] |
|
|
scaleFactor_ELE = arr[:,40] |
|
|
scaleFactor_MUON = arr[:,41] |
|
|
scaleFactor_LepTRIGGER = arr[:,42] |
|
|
scaleFactor_BTAG = arr[:,43] |
|
|
|
|
|
|
|
|
sigma_per_event = arr[:,34] |
|
|
sum_mcWeight_per_event = arr[:,33] |
|
|
|
|
|
|
|
|
|
|
|
luminosity = 10 * 1000 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
problematic_xsec_mask = np.abs(sigma_per_event - 2.64338632e-06) < 1e-10 |
|
|
if np.any(problematic_xsec_mask): |
|
|
correction_factor = 0.000116 / 2.64338632e-06 |
|
|
sigma_per_event = sigma_per_event.copy() |
|
|
sigma_per_event[problematic_xsec_mask] *= correction_factor |
|
|
|
|
|
arr[:,34] = sigma_per_event |
|
|
print(f"[Message] Applied cross-section correction factor {correction_factor:.1f} to {np.sum(problematic_xsec_mask)} SM Higgs events") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
normalization_per_event = (sigma_per_event * luminosity) / sum_mcWeight_per_event |
|
|
|
|
|
|
|
|
scale_factors = { |
|
|
'PILEUP': scaleFactor_PILEUP, |
|
|
'PHOTON': scaleFactor_PHOTON, |
|
|
'PhotonTRIGGER': scaleFactor_PhotonTRIGGER, |
|
|
'ELE': scaleFactor_ELE, |
|
|
'MUON': scaleFactor_MUON, |
|
|
'LepTRIGGER': scaleFactor_LepTRIGGER, |
|
|
'BTAG': scaleFactor_BTAG |
|
|
} |
|
|
|
|
|
|
|
|
combined_scale_factor = np.ones_like(mcWeight) |
|
|
for name, sf in scale_factors.items(): |
|
|
zero_count = np.sum(sf == 0) |
|
|
if zero_count > 0: |
|
|
print(f"[Warning] {zero_count} events have zero {name} scale factor") |
|
|
combined_scale_factor *= sf |
|
|
|
|
|
|
|
|
event_weight = mcWeight * normalization_per_event * combined_scale_factor |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
arr[:,32] = event_weight |
|
|
|
|
|
|
|
|
scale_factors = { |
|
|
'PILEUP': scaleFactor_PILEUP, |
|
|
'PHOTON': scaleFactor_PHOTON, |
|
|
'PhotonTRIGGER': scaleFactor_PhotonTRIGGER, |
|
|
'ELE': scaleFactor_ELE, |
|
|
'MUON': scaleFactor_MUON, |
|
|
'LepTRIGGER': scaleFactor_LepTRIGGER, |
|
|
'BTAG': scaleFactor_BTAG |
|
|
} |
|
|
print("-----------------------------------") |
|
|
print("Overall scale factor check for MC: (skipped, treating as 1)") |
|
|
print("-----------------------------------") |
|
|
overall_sf = np.ones_like(mcWeight) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ph1_eta_mask = (np.abs(arr[:,1]) < 1.37) | ((np.abs(arr[:,1]) > 1.52) & (np.abs(arr[:,1]) < 2.37)) |
|
|
ph2_eta_mask = (np.abs(arr[:,4]) < 1.37) | ((np.abs(arr[:,4]) > 1.52) & (np.abs(arr[:,4]) < 2.37)) |
|
|
|
|
|
|
|
|
m_yy_safe = np.where(arr[:,44] == 0, 1e-10, arr[:,44]) |
|
|
ph1_pt_mask = (arr[:,0] > 25*1000) & ((arr[:,0] / m_yy_safe) > 0.35) |
|
|
ph2_pt_mask = (arr[:,3] > 25*1000) & ((arr[:,3] / m_yy_safe) > 0.25) |
|
|
|
|
|
arr = arr[ph1_eta_mask & ph2_eta_mask & ph1_pt_mask & ph2_pt_mask] |
|
|
|
|
|
myy_mask = (arr[:,44] > 105*1000) & (arr[:,44] < 160*1000) |
|
|
arr = arr[myy_mask] |
|
|
|
|
|
|
|
|
ti_mask = (arr[:,35] == True) & (arr[:,36] == True) |
|
|
arr = arr[ti_mask] |
|
|
|
|
|
signal.append(arr) |
|
|
|
|
|
signal = np.concatenate(signal, axis=0) if signal else np.empty((0, 46)) |
|
|
|
|
|
|
|
|
bkgd = [] |
|
|
for arr in [data_raw]: |
|
|
|
|
|
m_yy = [] |
|
|
pt_yy = [] |
|
|
for pt1, eta1, phi1, pt2, eta2, phi2 in zip(arr[:,0], arr[:,1], arr[:,2], arr[:,3], arr[:,4], arr[:,5]): |
|
|
photon1 = TLorentzVector() |
|
|
photon1.SetPtEtaPhiM(pt1, eta1, phi1, 0.0) |
|
|
photon2 = TLorentzVector() |
|
|
photon2.SetPtEtaPhiM(pt2, eta2, phi2, 0.0) |
|
|
diphoton = photon1 + photon2 |
|
|
m_yy.append(diphoton.M()) |
|
|
pt_yy.append(diphoton.Pt()) |
|
|
|
|
|
arr[:,44] = np.array(m_yy) |
|
|
arr[:,45] = np.array(pt_yy) |
|
|
arr[:,32] = np.ones_like(arr[:,32]) |
|
|
|
|
|
|
|
|
ph1_eta_mask = (np.abs(arr[:,1]) < 1.37) | ((np.abs(arr[:,1]) > 1.52) & (np.abs(arr[:,1]) < 2.37)) |
|
|
ph2_eta_mask = (np.abs(arr[:,4]) < 1.37) | ((np.abs(arr[:,4]) > 1.52) & (np.abs(arr[:,4]) < 2.37)) |
|
|
|
|
|
m_yy_safe = np.where(arr[:,44] == 0, 1e-10, arr[:,44]) |
|
|
ph1_pt_mask = (arr[:,0] > 25*1000) & ((arr[:,0] / m_yy_safe) > 0.35) |
|
|
ph2_pt_mask = (arr[:,3] > 25*1000) & ((arr[:,3] / m_yy_safe) > 0.25) |
|
|
|
|
|
arr = arr[ph1_eta_mask & ph2_eta_mask & ph1_pt_mask & ph2_pt_mask] |
|
|
|
|
|
myy_mask = (arr[:,44] > 105*1000) & (arr[:,44] < 160*1000) |
|
|
arr = arr[myy_mask] |
|
|
|
|
|
bkgd.append(arr) |
|
|
|
|
|
bkgd = np.concatenate(bkgd, axis=0) if bkgd else np.empty((0, 46)) |
|
|
|
|
|
print(f"[Message] After preprocessing: {len(signal)} signal events passing cuts, {len(bkgd)} background events passing cuts") |
|
|
|
|
|
|
|
|
print("-----------------------------------") |
|
|
print("Performing background estimation...") |
|
|
print("-----------------------------------") |
|
|
|
|
|
|
|
|
sb_mask = (bkgd[:,44] < 120*1000) | (bkgd[:,44] > 130*1000) |
|
|
sr_mask = (bkgd[:,44] > 123*1000) & (bkgd[:,44] < 127*1000) |
|
|
ti_mask = (bkgd[:,35] == True) & (bkgd[:,36] == True) |
|
|
nti_mask = (bkgd[:,35] == False) | (bkgd[:,36] == False) |
|
|
|
|
|
|
|
|
ti_sb_yield = np.sum(bkgd[ti_mask & sb_mask, 32]) |
|
|
ti_sr_yield = np.sum(bkgd[ti_mask & sr_mask, 32]) |
|
|
nti_sr_yield = np.sum(bkgd[nti_mask & sr_mask, 32]) |
|
|
nti_sb_yield = np.sum(bkgd[nti_mask & sb_mask, 32]) |
|
|
|
|
|
|
|
|
sf1 = ti_sb_yield / nti_sb_yield if nti_sb_yield > 0 else 1.0 |
|
|
sf2 = nti_sr_yield / nti_sb_yield if nti_sb_yield > 0 else 1.0 |
|
|
expected_bkgd = nti_sb_yield * sf1 * sf2 |
|
|
|
|
|
|
|
|
bkgd = bkgd[nti_mask & sb_mask] |
|
|
bkgd[:,32] = np.ones_like(bkgd[:,32]) * expected_bkgd / np.sum(np.ones_like(bkgd[:,32])) if len(bkgd) > 0 else bkgd[:,32] |
|
|
|
|
|
|
|
|
sr_mask = (signal[:,44] > 123*1000) & (signal[:,44] < 127*1000) |
|
|
print("Signal yield without mass cut: ", np.sum(signal[:,32])) |
|
|
signal = signal[sr_mask] |
|
|
print("Signal yield with mass cut: ", np.sum(signal[:,32])) |
|
|
|
|
|
|
|
|
signal = signal[:,0:46] |
|
|
bkgd = bkgd[:,0:46] |
|
|
|
|
|
|
|
|
print("\n=== Background Estimation Results ===") |
|
|
print(f"NTI SB yield: {nti_sb_yield:.2f}") |
|
|
print(f"Scale factors: SF1 = {sf1:.4f}, SF2 = {sf2:.4f}") |
|
|
print(f"Expected background: {expected_bkgd:.2f}") |
|
|
print("\nRegion yields:") |
|
|
print("") |
|
|
print(" NTI SR | NTI SB | TI SR | TI SB") |
|
|
print("------------------------------------------------") |
|
|
print(f" {nti_sr_yield:>7.2f} | {nti_sb_yield:>7.2f} | {ti_sr_yield:>7.2f} | {ti_sb_yield:>7.2f}") |
|
|
|
|
|
np.save('arrays/signal.npy', signal) |
|
|
np.save('arrays/bkgd.npy', bkgd) |
|
|
|
|
|
print("--------------------------------") |
|
|
print("[Step 3] completed successfully!") |
|
|
print("--------------------------------") |
|
|
print("") |
|
|
if not (step4 or step5 or plot or sig): |
|
|
exit(0) |
|
|
|
|
|
|
|
|
if not step1 and not step2 and not step3: |
|
|
if os.path.exists('arrays/signal.npy') and os.path.exists('arrays/bkgd.npy'): |
|
|
signal = np.load('arrays/signal.npy') |
|
|
bkgd = np.load('arrays/bkgd.npy') |
|
|
|
|
|
|
|
|
else: |
|
|
print("Error: Processed arrays not found. Run --step3 first to generate them.") |
|
|
exit(1) |
|
|
|
|
|
|
|
|
|
|
|
if step4: |
|
|
batch_size = 20_000 |
|
|
test_size = 0.5 |
|
|
|
|
|
|
|
|
signal_scores, bkgd_scores = tabpfn(signal, bkgd, batch_size=batch_size, test_size=test_size, random_state=42) |
|
|
np.save('arrays/signal_scores.npy', signal_scores) |
|
|
np.save('arrays/bkgd_scores.npy', bkgd_scores) |
|
|
print("[Step 4] completed successfully!") |
|
|
print("--------------------------------") |
|
|
if not (step5 or plot or sig): |
|
|
exit(0) |
|
|
|
|
|
|
|
|
if not step4: |
|
|
if os.path.exists('arrays/signal_scores.npy') and os.path.exists('arrays/bkgd_scores.npy'): |
|
|
signal_scores = np.load('arrays/signal_scores.npy') |
|
|
bkgd_scores = np.load('arrays/bkgd_scores.npy') |
|
|
else: |
|
|
signal_scores = None |
|
|
bkgd_scores = None |
|
|
|
|
|
|
|
|
|
|
|
if step5: |
|
|
if signal_scores is None or bkgd_scores is None: |
|
|
print("Error: Cannot run step 5 without signal scores. Run --step4 first.") |
|
|
exit(1) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
signal_scores = np.load('arrays/signal_scores.npy') |
|
|
bkgd_scores = np.load('arrays/bkgd_scores.npy') |
|
|
signal_df, bkgd_df = load_datasets(signal, bkgd, signal_scores, bkgd_scores) |
|
|
bb = [0, 1] |
|
|
num_divisions = 0 |
|
|
cur_Z = get_significance(signal_df, bkgd_df, np.array(bb)) |
|
|
ZZ = [cur_Z] |
|
|
|
|
|
while len(ZZ) < 3 or (ZZ[-1] - ZZ[-2])/ZZ[-2] > 0.05: |
|
|
num_bins = 1000 |
|
|
min_events = 100 |
|
|
new_boundary, _ = place_boundary(signal_df, bkgd_df, np.array(bb), num_bins, min_events) |
|
|
bb.append(new_boundary) |
|
|
bb.sort() |
|
|
cur_Z = get_significance(signal_df, bkgd_df, np.array(bb)) |
|
|
ZZ.append(cur_Z) |
|
|
num_divisions += 1 |
|
|
|
|
|
signal_np = signal_df.AsNumpy(columns=['ml_score', 'normalized_weight']).copy() |
|
|
signal_scores = signal_np['ml_score'] |
|
|
signal_weights = signal_np['normalized_weight'] |
|
|
bkgd_np = bkgd_df.AsNumpy(columns=['ml_score', 'normalized_weight']).copy() |
|
|
bkgd_scores = bkgd_np['ml_score'] |
|
|
bkgd_weights = bkgd_np['normalized_weight'] |
|
|
|
|
|
bb = np.array(bb) |
|
|
ZZ = np.array(ZZ) |
|
|
|
|
|
print("Boundaries: ", bb) |
|
|
print("Significances: ", ZZ) |
|
|
|
|
|
np.save('arrays/boundaries.npy', bb) |
|
|
np.save('arrays/significances.npy', ZZ) |
|
|
print("[Step 5] completed successfully!") |
|
|
print("--------------------------------") |
|
|
if not (plot or sig): |
|
|
exit(0) |
|
|
|
|
|
|
|
|
if not step5: |
|
|
if os.path.exists('arrays/boundaries.npy') and os.path.exists('arrays/significances.npy'): |
|
|
bb = np.load('arrays/boundaries.npy') |
|
|
significances = np.load('arrays/significances.npy') |
|
|
else: |
|
|
bb = None |
|
|
significances = None |
|
|
|
|
|
if plot: |
|
|
if signal_scores is None or bkgd_scores is None or bb is None: |
|
|
print("Error: Cannot plot without required data. Run --all to generate all necessary files.") |
|
|
exit(1) |
|
|
else: |
|
|
|
|
|
os.makedirs('plots', exist_ok=True) |
|
|
|
|
|
plot_scores(signal, bkgd, signal_scores, bkgd_scores, bb) |
|
|
plot_myy(signal, bkgd) |
|
|
print("Plotting completed successfully!") |
|
|
print("--------------------------------") |
|
|
if not sig: |
|
|
exit(0) |
|
|
|
|
|
if sig: |
|
|
if signal_scores is None or bkgd_scores is None or bb is None: |
|
|
print("Error: Cannot calculate significance without required data. Run --all to generate all necessary files.") |
|
|
exit(1) |
|
|
else: |
|
|
signal_df, bkgd_df = load_datasets(signal, bkgd, signal_scores, bkgd_scores) |
|
|
Z = get_significance(signal_df, bkgd_df, bb) |
|
|
print('Final significance: {:.2f}'.format(Z)) |
|
|
|
|
|
|
|
|
|
|
|
output_dir = os.environ.get('OUTPUT_DIR', os.getcwd()) |
|
|
results_dir = os.path.join(output_dir, 'results') |
|
|
for root_file in [os.path.join(results_dir, 'signal.root'), os.path.join(results_dir, 'bkgd.root')]: |
|
|
if os.path.exists(root_file): |
|
|
try: |
|
|
os.remove(root_file) |
|
|
except OSError: |
|
|
pass |
|
|
|
|
|
print("Significance calculation completed successfully!") |
|
|
exit(0) |