import os import sys import numpy as np import pandas as pd import uproot import awkward as ak import random from ROOT import TLorentzVector import argparse from utils import * # Add the parent directory to the path for utils_plot sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils_plot import * # Parse command-line arguments parser = argparse.ArgumentParser(description='Process physics data analysis steps') parser.add_argument('--step1', action='store_true', help='Step 1: List ROOT files') parser.add_argument('--step2', action='store_true', help='Step 2: ROOT -> NumPy conversion') parser.add_argument('--step3', action='store_true', help='Step 3: Background estimation') parser.add_argument('--step4', action='store_true', help='Step 4: Classification') parser.add_argument('--step5', action='store_true', help='Step 5: Categorization') parser.add_argument('--plot', action='store_true', help='Generate plots') parser.add_argument('--sig', action='store_true', help='Calculate significance') parser.add_argument('--all', action='store_true', help='Run all steps') parser.add_argument("--fraction", type=float, default=1.0, help="Fraction of events to process per file (0.0 - 1.0)") parser.add_argument("--selective-step2", action='store_true', help='Step 2: Process only specific files with custom output names') parser.add_argument("--data-file", type=str, default='data_A.GamGam.root', help='Data file to process in selective mode') parser.add_argument("--signal-file", type=str, default='mc_345318.WpH125J_Wincl_gamgam.GamGam.root', help='Signal file to process in selective mode') parser.add_argument("--data-output", type=str, default='data_A_raw.npy', help='Output filename for data in selective mode') parser.add_argument("--signal-output", type=str, default='signal_WH_raw.npy', help='Output filename for signal in selective mode') args = parser.parse_args() # Set flags based on arguments if args.all: step1 = True step2 = True step3 = True step4 = True step5 = True plot = True sig = True else: step1 = args.step1 step2 = args.step2 or args.selective_step2 step3 = args.step3 step4 = args.step4 step5 = args.step5 plot = args.plot sig = args.sig # fix random seeds for reproducibility np.random.seed(42) random.seed(42) # Try to set PyTorch seeds if available try: import torch torch.manual_seed(42) torch.cuda.manual_seed(42) torch.cuda.manual_seed_all(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False print("[Info] PyTorch random seeds set for reproducibility") except ImportError: print("[Info] PyTorch not available, skipping PyTorch seed setting") # Try to set TensorFlow seeds if available try: import tensorflow as tf tf.random.set_seed(42) print("[Info] TensorFlow random seeds set for reproducibility") except ImportError: print("[Info] TensorFlow not available, skipping TensorFlow seed setting") fraction = args.fraction # Use configurable fraction of events # Step 1: List ROOT files if step1: print("-----------------------------------------") print("Step 1: Listing ROOT files") print("-----------------------------------------") # Create arrays directory if it doesn't exist os.makedirs('arrays', exist_ok=True) filelist = [] data_dir = '/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/' for file in sorted(os.listdir(data_dir)): print(file) filepath = os.path.join(data_dir, file) if os.path.isfile(filepath) and filepath.endswith('.root'): filelist.append(filepath) # Save filelist to arrays directory file_list_path = os.path.abspath('arrays/file_list.txt') with open('arrays/file_list.txt', 'w') as f: for filepath in filelist: filename = os.path.basename(filepath) f.write(f"{filename}\n") print("-------------------------------------------------------------------------") print(f"Found {len(filelist)} ROOT files.") print(f"File list saved to: {file_list_path}") print("-------------------------------------------------------------------------") # Analyze trees and branches for each ROOT file print("Analyzing ROOT file structures...") print("-------------------------------------------------------------------------") root_summary_path = os.path.abspath('arrays/root_summary.txt') with open('arrays/root_summary.txt', 'w') as f: f.write("=" * 80 + "\n") f.write("ROOT FILES ANALYSIS SUMMARY\n") f.write("=" * 80 + "\n\n") # Extract common branches across all files print("Extracting common branches across all files...") all_branches = {} for filepath in filelist: try: with uproot.open(filepath) as root_file: trees = [] for key in root_file.keys(): try: obj = root_file[key] if hasattr(obj, 'keys'): trees.append(key) except: continue for tree_name in trees: try: tree = root_file[tree_name] branches = list(tree.keys()) if tree_name not in all_branches: all_branches[tree_name] = [] all_branches[tree_name].append(set(branches)) except: continue except: continue # Find common branches for each tree common_branches = {} for tree_name, branch_sets in all_branches.items(): if len(branch_sets) > 1: common = set.intersection(*branch_sets) common_branches[tree_name] = sorted(list(common)) elif len(branch_sets) == 1: common_branches[tree_name] = sorted(list(branch_sets[0])) # Write common branches section f.write("COMMON BRANCHES ACROSS ALL FILES\n") f.write("=" * 40 + "\n\n") for tree_name, branches in common_branches.items(): f.write(f"Tree: {tree_name}\n") f.write(f"Common branches ({len(branches)}):\n") f.write(f" {', '.join(sorted(branches))}\n") f.write("\n") f.write("=" * 80 + "\n\n") for i, filepath in enumerate(filelist, 1): filename = os.path.basename(filepath) f.write(f"File {i}: {filename}\n") f.write("-" * (len(filename) + 8) + "\n") try: with uproot.open(filepath) as root_file: # Get all keys (trees and other objects) keys = list(root_file.keys()) f.write(f"Total objects: {len(keys)}\n") # Find trees (objects that can be treated as trees) trees = [] for key in keys: try: obj = root_file[key] if hasattr(obj, 'keys'): # It's a tree/directory trees.append(key) except: continue f.write(f"Trees found: {len(trees)}\n") # Analyze each tree for tree_name in trees: f.write(f"\nTree: {tree_name}\n") try: tree = root_file[tree_name] n_entries = tree.num_entries f.write(f" Entries: {n_entries}\n") # Get branch names branches = list(tree.keys()) f.write(f" Branches ({len(branches)}):\n") # Group branches by prefix for better readability branch_groups = {} for branch in sorted(branches): prefix = branch.split('_')[0] if '_' in branch else 'other' if prefix not in branch_groups: branch_groups[prefix] = [] branch_groups[prefix].append(branch) for prefix, branch_list in sorted(branch_groups.items()): f.write(f" {prefix}: {', '.join(branch_list)}\n") except Exception as e: f.write(f" Error analyzing tree: {e}\n") f.write("\n" + "=" * 50 + "\n") except Exception as e: f.write(f"Error opening file: {e}\n\n") print(f"ROOT analysis saved to: {root_summary_path}") print("--------------------------------") print("[Step 1] completed successfully!") if not (step2 or step3 or step4 or step5 or plot or sig): exit(0) # Exit if only step1 was requested # Step 2: ROOT -> NumPy conversion if step2: print("-----------------------------------------") print("Step 2: ROOT -> NumPy conversion") print("-----------------------------------------") if args.selective_step2: print("Running in selective mode - processing only specified files...") print(f"Data file: {args.data_file}") print(f"Signal file: {args.signal_file}") print(f"Data output: {args.data_output}") print(f"Signal output: {args.signal_output}") # Process data file data_dir = '/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/' if os.path.isabs(args.data_file): data_filepath = args.data_file else: data_filepath = os.path.join(data_dir, args.data_file) if not os.path.exists(data_filepath): print(f"Error: Data file {data_filepath} not found.") exit(1) print(f'Processing data file: {args.data_file}') tree = uproot.open(data_filepath)['mini;1'] entry_stop = int(tree.num_entries * fraction) branches = next(tree.iterate(library="ak", entry_stop=entry_stop)) N = len(branches) arr = np.full((N, 46), np.nan) # now includes extra columns for m_yy and pt_yy # photons (ensure highest pT photon appears first) for i in range(0,2): arr[:,0+3*i] = ak.to_numpy(branches['photon_pt'][:,i]) arr[:,1+3*i] = ak.to_numpy(branches['photon_eta'][:,i]) arr[:,2+3*i] = ak.to_numpy(branches['photon_phi'][:,i]) # leptons for i in range(0,2): mask = branches['lep_n'] > i arr[mask,6+3*i] = ak.to_numpy(branches['lep_pt'][mask,i]) arr[mask,7+3*i] = ak.to_numpy(branches['lep_eta'][mask,i]) arr[mask,8+3*i] = ak.to_numpy(branches['lep_phi'][mask,i]) # jets for i in range(0,6): mask = branches['jet_n'] > i arr[mask,12+3*i] = ak.to_numpy(branches['jet_pt'][mask,i]) arr[mask,13+3*i] = ak.to_numpy(branches['jet_eta'][mask,i]) arr[mask,14+3*i] = ak.to_numpy(branches['jet_phi'][mask,i]) # MET arr[:,30] = ak.to_numpy(branches['met_et']) arr[:,31] = ak.to_numpy(branches['met_phi']) # Additional branches arr[:,32] = ak.to_numpy(branches['mcWeight']) arr[:,33] = ak.to_numpy(branches['SumWeights']) arr[:,34] = ak.to_numpy(branches['XSection']) arr[:,35] = ak.to_numpy(branches['photon_isTightID'][:,0]) arr[:,36] = ak.to_numpy(branches['photon_isTightID'][:,1]) # Scale factors arr[:,37] = ak.to_numpy(branches['scaleFactor_PILEUP']) arr[:,38] = ak.to_numpy(branches['scaleFactor_PHOTON']) arr[:,39] = ak.to_numpy(branches['scaleFactor_PhotonTRIGGER']) arr[:,40] = ak.to_numpy(branches['scaleFactor_ELE']) arr[:,41] = ak.to_numpy(branches['scaleFactor_MUON']) arr[:,42] = ak.to_numpy(branches['scaleFactor_LepTRIGGER']) arr[:,43] = ak.to_numpy(branches['scaleFactor_BTAG']) np.save(f'arrays/{args.data_output}', arr) print(f"Saved data to arrays/{args.data_output} with shape: {arr.shape}") # Process signal file if os.path.isabs(args.signal_file): signal_filepath = args.signal_file else: signal_filepath = os.path.join(data_dir, args.signal_file) if not os.path.exists(signal_filepath): print(f"Error: Signal file {signal_filepath} not found.") exit(1) print(f'Processing signal file: {args.signal_file}') tree = uproot.open(signal_filepath)['mini;1'] entry_stop = int(tree.num_entries * fraction) branches = next(tree.iterate(library="ak", entry_stop=entry_stop)) N = len(branches) arr = np.full((N, 46), np.nan) # now includes extra columns for m_yy and pt_yy # photons (ensure highest pT photon appears first) for i in range(0,2): arr[:,0+3*i] = ak.to_numpy(branches['photon_pt'][:,i]) arr[:,1+3*i] = ak.to_numpy(branches['photon_eta'][:,i]) arr[:,2+3*i] = ak.to_numpy(branches['photon_phi'][:,i]) # leptons for i in range(0,2): mask = branches['lep_n'] > i arr[mask,6+3*i] = ak.to_numpy(branches['lep_pt'][mask,i]) arr[mask,7+3*i] = ak.to_numpy(branches['lep_eta'][mask,i]) arr[mask,8+3*i] = ak.to_numpy(branches['lep_phi'][mask,i]) # jets for i in range(0,6): mask = branches['jet_n'] > i arr[mask,12+3*i] = ak.to_numpy(branches['jet_pt'][mask,i]) arr[mask,13+3*i] = ak.to_numpy(branches['jet_eta'][mask,i]) arr[mask,14+3*i] = ak.to_numpy(branches['jet_phi'][mask,i]) # MET arr[:,30] = ak.to_numpy(branches['met_et']) arr[:,31] = ak.to_numpy(branches['met_phi']) # Additional branches arr[:,32] = ak.to_numpy(branches['mcWeight']) arr[:,33] = ak.to_numpy(branches['SumWeights']) arr[:,34] = ak.to_numpy(branches['XSection']) arr[:,35] = ak.to_numpy(branches['photon_isTightID'][:,0]) arr[:,36] = ak.to_numpy(branches['photon_isTightID'][:,1]) # Scale factors arr[:,37] = ak.to_numpy(branches['scaleFactor_PILEUP']) arr[:,38] = ak.to_numpy(branches['scaleFactor_PHOTON']) arr[:,39] = ak.to_numpy(branches['scaleFactor_PhotonTRIGGER']) arr[:,40] = ak.to_numpy(branches['scaleFactor_ELE']) arr[:,41] = ak.to_numpy(branches['scaleFactor_MUON']) arr[:,42] = ak.to_numpy(branches['scaleFactor_LepTRIGGER']) arr[:,43] = ak.to_numpy(branches['scaleFactor_BTAG']) np.save(f'arrays/{args.signal_output}', arr) print(f"Saved signal to arrays/{args.signal_output} with shape: {arr.shape}") print("-----------------------------------------------------") print("Selective file processing completed successfully!") print("[Step 2] completed successfully!") if not (step3 or step4 or step5 or plot or sig): exit(0) # Exit if only step1 and step2 were requested else: # Original Step 2 logic for processing all files # Load filelist from saved file if not os.path.exists('arrays/file_list.txt'): print("Error: file_list.txt not found. Run --step1 first.") exit(1) filelist = [] with open('arrays/file_list.txt', 'r') as f: lines = f.readlines() for line in lines: # Process all lines (no header to skip) if line.strip(): filename = line.strip() filepath = os.path.join('/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/', filename) filelist.append(filepath) all_data = [] signal_data = [] background_data = [] for filepath in filelist: name = os.path.basename(filepath).removesuffix('.root') print(f'Starting to read {name} into np array') tree = uproot.open(filepath)['mini;1'] entry_stop = int(tree.num_entries * fraction) branches = next(tree.iterate(library="ak", entry_stop=entry_stop)) N = len(branches) arr = np.full((N, 46), np.nan) # now includes extra columns for m_yy and pt_yy # photons (ensure highest pT photon appears first) for i in range(0,2): arr[:,0+3*i] = ak.to_numpy(branches['photon_pt'][:,i]) arr[:,1+3*i] = ak.to_numpy(branches['photon_eta'][:,i]) arr[:,2+3*i] = ak.to_numpy(branches['photon_phi'][:,i]) # leptons for i in range(0,2): mask = branches['lep_n'] > i arr[mask,6+3*i] = ak.to_numpy(branches['lep_pt'][mask,i]) arr[mask,7+3*i] = ak.to_numpy(branches['lep_eta'][mask,i]) arr[mask,8+3*i] = ak.to_numpy(branches['lep_phi'][mask,i]) # jets for i in range(0,6): mask = branches['jet_n'] > i arr[mask,12+3*i] = ak.to_numpy(branches['jet_pt'][mask,i]) arr[mask,13+3*i] = ak.to_numpy(branches['jet_eta'][mask,i]) arr[mask,14+3*i] = ak.to_numpy(branches['jet_phi'][mask,i]) # MET arr[:,30] = ak.to_numpy(branches['met_et']) arr[:,31] = ak.to_numpy(branches['met_phi']) # Additional branches arr[:,32] = ak.to_numpy(branches['mcWeight']) arr[:,33] = ak.to_numpy(branches['SumWeights']) arr[:,34] = ak.to_numpy(branches['XSection']) arr[:,35] = ak.to_numpy(branches['photon_isTightID'][:,0]) arr[:,36] = ak.to_numpy(branches['photon_isTightID'][:,1]) # Scale factors arr[:,37] = ak.to_numpy(branches['scaleFactor_PILEUP']) arr[:,38] = ak.to_numpy(branches['scaleFactor_PHOTON']) arr[:,39] = ak.to_numpy(branches['scaleFactor_PhotonTRIGGER']) arr[:,40] = ak.to_numpy(branches['scaleFactor_ELE']) arr[:,41] = ak.to_numpy(branches['scaleFactor_MUON']) arr[:,42] = ak.to_numpy(branches['scaleFactor_LepTRIGGER']) arr[:,43] = ak.to_numpy(branches['scaleFactor_BTAG']) # Separate signal and background if 'mc' in filepath: signal_data.append(arr) else: background_data.append(arr) # Concatenate signal and background separately if signal_data: signal_array = np.concatenate(signal_data, axis=0) np.save('arrays/signal_raw.npy', signal_array) print(f"Saved {len(signal_data)} signal files to signal_raw.npy with shape: {signal_array.shape}") if background_data: background_array = np.concatenate(background_data, axis=0) np.save('arrays/data_raw.npy', background_array) print(f"Saved {len(background_data)} data files to data_raw.npy with shape: {background_array.shape}") print("-----------------------------------------------------") print(f"Converted {len(filelist)} ROOT files to NumPy arrays") print("[Step 2] completed successfully!") if not (step3 or step4 or step5 or plot or sig): exit(0) # Exit if only step1 and step2 were requested # Step 3: Preprocessing and Background Estimation if step3: print("-----------------------------------------------") print("Step 3: Preprocessing and Background Estimation") print("-----------------------------------------------") # Load raw data if not os.path.exists('arrays/signal_raw.npy') or not os.path.exists('arrays/data_raw.npy'): print("Error: signal_raw.npy or data_raw.npy not found. Run --step2 first.") exit(1) signal_raw = np.load('arrays/signal_raw.npy') data_raw = np.load('arrays/data_raw.npy') print(f"[Message] Loaded {len(signal_raw)} MC events and {len(data_raw)} data events") # Check that arrays have exactly 46 columns if signal_raw.shape[1] != 46: print(f"Error: signal_raw.npy has {signal_raw.shape[1]} columns, expected 46. Please re-run --step2 to regenerate with correct format.") exit(1) if data_raw.shape[1] != 46: print(f"Error: data_raw.npy has {data_raw.shape[1]} columns, expected 46. Please re-run --step2 to regenerate with correct format.") exit(1) # Process MC data signal = [] for arr in [signal_raw]: # Process in chunks if needed # Calculate diphoton variables using TLorentzVector m_yy = [] pt_yy = [] for pt1, eta1, phi1, pt2, eta2, phi2 in zip(arr[:,0], arr[:,1], arr[:,2], arr[:,3], arr[:,4], arr[:,5]): photon1 = TLorentzVector() photon1.SetPtEtaPhiM(pt1, eta1, phi1, 0.0) # mass = 0 for photons photon2 = TLorentzVector() photon2.SetPtEtaPhiM(pt2, eta2, phi2, 0.0) # mass = 0 for photons diphoton = photon1 + photon2 m_yy.append(diphoton.M()) pt_yy.append(diphoton.Pt()) arr[:,44] = np.array(m_yy) # diphoton invariant mass arr[:,45] = np.array(pt_yy) # diphoton pT # Apply comprehensive MC weight scaling with all scale factors mcWeight = arr[:,32] scaleFactor_PILEUP = arr[:,37] scaleFactor_PHOTON = arr[:,38] scaleFactor_PhotonTRIGGER = arr[:,39] scaleFactor_ELE = arr[:,40] scaleFactor_MUON = arr[:,41] scaleFactor_LepTRIGGER = arr[:,42] scaleFactor_BTAG = arr[:,43] # Use original XSection and SumWeights for normalization sigma_per_event = arr[:,34] # XSection for each event sum_mcWeight_per_event = arr[:,33] # SumWeights for each event # Convert data luminosity to pb^-1 to match XSection units # --fraction option effect?? # Using a fraction of events still represents the full 10 fb-1 dataset — we are just sampling fewer events to speed up processing. luminosity = 10 * 1000 # 10000 pb^-1 # CORRECTION: Fix SM Higgs cross-section value # The cross-section value of ~2.64e-06 corresponds to SM Higgs production # and should be corrected to the expected SM Higgs → γγ cross-section of 0.000116 pb problematic_xsec_mask = np.abs(sigma_per_event - 2.64338632e-06) < 1e-10 if np.any(problematic_xsec_mask): correction_factor = 0.000116 / 2.64338632e-06 # ≈ 43.9 sigma_per_event = sigma_per_event.copy() # Make a copy to avoid modifying original sigma_per_event[problematic_xsec_mask] *= correction_factor # Update the array with corrected cross-section values arr[:,34] = sigma_per_event print(f"[Message] Applied cross-section correction factor {correction_factor:.1f} to {np.sum(problematic_xsec_mask)} SM Higgs events") # print("-----------------------------------") # print("Cross section and weight debugging:") # print("-----------------------------------") # print(f"Unique cross sections: {np.unique(sigma_per_event)}") # print(f"Unique sum of weights: {np.unique(sum_mcWeight_per_event)}") # print(f"Luminosity: {luminosity} pb^-1") # print(f"MC weight statistics: mean={np.mean(mcWeight):.6f}, std={np.std(mcWeight):.6f}, min={np.min(mcWeight):.6f}, max={np.max(mcWeight):.6f}") # print(f"Sample MC weights (first 5): {mcWeight[:5]}") # CORRECTED: Use mcWeight directly as it already contains the appropriate weight values # Calculate normalization factor per event: (cross_section * luminosity) / sum_of_weights normalization_per_event = (sigma_per_event * luminosity) / sum_mcWeight_per_event # Apply all scale factors (filter out zero scale factors) scale_factors = { 'PILEUP': scaleFactor_PILEUP, 'PHOTON': scaleFactor_PHOTON, 'PhotonTRIGGER': scaleFactor_PhotonTRIGGER, 'ELE': scaleFactor_ELE, 'MUON': scaleFactor_MUON, 'LepTRIGGER': scaleFactor_LepTRIGGER, 'BTAG': scaleFactor_BTAG } # Calculate combined scale factor (include all scale factors, even if zero) combined_scale_factor = np.ones_like(mcWeight) for name, sf in scale_factors.items(): zero_count = np.sum(sf == 0) if zero_count > 0: print(f"[Warning] {zero_count} events have zero {name} scale factor") combined_scale_factor *= sf # CORRECTED: Final event weight = mcWeight * normalization * combined_scale_factors event_weight = mcWeight * normalization_per_event * combined_scale_factor # print(f"Normalization factor statistics: mean={np.mean(normalization_per_event):.6f}, std={np.std(normalization_per_event):.6f}") # print(f"Combined scale factor statistics: mean={np.mean(combined_scale_factor):.6f}, std={np.std(combined_scale_factor):.6f}") # print(f"Final event weight statistics: mean={np.mean(event_weight):.6f}, std={np.std(event_weight):.6f}") # print(f"Sample final weights (first 5): {event_weight[:5]}") # Store the final weight in the array arr[:,32] = event_weight # Check for zero scale factors and print warnings scale_factors = { 'PILEUP': scaleFactor_PILEUP, 'PHOTON': scaleFactor_PHOTON, 'PhotonTRIGGER': scaleFactor_PhotonTRIGGER, 'ELE': scaleFactor_ELE, 'MUON': scaleFactor_MUON, 'LepTRIGGER': scaleFactor_LepTRIGGER, 'BTAG': scaleFactor_BTAG } print("-----------------------------------") print("Overall scale factor check for MC: (skipped, treating as 1)") print("-----------------------------------") overall_sf = np.ones_like(mcWeight) # Apply selections # |eta| < 2.37 for both photons # |eta| *not* in (1.37, 1.52) for both photons # pT > 25 GeV for both photons # pT / m_yy > 0.35 for leading photon and > 0.25 for subleading photon # invariant mass in [105, 160] ph1_eta_mask = (np.abs(arr[:,1]) < 1.37) | ((np.abs(arr[:,1]) > 1.52) & (np.abs(arr[:,1]) < 2.37)) ph2_eta_mask = (np.abs(arr[:,4]) < 1.37) | ((np.abs(arr[:,4]) > 1.52) & (np.abs(arr[:,4]) < 2.37)) # Avoid division by zero in pT/m_yy cuts m_yy_safe = np.where(arr[:,44] == 0, 1e-10, arr[:,44]) ph1_pt_mask = (arr[:,0] > 25*1000) & ((arr[:,0] / m_yy_safe) > 0.35) ph2_pt_mask = (arr[:,3] > 25*1000) & ((arr[:,3] / m_yy_safe) > 0.25) arr = arr[ph1_eta_mask & ph2_eta_mask & ph1_pt_mask & ph2_pt_mask] myy_mask = (arr[:,44] > 105*1000) & (arr[:,44] < 160*1000) arr = arr[myy_mask] # Tight ID selection for signal ti_mask = (arr[:,35] == True) & (arr[:,36] == True) arr = arr[ti_mask] signal.append(arr) signal = np.concatenate(signal, axis=0) if signal else np.empty((0, 46)) # Process data (background) bkgd = [] for arr in [data_raw]: # Calculate diphoton variables using TLorentzVector m_yy = [] pt_yy = [] for pt1, eta1, phi1, pt2, eta2, phi2 in zip(arr[:,0], arr[:,1], arr[:,2], arr[:,3], arr[:,4], arr[:,5]): photon1 = TLorentzVector() photon1.SetPtEtaPhiM(pt1, eta1, phi1, 0.0) # mass = 0 for photons photon2 = TLorentzVector() photon2.SetPtEtaPhiM(pt2, eta2, phi2, 0.0) # mass = 0 for photons diphoton = photon1 + photon2 m_yy.append(diphoton.M()) pt_yy.append(diphoton.Pt()) arr[:,44] = np.array(m_yy) # diphoton invariant mass arr[:,45] = np.array(pt_yy) # diphoton pT arr[:,32] = np.ones_like(arr[:,32]) # Data weights = 1 # Apply same selections as signal ph1_eta_mask = (np.abs(arr[:,1]) < 1.37) | ((np.abs(arr[:,1]) > 1.52) & (np.abs(arr[:,1]) < 2.37)) ph2_eta_mask = (np.abs(arr[:,4]) < 1.37) | ((np.abs(arr[:,4]) > 1.52) & (np.abs(arr[:,4]) < 2.37)) m_yy_safe = np.where(arr[:,44] == 0, 1e-10, arr[:,44]) ph1_pt_mask = (arr[:,0] > 25*1000) & ((arr[:,0] / m_yy_safe) > 0.35) ph2_pt_mask = (arr[:,3] > 25*1000) & ((arr[:,3] / m_yy_safe) > 0.25) arr = arr[ph1_eta_mask & ph2_eta_mask & ph1_pt_mask & ph2_pt_mask] myy_mask = (arr[:,44] > 105*1000) & (arr[:,44] < 160*1000) arr = arr[myy_mask] bkgd.append(arr) bkgd = np.concatenate(bkgd, axis=0) if bkgd else np.empty((0, 46)) print(f"[Message] After preprocessing: {len(signal)} signal events passing cuts, {len(bkgd)} background events passing cuts") # Background estimation print("-----------------------------------") print("Performing background estimation...") print("-----------------------------------") # Define regions sb_mask = (bkgd[:,44] < 120*1000) | (bkgd[:,44] > 130*1000) sr_mask = (bkgd[:,44] > 123*1000) & (bkgd[:,44] < 127*1000) ti_mask = (bkgd[:,35] == True) & (bkgd[:,36] == True) nti_mask = (bkgd[:,35] == False) | (bkgd[:,36] == False) # Calculate yields ti_sb_yield = np.sum(bkgd[ti_mask & sb_mask, 32]) ti_sr_yield = np.sum(bkgd[ti_mask & sr_mask, 32]) nti_sr_yield = np.sum(bkgd[nti_mask & sr_mask, 32]) nti_sb_yield = np.sum(bkgd[nti_mask & sb_mask, 32]) # Calculate scale factors sf1 = ti_sb_yield / nti_sb_yield if nti_sb_yield > 0 else 1.0 sf2 = nti_sr_yield / nti_sb_yield if nti_sb_yield > 0 else 1.0 expected_bkgd = nti_sb_yield * sf1 * sf2 # Apply background estimation bkgd = bkgd[nti_mask & sb_mask] bkgd[:,32] = np.ones_like(bkgd[:,32]) * expected_bkgd / np.sum(np.ones_like(bkgd[:,32])) if len(bkgd) > 0 else bkgd[:,32] # Signal mass cut sr_mask = (signal[:,44] > 123*1000) & (signal[:,44] < 127*1000) print("Signal yield without mass cut: ", np.sum(signal[:,32])) signal = signal[sr_mask] print("Signal yield with mass cut: ", np.sum(signal[:,32])) # Remove extra columns, keep only physics variables signal = signal[:,0:46] bkgd = bkgd[:,0:46] # Print background estimation results print("\n=== Background Estimation Results ===") print(f"NTI SB yield: {nti_sb_yield:.2f}") print(f"Scale factors: SF1 = {sf1:.4f}, SF2 = {sf2:.4f}") print(f"Expected background: {expected_bkgd:.2f}") print("\nRegion yields:") print("") print(" NTI SR | NTI SB | TI SR | TI SB") print("------------------------------------------------") print(f" {nti_sr_yield:>7.2f} | {nti_sb_yield:>7.2f} | {ti_sr_yield:>7.2f} | {ti_sb_yield:>7.2f}") np.save('arrays/signal.npy', signal) np.save('arrays/bkgd.npy', bkgd) print("--------------------------------") print("[Step 3] completed successfully!") print("--------------------------------") print("") if not (step4 or step5 or plot or sig): exit(0) # Exit if only step1, step2, and step3 were requested # Load data for subsequent steps if not step1 and not step2 and not step3: if os.path.exists('arrays/signal.npy') and os.path.exists('arrays/bkgd.npy'): signal = np.load('arrays/signal.npy') bkgd = np.load('arrays/bkgd.npy') # print("Sum of signal weights: ", np.sum(signal[:,32])) # print("Sum of bkgd weights: ", np.sum(bkgd[:,32])) else: print("Error: Processed arrays not found. Run --step3 first to generate them.") exit(1) # Step 4: classification if step4: batch_size = 20_000 test_size = 0.5 # signal = np.load('arrays/signal.npy') # bkgd = np.load('arrays/bkgd.npy') signal_scores, bkgd_scores = tabpfn(signal, bkgd, batch_size=batch_size, test_size=test_size, random_state=42) np.save('arrays/signal_scores.npy', signal_scores) np.save('arrays/bkgd_scores.npy', bkgd_scores) print("[Step 4] completed successfully!") print("--------------------------------") if not (step5 or plot or sig): exit(0) # Exit if only step1-4 were requested # Load scores for subsequent steps if not step4: if os.path.exists('arrays/signal_scores.npy') and os.path.exists('arrays/bkgd_scores.npy'): signal_scores = np.load('arrays/signal_scores.npy') bkgd_scores = np.load('arrays/bkgd_scores.npy') else: signal_scores = None bkgd_scores = None # Step 5: categorization if step5: if signal_scores is None or bkgd_scores is None: print("Error: Cannot run step 5 without signal scores. Run --step4 first.") exit(1) # Load files # signal = np.load('arrays/signal.npy') # bkgd = np.load('arrays/bkgd.npy') signal_scores = np.load('arrays/signal_scores.npy') bkgd_scores = np.load('arrays/bkgd_scores.npy') signal_df, bkgd_df = load_datasets(signal, bkgd, signal_scores, bkgd_scores) bb = [0, 1] num_divisions = 0 cur_Z = get_significance(signal_df, bkgd_df, np.array(bb)) ZZ = [cur_Z] while len(ZZ) < 3 or (ZZ[-1] - ZZ[-2])/ZZ[-2] > 0.05: num_bins = 1000 min_events = 100 new_boundary, _ = place_boundary(signal_df, bkgd_df, np.array(bb), num_bins, min_events) bb.append(new_boundary) bb.sort() cur_Z = get_significance(signal_df, bkgd_df, np.array(bb)) ZZ.append(cur_Z) num_divisions += 1 signal_np = signal_df.AsNumpy(columns=['ml_score', 'normalized_weight']).copy() signal_scores = signal_np['ml_score'] signal_weights = signal_np['normalized_weight'] bkgd_np = bkgd_df.AsNumpy(columns=['ml_score', 'normalized_weight']).copy() bkgd_scores = bkgd_np['ml_score'] bkgd_weights = bkgd_np['normalized_weight'] bb = np.array(bb) ZZ = np.array(ZZ) print("Boundaries: ", bb) print("Significances: ", ZZ) np.save('arrays/boundaries.npy', bb) np.save('arrays/significances.npy', ZZ) print("[Step 5] completed successfully!") print("--------------------------------") if not (plot or sig): exit(0) # Exit if only step1-5 were requested # Load boundaries for subsequent steps if not step5: if os.path.exists('arrays/boundaries.npy') and os.path.exists('arrays/significances.npy'): bb = np.load('arrays/boundaries.npy') significances = np.load('arrays/significances.npy') else: bb = None significances = None if plot: if signal_scores is None or bkgd_scores is None or bb is None: print("Error: Cannot plot without required data. Run --all to generate all necessary files.") exit(1) else: # Create plots directory if it doesn't exist os.makedirs('plots', exist_ok=True) plot_scores(signal, bkgd, signal_scores, bkgd_scores, bb) plot_myy(signal, bkgd) print("Plotting completed successfully!") print("--------------------------------") if not sig: exit(0) # Exit if plotting was the last requested step if sig: if signal_scores is None or bkgd_scores is None or bb is None: print("Error: Cannot calculate significance without required data. Run --all to generate all necessary files.") exit(1) else: signal_df, bkgd_df = load_datasets(signal, bkgd, signal_scores, bkgd_scores) Z = get_significance(signal_df, bkgd_df, bb) print('Final significance: {:.2f}'.format(Z)) # Clean up temporary ROOT files created for significance calculation # Respect OUTPUT_DIR if set; otherwise fall back to current directory output_dir = os.environ.get('OUTPUT_DIR', os.getcwd()) results_dir = os.path.join(output_dir, 'results') for root_file in [os.path.join(results_dir, 'signal.root'), os.path.join(results_dir, 'bkgd.root')]: if os.path.exists(root_file): try: os.remove(root_file) except OSError: pass # File might be in use or already deleted print("Significance calculation completed successfully!") exit(0) # Exit after significance calculation