import os
import sys
import numpy as np
import pandas as pd
import uproot
import awkward as ak
import random
from ROOT import TLorentzVector
import argparse

from utils import *
# Add the parent directory to the path for utils_plot
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils_plot import *

# Parse command-line arguments
parser = argparse.ArgumentParser(description='Process physics data analysis steps')
parser.add_argument('--step1', action='store_true', help='Step 1: List ROOT files')
parser.add_argument('--step2', action='store_true', help='Step 2: ROOT -> NumPy conversion')
parser.add_argument('--step3', action='store_true', help='Step 3: Background estimation')
parser.add_argument('--step4', action='store_true', help='Step 4: Classification')
parser.add_argument('--step5', action='store_true', help='Step 5: Categorization')
parser.add_argument('--plot', action='store_true', help='Generate plots')
parser.add_argument('--sig', action='store_true', help='Calculate significance')
parser.add_argument('--all', action='store_true', help='Run all steps')
parser.add_argument("--fraction", type=float, default=1.0, help="Fraction of events to process per file (0.0 - 1.0)")
parser.add_argument("--selective-step2", action='store_true', help='Step 2: Process only specific files with custom output names')
parser.add_argument("--data-file", type=str, default='data_A.GamGam.root', help='Data file to process in selective mode')
parser.add_argument("--signal-file", type=str, default='mc_345318.WpH125J_Wincl_gamgam.GamGam.root', help='Signal file to process in selective mode')
parser.add_argument("--data-output", type=str, default='data_A_raw.npy', help='Output filename for data in selective mode')
parser.add_argument("--signal-output", type=str, default='signal_WH_raw.npy', help='Output filename for signal in selective mode')

args = parser.parse_args()

# Set flags based on arguments
if args.all:
    step1 = True
    step2 = True
    step3 = True
    step4 = True
    step5 = True
    plot = True
    sig = True
else:
    step1 = args.step1
    step2 = args.step2 or args.selective_step2
    step3 = args.step3
    step4 = args.step4
    step5 = args.step5
    plot = args.plot
    sig = args.sig


# fix random seeds for reproducibility
np.random.seed(42)
random.seed(42)

# Try to set PyTorch seeds if available
try:
    import torch
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print("[Info] PyTorch random seeds set for reproducibility")
except ImportError:
    print("[Info] PyTorch not available, skipping PyTorch seed setting")

# Try to set TensorFlow seeds if available
try:
    import tensorflow as tf
    tf.random.set_seed(42)
    print("[Info] TensorFlow random seeds set for reproducibility")
except ImportError:
    print("[Info] TensorFlow not available, skipping TensorFlow seed setting")

fraction = args.fraction  # Use configurable fraction of events

# Step 1: List ROOT files
if step1:
    print("-----------------------------------------")
    print("Step 1: Listing ROOT files")
    print("-----------------------------------------")
    # Create arrays directory if it doesn't exist
    os.makedirs('arrays', exist_ok=True)

    filelist = []
    data_dir = '/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/'
    for file in sorted(os.listdir(data_dir)):
        print(file)
        filepath = os.path.join(data_dir, file)
        if os.path.isfile(filepath) and filepath.endswith('.root'):
            filelist.append(filepath)
    
    # Save filelist to arrays directory
    file_list_path = os.path.abspath('arrays/file_list.txt')
    with open('arrays/file_list.txt', 'w') as f:
        for filepath in filelist:
            filename = os.path.basename(filepath)
            f.write(f"{filename}\n")
    print("-------------------------------------------------------------------------")
    print(f"Found {len(filelist)} ROOT files.")
    print(f"File list saved to: {file_list_path}")
    print("-------------------------------------------------------------------------")
    
    # Analyze trees and branches for each ROOT file
    print("Analyzing ROOT file structures...")
    print("-------------------------------------------------------------------------")
    root_summary_path = os.path.abspath('arrays/root_summary.txt')
    with open('arrays/root_summary.txt', 'w') as f:
        f.write("=" * 80 + "\n")
        f.write("ROOT FILES ANALYSIS SUMMARY\n")
        f.write("=" * 80 + "\n\n")
        
        # Extract common branches across all files
        print("Extracting common branches across all files...")
        all_branches = {}
        for filepath in filelist:
            try:
                with uproot.open(filepath) as root_file:
                    trees = []
                    for key in root_file.keys():
                        try:
                            obj = root_file[key]
                            if hasattr(obj, 'keys'):
                                trees.append(key)
                        except:
                            continue
                    
                    for tree_name in trees:
                        try:
                            tree = root_file[tree_name]
                            branches = list(tree.keys())
                            if tree_name not in all_branches:
                                all_branches[tree_name] = []
                            all_branches[tree_name].append(set(branches))
                        except:
                            continue
            except:
                continue
        
        # Find common branches for each tree
        common_branches = {}
        for tree_name, branch_sets in all_branches.items():
            if len(branch_sets) > 1:
                common = set.intersection(*branch_sets)
                common_branches[tree_name] = sorted(list(common))
            elif len(branch_sets) == 1:
                common_branches[tree_name] = sorted(list(branch_sets[0]))
        
        # Write common branches section
        f.write("COMMON BRANCHES ACROSS ALL FILES\n")
        f.write("=" * 40 + "\n\n")
        for tree_name, branches in common_branches.items():
            f.write(f"Tree: {tree_name}\n")
            f.write(f"Common branches ({len(branches)}):\n")
            f.write(f"  {', '.join(sorted(branches))}\n")
            f.write("\n")
        f.write("=" * 80 + "\n\n")
        
        for i, filepath in enumerate(filelist, 1):
            filename = os.path.basename(filepath)
            f.write(f"File {i}: {filename}\n")
            f.write("-" * (len(filename) + 8) + "\n")
            
            try:
                with uproot.open(filepath) as root_file:
                    # Get all keys (trees and other objects)
                    keys = list(root_file.keys())
                    f.write(f"Total objects: {len(keys)}\n")
                    
                    # Find trees (objects that can be treated as trees)
                    trees = []
                    for key in keys:
                        try:
                            obj = root_file[key]
                            if hasattr(obj, 'keys'):  # It's a tree/directory
                                trees.append(key)
                        except:
                            continue
                    
                    f.write(f"Trees found: {len(trees)}\n")
                    
                    # Analyze each tree
                    for tree_name in trees:
                        f.write(f"\nTree: {tree_name}\n")
                        try:
                            tree = root_file[tree_name]
                            n_entries = tree.num_entries
                            f.write(f"  Entries: {n_entries}\n")
                            
                            # Get branch names
                            branches = list(tree.keys())
                            f.write(f"  Branches ({len(branches)}):\n")
                            
                            # Group branches by prefix for better readability
                            branch_groups = {}
                            for branch in sorted(branches):
                                prefix = branch.split('_')[0] if '_' in branch else 'other'
                                if prefix not in branch_groups:
                                    branch_groups[prefix] = []
                                branch_groups[prefix].append(branch)
                            
                            for prefix, branch_list in sorted(branch_groups.items()):
                                f.write(f"    {prefix}: {', '.join(branch_list)}\n")
                            
                        except Exception as e:
                            f.write(f"  Error analyzing tree: {e}\n")
                    
                    f.write("\n" + "=" * 50 + "\n")
                    
            except Exception as e:
                f.write(f"Error opening file: {e}\n\n")

    print(f"ROOT analysis saved to: {root_summary_path}")
    print("--------------------------------")
    print("[Step 1] completed successfully!")
    if not (step2 or step3 or step4 or step5 or plot or sig):
        exit(0)  # Exit if only step1 was requested

# Step 2: ROOT -> NumPy conversion
if step2:
    print("-----------------------------------------")
    print("Step 2: ROOT -> NumPy conversion")
    print("-----------------------------------------")
    
    if args.selective_step2:
        print("Running in selective mode - processing only specified files...")
        print(f"Data file: {args.data_file}")
        print(f"Signal file: {args.signal_file}")
        print(f"Data output: {args.data_output}")
        print(f"Signal output: {args.signal_output}")
        
        # Process data file
        data_dir = '/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/'
        if os.path.isabs(args.data_file):
            data_filepath = args.data_file
        else:
            data_filepath = os.path.join(data_dir, args.data_file)
        
        if not os.path.exists(data_filepath):
            print(f"Error: Data file {data_filepath} not found.")
            exit(1)
        
        print(f'Processing data file: {args.data_file}')
        tree = uproot.open(data_filepath)['mini;1']
        entry_stop = int(tree.num_entries * fraction)
        branches = next(tree.iterate(library="ak", entry_stop=entry_stop))

        N = len(branches)
        arr = np.full((N, 46), np.nan)  # now includes extra columns for m_yy and pt_yy
        
        # photons (ensure highest pT photon appears first)
        for i in range(0,2):
            arr[:,0+3*i] = ak.to_numpy(branches['photon_pt'][:,i])
            arr[:,1+3*i] = ak.to_numpy(branches['photon_eta'][:,i])
            arr[:,2+3*i] = ak.to_numpy(branches['photon_phi'][:,i])

        # leptons
        for i in range(0,2):
            mask = branches['lep_n'] > i
            arr[mask,6+3*i] = ak.to_numpy(branches['lep_pt'][mask,i])
            arr[mask,7+3*i] = ak.to_numpy(branches['lep_eta'][mask,i])
            arr[mask,8+3*i] = ak.to_numpy(branches['lep_phi'][mask,i])

        # jets
        for i in range(0,6):
            mask = branches['jet_n'] > i
            arr[mask,12+3*i] = ak.to_numpy(branches['jet_pt'][mask,i])
            arr[mask,13+3*i] = ak.to_numpy(branches['jet_eta'][mask,i])
            arr[mask,14+3*i] = ak.to_numpy(branches['jet_phi'][mask,i])

        # MET
        arr[:,30] = ak.to_numpy(branches['met_et'])
        arr[:,31] = ak.to_numpy(branches['met_phi'])

        # Additional branches
        arr[:,32] = ak.to_numpy(branches['mcWeight'])
        arr[:,33] = ak.to_numpy(branches['SumWeights'])
        arr[:,34] = ak.to_numpy(branches['XSection'])
        arr[:,35] = ak.to_numpy(branches['photon_isTightID'][:,0])
        arr[:,36] = ak.to_numpy(branches['photon_isTightID'][:,1])
        
        # Scale factors
        arr[:,37] = ak.to_numpy(branches['scaleFactor_PILEUP'])
        arr[:,38] = ak.to_numpy(branches['scaleFactor_PHOTON'])
        arr[:,39] = ak.to_numpy(branches['scaleFactor_PhotonTRIGGER'])
        arr[:,40] = ak.to_numpy(branches['scaleFactor_ELE'])
        arr[:,41] = ak.to_numpy(branches['scaleFactor_MUON'])
        arr[:,42] = ak.to_numpy(branches['scaleFactor_LepTRIGGER'])
        arr[:,43] = ak.to_numpy(branches['scaleFactor_BTAG'])

        np.save(f'arrays/{args.data_output}', arr)
        print(f"Saved data to arrays/{args.data_output} with shape: {arr.shape}")
        
        # Process signal file
        if os.path.isabs(args.signal_file):
            signal_filepath = args.signal_file
        else:
            signal_filepath = os.path.join(data_dir, args.signal_file)
        
        if not os.path.exists(signal_filepath):
            print(f"Error: Signal file {signal_filepath} not found.")
            exit(1)
        
        print(f'Processing signal file: {args.signal_file}')
        tree = uproot.open(signal_filepath)['mini;1']
        entry_stop = int(tree.num_entries * fraction)
        branches = next(tree.iterate(library="ak", entry_stop=entry_stop))

        N = len(branches)
        arr = np.full((N, 46), np.nan)  # now includes extra columns for m_yy and pt_yy
        
        # photons (ensure highest pT photon appears first)
        for i in range(0,2):
            arr[:,0+3*i] = ak.to_numpy(branches['photon_pt'][:,i])
            arr[:,1+3*i] = ak.to_numpy(branches['photon_eta'][:,i])
            arr[:,2+3*i] = ak.to_numpy(branches['photon_phi'][:,i])

        # leptons
        for i in range(0,2):
            mask = branches['lep_n'] > i
            arr[mask,6+3*i] = ak.to_numpy(branches['lep_pt'][mask,i])
            arr[mask,7+3*i] = ak.to_numpy(branches['lep_eta'][mask,i])
            arr[mask,8+3*i] = ak.to_numpy(branches['lep_phi'][mask,i])

        # jets
        for i in range(0,6):
            mask = branches['jet_n'] > i
            arr[mask,12+3*i] = ak.to_numpy(branches['jet_pt'][mask,i])
            arr[mask,13+3*i] = ak.to_numpy(branches['jet_eta'][mask,i])
            arr[mask,14+3*i] = ak.to_numpy(branches['jet_phi'][mask,i])

        # MET
        arr[:,30] = ak.to_numpy(branches['met_et'])
        arr[:,31] = ak.to_numpy(branches['met_phi'])

        # Additional branches
        arr[:,32] = ak.to_numpy(branches['mcWeight'])
        arr[:,33] = ak.to_numpy(branches['SumWeights'])
        arr[:,34] = ak.to_numpy(branches['XSection'])
        arr[:,35] = ak.to_numpy(branches['photon_isTightID'][:,0])
        arr[:,36] = ak.to_numpy(branches['photon_isTightID'][:,1])
        
        # Scale factors
        arr[:,37] = ak.to_numpy(branches['scaleFactor_PILEUP'])
        arr[:,38] = ak.to_numpy(branches['scaleFactor_PHOTON'])
        arr[:,39] = ak.to_numpy(branches['scaleFactor_PhotonTRIGGER'])
        arr[:,40] = ak.to_numpy(branches['scaleFactor_ELE'])
        arr[:,41] = ak.to_numpy(branches['scaleFactor_MUON'])
        arr[:,42] = ak.to_numpy(branches['scaleFactor_LepTRIGGER'])
        arr[:,43] = ak.to_numpy(branches['scaleFactor_BTAG'])

        np.save(f'arrays/{args.signal_output}', arr)
        print(f"Saved signal to arrays/{args.signal_output} with shape: {arr.shape}")
        
        print("-----------------------------------------------------")
        print("Selective file processing completed successfully!")
        print("[Step 2] completed successfully!")
        if not (step3 or step4 or step5 or plot or sig):
            exit(0)  # Exit if only step1 and step2 were requested
            
    else:
        # Original Step 2 logic for processing all files
        # Load filelist from saved file
        if not os.path.exists('arrays/file_list.txt'):
            print("Error: file_list.txt not found. Run --step1 first.")
            exit(1)
        
        filelist = []
        with open('arrays/file_list.txt', 'r') as f:
            lines = f.readlines()
            for line in lines:  # Process all lines (no header to skip)
                if line.strip():
                    filename = line.strip()
                    filepath = os.path.join('/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/', filename)
                    filelist.append(filepath)

        all_data = []
        signal_data = []
        background_data = []

        for filepath in filelist:
            name = os.path.basename(filepath).removesuffix('.root')
            print(f'Starting to read {name} into np array')
            tree = uproot.open(filepath)['mini;1']
            entry_stop = int(tree.num_entries * fraction)
            branches = next(tree.iterate(library="ak", entry_stop=entry_stop))

            N = len(branches)
            arr = np.full((N, 46), np.nan)  # now includes extra columns for m_yy and pt_yy
            
            # photons (ensure highest pT photon appears first)
            for i in range(0,2):
                arr[:,0+3*i] = ak.to_numpy(branches['photon_pt'][:,i])
                arr[:,1+3*i] = ak.to_numpy(branches['photon_eta'][:,i])
                arr[:,2+3*i] = ak.to_numpy(branches['photon_phi'][:,i])

            # leptons
            for i in range(0,2):
                mask = branches['lep_n'] > i
                arr[mask,6+3*i] = ak.to_numpy(branches['lep_pt'][mask,i])
                arr[mask,7+3*i] = ak.to_numpy(branches['lep_eta'][mask,i])
                arr[mask,8+3*i] = ak.to_numpy(branches['lep_phi'][mask,i])

            # jets
            for i in range(0,6):
                mask = branches['jet_n'] > i
                arr[mask,12+3*i] = ak.to_numpy(branches['jet_pt'][mask,i])
                arr[mask,13+3*i] = ak.to_numpy(branches['jet_eta'][mask,i])
                arr[mask,14+3*i] = ak.to_numpy(branches['jet_phi'][mask,i])

            # MET
            arr[:,30] = ak.to_numpy(branches['met_et'])
            arr[:,31] = ak.to_numpy(branches['met_phi'])

            # Additional branches
            arr[:,32] = ak.to_numpy(branches['mcWeight'])
            arr[:,33] = ak.to_numpy(branches['SumWeights'])
            arr[:,34] = ak.to_numpy(branches['XSection'])
            arr[:,35] = ak.to_numpy(branches['photon_isTightID'][:,0])
            arr[:,36] = ak.to_numpy(branches['photon_isTightID'][:,1])
            
            # Scale factors
            arr[:,37] = ak.to_numpy(branches['scaleFactor_PILEUP'])
            arr[:,38] = ak.to_numpy(branches['scaleFactor_PHOTON'])
            arr[:,39] = ak.to_numpy(branches['scaleFactor_PhotonTRIGGER'])
            arr[:,40] = ak.to_numpy(branches['scaleFactor_ELE'])
            arr[:,41] = ak.to_numpy(branches['scaleFactor_MUON'])
            arr[:,42] = ak.to_numpy(branches['scaleFactor_LepTRIGGER'])
            arr[:,43] = ak.to_numpy(branches['scaleFactor_BTAG'])

            # Separate signal and background
            if 'mc' in filepath:
                signal_data.append(arr)
            else:
                background_data.append(arr)

        # Concatenate signal and background separately
        if signal_data:
            signal_array = np.concatenate(signal_data, axis=0)
            np.save('arrays/signal_raw.npy', signal_array)
            print(f"Saved {len(signal_data)} signal files to signal_raw.npy with shape: {signal_array.shape}")
        
        if background_data:
            background_array = np.concatenate(background_data, axis=0)
            np.save('arrays/data_raw.npy', background_array)
            print(f"Saved {len(background_data)} data files to data_raw.npy with shape: {background_array.shape}")

        print("-----------------------------------------------------")
        print(f"Converted {len(filelist)} ROOT files to NumPy arrays")
        print("[Step 2] completed successfully!")
        if not (step3 or step4 or step5 or plot or sig):
            exit(0)  # Exit if only step1 and step2 were requested

# Step 3: Preprocessing and Background Estimation
if step3:
    print("-----------------------------------------------")
    print("Step 3: Preprocessing and Background Estimation")
    print("-----------------------------------------------")
    
    # Load raw data
    if not os.path.exists('arrays/signal_raw.npy') or not os.path.exists('arrays/data_raw.npy'):
        print("Error: signal_raw.npy or data_raw.npy not found. Run --step2 first.")
        exit(1)
    
    signal_raw = np.load('arrays/signal_raw.npy')
    data_raw = np.load('arrays/data_raw.npy')
    
    print(f"[Message] Loaded {len(signal_raw)} MC events and {len(data_raw)} data events")
    
    # Check that arrays have exactly 46 columns
    if signal_raw.shape[1] != 46:
        print(f"Error: signal_raw.npy has {signal_raw.shape[1]} columns, expected 46. Please re-run --step2 to regenerate with correct format.")
        exit(1)
    if data_raw.shape[1] != 46:
        print(f"Error: data_raw.npy has {data_raw.shape[1]} columns, expected 46. Please re-run --step2 to regenerate with correct format.")
        exit(1)
    
    # Process MC data
    signal = []
    for arr in [signal_raw]:  # Process in chunks if needed
        # Calculate diphoton variables using TLorentzVector
        m_yy = []
        pt_yy = []
        for pt1, eta1, phi1, pt2, eta2, phi2 in zip(arr[:,0], arr[:,1], arr[:,2], arr[:,3], arr[:,4], arr[:,5]):
            photon1 = TLorentzVector()
            photon1.SetPtEtaPhiM(pt1, eta1, phi1, 0.0)  # mass = 0 for photons
            photon2 = TLorentzVector()
            photon2.SetPtEtaPhiM(pt2, eta2, phi2, 0.0)  # mass = 0 for photons
            diphoton = photon1 + photon2
            m_yy.append(diphoton.M())
            pt_yy.append(diphoton.Pt())

        arr[:,44] = np.array(m_yy)  # diphoton invariant mass
        arr[:,45] = np.array(pt_yy)  # diphoton pT
        
        # Apply comprehensive MC weight scaling with all scale factors
        mcWeight = arr[:,32]
        scaleFactor_PILEUP = arr[:,37]
        scaleFactor_PHOTON = arr[:,38]
        scaleFactor_PhotonTRIGGER = arr[:,39]
        scaleFactor_ELE = arr[:,40]
        scaleFactor_MUON = arr[:,41]
        scaleFactor_LepTRIGGER = arr[:,42]
        scaleFactor_BTAG = arr[:,43]

        # Use original XSection and SumWeights for normalization
        sigma_per_event = arr[:,34]  # XSection for each event
        sum_mcWeight_per_event = arr[:,33]  # SumWeights for each event
        # Convert data luminosity to pb^-1 to match XSection units
        # --fraction option effect??
        # Using a fraction of events still represents the full 10 fb-1 dataset — we are just sampling fewer events to speed up processing.
        luminosity = 10 * 1000 # 10000 pb^-1

        # CORRECTION: Fix SM Higgs cross-section value
        # The cross-section value of ~2.64e-06 corresponds to SM Higgs production
        # and should be corrected to the expected SM Higgs → γγ cross-section of 0.000116 pb
        problematic_xsec_mask = np.abs(sigma_per_event - 2.64338632e-06) < 1e-10
        if np.any(problematic_xsec_mask):
            correction_factor = 0.000116 / 2.64338632e-06  # ≈ 43.9
            sigma_per_event = sigma_per_event.copy()  # Make a copy to avoid modifying original
            sigma_per_event[problematic_xsec_mask] *= correction_factor
            # Update the array with corrected cross-section values
            arr[:,34] = sigma_per_event
            print(f"[Message] Applied cross-section correction factor {correction_factor:.1f} to {np.sum(problematic_xsec_mask)} SM Higgs events")

        # print("-----------------------------------")
        # print("Cross section and weight debugging:")
        # print("-----------------------------------")
        # print(f"Unique cross sections: {np.unique(sigma_per_event)}")
        # print(f"Unique sum of weights: {np.unique(sum_mcWeight_per_event)}")
        # print(f"Luminosity: {luminosity} pb^-1")
        # print(f"MC weight statistics: mean={np.mean(mcWeight):.6f}, std={np.std(mcWeight):.6f}, min={np.min(mcWeight):.6f}, max={np.max(mcWeight):.6f}")
        # print(f"Sample MC weights (first 5): {mcWeight[:5]}")

        # CORRECTED: Use mcWeight directly as it already contains the appropriate weight values
        # Calculate normalization factor per event: (cross_section * luminosity) / sum_of_weights
        normalization_per_event = (sigma_per_event * luminosity) / sum_mcWeight_per_event

        # Apply all scale factors (filter out zero scale factors)
        scale_factors = {
            'PILEUP': scaleFactor_PILEUP,
            'PHOTON': scaleFactor_PHOTON,
            'PhotonTRIGGER': scaleFactor_PhotonTRIGGER,
            'ELE': scaleFactor_ELE,
            'MUON': scaleFactor_MUON,
            'LepTRIGGER': scaleFactor_LepTRIGGER,
            'BTAG': scaleFactor_BTAG
        }

        # Calculate combined scale factor (include all scale factors, even if zero)
        combined_scale_factor = np.ones_like(mcWeight)
        for name, sf in scale_factors.items():
            zero_count = np.sum(sf == 0)
            if zero_count > 0:
                print(f"[Warning] {zero_count} events have zero {name} scale factor")
            combined_scale_factor *= sf

        # CORRECTED: Final event weight = mcWeight * normalization * combined_scale_factors
        event_weight = mcWeight * normalization_per_event * combined_scale_factor

        # print(f"Normalization factor statistics: mean={np.mean(normalization_per_event):.6f}, std={np.std(normalization_per_event):.6f}")
        # print(f"Combined scale factor statistics: mean={np.mean(combined_scale_factor):.6f}, std={np.std(combined_scale_factor):.6f}")
        # print(f"Final event weight statistics: mean={np.mean(event_weight):.6f}, std={np.std(event_weight):.6f}")
        # print(f"Sample final weights (first 5): {event_weight[:5]}")

        # Store the final weight in the array
        arr[:,32] = event_weight

        # Check for zero scale factors and print warnings
        scale_factors = {
            'PILEUP': scaleFactor_PILEUP,
            'PHOTON': scaleFactor_PHOTON,
            'PhotonTRIGGER': scaleFactor_PhotonTRIGGER,
            'ELE': scaleFactor_ELE,
            'MUON': scaleFactor_MUON,
            'LepTRIGGER': scaleFactor_LepTRIGGER,
            'BTAG': scaleFactor_BTAG
        }
        print("-----------------------------------")
        print("Overall scale factor check for MC: (skipped, treating as 1)")
        print("-----------------------------------")
        overall_sf = np.ones_like(mcWeight)

        # Apply selections
        # |eta| < 2.37 for both photons
        # |eta| *not* in (1.37, 1.52) for both photons
        # pT > 25 GeV for both photons
        # pT / m_yy > 0.35 for leading photon and > 0.25 for subleading photon
        # invariant mass in [105, 160]

        ph1_eta_mask = (np.abs(arr[:,1]) < 1.37) | ((np.abs(arr[:,1]) > 1.52) & (np.abs(arr[:,1]) < 2.37))
        ph2_eta_mask = (np.abs(arr[:,4]) < 1.37) | ((np.abs(arr[:,4]) > 1.52) & (np.abs(arr[:,4]) < 2.37))

        # Avoid division by zero in pT/m_yy cuts
        m_yy_safe = np.where(arr[:,44] == 0, 1e-10, arr[:,44])
        ph1_pt_mask = (arr[:,0] > 25*1000) & ((arr[:,0] / m_yy_safe) > 0.35)
        ph2_pt_mask = (arr[:,3] > 25*1000) & ((arr[:,3] / m_yy_safe) > 0.25)

        arr = arr[ph1_eta_mask & ph2_eta_mask & ph1_pt_mask & ph2_pt_mask]

        myy_mask = (arr[:,44] > 105*1000) & (arr[:,44] < 160*1000)
        arr = arr[myy_mask]

        # Tight ID selection for signal
        ti_mask = (arr[:,35] == True) & (arr[:,36] == True)
        arr = arr[ti_mask]

        signal.append(arr)
    
    signal = np.concatenate(signal, axis=0) if signal else np.empty((0, 46))
    
    # Process data (background)
    bkgd = []
    for arr in [data_raw]:
        # Calculate diphoton variables using TLorentzVector
        m_yy = []
        pt_yy = []
        for pt1, eta1, phi1, pt2, eta2, phi2 in zip(arr[:,0], arr[:,1], arr[:,2], arr[:,3], arr[:,4], arr[:,5]):
            photon1 = TLorentzVector()
            photon1.SetPtEtaPhiM(pt1, eta1, phi1, 0.0)  # mass = 0 for photons
            photon2 = TLorentzVector()
            photon2.SetPtEtaPhiM(pt2, eta2, phi2, 0.0)  # mass = 0 for photons
            diphoton = photon1 + photon2
            m_yy.append(diphoton.M())
            pt_yy.append(diphoton.Pt())

        arr[:,44] = np.array(m_yy)  # diphoton invariant mass
        arr[:,45] = np.array(pt_yy)  # diphoton pT
        arr[:,32] = np.ones_like(arr[:,32])  # Data weights = 1
        
        # Apply same selections as signal
        ph1_eta_mask = (np.abs(arr[:,1]) < 1.37) | ((np.abs(arr[:,1]) > 1.52) & (np.abs(arr[:,1]) < 2.37))
        ph2_eta_mask = (np.abs(arr[:,4]) < 1.37) | ((np.abs(arr[:,4]) > 1.52) & (np.abs(arr[:,4]) < 2.37))
        
        m_yy_safe = np.where(arr[:,44] == 0, 1e-10, arr[:,44])
        ph1_pt_mask = (arr[:,0] > 25*1000) & ((arr[:,0] / m_yy_safe) > 0.35)
        ph2_pt_mask = (arr[:,3] > 25*1000) & ((arr[:,3] / m_yy_safe) > 0.25)
        
        arr = arr[ph1_eta_mask & ph2_eta_mask & ph1_pt_mask & ph2_pt_mask]
        
        myy_mask = (arr[:,44] > 105*1000) & (arr[:,44] < 160*1000)
        arr = arr[myy_mask]
        
        bkgd.append(arr)
    
    bkgd = np.concatenate(bkgd, axis=0) if bkgd else np.empty((0, 46))
    
    print(f"[Message] After preprocessing: {len(signal)} signal events passing cuts, {len(bkgd)} background events passing cuts")
    
    # Background estimation
    print("-----------------------------------")
    print("Performing background estimation...")
    print("-----------------------------------")

    # Define regions
    sb_mask = (bkgd[:,44] < 120*1000) | (bkgd[:,44] > 130*1000)
    sr_mask = (bkgd[:,44] > 123*1000) & (bkgd[:,44] < 127*1000)
    ti_mask = (bkgd[:,35] == True) & (bkgd[:,36] == True)
    nti_mask = (bkgd[:,35] == False) | (bkgd[:,36] == False)
    
    # Calculate yields
    ti_sb_yield = np.sum(bkgd[ti_mask & sb_mask, 32])
    ti_sr_yield = np.sum(bkgd[ti_mask & sr_mask, 32])
    nti_sr_yield = np.sum(bkgd[nti_mask & sr_mask, 32])
    nti_sb_yield = np.sum(bkgd[nti_mask & sb_mask, 32])
    
    # Calculate scale factors
    sf1 = ti_sb_yield / nti_sb_yield if nti_sb_yield > 0 else 1.0
    sf2 = nti_sr_yield / nti_sb_yield if nti_sb_yield > 0 else 1.0
    expected_bkgd = nti_sb_yield * sf1 * sf2
    
    # Apply background estimation
    bkgd = bkgd[nti_mask & sb_mask]
    bkgd[:,32] = np.ones_like(bkgd[:,32]) * expected_bkgd / np.sum(np.ones_like(bkgd[:,32])) if len(bkgd) > 0 else bkgd[:,32]
    
    # Signal mass cut
    sr_mask = (signal[:,44] > 123*1000) & (signal[:,44] < 127*1000)
    print("Signal yield without mass cut: ", np.sum(signal[:,32]))
    signal = signal[sr_mask]
    print("Signal yield with mass cut: ", np.sum(signal[:,32]))
    
    # Remove extra columns, keep only physics variables
    signal = signal[:,0:46]
    bkgd = bkgd[:,0:46]
    
    # Print background estimation results
    print("\n=== Background Estimation Results ===")
    print(f"NTI SB yield: {nti_sb_yield:.2f}")
    print(f"Scale factors: SF1 = {sf1:.4f}, SF2 = {sf2:.4f}")
    print(f"Expected background: {expected_bkgd:.2f}")
    print("\nRegion yields:")
    print("")
    print("  NTI SR   | NTI SB    | TI SR    | TI SB")
    print("------------------------------------------------")
    print(f"  {nti_sr_yield:>7.2f} | {nti_sb_yield:>7.2f} | {ti_sr_yield:>7.2f} | {ti_sb_yield:>7.2f}")
    
    np.save('arrays/signal.npy', signal)
    np.save('arrays/bkgd.npy', bkgd)

    print("--------------------------------")
    print("[Step 3] completed successfully!")
    print("--------------------------------")
    print("")
    if not (step4 or step5 or plot or sig):
        exit(0)  # Exit if only step1, step2, and step3 were requested

# Load data for subsequent steps
if not step1 and not step2 and not step3:
    if os.path.exists('arrays/signal.npy') and os.path.exists('arrays/bkgd.npy'):
        signal = np.load('arrays/signal.npy')
        bkgd = np.load('arrays/bkgd.npy')
        # print("Sum of signal weights: ", np.sum(signal[:,32]))
        # print("Sum of bkgd weights: ", np.sum(bkgd[:,32]))
    else:
        print("Error: Processed arrays not found. Run --step3 first to generate them.")
        exit(1)


# Step 4: classification
if step4:
    batch_size = 20_000
    test_size = 0.5
    # signal = np.load('arrays/signal.npy')
    # bkgd = np.load('arrays/bkgd.npy')
    signal_scores, bkgd_scores = tabpfn(signal, bkgd, batch_size=batch_size, test_size=test_size, random_state=42)
    np.save('arrays/signal_scores.npy', signal_scores)
    np.save('arrays/bkgd_scores.npy', bkgd_scores)
    print("[Step 4] completed successfully!")
    print("--------------------------------")
    if not (step5 or plot or sig):
        exit(0)  # Exit if only step1-4 were requested

# Load scores for subsequent steps
if not step4:
    if os.path.exists('arrays/signal_scores.npy') and os.path.exists('arrays/bkgd_scores.npy'):
        signal_scores = np.load('arrays/signal_scores.npy')
        bkgd_scores = np.load('arrays/bkgd_scores.npy')
    else:
        signal_scores = None
        bkgd_scores = None


# Step 5: categorization
if step5:
    if signal_scores is None or bkgd_scores is None:
        print("Error: Cannot run step 5 without signal scores. Run --step4 first.")
        exit(1)

    # Load files
    # signal = np.load('arrays/signal.npy')
    # bkgd = np.load('arrays/bkgd.npy')
    signal_scores = np.load('arrays/signal_scores.npy')
    bkgd_scores = np.load('arrays/bkgd_scores.npy')
    signal_df, bkgd_df = load_datasets(signal, bkgd, signal_scores, bkgd_scores)
    bb = [0, 1]
    num_divisions = 0
    cur_Z = get_significance(signal_df, bkgd_df, np.array(bb))
    ZZ = [cur_Z]

    while len(ZZ) < 3 or (ZZ[-1] - ZZ[-2])/ZZ[-2] > 0.05: 
        num_bins = 1000
        min_events = 100
        new_boundary, _ = place_boundary(signal_df, bkgd_df, np.array(bb), num_bins, min_events)
        bb.append(new_boundary)
        bb.sort()
        cur_Z = get_significance(signal_df, bkgd_df, np.array(bb))
        ZZ.append(cur_Z)
        num_divisions += 1

        signal_np = signal_df.AsNumpy(columns=['ml_score', 'normalized_weight']).copy()
        signal_scores = signal_np['ml_score']
        signal_weights = signal_np['normalized_weight']
        bkgd_np = bkgd_df.AsNumpy(columns=['ml_score', 'normalized_weight']).copy()
        bkgd_scores = bkgd_np['ml_score']
        bkgd_weights = bkgd_np['normalized_weight']

    bb = np.array(bb)
    ZZ = np.array(ZZ)

    print("Boundaries: ", bb)
    print("Significances: ", ZZ)

    np.save('arrays/boundaries.npy', bb)
    np.save('arrays/significances.npy', ZZ)
    print("[Step 5] completed successfully!")
    print("--------------------------------")
    if not (plot or sig):
        exit(0)  # Exit if only step1-5 were requested

# Load boundaries for subsequent steps
if not step5:
    if os.path.exists('arrays/boundaries.npy') and os.path.exists('arrays/significances.npy'):
        bb = np.load('arrays/boundaries.npy')
        significances = np.load('arrays/significances.npy')
    else:
        bb = None
        significances = None

if plot:
    if signal_scores is None or bkgd_scores is None or bb is None:
        print("Error: Cannot plot without required data. Run --all to generate all necessary files.")
        exit(1)
    else:
        # Create plots directory if it doesn't exist
        os.makedirs('plots', exist_ok=True)
        
        plot_scores(signal, bkgd, signal_scores, bkgd_scores, bb)
        plot_myy(signal, bkgd)
        print("Plotting completed successfully!")
        print("--------------------------------")
        if not sig:
            exit(0)  # Exit if plotting was the last requested step

if sig:
    if signal_scores is None or bkgd_scores is None or bb is None:
        print("Error: Cannot calculate significance without required data. Run --all to generate all necessary files.")
        exit(1)
    else:
        signal_df, bkgd_df = load_datasets(signal, bkgd, signal_scores, bkgd_scores)
        Z = get_significance(signal_df, bkgd_df, bb)
        print('Final significance: {:.2f}'.format(Z))
        
        # Clean up temporary ROOT files created for significance calculation
        # Respect OUTPUT_DIR if set; otherwise fall back to current directory
        output_dir = os.environ.get('OUTPUT_DIR', os.getcwd())
        results_dir = os.path.join(output_dir, 'results')
        for root_file in [os.path.join(results_dir, 'signal.root'), os.path.join(results_dir, 'bkgd.root')]:
            if os.path.exists(root_file):
                try:
                    os.remove(root_file)
                except OSError:
                    pass  # File might be in use or already deleted
        
        print("Significance calculation completed successfully!")
        exit(0)  # Exit after significance calculation