File size: 15,914 Bytes
cfcbbc8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 |
#!/usr/bin/env python3
"""
Generic particle analyzer for ROOT files
Usage: python analyze_particles.py <prefix> [filepath]
Examples:
python analyze_particles.py lep
python analyze_particles.py photon
python analyze_particles.py tau
python analyze_particles.py jet
"""
import sys
import os
import uproot
import numpy as np
import argparse
def get_available_prefixes(filepath):
"""Get all available particle prefixes in the ROOT file"""
with uproot.open(filepath) as file:
tree = file['mini;1']
branches = list(tree.keys())
prefixes = set()
for branch in branches:
if '_' in branch:
prefix = branch.split('_')[0]
prefixes.add(prefix)
return sorted(list(prefixes))
def analyze_particles(filepath, prefix, max_events=None):
"""Analyze particle properties in detail"""
print(f"Analyzing {prefix} properties in: {filepath}")
print("=" * 60)
with uproot.open(filepath) as file:
tree = file['mini;1']
branches = list(tree.keys())
# Find all branches with the given prefix
prefix_branches = [b for b in branches if b.startswith(prefix + '_')]
if not prefix_branches:
print(f"No branches found with prefix '{prefix}_'")
print(f"Available prefixes: {get_available_prefixes(filepath)}")
return
print(f"Found {len(prefix_branches)} branches with prefix '{prefix}_':")
for branch in sorted(prefix_branches):
print(f" - {branch}")
print()
# Load data
data = {}
for branch in prefix_branches:
try:
data[branch] = tree[branch].array()
if max_events:
data[branch] = data[branch][:max_events]
except Exception as e:
print(f"Warning: Could not load {branch}: {e}")
continue
if not data:
print("No data could be loaded!")
return
n_events = len(data[list(data.keys())[0]])
print(f"Total events analyzed: {n_events}")
print()
# Analyze multiplicity if available
multiplicity_branch = f"{prefix}_n"
has_variable_multiplicity = multiplicity_branch in data
# For photons, check if ID variables are stored as fixed arrays
has_fixed_multiplicity = False
if prefix == 'photon':
# Check if identification variables exist and are 2D
id_branches = [b for b in prefix_branches if b in [f'{prefix}_isTightID', f'{prefix}_truthMatched', f'{prefix}_trigMatched']]
if id_branches:
sample_id = data[id_branches[0]]
# Check if it's a 2D array (events × 2 photons) using awkward array properties
try:
# For awkward arrays, check if we can access [:,1] (second photon)
test_access = sample_id[:,1]
has_fixed_multiplicity = True
except:
has_fixed_multiplicity = False
if has_variable_multiplicity:
analyze_multiplicity(data[multiplicity_branch], prefix)
# Analyze kinematic variables
kinematic_vars = ['pt', 'eta', 'phi', 'E', 'm']
for var in kinematic_vars:
branch_name = f"{prefix}_{var}"
if branch_name in data:
analyze_kinematic(data[branch_name], var.upper(), prefix,
has_variable_multiplicity, has_fixed_multiplicity)
# Analyze identification variables
id_vars = ['type', 'charge', 'isTightID', 'truthMatched', 'trigMatched']
for var in id_vars:
branch_name = f"{prefix}_{var}"
if branch_name in data:
analyze_identification(data[branch_name], var, prefix,
has_variable_multiplicity, has_fixed_multiplicity, prefix)
def analyze_multiplicity(mult_data, prefix):
"""Analyze particle multiplicity"""
print(f"{prefix.upper()} multiplicity distribution:")
unique, counts = np.unique(mult_data, return_counts=True)
for n, count in zip(unique, counts):
percentage = count / len(mult_data) * 100
print(" {} {}(s): {:6d} events ({:.1f}%)".format(n, prefix, count, percentage))
print()
def analyze_kinematic(var_data, var_name, prefix, has_variable_multiplicity=False, has_fixed_multiplicity=False):
"""Analyze kinematic variables"""
print(f"{prefix.upper()} {var_name} analysis:")
if has_variable_multiplicity:
# Handle variable number of particles per event
all_values = []
leading_values = []
subleading_values = []
for event_values in var_data:
if len(event_values) > 0:
# Sort by pT if this is pT, otherwise just take as is
if var_name == 'PT':
sorted_values = sorted(event_values, reverse=True)
else:
sorted_values = event_values
all_values.extend(sorted_values)
# Store leading and subleading
if len(sorted_values) >= 1:
leading_values.append(sorted_values[0])
if len(sorted_values) >= 2:
subleading_values.append(sorted_values[1])
values = np.array(all_values)
leading = np.array(leading_values) if leading_values else None
subleading = np.array(subleading_values) if subleading_values else None
print(f" Total number of {prefix}(s): {len(values)}")
print(f" Events with ≥1 {prefix}: {len(leading) if leading is not None else 0}")
print(f" Events with ≥2 {prefix}(s): {len(subleading) if subleading is not None else 0}")
elif has_fixed_multiplicity:
# Handle fixed number of particles per event (like exactly 2 photons)
values = np.array(var_data)
if var_name == 'PT':
# For fixed multiplicity, assume first column is leading, second is subleading
leading = values[:, 0] if values.shape[1] > 0 else None
subleading = values[:, 1] if values.shape[1] > 1 else None
else:
leading = values[:, 0] if values.shape[1] > 0 else None
subleading = values[:, 1] if values.shape[1] > 1 else None
print(f" Total number of {prefix}(s): {len(values)}")
else:
# Handle single values per event
values = np.array(var_data)
leading = None
subleading = None
print(f" Total number of {prefix}(s): {len(values)}")
if len(values) == 0:
print(" No data available")
return
# Convert to GeV if it's energy/momentum
if var_name in ['PT', 'E', 'M']:
values_gev = values / 1000
leading_gev = leading / 1000 if leading is not None else None
subleading_gev = subleading / 1000 if subleading is not None else None
unit = "GeV"
else:
values_gev = values
leading_gev = leading
subleading_gev = subleading
unit = ""
print(f" {var_name} statistics ({unit}) - All {prefix}(s):")
print(" Mean: {:.3f}".format(np.mean(values_gev)))
print(" Median: {:.3f}".format(np.median(values_gev)))
print(" Min: {:.3f}".format(np.min(values_gev)))
print(" Max: {:.3f}".format(np.max(values_gev)))
print(" Std: {:.3f}".format(np.std(values_gev)))
# Show leading particle stats
if leading_gev is not None and len(leading_gev) > 0:
print(f" {var_name} statistics ({unit}) - Leading {prefix}:")
print(" Mean: {:.3f}".format(np.mean(leading_gev)))
print(" Median: {:.3f}".format(np.median(leading_gev)))
print(" Min: {:.3f}".format(np.min(leading_gev)))
print(" Max: {:.3f}".format(np.max(leading_gev)))
print(" Std: {:.3f}".format(np.std(leading_gev)))
# Show subleading particle stats
if subleading_gev is not None and len(subleading_gev) > 0:
print(f" {var_name} statistics ({unit}) - Subleading {prefix}:")
print(" Mean: {:.3f}".format(np.mean(subleading_gev)))
print(" Median: {:.3f}".format(np.median(subleading_gev)))
print(" Min: {:.3f}".format(np.min(subleading_gev)))
print(" Max: {:.3f}".format(np.max(subleading_gev)))
print(" Std: {:.3f}".format(np.std(subleading_gev)))
# Show ratio between leading and subleading if both exist
if leading_gev is not None and len(leading_gev) == len(subleading_gev):
ratio = subleading_gev / leading_gev
print(f" {var_name} ratio (Subleading/Leading):")
print(" Mean: {:.3f}".format(np.mean(ratio)))
print(" Median: {:.3f}".format(np.median(ratio)))
print(" Min: {:.3f}".format(np.min(ratio)))
print(" Max: {:.3f}".format(np.max(ratio)))
print()
def analyze_identification(var_data, var_name, prefix, has_variable_multiplicity=False, has_fixed_multiplicity=False, particle_prefix=None):
"""Analyze identification variables"""
print(f"{prefix.upper()} {var_name} analysis:")
# For photons, prioritize fixed multiplicity logic even if variable multiplicity exists
use_fixed_multiplicity = has_fixed_multiplicity and (particle_prefix == 'photon' or not has_variable_multiplicity)
if use_fixed_multiplicity:
# Handle fixed number of particles per event (like exactly 2 photons)
values = np.array(var_data)
# For fixed multiplicity, analyze leading and subleading separately
if values.shape[1] >= 2:
leading_values = values[:, 0]
subleading_values = values[:, 1]
print(f" Overall {var_name} distribution:")
analyze_id_distribution(values.flatten(), var_name, prefix)
print(f" Leading {prefix} {var_name} distribution:")
analyze_id_distribution(leading_values, var_name, prefix)
print(f" Subleading {prefix} {var_name} distribution:")
analyze_id_distribution(subleading_values, var_name, prefix)
# Show correlation between leading and subleading
if var_name in ['isTightID', 'truthMatched', 'trigMatched']:
analyze_correlation(leading_values, subleading_values, var_name, prefix)
print()
return
elif has_variable_multiplicity:
# Handle variable number of particles per event
all_values = []
for event_values in var_data:
if len(event_values) > 0:
all_values.extend(event_values)
values = np.array(all_values)
else:
# Handle single values per event
values = np.array(var_data)
# For variable multiplicity or single values
analyze_id_distribution(values, var_name, prefix)
print()
def analyze_id_distribution(values, var_name, prefix):
"""Analyze a single identification variable distribution"""
if var_name == 'type':
analyze_particle_types(values, prefix)
elif var_name in ['isTightID', 'truthMatched', 'trigMatched']:
analyze_boolean_flags(values, var_name, prefix)
elif var_name == 'charge':
analyze_charges(values, prefix)
else:
# Generic analysis
unique, counts = np.unique(values, return_counts=True)
total = len(values)
print(f" Distribution:")
for val, count in zip(unique[:10], counts[:10]): # Show first 10
percentage = count / total * 100
print(" {}: {:6d} ({:.1f}%)".format(val, count, percentage))
if len(unique) > 10:
print(f" ... and {len(unique) - 10} more values")
def analyze_correlation(leading, subleading, var_name, prefix):
"""Analyze correlation between leading and subleading particle properties"""
print(f" {prefix.upper()} {var_name} correlation (Leading × Subleading):")
# Create contingency table
both_true = np.sum((leading == True) & (subleading == True))
leading_true_sub_false = np.sum((leading == True) & (subleading == False))
leading_false_sub_true = np.sum((leading == False) & (subleading == True))
both_false = np.sum((leading == False) & (subleading == False))
total = len(leading)
print(" Both True: {:6d} ({:.1f}%)".format(both_true, both_true/total*100))
print(" Leading True, Subleading False: {:6d} ({:.1f}%)".format(
leading_true_sub_false, leading_true_sub_false/total*100))
print(" Leading False, Subleading True: {:6d} ({:.1f}%)".format(
leading_false_sub_true, leading_false_sub_true/total*100))
print(" Both False: {:6d} ({:.1f}%)".format(both_false, both_false/total*100))
def analyze_particle_types(types, prefix):
"""Analyze particle types"""
type_dict = {11: 'electron', 13: 'muon', 15: 'tau', 22: 'photon'}
print(f" {prefix.upper()} type distribution:")
unique_types, counts = np.unique(types, return_counts=True)
for ptype, count in zip(unique_types, counts):
type_name = type_dict.get(ptype, f'unknown({ptype})')
percentage = count / len(types) * 100
print(" {}: {:6d} ({:.1f}%)".format(type_name, count, percentage))
print()
def analyze_boolean_flags(flags, flag_name, prefix):
"""Analyze boolean flags"""
true_count = np.sum(flags)
false_count = len(flags) - true_count
true_pct = true_count / len(flags) * 100
false_pct = false_count / len(flags) * 100
print(f" {prefix.upper()} {flag_name} distribution:")
print(" True: {:6d} ({:.1f}%)".format(true_count, true_pct))
print(" False: {:6d} ({:.1f}%)".format(false_count, false_pct))
print()
def analyze_charges(charges, prefix):
"""Analyze particle charges"""
unique_charges, counts = np.unique(charges, return_counts=True)
print(f" {prefix.upper()} charge distribution:")
for charge, count in zip(unique_charges, counts):
percentage = count / len(charges) * 100
print(" {}: {:6d} ({:.1f}%)".format(charge, count, percentage))
print()
def main():
parser = argparse.ArgumentParser(description='Generic particle analyzer for ROOT files')
parser.add_argument('--list-prefixes', action='store_true',
help='List all available prefixes in the file')
parser.add_argument('prefix', nargs='?', help='Particle prefix (e.g., lep, photon, tau, jet)')
parser.add_argument('filepath', nargs='?',
default="/global/cfs/projectdirs/atlas/eligd/llm_for_analysis_copy/data/mc_341081.ttH125_gamgam.GamGam.root",
help='Path to ROOT file')
parser.add_argument('--max-events', type=int, help='Limit analysis to first N events')
args = parser.parse_args()
if args.list_prefixes:
if not os.path.exists(args.filepath):
print(f"Error: File '{args.filepath}' does not exist!")
return
print("Available prefixes in the file:")
prefixes = get_available_prefixes(args.filepath)
for prefix in prefixes:
print(f" - {prefix}")
return
if not args.prefix:
print("Error: Please specify a particle prefix (e.g., lep, photon, tau, jet)")
print("Use --list-prefixes to see available options")
return
if not os.path.exists(args.filepath):
print(f"Error: File '{args.filepath}' does not exist!")
return
analyze_particles(args.filepath, args.prefix, args.max_events)
if __name__ == "__main__":
main()
|