Spaces:

gregorkrzmanc
/

HitPF_demo

Sleeping

App Files Files Community

github-actions[bot] commited on Mar 4

Commit

cc0720f

0 Parent(s):

Sync from GitHub f6dbbfb

Browse files

Files changed (47) hide show

.github/workflows/sync-to-hf-space.yml +41 -0
.gitignore +49 -0
Dockerfile +15 -0
README.md +79 -0
app.py +772 -0
config_files/config_hits_track_v4.yaml +146 -0
scripts/evaluation.sh +26 -0
scripts/train_clustering.sh +20 -0
scripts/train_energy_pid.sh +24 -0
src/data/config.py +218 -0
src/data/fileio.py +101 -0
src/data/preprocess.py +253 -0
src/data/tools.py +191 -0
src/dataset/dataclasses.py +126 -0
src/dataset/dataset.py +287 -0
src/dataset/functions_data.py +26 -0
src/dataset/functions_graph.py +105 -0
src/dataset/functions_particles.py +122 -0
src/inference.py +735 -0
src/layers/clustering.py +99 -0
src/layers/inference_oc.py +251 -0
src/layers/object_cond.py +609 -0
src/layers/regression/loss_regression.py +59 -0
src/layers/shower_dataframe.py +441 -0
src/layers/shower_matching.py +127 -0
src/layers/tools_for_regression.py +131 -0
src/layers/utils_training.py +166 -0
src/models/E_correction_module.py +43 -0
src/models/Gatr_pf_e_noise.py +332 -0
src/models/energy_correction_NN.py +299 -0
src/models/energy_correction_charged.py +116 -0
src/models/energy_correction_neutral.py +157 -0
src/models/wrapper/example_mode_gatr_noise.py +21 -0
src/train_lightning1.py +128 -0
src/utils/callbacks.py +30 -0
src/utils/import_tools.py +8 -0
src/utils/inference/pandas_helpers.py +36 -0
src/utils/load_pretrained_models.py +32 -0
src/utils/logger_wandb.py +33 -0
src/utils/parser_args.py +246 -0
src/utils/pid_conversion.py +7 -0
src/utils/post_clustering_features.py +82 -0
src/utils/train_utils.py +281 -0
tests/test_cpu_attention.py +99 -0
tests/test_csv_priority.py +162 -0
tests/test_energy_correction_no_matches.py +90 -0
tests/test_pfo_links.py +231 -0

.github/workflows/sync-to-hf-space.yml ADDED Viewed

	@@ -0,0 +1,41 @@

+name: Sync to Hugging Face Space
+on:
+  push:
+    branches:
+      - main
+permissions:
+  contents: read
+jobs:
+  sync-to-hf:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo (no history)
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+          lfs: false
+      - name: Push to Hugging Face Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          # Configure git
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git config --global user.name "github-actions[bot]"
+          # Use a credential helper to avoid embedding the token in the URL
+          git config --global credential.helper store
+          printf 'https://user:%s@huggingface.co\n' "$HF_TOKEN" > ~/.git-credentials
+          # Create a fresh repo with a single commit (no history)
+          cd $GITHUB_WORKSPACE
+          rm -rf .git
+          git init --initial-branch main
+          git add .
+          git commit -m "Sync from GitHub ${GITHUB_SHA::7}"
+          # Force-push the single commit to HF Space
+          git push --force https://huggingface.co/spaces/gregorkrzmanc/HitPF_demo main

.gitignore ADDED Viewed

	@@ -0,0 +1,49 @@

+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.egg-info/
+dist/
+build/
+.eggs/
+# Jupyter
+.ipynb_checkpoints/
+*.ipynb
+# Weights & Biases
+wandb/
+# Model checkpoints and outputs
+*.pt
+*.pth
+showers_df_evaluation/
+# Data files
+*.root
+*.h5
+*.hdf5
+*.pkl
+*.pickle
+*.npy
+*.npz
+# Demo files are downloaded at runtime from Hugging Face Hub
+model_clustering.ckpt
+model_e_pid.ckpt
+test_data.parquet
+# Logs
+*.log
+logs/
+# Editors
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM dologarcia/gatr:v9
+WORKDIR /app
+RUN pip install --no-cache-dir \
+    densitypeakclustering \
+    lightning-utilities \
+    torchmetrics \
+    gradio \
+    plotly
+COPY . .
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+---
+title: HitPF
+emoji: ⚛️
+colorFrom: blue
+colorTo: purple
+sdk: docker
+app_file: app.py
+pinned: false
+---
+# HitPF
+**HitPF** is a GATr-based particle-flow reconstruction model for the CLD detector at the FCC-ee.
+It performs two sequential tasks:
+1. **Clustering** — groups calorimeter hits and tracks into particle-flow objects using an object-condensation loss.
+2. **Property regression** — regresses a correction factor for each reconstructed cluster using a GNN-based model and a PID class
+---
+## Dependencies
+The code can be used with this container:
+```docker://dologarcia/gatr:v9```
+For the live demo, gradio and plotly also need to be installed:
+```
+pip install gradio plotly
+```
+---
+## Dataset
+Input data is stored as `.parquet` files, each file stores 100 events. A sample of the dataset in ML-ready format can be found at [1](https://zenodo.org/records/18749298). The full dataset is hosted on CERN's EOS space.
+---
+## Training
+### Step 1 — Clustering
+```bash
+bash scripts/train_clustering.sh
+```
+### Step 2 — Energy correction
+```bash
+bash scripts/train_energy_pid.sh
+```
+### Validation
+```bash
+bash scripts/evaluation.sh
+```
+---
+### Live demo (work in progress)
+```bash
+python -m app
+```
+## Citation
+If you use this code, please cite:
+```bibtex
+@software{hitpf2026,
+  title  = {End-to-end event reconstruction for precision physics at future colliders code},
+  year   = {2026},
+  url    = {https://github.com/mgarciam/HitPF}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,772 @@

+#!/usr/bin/env python
+"""
+Gradio UI for single-event MLPF inference.
+Launch with:
+    python app.py [--device cpu]
+The UI lets you:
+  1. Load an event from a parquet file (pick file + event index), **or**
+     paste hit / track / particle data in CSV format.
+  2. (Optionally) load pre-trained model checkpoints.
+  3. Run inference → view predicted particles and the hit→cluster mapping.
+"""
+import argparse
+import os
+import shutil
+import traceback
+import gradio as gr
+import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
+from huggingface_hub import hf_hub_download
+# ---------------------------------------------------------------------------
+# Auto-download demo files from Hugging Face Hub if they are not present
+# ---------------------------------------------------------------------------
+_HF_REPO_ID = "gregorkrzmanc/hitpf_demo_files"
+_DEMO_FILES = [
+    "model_clustering.ckpt",
+    "model_e_pid.ckpt",
+    "test_data.parquet",
+]
+def _ensure_demo_files(dest_dir: str = ".") -> None:
+    """Download demo files from Hugging Face Hub if they don't already exist."""
+    for fname in _DEMO_FILES:
+        dest = os.path.join(dest_dir, fname)
+        if not os.path.isfile(dest):
+            try:
+                print(f"Downloading {fname} from HF Hub ({_HF_REPO_ID}) …")
+                downloaded = hf_hub_download(
+                    repo_id=_HF_REPO_ID,
+                    filename=fname,
+                    repo_type="dataset",
+                )
+                shutil.copy(downloaded, dest)
+                print(f"  → saved to {dest}")
+            except Exception as exc:
+                print(f"  ⚠️ Could not download {fname}: {exc}")
+_ensure_demo_files()
+# ---------------------------------------------------------------------------
+# Global state – filled lazily
+# ---------------------------------------------------------------------------
+_MODEL = None
+_ARGS = None
+_DEVICE = "cpu"
+def _set_device(device: str):
+    global _DEVICE
+    _DEVICE = device
+# ---------------------------------------------------------------------------
+# Model loading
+# ---------------------------------------------------------------------------
+def load_model_ui(clustering_ckpt: str, energy_pid_ckpt: str, device: str):
+    """Load model from checkpoint paths (called by the UI button)."""
+    global _MODEL, _ARGS, _DEVICE
+    _DEVICE = device or "cpu"
+    if not clustering_ckpt or not os.path.isfile(clustering_ckpt):
+        return "⚠️ Please provide a valid path to the clustering checkpoint."
+    energy_pid = energy_pid_ckpt if (energy_pid_ckpt and os.path.isfile(energy_pid_ckpt)) else None
+    try:
+        from src.inference import load_model
+        _MODEL, _ARGS = load_model(
+            clustering_ckpt=clustering_ckpt,
+            energy_pid_ckpt=energy_pid,
+            device=_DEVICE,
+        )
+        msg = f"✅ Model loaded on **{_DEVICE}**"
+        if energy_pid:
+            msg += " (clustering + energy/PID correction)"
+        else:
+            msg += " (clustering only — no energy/PID correction)"
+        return msg
+    except Exception:
+        return f"❌ Failed to load model:\n```\n{traceback.format_exc()}\n```"
+# ---------------------------------------------------------------------------
+# Event loading helpers
+# ---------------------------------------------------------------------------
+def _count_events_in_parquet(parquet_path: str) -> str:
+    """Return a short info string about the parquet file."""
+    if not parquet_path or not os.path.isfile(parquet_path):
+        return "No file selected"
+    try:
+        from src.inference import load_event_from_parquet
+        from src.data.fileio import _read_parquet
+        table = _read_parquet(parquet_path)
+        n = len(table["X_track"])
+        return f"File has **{n}** events (indices 0–{n-1})"
+    except Exception as e:
+        return f"Error reading file: {e}"
+def _load_event_into_csv(parquet_path: str, event_index: int):
+    """Load an event from a parquet file and return CSV strings for the text fields."""
+    if not parquet_path or not os.path.isfile(parquet_path):
+        return "", "", "", "", "", "⚠️ Please provide a valid parquet file path."
+    try:
+        from src.inference import load_event_from_parquet
+        event = load_event_from_parquet(parquet_path, int(event_index))
+        hits_arr = np.asarray(event.get("X_hit", []))
+        tracks_arr = np.asarray(event.get("X_track", []))
+        particles_arr = np.asarray(event.get("X_gen", []))
+        pandora_arr = np.asarray(event.get("X_pandora", []))
+        def _arr_to_csv(arr):
+            if arr.ndim != 2:
+                return ""
+            return "\n".join(",".join(str(v) for v in row) for row in arr)
+        def _1d_to_csv(arr):
+            if len(arr) == 0:
+                return ""
+            return ",".join(str(int(v)) for v in arr)
+        pfo_calohit = np.asarray(event.get("pfo_calohit", []), dtype=np.int64)
+        pfo_track = np.asarray(event.get("pfo_track", []), dtype=np.int64)
+        calohit_csv = _1d_to_csv(pfo_calohit)
+        track_csv = _1d_to_csv(pfo_track)
+        if calohit_csv and track_csv:
+            pfo_links_csv = calohit_csv + "\n" + track_csv
+        elif calohit_csv:
+            pfo_links_csv = calohit_csv
+        elif track_csv:
+            pfo_links_csv = "\n" + track_csv
+        else:
+            pfo_links_csv = ""
+        return (
+            _arr_to_csv(hits_arr),
+            _arr_to_csv(tracks_arr),
+            _arr_to_csv(particles_arr),
+            _arr_to_csv(pandora_arr),
+            pfo_links_csv,
+            f"✅ Loaded event **{int(event_index)}**: "
+            f"{hits_arr.shape[0] if hits_arr.ndim == 2 else 0} hits, "
+            f"{tracks_arr.shape[0] if tracks_arr.ndim == 2 else 0} tracks, "
+            f"{particles_arr.shape[0] if particles_arr.ndim == 2 else 0} MC particles, "
+            f"{pandora_arr.shape[0] if pandora_arr.ndim == 2 else 0} Pandora PFOs",
+        )
+    except Exception as e:
+        return "", "", "", "", "", f"❌ Error loading event: {e}"
+def _build_cluster_plot(hit_cluster_df: pd.DataFrame) -> go.Figure:
+    """Build an interactive 3D scatter plot of hits colored by cluster ID."""
+    if hit_cluster_df.empty:
+        fig = go.Figure()
+        fig.update_layout(title="No hit data available", height=600)
+        return fig
+    df = hit_cluster_df.copy()
+    # Drop rows with NaN/Inf coordinates
+    for col in ("x", "y", "z", "hit_energy"):
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+    df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=["x", "y", "z", "hit_energy"])
+    if df.empty:
+        fig = go.Figure()
+        fig.update_layout(title="No valid hit data (all NaN/Inf)", height=600)
+        return fig
+    # Normalize hit energies for marker sizes
+    energies = df["hit_energy"].values.astype(float)
+    e_min, e_max = float(energies.min()), float(energies.max())
+    if e_max > e_min:
+        norm_e = (energies - e_min) / (e_max - e_min)
+    else:
+        norm_e = np.ones_like(energies) * 0.5  # midpoint when all equal
+    marker_sizes = 3 + norm_e * 12  # min size 3, max size 15
+    # Build per-hit hover text (avoids mixed-type customdata serialization issues)
+    df["_hover"] = (
+        "<b>" + df["hit_type"].astype(str) + "</b> hit #" + df["hit_index"].astype(int).astype(str) + "<br>"
+        + "Cluster: " + df["cluster_id"].astype(int).astype(str) + "<br>"
+        + "Energy: " + df["hit_energy"].map(lambda v: f"{v:.4f}") + "<br>"
+        + "x: " + df["x"].map(lambda v: f"{v:.2f}")
+        + ", y: " + df["y"].map(lambda v: f"{v:.2f}")
+        + ", z: " + df["z"].map(lambda v: f"{v:.2f}")
+    )
+    cluster_ids = df["cluster_id"].values
+    unique_clusters = sorted(set(int(c) for c in cluster_ids))
+    fig = go.Figure()
+    for cid in unique_clusters:
+        mask = cluster_ids == cid
+        subset = df[mask]
+        sizes = marker_sizes[mask].tolist()
+        label = "noise" if cid == 0 else f"cluster {cid}"
+        fig.add_trace(go.Scatter3d(
+            x=subset["x"].tolist(),
+            y=subset["y"].tolist(),
+            z=subset["z"].tolist(),
+            mode="markers",
+            name=label,
+            marker=dict(size=sizes, opacity=0.8),
+            hovertext=subset["_hover"].tolist(),
+            hoverinfo="text",
+        ))
+    fig.update_layout(
+        title="Hit → Cluster 3D Map",
+        scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
+        legend_title="Cluster",
+        height=600,
+        margin=dict(l=0, r=0, t=40, b=0),
+    )
+    return fig
+def _build_pandora_cluster_plot(hit_cluster_df: pd.DataFrame) -> go.Figure:
+    """Build an interactive 3D scatter plot of hits colored by Pandora cluster ID."""
+    if hit_cluster_df.empty or "pandora_cluster_id" not in hit_cluster_df.columns:
+        fig = go.Figure()
+        fig.update_layout(title="No Pandora cluster data available", height=600)
+        return fig
+    df = hit_cluster_df.copy()
+    # Only keep rows that have valid Pandora assignments (pandora_cluster_id >= 0)
+    for col in ("x", "y", "z", "hit_energy"):
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+    df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=["x", "y", "z", "hit_energy"])
+    if df.empty:
+        fig = go.Figure()
+        fig.update_layout(title="No valid hit data for Pandora plot (all NaN/Inf)", height=600)
+        return fig
+    # Normalize hit energies for marker sizes
+    energies = df["hit_energy"].values.astype(float)
+    e_min, e_max = float(energies.min()), float(energies.max())
+    if e_max > e_min:
+        norm_e = (energies - e_min) / (e_max - e_min)
+    else:
+        norm_e = np.ones_like(energies) * 0.5
+    marker_sizes = 3 + norm_e * 12
+    # Build per-hit hover text
+    df["_hover"] = (
+        "<b>" + df["hit_type"].astype(str) + "</b> hit #" + df["hit_index"].astype(int).astype(str) + "<br>"
+        + "Pandora cluster: " + df["pandora_cluster_id"].astype(int).astype(str) + "<br>"
+        + "Energy: " + df["hit_energy"].map(lambda v: f"{v:.4f}") + "<br>"
+        + "x: " + df["x"].map(lambda v: f"{v:.2f}")
+        + ", y: " + df["y"].map(lambda v: f"{v:.2f}")
+        + ", z: " + df["z"].map(lambda v: f"{v:.2f}")
+    )
+    pandora_ids = df["pandora_cluster_id"].values
+    unique_clusters = sorted(set(int(c) for c in pandora_ids))
+    fig = go.Figure()
+    for cid in unique_clusters:
+        mask = pandora_ids == cid
+        subset = df[mask]
+        sizes = marker_sizes[mask].tolist()
+        label = "unassigned" if cid == -1 else f"PFO {cid}"
+        fig.add_trace(go.Scatter3d(
+            x=subset["x"].tolist(),
+            y=subset["y"].tolist(),
+            z=subset["z"].tolist(),
+            mode="markers",
+            name=label,
+            marker=dict(size=sizes, opacity=0.8),
+            hovertext=subset["_hover"].tolist(),
+            hoverinfo="text",
+        ))
+    fig.update_layout(
+        title="Hit → Pandora Cluster 3D Map",
+        scene=dict(xaxis_title="x", yaxis_title="y", zaxis_title="z"),
+        legend_title="Pandora PFO",
+        height=600,
+        margin=dict(l=0, r=0, t=40, b=0),
+    )
+    return fig
+def _build_clustering_space_plot(hit_cluster_df: pd.DataFrame) -> go.Figure:
+    """Build an interactive 3D scatter plot of hits in the learned clustering space."""
+    if hit_cluster_df.empty or "cluster_x" not in hit_cluster_df.columns:
+        fig = go.Figure()
+        fig.update_layout(title="No clustering-space data available", height=600)
+        return fig
+    df = hit_cluster_df.copy()
+    # Drop rows with NaN/Inf coordinates
+    for col in ("cluster_x", "cluster_y", "cluster_z", "hit_energy"):
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+    df = df.replace([np.inf, -np.inf], np.nan).dropna(
+        subset=["cluster_x", "cluster_y", "cluster_z", "hit_energy"]
+    )
+    if df.empty:
+        fig = go.Figure()
+        fig.update_layout(title="No valid clustering-space data (all NaN/Inf)", height=600)
+        return fig
+    # Normalize hit energies for marker sizes
+    energies = df["hit_energy"].values.astype(float)
+    e_min, e_max = float(energies.min()), float(energies.max())
+    if e_max > e_min:
+        norm_e = (energies - e_min) / (e_max - e_min)
+    else:
+        norm_e = np.ones_like(energies) * 0.5
+    marker_sizes = 3 + norm_e * 12
+    # Build per-hit hover text
+    df["_hover"] = (
+        "<b>" + df["hit_type"].astype(str) + "</b> hit #" + df["hit_index"].astype(int).astype(str) + "<br>"
+        + "Cluster: " + df["cluster_id"].astype(int).astype(str) + "<br>"
+        + "Energy: " + df["hit_energy"].map(lambda v: f"{v:.4f}") + "<br>"
+        + "cluster_x: " + df["cluster_x"].map(lambda v: f"{v:.4f}")
+        + ", cluster_y: " + df["cluster_y"].map(lambda v: f"{v:.4f}")
+        + ", cluster_z: " + df["cluster_z"].map(lambda v: f"{v:.4f}")
+    )
+    cluster_ids = df["cluster_id"].values
+    unique_clusters = sorted(set(int(c) for c in cluster_ids))
+    fig = go.Figure()
+    for cid in unique_clusters:
+        mask = cluster_ids == cid
+        subset = df[mask]
+        sizes = marker_sizes[mask].tolist()
+        label = "noise" if cid == 0 else f"cluster {cid}"
+        fig.add_trace(go.Scatter3d(
+            x=subset["cluster_x"].tolist(),
+            y=subset["cluster_y"].tolist(),
+            z=subset["cluster_z"].tolist(),
+            mode="markers",
+            name=label,
+            marker=dict(size=sizes, opacity=0.8),
+            hovertext=subset["_hover"].tolist(),
+            hoverinfo="text",
+        ))
+    fig.update_layout(
+        title="Clustering Space 3D Map (GATr regressed coordinates)",
+        scene=dict(
+            xaxis_title="cluster_x",
+            yaxis_title="cluster_y",
+            zaxis_title="cluster_z",
+        ),
+        legend_title="Cluster",
+        height=600,
+        margin=dict(l=0, r=0, t=40, b=0),
+    )
+    return fig
+# ---------------------------------------------------------------------------
+# Main inference entry point for the UI
+# ---------------------------------------------------------------------------
+def _compute_inv_mass(df, e_col, px_col, py_col, pz_col):
+    """Compute the invariant mass of a system of particles in GeV.
+    Returns the scalar invariant mass  m = sqrt(max((ΣE)²−(Σpx)²−(Σpy)²−(Σpz)², 0)),
+    or *None* when *df* is empty or the required columns are absent.
+    """
+    if df.empty:
+        return None
+    for col in (e_col, px_col, py_col, pz_col):
+        if col not in df.columns:
+            return None
+    E = float(df[e_col].sum())
+    px = float(df[px_col].sum())
+    py = float(df[py_col].sum())
+    pz = float(df[pz_col].sum())
+    m2 = E ** 2 - px ** 2 - py ** 2 - pz ** 2
+    return float(np.sqrt(max(m2, 0.0)))
+def _fmt_mass(val):
+    """Format an invariant-mass value (float or None) as a GeV string."""
+    return f"{val:.4f} GeV" if val is not None else "N/A"
+def run_inference_ui(
+    parquet_path: str,
+    event_index: int,
+    csv_hits: str,
+    csv_tracks: str,
+    csv_particles: str,
+    csv_pandora: str,
+    csv_pfo_links: str = "",
+):
+    """Run inference on a single event, return predicted particles, 3D plots, MC particles and Pandora particles.
+    Returns
+    -------
+    particles_df : pandas.DataFrame
+    cluster_fig : plotly.graph_objects.Figure
+    clustering_space_fig : plotly.graph_objects.Figure
+    pandora_cluster_fig : plotly.graph_objects.Figure
+    mc_particles_df : pandas.DataFrame
+    pandora_particles_df : pandas.DataFrame
+    inv_mass_summary : str
+    """
+    global _MODEL, _ARGS, _DEVICE
+    empty_fig = go.Figure()
+    if _MODEL is None:
+        return (
+            pd.DataFrame({"error": ["Model not loaded. Please load a model first."]}),
+            empty_fig,
+            empty_fig,
+            empty_fig,
+            pd.DataFrame(),
+            pd.DataFrame(),
+            "",
+        )
+    try:
+        from src.inference import load_event_from_parquet, run_single_event_inference
+        # Decide input source
+        use_parquet = parquet_path and os.path.isfile(parquet_path)
+        use_csv = bool(csv_hits and csv_hits.strip())
+        if not use_parquet and not use_csv:
+            return (
+                pd.DataFrame({"error": ["Provide a parquet file or paste CSV hit data."]}),
+                empty_fig,
+                empty_fig,
+                empty_fig,
+                pd.DataFrame(),
+                pd.DataFrame(),
+                "",
+            )
+        if use_csv:
+            event = _parse_csv_event(csv_hits, csv_tracks, csv_particles, csv_pandora, csv_pfo_links)
+        elif use_parquet:
+            event = load_event_from_parquet(parquet_path, int(event_index))
+        particles_df, hit_cluster_df, mc_particles_df, pandora_particles_df = run_single_event_inference(
+            event, _MODEL, _ARGS, device=_DEVICE,
+        )
+        if particles_df.empty:
+            particles_df = pd.DataFrame({"info": ["Event produced no clusters (empty graph)."]})
+        cluster_fig = _build_cluster_plot(hit_cluster_df)
+        clustering_space_fig = _build_clustering_space_plot(hit_cluster_df)
+        pandora_cluster_fig = _build_pandora_cluster_plot(hit_cluster_df)
+        # Compute invariant masses [GeV]
+        m_true = _compute_inv_mass(mc_particles_df, "energy", "px", "py", "pz")
+        # HitPF uses corrected_energy when available, otherwise energy_sum_hits
+        hitpf_e_col = "corrected_energy" if "corrected_energy" in particles_df.columns else "energy_sum_hits"
+        m_reco_hitpf = _compute_inv_mass(particles_df, hitpf_e_col, "px", "py", "pz")
+        m_reco_pandora = _compute_inv_mass(pandora_particles_df, "energy", "px", "py", "pz")
+        inv_mass_summary = (
+            f"**Invariant mass (sum of all particle 4-vectors)**\n\n"
+            f"| Algorithm | m [GeV] |\n"
+            f"|---|---|\n"
+            f"| m_true (MC truth) | {_fmt_mass(m_true)} |\n"
+            f"| m_reco (HitPF) | {_fmt_mass(m_reco_hitpf)} |\n"
+            f"| m_reco (Pandora) | {_fmt_mass(m_reco_pandora)} |"
+        )
+        return particles_df, cluster_fig, clustering_space_fig, pandora_cluster_fig, mc_particles_df, pandora_particles_df, inv_mass_summary
+    except Exception:
+        err = traceback.format_exc()
+        return (
+            pd.DataFrame({"error": [err]}),
+            empty_fig,
+            empty_fig,
+            empty_fig,
+            pd.DataFrame(),
+            pd.DataFrame(),
+            "",
+        )
+def _parse_csv_event(csv_hits: str, csv_tracks: str, csv_particles: str, csv_pandora: str = "", csv_pfo_links: str = ""):
+    """Parse user-provided CSV text into the dict-of-arrays format expected by
+    ``create_graph``.
+    Expected CSV columns for hits (X_hit) — 11 columns:
+        0: hit_x          — hit position x [mm]
+        1: hit_y          — hit position y [mm]
+        2: hit_z          — hit position z [mm]
+        3: hit_px         — hit momentum px [GeV] (0 for calo hits)
+        4: hit_py         — hit momentum py [GeV] (0 for calo hits)
+        5: hit_energy     — hit energy deposit [GeV]
+        6: hit_x_calo     — hit position x at calorimeter surface [mm] (used as 3D position by the model)
+        7: hit_y_calo     — hit position y at calorimeter surface [mm]
+        8: hit_z_calo     — hit position z at calorimeter surface [mm]
+        9: (unused)       — reserved column (set to 0)
+       10: hit_type       — hit sub-detector type: 1 = ECAL, 2 = HCAL, 3 = muon system
+    Expected CSV columns for tracks (X_track) — 25 columns (padded with
+    zeros if fewer are provided; minimum 17):
+        0: elemtype       — element type (always 1 for tracks)
+        1–4: (unused)     — reserved columns (set to 0)
+        5: p              — track momentum magnitude |p| [GeV]
+        6: px_IP          — track px at interaction point [GeV]
+        7: py_IP          — track py at interaction point [GeV]
+        8: pz_IP          — track pz at interaction point [GeV]
+        9–11: (unused)    — reserved columns (set to 0)
+       12: ref_x_calo     — track reference-point x at calorimeter [mm]
+       13: ref_y_calo     — track reference-point y at calorimeter [mm]
+       14: ref_z_calo     — track reference-point z at calorimeter [mm]
+       15: chi2           — track-fit chi-squared
+       16: ndf            — track-fit number of degrees of freedom
+       17–21: (unused)    — reserved columns (set to 0)
+       22: px_calo        — track momentum x component at calorimeter [GeV]
+       23: py_calo        — track momentum y component at calorimeter [GeV]
+       24: pz_calo        — track momentum z component at calorimeter [GeV]
+    Expected CSV columns for particles / MC truth (X_gen) — 18 columns:
+        0: pid            — PDG particle ID (e.g. 211, 22, 11, 13)
+        1: gen_status     — generator status code
+        2: isDecayedInCalo — 1 if decayed in calorimeter, else 0
+        3: isDecayedInTracker — 1 if decayed in tracker, else 0
+        4: theta          — polar angle [rad]
+        5: phi            — azimuthal angle [rad]
+        6: (unused)       — reserved (set to 0)
+        7: (unused)       — reserved (set to 0)
+        8: energy         — true particle energy [GeV]
+        9: (unused)       — reserved (set to 0)
+       10: mass           — particle mass [GeV]
+       11: momentum       — momentum magnitude |p| [GeV]
+       12: px             — momentum x component [GeV]
+       13: py             — momentum y component [GeV]
+       14: pz             — momentum z component [GeV]
+       15: vx             — production vertex x [mm]
+       16: vy             — production vertex y [mm]
+       17: vz             — production vertex z [mm]
+    PFO links (csv_pfo_links) — two lines of comma-separated integers:
+        Line 1: pfo_calohit — one PFO index per calorimeter hit (-1 = unassigned)
+        Line 2: pfo_track   — one PFO index per track (-1 = unassigned)
+    """
+    import io
+    import awkward as ak
+    def _read(text, min_cols=1):
+        if not text or not text.strip():
+            return np.zeros((0, min_cols), dtype=np.float64)
+        df = pd.read_csv(io.StringIO(text), header=None)
+        return df.values.astype(np.float64)
+    hits_arr = _read(csv_hits, 11)
+    tracks_arr = _read(csv_tracks, 25)
+    particles_arr = _read(csv_particles, 18)
+    pandora_arr = _read(csv_pandora, 9)
+    # Pad tracks to 25 columns if needed
+    if tracks_arr.shape[1] < 25 and tracks_arr.shape[0] > 0:
+        pad = np.zeros((tracks_arr.shape[0], 25 - tracks_arr.shape[1]))
+        tracks_arr = np.concatenate([tracks_arr, pad], axis=1)
+    # Build ygen_hit / ygen_track (particle link per hit — use -1 for unknown)
+    ygen_hit = np.full(len(hits_arr), -1, dtype=np.int64)
+    ygen_track = np.full(len(tracks_arr), -1, dtype=np.int64)
+    # Parse PFO link arrays (hit → Pandora cluster mapping)
+    pfo_calohit = np.array([], dtype=np.int64)
+    pfo_track = np.array([], dtype=np.int64)
+    if csv_pfo_links and csv_pfo_links.strip():
+        lines = csv_pfo_links.strip().split("\n")
+        if len(lines) >= 1 and lines[0].strip():
+            pfo_calohit = np.array(
+                [int(v) for v in lines[0].strip().split(",")], dtype=np.int64
+            )
+        if len(lines) >= 2 and lines[1].strip():
+            pfo_track = np.array(
+                [int(v) for v in lines[1].strip().split(",")], dtype=np.int64
+            )
+    event = {
+        "X_hit": hits_arr,
+        "X_track": tracks_arr,
+        "X_gen": particles_arr,
+        "X_pandora": pandora_arr,
+        "ygen_hit": ygen_hit,
+        "ygen_track": ygen_track,
+        "pfo_calohit": pfo_calohit,
+        "pfo_track": pfo_track,
+    }
+    return event
+# ---------------------------------------------------------------------------
+# Build the Gradio interface
+# ---------------------------------------------------------------------------
+def build_app():
+    with gr.Blocks(title="HitPF — Single-event MLPF Inference") as demo:
+        gr.Markdown(
+            "# HitPF — Single-event MLPF Inference\n"
+            "Run the GATr-based particle-flow reconstruction on a single event.\n\n"
+            "**Steps:** 1) Load model checkpoints  2) Select an event  3) Run inference"
+        )
+        # ---- Model loading ----
+        with gr.Accordion("1 · Load Model", open=True):
+            with gr.Row():
+                clustering_ckpt = gr.Textbox(
+                    label="Clustering checkpoint (.ckpt)",
+                    value="model_clustering.ckpt",
+                    placeholder="/path/to/clustering.ckpt",
+                )
+                energy_pid_ckpt = gr.Textbox(
+                    label="Energy / PID checkpoint (.ckpt) — optional",
+                    value="model_e_pid.ckpt",
+                    placeholder="/path/to/energy_pid.ckpt",
+                )
+                device_dd = gr.Dropdown(
+                    choices=["cpu", "cuda:0", "cuda:1"],
+                    value="cpu",
+                    label="Device",
+                )
+            load_btn = gr.Button("Load model")
+            load_status = gr.Markdown("")
+            load_btn.click(
+                fn=load_model_ui,
+                inputs=[clustering_ckpt, energy_pid_ckpt, device_dd],
+                outputs=load_status,
+            )
+        # ---- Event selection ----
+        with gr.Accordion("2 · Select Event", open=True):
+            gr.Markdown("**Option A** — from a parquet file:")
+            with gr.Row():
+                parquet_path = gr.Textbox(
+                    label="Parquet file path",
+                    value="test_data.parquet",
+                    placeholder="/path/to/events.parquet",
+                )
+                event_idx = gr.Number(label="Event index", value=0, precision=0)
+            parquet_info = gr.Markdown("")
+            parquet_path.change(
+                fn=_count_events_in_parquet,
+                inputs=parquet_path,
+                outputs=parquet_info,
+            )
+            load_event_btn = gr.Button("Load event from parquet")
+            load_event_status = gr.Markdown("")
+            gr.Markdown(
+                "---\n**Option B** — paste CSV data (one row per hit/track/particle, "
+                "no header, comma-separated):\n"
+            )
+            csv_hits = gr.Textbox(
+                label="Hits CSV (11 columns)",
+                lines=4,
+                placeholder=(
+                    "Example (one ECAL hit, one HCAL hit):\n"
+                    "0,0,0,0,0,1.23,1800.5,200.3,100.1,0,1\n"
+                    "0,0,0,0,0,0.45,1900.2,-50.1,300.7,0,2"
+                ),
+            )
+            csv_tracks = gr.Textbox(
+                label="Tracks CSV (25 columns; leave empty if none)",
+                lines=3,
+                placeholder=(
+                    "Example (one track with p≈5 GeV):\n"
+                    "1,0,0,0,0,5.0,3.0,2.0,3.3,0,0,0,1800.0,150.0,90.0,12.5,8,0,0,0,0,0,2.9,1.9,3.2"
+                ),
+            )
+            csv_particles = gr.Textbox(
+                label="Particles (MC truth) CSV (18 columns; optional)",
+                lines=3,
+                placeholder=(
+                    "Example (one pion, one photon):\n"
+                    "211,1,0,0,1.2,0.5,0,0,5.2,0,0.1396,5.198,3.1,2.0,3.3,0,0,0\n"
+                    "22,1,0,0,0.8,2.1,0,0,1.5,0,0,1.5,0.5,-0.3,1.38,0,0,0"
+                ),
+            )
+            csv_pandora = gr.Textbox(
+                label="Pandora PFOs CSV (9 columns; optional)",
+                lines=3,
+                placeholder=(
+                    "Columns: pid, px, py, pz, ref_x, ref_y, ref_z, energy, momentum\n"
+                    "Example (one charged pion PFO):\n"
+                    "211,3.0,2.0,3.3,1800.0,150.0,90.0,5.2,5.198"
+                ),
+            )
+            csv_pfo_links = gr.Textbox(
+                label="Hit → Pandora Cluster links (optional; loaded from parquet)",
+                lines=2,
+                placeholder=(
+                    "Line 1: PFO index per calo hit (comma-separated, -1 = unassigned)\n"
+                    "Line 2: PFO index per track (comma-separated, -1 = unassigned)"
+                ),
+            )
+            load_event_btn.click(
+                fn=_load_event_into_csv,
+                inputs=[parquet_path, event_idx],
+                outputs=[csv_hits, csv_tracks, csv_particles, csv_pandora, csv_pfo_links, load_event_status],
+            )
+        # ---- Run inference ----
+        with gr.Accordion("3 · Results", open=True):
+            run_btn = gr.Button("▶  Run Inference", variant="primary")
+            inv_mass_output = gr.Markdown("")
+            gr.Markdown("### Predicted Particles (HitPF)")
+            particles_table = gr.Dataframe(label="Predicted particles")
+            gr.Markdown("### MC Truth Particles")
+            mc_particles_table = gr.Dataframe(label="MC truth particles (for comparison)")
+            gr.Markdown("### Pandora Particles")
+            pandora_particles_table = gr.Dataframe(label="Pandora PFO particles (for comparison)")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Hit → HitPF Cluster 3D Map")
+                    cluster_plot = gr.Plot(label="Hit-cluster 3D scatter (color = HitPF cluster, size = energy)")
+                with gr.Column():
+                    gr.Markdown("### Hit → Pandora Cluster 3D Map")
+                    pandora_cluster_plot = gr.Plot(label="Hit-cluster 3D scatter (color = Pandora PFO, size = energy)")
+            gr.Markdown("### Clustering Space 3D Map")
+            clustering_space_plot = gr.Plot(label="Clustering space 3D scatter (GATr regressed coordinates)")
+            run_btn.click(
+                fn=run_inference_ui,
+                inputs=[parquet_path, event_idx, csv_hits, csv_tracks, csv_particles, csv_pandora, csv_pfo_links],
+                outputs=[particles_table, cluster_plot, clustering_space_plot, pandora_cluster_plot, mc_particles_table, pandora_particles_table, inv_mass_output],
+            )
+    return demo
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser(description="HitPF Gradio UI")
+    ap.add_argument("--device", default="cpu", help="Default device (cpu / cuda:0 / …)")
+    ap.add_argument("--share", action="store_true", help="Create a public Gradio link")
+    cli_args = ap.parse_args()
+    _set_device(cli_args.device)
+    demo = build_app()
+    demo.launch(share=cli_args.share)

config_files/config_hits_track_v4.yaml ADDED Viewed

	@@ -0,0 +1,146 @@

+# This one uses px, py, pz instead of theta, phi, to avoid possible errors
+graph_config:
+   only_hits: false
+   prediction: true
+   muons: true
+custom_model_kwargs:
+   # add custom model kwargs here
+   n_postgn_dense_blocks: 4
+   clust_space_norm: none
+#treename:
+selection:
+   ### use `&`, `|`, `~` for logical operations on numpy arrays
+   ### can use functions from `math`, `np` (numpy), and `awkward` in the expression
+   #(jet_tightId==1) & (jet_no<2) & (fj_pt>200) & (fj_pt<2500) & (((sample_isQCD==0) & (fj_isQCD==0)) | ((sample_isQCD==1) & (fj_isQCD==1))) & (event_no%7!=0)
+   #(recojet_e>=5)
+test_time_selection:
+   ### selection to apply at test time (i.e., when running w/ --predict)
+   #(jet_tightId==1) & (jet_no<2) & (fj_pt>200) & (fj_pt<2500) & (((sample_isQCD==0) & (fj_isQCD==0)) | ((sample_isQCD==1) & (fj_isQCD==1))) & (event_no%7==0)
+   #(recojet_e<5)
+new_variables:
+   ### [format] name: formula
+   ### can use functions from `math`, `np` (numpy), and `awkward` in the expression
+   #pfcand_mask: awkward.JaggedArray.ones_like(pfcand_etarel)
+   #sv_mask: awkward.JaggedArray.ones_like(sv_etarel)
+   #pfcand_mask: awkward.JaggedArray.ones_like(pfcand_e)
+   hit_mask: ak.ones_like(hit_e)
+   part_mask: ak.ones_like(part_p)
+   hit_e_nn: hit_e
+   part_p1: part_p
+   part_theta1: part_theta
+   part_phi1: part_phi
+   part_m1: part_m
+   part_pid1: part_pid
+preprocess:
+  ### method: [manual, auto] - whether to use manually specified parameters for variable standardization
+  ### [note]: `[var]_mask` will not be transformed even if `method=auto`
+  method: auto
+  ### data_fraction: fraction of events to use when calculating the mean/scale for the standardization
+  data_fraction: 0.1
+inputs:
+   pf_points:
+      pad_mode: wrap
+      length: 25000
+      vars:
+         - [hit_x, null]
+         - [hit_y, null]
+         - [hit_z, null]
+         - [hit_px, null]
+         - [hit_py, null]
+         - [hit_pz, null]
+   pf_points_pfo:
+      pad_mode: wrap
+      length: 25000
+      vars:
+         - [hit__pandora_px, null]
+         - [hit__pandora_py, null]
+         - [hit__pandora_pz, null]
+         - [hit__pandora_x, null]
+         - [hit__pandora_y, null]
+         - [hit__pandora_z, null]
+         - [pandora_pid, null]
+   pf_features:
+      pad_mode: wrap
+      length: 25000
+      vars:
+      ### [format 1]: var_name (no transformation)
+      ### [format 2]: [var_name,
+      ###              subtract_by(optional, default=None, no transf. if preprocess.method=manual, auto transf. if preprocess.method=auto),
+      ###              multiply_by(optional, default=1),
+      ###              clip_min(optional, default=-5),
+      ###              clip_max(optional, default=5),
+      ###              pad_value(optional, default=0)]
+         - [hit_p, null]
+         - [hit_e, null]
+         - [part_theta , null]
+         - [part_phi , null]
+         - [part_p , null]
+         - [part_m, null]
+         - [part_pid, null]
+         - [part_isDecayedInCalorimeter, null]
+         - [part_isDecayedInTracker, null]
+         - [hit_pandora_cluster_energy, null]
+         - [hit_pandora_pfo_energy, null]
+         - [hit_chis, null]
+         - [part_px , null]
+         - [part_py , null]
+         - [part_pz , null]
+         - [part_vertex_x, null]
+         - [part_vertex_y, null]
+         - [part_vertex_z, null]
+   pf_vectors:
+      length: 25000
+      pad_mode: wrap
+      vars:
+         - [hit_type, null] #0
+         - [hit_e_nn, null] #1
+         # #labels
+         # - [part_p1, null]  #2
+         # - [part_theta1, null] #3
+         # - [part_phi1, null] #4
+         # - [part_m1, null]  #15
+         # - [part_pid1, null]  #6
+   pf_vectoronly:
+      length: 25000
+      pad_mode: wrap
+      vars:
+      - [hit_genlink0, null] # hit link to MC
+      - [hit_genlink1, null] # pandora_cluster if val data otherwise 0
+      - [hit_genlink2, null] # pandora_index_pfo if val data otherwise 0
+      - [hit_genlink3, null] # hit link to daugther
+   pf_mask:
+      length: 25000
+      pad_mode: constant
+      vars:
+         - [hit_mask, null]
+         - [part_mask, null]
+labels:
+   ### type can be `simple`, `custom`
+  ### [option 1] use `simple` for binary/multi-class classification, then `value` is a list of 0-1 labels
+   #type: simple
+   #value: [
+   #   hit_ty
+   #   ]
+   ### [option 2] otherwise use `custom` to define the label, then `value` is a map
+   # type: custom
+   # value:
+      # target_mass: np.where(fj_isQCD, fj_genjet_sdmass, fj_gen_mass)
+observers:

scripts/evaluation.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+python -m src.train_lightning1 \
+    --data-test /eos/experiment/fcc/users/m/mgarciam/mlpf/CLD/train/Z_uds_CLD_o2_v05_eval_v1/05/pf_tree_10100.parquet  \
+    --data-config config_files/config_hits_track_v4.yaml \
+    --network-config src/models/wrapper/example_mode_gatr_noise.py \
+    --model-prefix /eos/user/m/mgarciam/datasets_mlpf/models_trained_CLD/041225_arc_05/ \
+    --load-model-weights-clustering /eos/user/m/mgarciam/datasets_mlpf/models_trained_CLD/041225_arc_05/_epoch=9_step=120000.ckpt \
+    --load-model-weights /eos/user/m/mgarciam/datasets_mlpf/models_trained_CLD/040226_basic_ecor/_epoch=2_step=24000.ckpt \
+    --wandb-displayname eval_gun_drlog \
+    --gpus 2 \
+    --batch-size 20 \
+    --num-workers 4 \
+    --start-lr 1e-3 \
+    --num-epochs 100 \
+    --fetch-step 1 \
+    --fetch-by-files \
+    --log-wandb \
+    --wandb-projectname mlpf_debug_eval \
+    --wandb-entity fcc_ml \
+    --frac_cluster_loss 0 \
+    --qmin 1 \
+    --use-average-cc-pos 0.99 \
+    --correction \
+    --freeze-clustering \
+    --predict \
+    --name-output test_plot_hitpf2 \
+    --pandora

scripts/train_clustering.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+python -m src.train_lightning1 \
+    --data-train /eos/experiment/fcc/users/m/mgarciam/mlpf/CLD/train/Z_uds_clustering_dataset_3/05/ \
+    --data-config config_files/config_hits_track_v4.yaml \
+    --network-config src/models/wrapper/example_mode_gatr_noise.py \
+    --model-prefix  /eos/user/m/mgarciam/datasets_mlpf/models_trained_CLD/test_hitpf/ \
+    --num-workers 4 \
+    --gpus 0,1 \
+    --batch-size 5  \
+    --num-epochs 100 \
+    --fetch-step 1 \
+    --log-wandb \
+    --wandb-displayname CLD_clustering_training \
+    --wandb-projectname mlpf_debug \
+    --wandb-entity ml4hep \
+    --frac_cluster_loss 0 \
+    --qmin 3 \
+    --use-average-cc-pos 0.98 \
+    --train-val-split 0.98 \
+    --fetch-by-files \
+    --train-batches 10

scripts/train_energy_pid.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+python -m src.train_lightning1 \
+    --data-train /eos/experiment/fcc/users/m/mgarciam/mlpf/CLD/train/gun_ecort/05/ \
+    --data-config config_files/config_hits_track_v4.yaml \
+    --network-config src/models/wrapper/example_mode_gatr_noise.py \
+    --model-prefix /eos/user/m/mgarciam/datasets_mlpf/models_trained_CLD/test_hitpf_ecor/ \
+    --wandb-displayname E_PID_05_basicecor_v1_1 \
+    --gpus 0 \
+    --batch-size 20 \
+    --num-workers 4 \
+    --start-lr 1e-3 \
+    --num-epochs 100 \
+    --fetch-step 1 \
+    --fetch-by-files \
+    --train-val-split 0.98 \
+    --train-batches 8000 \
+    --log-wandb \
+    --wandb-projectname mlpf_debug \
+    --wandb-entity ml4hep \
+    --frac_cluster_loss 0 \
+    --qmin 1 \
+    --use-average-cc-pos 0.99 \
+    --correction \
+    --freeze-clustering \
+    --use-gt-clusters

src/data/config.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import numpy as np
+import yaml
+import copy
+from src.logger.logger import _logger
+from src.data.tools import _get_variable_names
+def _as_list(x):
+    if x is None:
+        return None
+    elif isinstance(x, (list, tuple)):
+        return x
+    else:
+        return [x]
+def _md5(fname):
+    '''https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file'''
+    import hashlib
+    hash_md5 = hashlib.md5()
+    with open(fname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+class DataConfig(object):
+    r"""Data loading configuration.
+    """
+    def __init__(self, print_info=True, **kwargs):
+        opts = {
+            'treename': None,
+            'selection': None,
+            'test_time_selection': None,
+            'preprocess': {'method': 'manual', 'data_fraction': 0.1, 'params': None},
+            'new_variables': {},
+            'inputs': {},
+            'labels': {},
+            'observers': [],
+            'monitor_variables': [],
+            'weights': None,
+            'graph_config': {},
+            'custom_model_kwargs': {}
+        }
+        for k, v in kwargs.items():
+            if v is not None:
+                if isinstance(opts[k], dict):
+                    opts[k].update(v)
+                else:
+                    opts[k] = v
+        # only information in ``self.options'' will be persisted when exporting to YAML
+        self.options = opts
+        if print_info:
+            _logger.debug(opts)
+        self.selection = opts['selection']
+        self.test_time_selection = opts['test_time_selection'] if opts['test_time_selection'] else self.selection
+        self.var_funcs = copy.deepcopy(opts['new_variables'])
+        # preprocessing config
+        self.preprocess = opts['preprocess']
+        self._auto_standardization = opts['preprocess']['method'].lower().startswith('auto')
+        self._missing_standardization_info = False
+        self.preprocess_params = opts['preprocess']['params'] if opts['preprocess']['params'] is not None else {}
+        # inputs
+        self.input_names = tuple(opts['inputs'].keys())
+        self.input_dicts = {k: [] for k in self.input_names}
+        self.input_shapes = {}
+        for k, o in opts['inputs'].items():
+            self.input_shapes[k] = (-1, len(o['vars']), o['length'])
+            for v in o['vars']:
+                v = _as_list(v)
+                self.input_dicts[k].append(v[0])
+                if opts['preprocess']['params'] is None:
+                    def _get(idx, default):
+                        try:
+                            return v[idx]
+                        except IndexError:
+                            return default
+                    params = {'length': o['length'], 'pad_mode': o.get('pad_mode', 'constant').lower(),
+                              'center': _get(1, 'auto' if self._auto_standardization else None),
+                              'scale': _get(2, 1), 'min': _get(3, -5), 'max': _get(4, 5), 'pad_value': _get(5, 0)}
+                    if v[0] in self.preprocess_params and params != self.preprocess_params[v[0]]:
+                        raise RuntimeError(
+                            'Incompatible info for variable %s, had: \n  %s\nnow got:\n  %s' %
+                            (v[0], str(self.preprocess_params[v[0]]), str(params)))
+                    if k.endswith('_mask') and params['pad_mode'] != 'constant':
+                        raise RuntimeError('The `pad_mode` must be set to `constant` for the mask input `%s`' % k)
+                    if params['center'] == 'auto':
+                        self._missing_standardization_info = True
+                    self.preprocess_params[v[0]] = params
+        # observers
+        self.observer_names = tuple(opts['observers'])
+        # monitor variables
+        self.monitor_variables = tuple(opts['monitor_variables'])
+        # Z variables: returned as `Z` in the dataloader (use monitor_variables for training, observers for eval)
+        self.z_variables = self.observer_names if len(self.observer_names) > 0 else self.monitor_variables
+        # remove self mapping from var_funcs
+        for k, v in self.var_funcs.items():
+            if k == v:
+                del self.var_funcs[k]
+        if print_info:
+            def _log(msg, *args, **kwargs):
+                _logger.info(msg, *args, color='lightgray', **kwargs)
+            _log('preprocess config: %s', str(self.preprocess))
+            _log('selection: %s', str(self.selection))
+            _log('test_time_selection: %s', str(self.test_time_selection))
+            _log('var_funcs:\n - %s', '\n - '.join(str(it) for it in self.var_funcs.items()))
+            _log('input_names: %s', str(self.input_names))
+            _log('input_dicts:\n - %s', '\n - '.join(str(it) for it in self.input_dicts.items()))
+            _log('input_shapes:\n - %s', '\n - '.join(str(it) for it in self.input_shapes.items()))
+            _log('preprocess_params:\n - %s', '\n - '.join(str(it) for it in self.preprocess_params.items()))
+            #_log('label_names: %s', str(self.label_names))
+            _log('observer_names: %s', str(self.observer_names))
+            _log('monitor_variables: %s', str(self.monitor_variables))
+            if opts['weights'] is not None:
+                if self.use_precomputed_weights:
+                    _log('weight: %s' % self.var_funcs[self.weight_name])
+                else:
+                    for k in ['reweight_method', 'reweight_basewgt', 'reweight_branches', 'reweight_bins',
+                              'reweight_classes', 'class_weights', 'reweight_threshold',
+                              'reweight_discard_under_overflow']:
+                        _log('%s: %s' % (k, getattr(self, k)))
+        # parse config
+        self.keep_branches = set()
+        aux_branches = set()
+        # selection
+        if self.selection:
+            aux_branches.update(_get_variable_names(self.selection))
+        # test time selection
+        if self.test_time_selection:
+            aux_branches.update(_get_variable_names(self.test_time_selection))
+        # var_funcs
+        self.keep_branches.update(self.var_funcs.keys())
+        for expr in self.var_funcs.values():
+            aux_branches.update(_get_variable_names(expr))
+        # inputs
+        for names in self.input_dicts.values():
+            self.keep_branches.update(names)
+        # labels
+        #self.keep_branches.update(self.label_names)
+        # weight
+        #if self.weight_name:
+        #    self.keep_branches.add(self.weight_name)
+        #    if not self.use_precomputed_weights:
+        #        aux_branches.update(self.reweight_branches)
+        #        aux_branches.update(self.reweight_classes)
+        # observers
+        self.keep_branches.update(self.observer_names)
+        # monitor variables
+        self.keep_branches.update(self.monitor_variables)
+        # keep and drop
+        self.drop_branches = (aux_branches - self.keep_branches)
+        self.load_branches = (aux_branches | self.keep_branches) - set(self.var_funcs.keys()) #- {self.weight_name, }
+        if print_info:
+            _logger.debug('drop_branches:\n  %s', ','.join(self.drop_branches))
+            _logger.debug('load_branches:\n  %s', ','.join(self.load_branches))
+    def __getattr__(self, name):
+        return self.options[name]
+    def dump(self, fp):
+        with open(fp, 'w') as f:
+            yaml.safe_dump(self.options, f, sort_keys=False)
+    @classmethod
+    def load(cls, fp, load_observers=True, load_reweight_info=True, extra_selection=None, extra_test_selection=None):
+        with open(fp) as f:
+            options = yaml.safe_load(f)
+        if not load_observers:
+            options['observers'] = None
+        if not load_reweight_info:
+            options['weights'] = None
+        if extra_selection:
+            options['selection'] = '(%s) & (%s)' % (options['selection'], extra_selection)
+        if extra_test_selection:
+            if 'test_time_selection' not in options:
+                raise RuntimeError('`test_time_selection` is not defined in the yaml file!')
+            options['test_time_selection'] = '(%s) & (%s)' % (options['test_time_selection'], extra_test_selection)
+        return cls(**options)
+    def copy(self):
+        return self.__class__(print_info=False, **copy.deepcopy(self.options))
+    def __copy__(self):
+        return self.copy()
+    def __deepcopy__(self, memo):
+        return self.copy()
+    def export_json(self, fp):
+        import json
+        j = {'output_names': self.label_value, 'input_names': self.input_names}
+        for k, v in self.input_dicts.items():
+            j[k] = {'var_names': v, 'var_infos': {}}
+            for var_name in v:
+                j[k]['var_length'] = self.preprocess_params[var_name]['length']
+                info = self.preprocess_params[var_name]
+                j[k]['var_infos'][var_name] = {
+                    'median': 0 if info['center'] is None else info['center'],
+                    'norm_factor': info['scale'],
+                    'replace_inf_value': 0,
+                    'lower_bound': -1e32 if info['center'] is None else info['min'],
+                    'upper_bound': 1e32 if info['center'] is None else info['max'],
+                    'pad': info['pad_value']
+                }
+        with open(fp, 'w') as f:
+            json.dump(j, f, indent=2)

src/data/fileio.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import math
+import awkward as ak
+import tqdm
+import traceback
+from src.data.tools import _concat, _concat_records
+def _read_hdf5(filepath, branches, load_range=None):
+    import tables
+    tables.set_blosc_max_threads(4)
+    with tables.open_file(filepath) as f:
+        outputs = {k: getattr(f.root, k)[:] for k in branches}
+    if load_range is None:
+        load_range = (0, 1)
+    start = math.trunc(load_range[0] * len(outputs[branches[0]]))
+    stop = max(start + 1, math.trunc(load_range[1] * len(outputs[branches[0]])))
+    for k, v in outputs.items():
+        outputs[k] = v[start:stop]
+    return ak.Array(outputs)
+def _read_root(filepath, branches, load_range=None, treename=None):
+    import uproot
+    with uproot.open(filepath) as f:
+        if treename is None:
+            treenames = set([k.split(';')[0] for k, v in f.items() if getattr(v, 'classname', '') == 'TTree'])
+            if len(treenames) == 1:
+                treename = treenames.pop()
+            else:
+                raise RuntimeError(
+                    'Need to specify `treename` as more than one trees are found in file %s: %s' %
+                    (filepath, str(branches)))
+        tree = f[treename]
+        if load_range is not None:
+            start = math.trunc(load_range[0] * tree.num_entries)
+            stop = max(start + 1, math.trunc(load_range[1] * tree.num_entries))
+        else:
+            start, stop = None, None
+        outputs = tree.arrays(filter_name=branches, entry_start=start, entry_stop=stop)
+    return outputs
+def _read_awkd(filepath, branches, load_range=None):
+    import awkward0
+    with awkward0.load(filepath) as f:
+        outputs = {k: f[k] for k in branches}
+    if load_range is None:
+        load_range = (0, 1)
+    start = math.trunc(load_range[0] * len(outputs[branches[0]]))
+    stop = max(start + 1, math.trunc(load_range[1] * len(outputs[branches[0]])))
+    for k, v in outputs.items():
+        outputs[k] = ak.from_awkward0(v[start:stop])
+    return ak.Array(outputs)
+def _slice_record(record, start, stop):
+    sliced_fields = {}
+    for field in record.fields:
+        sliced_fields[field] = record[field][start:stop]
+    return ak.Record(sliced_fields)
+def _read_parquet(filepath, load_range=None):
+    outputs = ak.from_parquet(filepath)
+    len_outputs = len(outputs["X_track"])
+    if load_range is not None:
+        start = math.trunc(load_range[0] * len_outputs)
+        stop = max(start + 1, math.trunc(load_range[1] * len_outputs))
+        outputs = _slice_record(outputs, start, stop)
+    return outputs
+def _read_files(filelist, load_range=None, show_progressbar=False, **kwargs):
+    import os
+    table = []
+    if show_progressbar:
+        filelist = tqdm.tqdm(filelist)
+    for filepath in filelist:
+        ext = os.path.splitext(filepath)[1]
+        if ext not in ('.h5', '.root', '.awkd', '.parquet'):
+            raise RuntimeError('File %s of type `%s` is not supported!' % (filepath, ext))
+        a = _read_parquet(filepath, load_range=load_range)
+        if a is not None:
+            table.append(a)
+    table = _concat_records(table)  # ak.Array
+    if len(table["X_track"]) == 0:
+        raise RuntimeError(f'Zero entries loaded when reading files {filelist} with `load_range`={load_range}.')
+    return table
+def _write_root(file, table, treename='Events', compression=-1, step=1048576):
+    import uproot
+    if compression == -1:
+        compression = uproot.LZ4(4)
+    with uproot.recreate(file, compression=compression) as fout:
+        tree = fout.mktree(treename, {k: v.dtype for k, v in table.items()})
+        start = 0
+        while start < len(list(table.values())[0]) - 1:
+            tree.extend({k: v[start:start + step] for k, v in table.items()})
+            start += step

src/data/preprocess.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import time
+import glob
+import copy
+import numpy as np
+import awkward as ak
+from src.data.tools import _get_variable_names, _eval_expr
+from src.data.fileio import _read_files
+def _apply_selection(table, selection):
+    if selection is None:
+        return table
+    selected = ak.values_astype(_eval_expr(selection, table), 'bool')
+    return table[selected]
+def _build_new_variables(table, funcs):
+    if funcs is None:
+        return table
+    for k, expr in funcs.items():
+        if k in table.fields:
+            continue
+        table[k] = _eval_expr(expr, table)
+    return table
+def _clean_up(table, drop_branches):
+    columns = [k for k in table.fields if k not in drop_branches]
+    return table[columns]
+def _build_weights(table, data_config, reweight_hists=None):
+    if data_config.weight_name is None:
+        raise RuntimeError('Error when building weights: `weight_name` is None!')
+    if data_config.use_precomputed_weights:
+        return ak.to_numpy(table[data_config.weight_name])
+    else:
+        x_var, y_var = data_config.reweight_branches
+        x_bins, y_bins = data_config.reweight_bins
+        rwgt_sel = None
+        if data_config.reweight_discard_under_overflow:
+            rwgt_sel = (table[x_var] >= min(x_bins)) & (table[x_var] <= max(x_bins)) & \
+                (table[y_var] >= min(y_bins)) & (table[y_var] <= max(y_bins))
+        # init w/ wgt=0: events not belonging to any class in `reweight_classes` will get a weight of 0 at the end
+        wgt = np.zeros(len(table), dtype='float32')
+        sum_evts = 0
+        if reweight_hists is None:
+            reweight_hists = data_config.reweight_hists
+        for label, hist in reweight_hists.items():
+            pos = table[label] == 1
+            if rwgt_sel is not None:
+                pos = (pos & rwgt_sel)
+            rwgt_x_vals = ak.to_numpy(table[x_var][pos])
+            rwgt_y_vals = ak.to_numpy(table[y_var][pos])
+            x_indices = np.clip(np.digitize(
+                rwgt_x_vals, x_bins) - 1, a_min=0, a_max=len(x_bins) - 2)
+            y_indices = np.clip(np.digitize(
+                rwgt_y_vals, y_bins) - 1, a_min=0, a_max=len(y_bins) - 2)
+            wgt[pos] = hist[x_indices, y_indices]
+            sum_evts += np.sum(pos)
+        if sum_evts != len(table):
+            warn(
+                'Not all selected events used in the reweighting. '
+                'Check consistency between `selection` and `reweight_classes` definition, or with the `reweight_vars` binnings '
+                '(under- and overflow bins are discarded by default, unless `reweight_discard_under_overflow` is set to `False` in the `weights` section).',
+            )
+        if data_config.reweight_basewgt:
+            wgt *= ak.to_numpy(table[data_config.basewgt_name])
+        return wgt
+class AutoStandardizer(object):
+    r"""AutoStandardizer.
+    Class to compute the variable standardization information.
+    Arguments:
+        filelist (list): list of files to be loaded.
+        data_config (DataConfig): object containing data format information.
+    """
+    def __init__(self, filelist, data_config):
+        if isinstance(filelist, dict):
+            filelist = sum(filelist.values(), [])
+        self._filelist = filelist if isinstance(
+            filelist, (list, tuple)) else glob.glob(filelist)
+        self._data_config = data_config.copy()
+        self.load_range = (0, data_config.preprocess.get('data_fraction', 0.1))
+    def read_file(self, filelist):
+        self.keep_branches = set()
+        self.load_branches = set()
+        for k, params in self._data_config.preprocess_params.items():
+            if params['center'] == 'auto':
+                self.keep_branches.add(k)
+                if k in self._data_config.var_funcs:
+                    expr = self._data_config.var_funcs[k]
+                    self.load_branches.update(_get_variable_names(expr))
+                else:
+                    self.load_branches.add(k)
+        if self._data_config.selection:
+            self.load_branches.update(_get_variable_names(self._data_config.selection))
+        table = _read_files(filelist, self.load_branches, self.load_range,
+                            show_progressbar=True, treename=self._data_config.treename)
+        table = _apply_selection(table, self._data_config.selection)
+        table = _build_new_variables(
+            table, {k: v for k, v in self._data_config.var_funcs.items() if k in self.keep_branches})
+        table = _clean_up(table, self.load_branches - self.keep_branches)
+        return table
+    def make_preprocess_params(self, table):
+        preprocess_params = copy.deepcopy(self._data_config.preprocess_params)
+        for k, params in self._data_config.preprocess_params.items():
+            if params['center'] == 'auto':
+                if k.endswith('_mask'):
+                    params['center'] = None
+                else:
+                    a = ak.to_numpy(ak.flatten(table[k], axis=None))
+                    # check for NaN
+                    if np.any(np.isnan(a)):
+                        time.sleep(10)
+                        a = np.nan_to_num(a)
+                    low, center, high = np.percentile(a, [16, 50, 84])
+                    scale = max(high - center, center - low)
+                    scale = 1 if scale == 0 else 1. / scale
+                    params['center'] = float(center)
+                    params['scale'] = float(scale)
+                preprocess_params[k] = params
+        return preprocess_params
+    def produce(self, output=None):
+        table = self.read_file(self._filelist)
+        preprocess_params = self.make_preprocess_params(table)
+        self._data_config.preprocess_params = preprocess_params
+        # must also propogate the changes to `data_config.options` so it can be persisted
+        self._data_config.options['preprocess']['params'] = preprocess_params
+        if output:
+            self._data_config.dump(output)
+        return self._data_config
+class WeightMaker(object):
+    r"""WeightMaker.
+    Class to make reweighting information.
+    Arguments:
+        filelist (list): list of files to be loaded.
+        data_config (DataConfig): object containing data format information.
+    """
+    def __init__(self, filelist, data_config):
+        if isinstance(filelist, dict):
+            filelist = sum(filelist.values(), [])
+        self._filelist = filelist if isinstance(filelist, (list, tuple)) else glob.glob(filelist)
+        self._data_config = data_config.copy()
+    def read_file(self, filelist):
+        self.keep_branches = set(self._data_config.reweight_branches + self._data_config.reweight_classes +
+                                 (self._data_config.basewgt_name,))
+        self.load_branches = set()
+        for k in self.keep_branches:
+            if k in self._data_config.var_funcs:
+                expr = self._data_config.var_funcs[k]
+                self.load_branches.update(_get_variable_names(expr))
+            else:
+                self.load_branches.add(k)
+        if self._data_config.selection:
+            self.load_branches.update(_get_variable_names(self._data_config.selection))
+        table = _read_files(filelist, self.load_branches, show_progressbar=True, treename=self._data_config.treename)
+        table = _apply_selection(table, self._data_config.selection)
+        table = _build_new_variables(
+            table, {k: v for k, v in self._data_config.var_funcs.items() if k in self.keep_branches})
+        table = _clean_up(table, self.load_branches - self.keep_branches)
+        return table
+    def make_weights(self, table):
+        x_var, y_var = self._data_config.reweight_branches
+        x_bins, y_bins = self._data_config.reweight_bins
+        if not self._data_config.reweight_discard_under_overflow:
+            # clip variables to be within bin ranges
+            x_min, x_max = min(x_bins), max(x_bins)
+            y_min, y_max = min(y_bins), max(y_bins)
+            table[x_var] = np.clip(table[x_var], min(x_bins), max(x_bins))
+            table[y_var] = np.clip(table[y_var], min(y_bins), max(y_bins))
+        sum_evts = 0
+        max_weight = 0.9
+        raw_hists = {}
+        class_events = {}
+        result = {}
+        for label in self._data_config.reweight_classes:
+            pos = (table[label] == 1)
+            x = ak.to_numpy(table[x_var][pos])
+            y = ak.to_numpy(table[y_var][pos])
+            hist, _, _ = np.histogram2d(x, y, bins=self._data_config.reweight_bins)
+            sum_evts += hist.sum()
+            if self._data_config.reweight_basewgt:
+                w = ak.to_numpy(table[self._data_config.basewgt_name][pos])
+                hist, _, _ = np.histogram2d(x, y, weights=w, bins=self._data_config.reweight_bins)
+            raw_hists[label] = hist.astype('float32')
+            result[label] = hist.astype('float32')
+        if sum_evts != len(table):
+            time.sleep(10)
+        if self._data_config.reweight_method == 'flat':
+            for label, classwgt in zip(self._data_config.reweight_classes, self._data_config.class_weights):
+                hist = result[label]
+                threshold_ = np.median(hist[hist > 0]) * 0.01
+                nonzero_vals = hist[hist > threshold_]
+                min_val, med_val = np.min(nonzero_vals), np.median(hist)  # not really used
+                ref_val = np.percentile(nonzero_vals, self._data_config.reweight_threshold)
+                # wgt: bins w/ 0 elements will get a weight of 0; bins w/ content<ref_val will get 1
+                wgt = np.clip(np.nan_to_num(ref_val / hist, posinf=0), 0, 1)
+                result[label] = wgt
+                # divide by classwgt here will effective increase the weight later
+                class_events[label] = np.sum(raw_hists[label] * wgt) / classwgt
+        elif self._data_config.reweight_method == 'ref':
+            # use class 0 as the reference
+            hist_ref = raw_hists[self._data_config.reweight_classes[0]]
+            for label, classwgt in zip(self._data_config.reweight_classes, self._data_config.class_weights):
+                # wgt: bins w/ 0 elements will get a weight of 0; bins w/ content<ref_val will get 1
+                ratio = np.nan_to_num(hist_ref / result[label], posinf=0)
+                upper = np.percentile(ratio[ratio > 0], 100 - self._data_config.reweight_threshold)
+                wgt = np.clip(ratio / upper, 0, 1)  # -> [0,1]
+                result[label] = wgt
+                # divide by classwgt here will effective increase the weight later
+                class_events[label] = np.sum(raw_hists[label] * wgt) / classwgt
+        # ''equalize'' all classes
+        # multiply by max_weight (<1) to add some randomness in the sampling
+        min_nevt = min(class_events.values()) * max_weight
+        for label in self._data_config.reweight_classes:
+            class_wgt = float(min_nevt) / class_events[label]
+            result[label] *= class_wgt
+        if self._data_config.reweight_basewgt:
+            wgts = _build_weights(table, self._data_config, reweight_hists=result)
+            wgt_ref = np.percentile(wgts, 100 - self._data_config.reweight_threshold)
+            for label in self._data_config.reweight_classes:
+                result[label] /= wgt_ref
+        return result
+    def produce(self, output=None):
+        table = self.read_file(self._filelist)
+        wgts = self.make_weights(table)
+        self._data_config.reweight_hists = wgts
+        # must also propogate the changes to `data_config.options` so it can be persisted
+        self._data_config.options['weights']['reweight_hists'] = {k: v.tolist() for k, v in wgts.items()}
+        if output:
+            self._data_config.dump(output)
+        return self._data_config

src/data/tools.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import numpy as np
+import math
+import awkward as ak
+def build_dummy_array(num, dtype=np.int64):
+    return ak.Array(
+        ak.contents.ListOffsetArray(
+            ak.index.Index64(np.zeros(num + 1, dtype=np.int64)),
+            ak.from_numpy(np.array([], dtype=dtype), highlevel=False),
+        )
+    )
+def _concat_records(table):
+    table1 =  {k : ak.from_iter([record[k][event] for record in table for event in range(len(record[k])) ]) for k in table[0].fields}
+    for k in table1.keys():
+            if len(ak.flatten(table1[k])) == 0:
+                table1[k] = build_dummy_array(len(table1[k]), np.float32)
+    table1 = ak.Record(table1)
+    return table1
+def _concat(arrays, axis=0):
+    if len(arrays) == 0:
+        return np.array([])
+    if isinstance(arrays[0], np.ndarray):
+        return np.concatenate(arrays, axis=axis)
+    else:
+        return ak.concatenate(arrays, axis=axis)
+def _stack(arrays, axis=1):
+    if len(arrays) == 0:
+        return np.array([])
+    if isinstance(arrays[0], np.ndarray):
+        return np.stack(arrays, axis=axis)
+    else:
+        return ak.concatenate(arrays, axis=axis)
+def _pad_vector(a, value=-1, dtype="float32"):
+    maxlen = 2000
+    maxlen2 = 5
+    x = (np.ones((len(a), maxlen, maxlen2)) * value).astype(dtype)
+    for idx, s in enumerate(a):
+        for idx_vec, s_vec in enumerate(s):
+            x[idx, idx_vec, : len(s_vec)] = s_vec
+    return x
+def _pad(a, maxlen, value=0, dtype="float32"):
+    if isinstance(a, np.ndarray) and a.ndim >= 2 and a.shape[1] == maxlen:
+        return a
+    elif isinstance(a, ak.Array):
+        if a.ndim == 1:
+            a = ak.unflatten(a, 1)
+        a = ak.fill_none(ak.pad_none(a, maxlen, clip=True), value)
+        return ak.values_astype(a, dtype)
+    else:
+        x = (np.ones((len(a), maxlen)) * value).astype(dtype)
+        for idx, s in enumerate(a):
+            if not len(s):
+                continue
+            trunc = s[:maxlen].astype(dtype)
+            x[idx, : len(trunc)] = trunc
+        return x
+def _repeat_pad(a, maxlen, shuffle=False, dtype="float32"):
+    x = ak.to_numpy(ak.flatten(a))
+    x = np.tile(x, int(np.ceil(len(a) * maxlen / len(x))))
+    if shuffle:
+        np.random.shuffle(x)
+    x = x[: len(a) * maxlen].reshape((len(a), maxlen))
+    mask = _pad(ak.zeros_like(a), maxlen, value=1)
+    x = _pad(a, maxlen) + mask * x
+    return ak.values_astype(x, dtype)
+def _clip(a, a_min, a_max):
+    try:
+        return np.clip(a, a_min, a_max)
+    except ValueError:
+        return ak.unflatten(np.clip(ak.flatten(a), a_min, a_max), ak.num(a))
+def _knn(support, query, k, n_jobs=1):
+    from scipy.spatial import cKDTree
+    kdtree = cKDTree(support)
+    d, idx = kdtree.query(query, k, n_jobs=n_jobs)
+    return idx
+def _batch_knn(supports, queries, k, maxlen_s, maxlen_q=None, n_jobs=1):
+    assert len(supports) == len(queries)
+    if maxlen_q is None:
+        maxlen_q = maxlen_s
+    batch_knn_idx = np.ones((len(supports), maxlen_q, k), dtype="int32") * (
+        maxlen_s - 1
+    )
+    for i, (s, q) in enumerate(zip(supports, queries)):
+        batch_knn_idx[i, : len(q[:maxlen_q]), :] = _knn(
+            s[:maxlen_s], q[:maxlen_q], k, n_jobs=n_jobs
+        ).reshape(
+            (-1, k)
+        )  # (len(q), k)
+    return batch_knn_idx
+def _batch_permute_indices(array, maxlen):
+    batch_permute_idx = np.tile(np.arange(maxlen), (len(array), 1))
+    for i, a in enumerate(array):
+        batch_permute_idx[i, : len(a)] = np.random.permutation(len(a[:maxlen]))
+    return batch_permute_idx
+def _batch_argsort(array, maxlen):
+    batch_argsort_idx = np.tile(np.arange(maxlen), (len(array), 1))
+    for i, a in enumerate(array):
+        batch_argsort_idx[i, : len(a)] = np.argsort(a[:maxlen])
+    return batch_argsort_idx
+def _batch_gather(array, indices):
+    out = array.zeros_like()
+    for i, (a, idx) in enumerate(zip(array, indices)):
+        maxlen = min(len(a), len(idx))
+        out[i][:maxlen] = a[idx[:maxlen]]
+    return out
+def _p4_from_pxpypze(px, py, pz, energy):
+    import vector
+    vector.register_awkward()
+    return vector.zip({"px": px, "py": py, "pz": pz, "energy": energy})
+def _p4_from_ptetaphie(pt, eta, phi, energy):
+    import vector
+    vector.register_awkward()
+    return vector.zip({"pt": pt, "eta": eta, "phi": phi, "energy": energy})
+def _p4_from_ptetaphim(pt, eta, phi, mass):
+    import vector
+    vector.register_awkward()
+    return vector.zip({"pt": pt, "eta": eta, "phi": phi, "mass": mass})
+def _get_variable_names(expr, exclude=["awkward", "ak", "np", "numpy", "math"]):
+    import ast
+    root = ast.parse(expr)
+    return sorted(
+        {
+            node.id
+            for node in ast.walk(root)
+            if isinstance(node, ast.Name) and not node.id.startswith("_")
+        }
+        - set(exclude)
+    )
+def _eval_expr(expr, table):
+    tmp = {k: table[k] for k in _get_variable_names(expr)}
+    tmp.update(
+        {
+            "math": math,
+            "np": np,
+            "numpy": np,
+            "ak": ak,
+            "awkward": ak,
+            "_concat": _concat,
+            "_stack": _stack,
+            "_pad": _pad,
+            "_repeat_pad": _repeat_pad,
+            "_clip": _clip,
+            "_batch_knn": _batch_knn,
+            "_batch_permute_indices": _batch_permute_indices,
+            "_batch_argsort": _batch_argsort,
+            "_batch_gather": _batch_gather,
+            "_p4_from_pxpypze": _p4_from_pxpypze,
+            "_p4_from_ptetaphie": _p4_from_ptetaphie,
+            "_p4_from_ptetaphim": _p4_from_ptetaphim,
+        }
+    )
+    return eval(expr, tmp)

src/dataset/dataclasses.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from dataclasses import dataclass
+from typing import Any, List, Optional
+import torch
+import numpy as np
+@dataclass
+class PandoraFeatures:
+    # Features associated to the hits
+    pandora_cluster: Optional[Any] = None
+    pandora_cluster_energy: Optional[Any] = None
+    pfo_energy: Optional[Any] = None
+    pandora_mom: Optional[Any] = None
+    pandora_ref_point: Optional[Any] = None
+    pandora_pid: Optional[Any] = None
+    pandora_pfo_link: Optional[Any] = None
+    pandora_mom_components: Optional[Any] = None
+@dataclass
+class Hits:
+    pos_xyz_hits: Any
+    pos_pxpypz: Any
+    pos_pxpypz_calo: Any
+    p_hits: Any
+    e_hits: Any
+    hit_particle_link: Any
+    pandora_features: Any # type PandoraFeatures
+    hit_type_feature: Any
+    chi_squared_tracks: Any
+    hit_type_one_hot: Any
+    @classmethod
+    def from_data(cls, output, number_hits, args, number_part):
+        hit_particle_link_hits = torch.tensor(output["ygen_hit"])
+        if len(output["ygen_track"])>0:
+            hit_particle_link_tracks= torch.tensor(output["ygen_track"])
+            hit_particle_link = torch.cat((hit_particle_link_hits, hit_particle_link_tracks), dim=0)
+        else:
+            hit_particle_link = hit_particle_link_hits
+        # hit_particle_link_calomother = torch.cat((hit_particle_link_hits_calomother, hit_particle_link_tracks), dim=0)
+        if args.pandora:
+            pandora_features = PandoraFeatures()
+            X_pandora = torch.tensor(output["X_pandora"])
+            pfo_link_hits = torch.tensor(output["pfo_calohit"])
+            if len(output["pfo_track"])>0:
+                pfo_link_tracks = torch.tensor(output["pfo_track"])
+                pfo_link = torch.cat((pfo_link_hits, pfo_link_tracks), dim=0)
+            else:
+                pfo_link = pfo_link_hits
+            pandora_features.pandora_pfo_link = pfo_link
+            pfo_link_temp = pfo_link.clone()
+            pfo_link_temp[pfo_link_temp==-1]=0
+            pandora_features.pandora_mom = X_pandora[pfo_link_temp, 8]
+            pandora_features.pandora_ref_point = X_pandora[pfo_link_temp, 4:7]
+            pandora_features.pandora_mom_components = X_pandora[pfo_link_temp, 1:4]
+            pandora_features.pandora_pid = X_pandora[pfo_link_temp, 0]
+            pandora_features.pfo_energy = X_pandora[pfo_link_temp, 7]
+            pandora_features.pandora_mom[pfo_link==-1]=0
+            pandora_features.pandora_mom_components[pfo_link==-1]=0
+            pandora_features.pandora_ref_point[pfo_link==-1]=0
+            pandora_features.pandora_pid[pfo_link==-1]=0
+            pandora_features.pfo_energy[pfo_link==-1]=0
+        else:
+            pandora_features = None
+        X_hit = torch.tensor(output["X_hit"])
+        if len(output["X_track"])>0:
+            X_track = torch.tensor(output["X_track"])
+        # obtain hit type
+        hit_type_feature_hit = X_hit[:,10]+1 #tyep (1,2,3,4 hits)
+        if len(output["X_track"])>0:
+            hit_type_feature_track = X_track[:,0] #elemtype (1 for tracks)
+            hit_type_feature = torch.cat((hit_type_feature_hit, hit_type_feature_track), dim=0).to(torch.int64)
+        else:
+            hit_type_feature = hit_type_feature_hit.to(torch.int64)
+        # obtain the position of the hits and the energies and p
+        pos_xyz_hits_hits = X_hit[:,6:9]
+        e_hits = X_hit[:,5]
+        p_hits = X_hit[:,5]*0
+        if len(output["X_track"])>0:
+            pos_xyz_hits_tracks = X_track[:,12:15] #(referencePoint_calo.i)
+            pos_xyz_hits = torch.cat((pos_xyz_hits_hits, pos_xyz_hits_tracks), dim=0)
+            e_tracks =X_track[:,5]*0
+            e = torch.cat((e_hits, e_tracks), dim=0).view(-1,1)
+            p_tracks =X_track[:,5]
+            pos_pxpypz_hits_tracks = X_track[:,6:9]
+            pos_pxpypz = torch.cat((pos_xyz_hits_hits*0, pos_pxpypz_hits_tracks), dim=0)
+            pos_pxpypz_hits_tracks = X_track[:,22:]
+            pos_pxpypz_calo = torch.cat((pos_xyz_hits_hits*0, pos_pxpypz_hits_tracks), dim=0)
+            p = torch.cat((p_hits, p_tracks), dim=0).view(-1,1)
+        else:
+            pos_xyz_hits = pos_xyz_hits_hits
+            e = e_hits.view(-1,1)
+            pos_pxpypz = pos_xyz_hits_hits*0
+            pos_pxpypz_calo = pos_pxpypz
+            p = p_hits.view(-1,1)
+        if len(output["X_track"])>0:
+            chi_tracks = X_track[:,15]/ X_track[:,16]
+            chi_squared_tracks = torch.cat((p_hits, chi_tracks), dim=0)
+        else:
+            chi_squared_tracks = p_hits
+        hit_type_one_hot = torch.nn.functional.one_hot(
+            hit_type_feature, num_classes=5
+            )
+        return cls(
+            pos_xyz_hits=pos_xyz_hits,
+            pos_pxpypz=pos_pxpypz,
+            pos_pxpypz_calo = pos_pxpypz_calo,
+            p_hits=p,
+            e_hits=e,
+            hit_particle_link=hit_particle_link,
+            pandora_features= pandora_features,
+            hit_type_feature=hit_type_feature,
+            chi_squared_tracks=chi_squared_tracks,
+            hit_type_one_hot = hit_type_one_hot,
+        )

src/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+This file contains a modified version of the dataloader originally from:
+    weaver-core
+    https://github.com/hqucms/weaver-core
+The original implementation has been adapted and extended for the needs of this project.
+Please refer to the original repository for the base implementation and license details.
+Changes in this version:
+    - Adapted to read parquet files
+    - Modified batching logic to build graphs on the fly
+    - No reweighting or standarization of dataset
+"""
+import os
+import copy
+import json
+import numpy as np
+import awkward as ak
+import torch.utils.data
+import time
+from functools import partial
+from concurrent.futures.thread import ThreadPoolExecutor
+from src.data.tools import _pad
+from src.data.fileio import _read_files
+from src.data.preprocess import (
+    AutoStandardizer,
+    WeightMaker,
+)
+from src.dataset.functions_graph import create_graph
+def _preprocess(table, options):
+    indices = np.arange(
+        len(table["X_track"])
+    )
+    if options["shuffle"]:
+        np.random.shuffle(indices)
+    return table, indices
+def _load_next(filelist, load_range, options):
+    table = _read_files(
+        filelist, load_range,
+    )
+    table, indices = _preprocess(table, options)
+    return table, indices
+class _SimpleIter(object):
+    r"""_SimpleIter
+    Iterator object for ``SimpleIterDataset''.
+    """
+    def __init__(self, **kwargs):
+        # inherit all properties from SimpleIterDataset
+        self.__dict__.update(**kwargs)
+        self.iter_count = 0
+        # executor to read files and run preprocessing asynchronously
+        self.executor = ThreadPoolExecutor(max_workers=1) if self._async_load else None
+        # init: prefetch holds table and indices for the next fetch
+        self.prefetch = None
+        self.table = None
+        self.indices = []
+        self.cursor = 0
+        self._seed = None
+        worker_info = torch.utils.data.get_worker_info()
+        file_dict = self._init_file_dict.copy()
+        if worker_info is not None:
+            # in a worker process
+            self._name += "_worker%d" % worker_info.id
+            self._seed = worker_info.seed & 0xFFFFFFFF
+            np.random.seed(self._seed)
+            # split workload by files
+            new_file_dict = {}
+            for name, files in file_dict.items():
+                new_files = files[worker_info.id :: worker_info.num_workers]
+                assert len(new_files) > 0
+                new_file_dict[name] = new_files
+            file_dict = new_file_dict
+        self.worker_file_dict = file_dict
+        self.worker_filelist = sum(file_dict.values(), [])
+        self.worker_info = worker_info
+        self.restart()
+    def restart(self):
+        print("=== Restarting DataIter %s, seed=%s ===" % (self._name, self._seed))
+        # re-shuffle filelist and load range if for training
+        filelist = self.worker_filelist.copy()
+        if self._sampler_options["shuffle"]:
+            np.random.shuffle(filelist)
+        if self._file_fraction < 1:
+            num_files = int(len(filelist) * self._file_fraction)
+            filelist = filelist[:num_files]
+        self.filelist = filelist
+        if self._init_load_range_and_fraction is None:
+            self.load_range = (0, 1)
+        else:
+            (start_pos, end_pos), load_frac = self._init_load_range_and_fraction
+            interval = (end_pos - start_pos) * load_frac
+            if self._sampler_options["shuffle"]:
+                offset = np.random.uniform(start_pos, end_pos - interval)
+                self.load_range = (offset, offset + interval)
+            else:
+                self.load_range = (start_pos, start_pos + interval)
+        self.ipos = 0 if self._fetch_by_files else self.load_range[0]
+        # prefetch the first entry asynchronously
+        self._try_get_next(init=True)
+    def __next__(self):
+        graph_empty = True
+        self.iter_count += 1
+        while graph_empty:
+            if len(self.filelist) == 0:
+                raise StopIteration
+            try:
+                i = self.indices[self.cursor]
+            except IndexError:
+                # case 1: first entry, `self.indices` is still empty
+                # case 2: running out of entries, `self.indices` is not empty
+                while True:
+                    if self.prefetch is None:
+                        # reaching the end as prefetch got nothing
+                        self.table = None
+                        if self._async_load:
+                            self.executor.shutdown(wait=False)
+                        raise StopIteration
+                    # get result from prefetch
+                    if self._async_load:
+                        self.table, self.indices = self.prefetch.result()
+                    else:
+                        self.table, self.indices = self.prefetch
+                    # try to load the next ones asynchronously
+                    self._try_get_next()
+                    # check if any entries are fetched (i.e., passing selection) -- if not, do another fetch
+                    if len(self.indices) > 0:
+                        break
+                # reset cursor
+                self.cursor = 0
+                i = self.indices[self.cursor]
+            self.cursor += 1
+            data, graph_empty = self.get_data(i)
+        return data
+    def _try_get_next(self, init=False):
+        end_of_list = (
+            self.ipos >= len(self.filelist)
+            if self._fetch_by_files
+            else self.ipos >= self.load_range[1]
+        )
+        if end_of_list:
+            if init:
+                raise RuntimeError(
+                    "Nothing to load for worker %d" % 0
+                    if self.worker_info is None
+                    else self.worker_info.id
+                )
+            if self._infinity_mode and not self._in_memory:
+                # infinity mode: re-start
+                self.restart()
+                return
+            else:
+                # finite mode: set prefetch to None, exit
+                self.prefetch = None
+                return
+        if self._fetch_by_files:
+            filelist = self.filelist[int(self.ipos) : int(self.ipos + self._fetch_step)]
+            load_range = self.load_range
+        else:
+            filelist = self.filelist
+            load_range = (
+                self.ipos,
+                min(self.ipos + self._fetch_step, self.load_range[1]),
+            )
+        print('Start fetching next batch, len(filelist)=%d, load_range=%s'%(len(filelist), load_range))
+        if self._async_load:
+            self.prefetch = self.executor.submit(
+                _load_next,
+                filelist,
+                load_range,
+                self._sampler_options,
+            )
+        else:
+            self.prefetch = _load_next(
+                filelist, load_range, self._sampler_options
+            )
+        self.ipos += self._fetch_step
+    def get_data(self, i):
+        # inputs
+        self.args_parse.prediction = (not self.for_training)
+        # X = {k: self.table["_" + k][i].copy() for k in self._data_config.input_names}
+        X = {k: self.table[k][i] for k in self.table.fields}
+        [g, features_partnn], graph_empty = create_graph(
+            X, self.for_training, self.args_parse
+        )
+        return [g, features_partnn], graph_empty
+        # return X, False
+class SimpleIterDataset(torch.utils.data.IterableDataset):
+    r"""Base IterableDataset.
+    Handles dataloading.
+    Arguments:
+        file_dict (dict): dictionary of lists of files to be loaded.
+        data_config_file (str): YAML file containing data format information.
+        for_training (bool): flag indicating whether the dataset is used for training or testing.
+            When set to ``True``, will enable shuffling and sampling-based reweighting.
+            When set to ``False``, will disable shuffling and reweighting, but will load the observer variables.
+        load_range_and_fraction (tuple of tuples, ``((start_pos, end_pos), load_frac)``): fractional range of events to load from each file.
+            E.g., setting load_range_and_fraction=((0, 0.8), 0.5) will randomly load 50% out of the first 80% events from each file (so load 50%*80% = 40% of the file).
+        fetch_by_files (bool): flag to control how events are retrieved each time we fetch data from disk.
+            When set to ``True``, will read only a small number (set by ``fetch_step``) of files each time, but load all the events in these files.
+            When set to ``False``, will read from all input files, but load only a small fraction (set by ``fetch_step``) of events each time.
+            Default is ``False``, which results in a more uniform sample distribution but reduces the data loading speed.
+        fetch_step (float or int): fraction of events (when ``fetch_by_files=False``) or number of files (when ``fetch_by_files=True``) to load each time we fetch data from disk.
+            Event shuffling and reweighting (sampling) is performed each time after we fetch data.
+            So set this to a large enough value to avoid getting an imbalanced minibatch (due to reweighting/sampling), especially when ``fetch_by_files`` set to ``True``.
+            Will load all events (files) at once if set to non-positive value.
+        file_fraction (float): fraction of files to load.
+    """
+    def __init__(
+        self,
+        file_dict,
+        data_config_file,
+        for_training=True,
+        load_range_and_fraction=None,
+        extra_selection=None,
+        fetch_by_files=False,
+        fetch_step=0.01,
+        file_fraction=1,
+        remake_weights=False,
+        up_sample=True,
+        weight_scale=1,
+        max_resample=10,
+        async_load=True,
+        infinity_mode=False,
+        name="",
+        args_parse=None
+    ):
+        self._iters = {} if infinity_mode  else None
+        _init_args = set(self.__dict__.keys())
+        self._init_file_dict = file_dict
+        self._init_load_range_and_fraction = load_range_and_fraction
+        self._fetch_by_files = fetch_by_files
+        self._fetch_step = fetch_step
+        self._file_fraction = file_fraction
+        self._async_load = async_load
+        self._infinity_mode = infinity_mode
+        self._name = name
+        self.for_training = for_training
+        self.args_parse = args_parse
+        # ==== sampling parameters ====
+        self._sampler_options = {
+            "up_sample": up_sample,
+            "weight_scale": weight_scale,
+            "max_resample": max_resample,
+        }
+        if for_training:
+            self._sampler_options.update(training=True, shuffle=True, reweight=True)
+        else:
+            self._sampler_options.update(training=False, shuffle=False, reweight=False)
+        self._init_args = set(self.__dict__.keys()) - _init_args
+    def __iter__(self):
+        if self._iters is None:
+            kwargs = {k: copy.deepcopy(self.__dict__[k]) for k in self._init_args}
+            return _SimpleIter(**kwargs)
+        else:
+            worker_info = torch.utils.data.get_worker_info()
+            worker_id = worker_info.id if worker_info is not None else 0
+            try:
+                return self._iters[worker_id]
+            except KeyError:
+                kwargs = {k: copy.deepcopy(self.__dict__[k]) for k in self._init_args}
+                self._iters[worker_id] = _SimpleIter(**kwargs)
+                return self._iters[worker_id]

src/dataset/functions_data.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import numpy as np
+import torch
+def calculate_distance_to_boundary(g):
+    r = 2150
+    r_in_endcap = 2307
+    mask_endcap = (torch.abs(g.ndata["pos_hits_xyz"][:, 2]) - r_in_endcap) > 0
+    mask_barrer = ~mask_endcap
+    weight = torch.ones_like(g.ndata["pos_hits_xyz"][:, 0])
+    C = g.ndata["pos_hits_xyz"]
+    A = torch.tensor([0, 0, 1], dtype=C.dtype, device=C.device)
+    P = (
+        r
+        * 1
+        / (torch.norm(torch.cross(A.view(1, -1), C, dim=-1), dim=1)).unsqueeze(1)
+        * C
+    )
+    P1 = torch.abs(r_in_endcap / g.ndata["pos_hits_xyz"][:, 2].unsqueeze(1)) * C
+    weight[mask_barrer] = torch.norm(P - C, dim=1)[mask_barrer]
+    weight[mask_endcap] = torch.norm(P1[mask_endcap] - C[mask_endcap], dim=1)
+    g.ndata["radial_distance"] = weight
+    weight_ = torch.exp(-(weight / 1000))
+    g.ndata["radial_distance_exp"] = weight_
+    return g

src/dataset/functions_graph.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import numpy as np
+import torch
+import dgl
+from src.dataset.functions_data import (
+    calculate_distance_to_boundary,
+)
+import time
+from src.dataset.functions_particles import concatenate_Particles_GT, Particles_GT
+from src.dataset.dataclasses import Hits
+def create_inputs_from_table(
+    output, prediction=False, args=None
+):
+    number_hits = np.int32(len(output["X_track"])+len(output["X_hit"]))
+    number_part = np.int32(len(output["X_gen"]))
+    hits = Hits.from_data(
+    output,
+    number_hits,
+    args,
+    number_part
+    )
+    y_data_graph = Particles_GT()
+    y_data_graph.fill( output, prediction,args)
+    result = [
+        y_data_graph,
+        hits
+    ]
+    return result
+def create_graph(
+    output,
+    for_training =True, args=None
+):
+    prediction = not for_training
+    graph_empty = False
+    result = create_inputs_from_table(
+        output,
+        prediction=prediction,
+        args=args
+    )
+    if len(result) == 1:
+        graph_empty = True
+        return [0, 0], graph_empty
+    else:
+        (y_data_graph,hits) = result
+        g = dgl.graph(([], []))
+        g.add_nodes(hits.pos_xyz_hits.shape[0])
+        g.ndata["h"] = torch.cat(
+                (hits.pos_xyz_hits, hits.hit_type_one_hot, hits.e_hits, hits.p_hits), dim=1
+            ).float()
+        g.ndata["p_hits"] = hits.p_hits.float()
+        g.ndata["pos_hits_xyz"] = hits.pos_xyz_hits.float()
+        g.ndata["pos_pxpypz_at_vertex"] = hits.pos_pxpypz.float()
+        g.ndata["pos_pxpypz"] = hits.pos_pxpypz  #TrackState::AtIP
+        g.ndata["pos_pxpypz_at_calo"] = hits.pos_pxpypz_calo  #TrackState::AtCalorimeter
+        g = calculate_distance_to_boundary(g)
+        g.ndata["hit_type"] = hits.hit_type_feature.float()
+        g.ndata["e_hits"] = hits.e_hits.float()
+        g.ndata["chi_squared_tracks"] = hits.chi_squared_tracks.float()
+        g.ndata["particle_number"] = hits.hit_particle_link.float()+1 #(noise idx is 0 and particle MC 0 starts at 1)
+        if prediction and (args.pandora):
+            g.ndata["pandora_pfo"] = hits.pandora_features.pandora_pfo_link.float()
+            g.ndata["pandora_pfo_energy"] = hits.pandora_features.pfo_energy.float()
+            g.ndata["pandora_momentum"] = hits.pandora_features.pandora_mom_components.float()
+            g.ndata["pandora_reference_point"] = hits.pandora_features.pandora_ref_point.float()
+            g.ndata["pandora_pid"] = hits.pandora_features.pandora_pid.float()
+        graph_empty = False
+        unique_links = torch.unique(hits.hit_particle_link)
+        if not prediction and unique_links.shape[0] == 1 and unique_links[0] == -1:
+            graph_empty = True
+        if hits.pos_xyz_hits.shape[0] < 10:
+            graph_empty = True
+    return [g, y_data_graph], graph_empty
+def graph_batch_func(list_graphs):
+    """collator function for graph dataloader
+    Args:
+        list_graphs (list): list of graphs from the iterable dataset
+    Returns:
+        batch dgl: dgl batch of graphs
+    """
+    list_graphs_g = [el[0] for el in list_graphs]
+    ys = concatenate_Particles_GT(list_graphs)
+    bg = dgl.batch(list_graphs_g)
+    return bg, ys

src/dataset/functions_particles.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import numpy as np
+import torch
+from sklearn.preprocessing import StandardScaler
+from dataclasses import dataclass
+from typing import Any, List, Optional
+@dataclass
+class Particles_GT():
+    angle: Optional[Any] = None
+    coord: Optional[Any] = None
+    E: Optional[Any] = None
+    E_corrected: Optional[Any] = None
+    m: Optional[Any] = None
+    mass: Optional[Any] = None
+    pid: Optional[Any] = None
+    vertex: Optional[Any] = None
+    gen_status: Optional[Any] = None
+    batch_number: Optional[Any] = None
+    endpoint: Optional[Any] = None
+    def fill(self, output, prediction, args):
+        features_particles = torch.tensor(output["X_gen"])
+        particle_coord_angle = features_particles[:,4:6]
+        particle_coord = features_particles[:, 12:15]
+        vertex_coord = features_particles[:, 15:18]
+        y_mass = features_particles[:, 10].view(-1).unsqueeze(1)
+        y_mom = features_particles[:, 11].view(-1).unsqueeze(1)
+        y_energy = features_particles[:, 8].view(-1).unsqueeze(1)
+        y_pid = features_particles[:,0]
+        gen_status = features_particles[:,1]
+        self.angle= particle_coord_angle
+        self.coord = particle_coord
+        self.E_corrected = y_energy
+        self.E = y_energy
+        self.m = y_mom
+        self.mass = y_mass
+        self.pid = y_pid
+        self.vertex=vertex_coord
+        self.gen_status = gen_status
+    def __len__(self):
+        return len(self.E)
+    def mask(self, mask):
+        for k in self.__dict__:
+            if getattr(self, k) is not None:
+                if type(getattr(self, k)) == list:
+                    if getattr(self, k)[0] is not None:
+                        setattr(self, k, getattr(self, k)[mask])
+                else:
+                    setattr(self, k, getattr(self, k)[mask])
+    def copy(self):
+        obj = type(self).__new__(self.__class__)
+        obj.__dict__.update(self.__dict__)
+        return obj
+def concatenate_Particles_GT(list_of_Particles_GT):
+    list_coord = [p[1].coord for p in list_of_Particles_GT]
+    list_angle = [p[1].angle for p in list_of_Particles_GT]
+    list_angle = torch.cat(list_angle, dim=0)
+    list_vertex = [p[1].vertex for p in list_of_Particles_GT]
+    list_coord = torch.cat(list_coord, dim=0)
+    list_E = [p[1].E for p in list_of_Particles_GT]
+    list_E = torch.cat(list_E, dim=0)
+    list_E_corr = [p[1].E_corrected for p in list_of_Particles_GT]
+    list_E_corr = torch.cat(list_E_corr, dim=0)
+    list_m = [p[1].m for p in list_of_Particles_GT]
+    list_m = torch.cat(list_m, dim=0)
+    list_mass = [p[1].mass for p in list_of_Particles_GT]
+    list_mass = torch.cat(list_mass, dim=0)
+    list_pid = [p[1].pid for p in list_of_Particles_GT]
+    list_pid = torch.cat(list_pid, dim=0)
+    list_genstatus = [p[1].gen_status for p in list_of_Particles_GT]
+    list_genstatus = torch.cat(list_genstatus, dim=0)
+    if hasattr(list_of_Particles_GT[0], "endpoint"):
+        list_endpoint = [p[1].endpoint for p in list_of_Particles_GT]
+        list_endpoint= torch.cat(list_endpoint, dim=0)
+    else:
+        list_endpoint = None
+    if list_vertex[0] is not None:
+        list_vertex = torch.cat(list_vertex, dim=0)
+    if hasattr(list_of_Particles_GT[0], "decayed_in_calo"):
+        list_dec_calo = [p[1].decayed_in_calo for p in list_of_Particles_GT]
+        list_dec_track = [p[1].decayed_in_tracker for p in list_of_Particles_GT]
+        list_dec_calo = torch.cat(list_dec_calo, dim=0)
+        list_dec_track = torch.cat(list_dec_track, dim=0)
+    else:
+        list_dec_calo = None
+        list_dec_track = None
+    batch_number = add_batch_number(list_of_Particles_GT)
+    particle_batch = Particles_GT()
+    particle_batch.angle = list_angle
+    particle_batch.coord = list_coord
+    particle_batch.E = list_E
+    particle_batch.E_corrected = list_E_corr
+    particle_batch.m = list_m
+    particle_batch.pid = list_pid
+    particle_batch.vertex=  list_vertex
+    particle_batch.decayed_in_calo = list_dec_calo
+    particle_batch.decayed_in_tracker = list_dec_track
+    particle_batch.batch_number = batch_number
+    particle_batch.gen_status = list_genstatus
+    particle_batch.endpoint = list_endpoint
+    return particle_batch
+def add_batch_number(list_graphs):
+    list_y = []
+    for i, el in enumerate(list_graphs):
+        y = el[1]
+        batch_id = torch.ones(y.E.shape[0], 1) * i
+        list_y.append(batch_id)
+    list_y = torch.cat(list_y, dim=0)
+    return list_y

src/inference.py ADDED Viewed

	@@ -0,0 +1,735 @@

+"""
+Standalone single-event MLPF inference.
+Provides :func:`run_single_event_inference` which takes raw event data
+(from a parquet file or as an awkward record) and model checkpoint paths,
+runs the full particle-flow pipeline (graph construction → GATr forward
+pass → density-peak clustering → energy correction & PID), and returns:
+* a ``pandas.DataFrame`` of predicted particles with their properties
+* a hit→cluster mapping as a ``pandas.DataFrame``
+"""
+import argparse
+import types
+from typing import Optional
+import numpy as np
+import pandas as pd
+import torch
+import dgl
+import awkward as ak
+from src.data.fileio import _read_parquet
+from src.dataset.functions_graph import create_graph
+from src.dataset.functions_particles import Particles_GT, add_batch_number
+from src.layers.clustering import DPC_custom_CLD, remove_bad_tracks_from_cluster
+from src.utils.pid_conversion import pid_conversion_dict
+# -- CPU-compatible attention patch ------------------------------------------
+def _patch_gatr_attention_for_cpu():
+    """Replace GATr's xformers-based attention with a naive implementation.
+    ``xformers.ops.fmha.memory_efficient_attention`` has no CPU kernel, so
+    running GATr on CPU crashes.  This function monkey-patches
+    ``gatr.primitives.attention.scaled_dot_product_attention`` with a plain
+    PyTorch implementation that works on any device (albeit slower on GPU).
+    The patch is applied at most once.
+    """
+    import gatr.primitives.attention as _gatr_attn
+    if getattr(_gatr_attn, "_cpu_patched", False):
+        return
+    def _cpu_sdpa(q, k, v, attn_mask=None):
+        # q, k, v: (B, H, N, D) — batch, heads, items, dim
+        B, H, N, D = q.shape
+        scale = float(D) ** -0.5
+        q2 = q.reshape(B * H, N, D)
+        k2 = k.reshape(B * H, N, D)
+        v2 = v.reshape(B * H, N, D)
+        attn = torch.bmm(q2 * scale, k2.transpose(1, 2))  # (B*H, N, N)
+        if attn_mask is not None:
+            dense = _block_diag_mask_to_dense(attn_mask, N, q.device)
+            if dense is not None:
+                attn = attn.masked_fill(~dense.unsqueeze(0), float("-inf"))
+        attn = torch.softmax(attn, dim=-1)
+        # Rows that are fully masked produce NaN after softmax; zero them out.
+        attn = attn.nan_to_num(0.0)
+        out = torch.bmm(attn, v2)  # (B*H, N, D)
+        return out.reshape(B, H, N, D)
+    _gatr_attn.scaled_dot_product_attention = _cpu_sdpa
+    _gatr_attn._cpu_patched = True
+def _block_diag_mask_to_dense(attn_mask, total_len, device):
+    """Convert an ``xformers.ops.fmha.BlockDiagonalMask`` to a dense bool mask."""
+    try:
+        from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+        if not isinstance(attn_mask, BlockDiagonalMask):
+            return None
+    except ImportError:
+        return None
+    # Extract per-sequence start offsets
+    try:
+        seqstarts = attn_mask.q_seqinfo.seqstart_py
+    except AttributeError:
+        try:
+            seqstarts = attn_mask.q_seqinfo.seqstart.cpu().tolist()
+        except Exception:
+            return None
+    mask = torch.zeros(total_len, total_len, dtype=torch.bool, device=device)
+    for i in range(len(seqstarts) - 1):
+        s, e = seqstarts[i], seqstarts[i + 1]
+        mask[s:e, s:e] = True
+    return mask
+# -- PID label → human-readable name ----------------------------------------
+_PID_LABELS = {
+    0: "electron",
+    1: "charged hadron",
+    2: "neutral hadron",
+    3: "photon",
+    4: "muon",
+}
+_ABS_PDG_NAME = {
+    11: "electron",
+    13: "muon",
+    22: "photon",
+    130: "K_L",
+    211: "pion±",
+    321: "kaon±",
+    2112: "neutron",
+    2212: "proton",
+    310: "K_S",
+}
+# -- Minimal args namespace for inference ------------------------------------
+def _default_args(**overrides):
+    """Return a minimal ``argparse.Namespace`` with defaults the model expects."""
+    d = dict(
+        correction=True,
+        freeze_clustering=True,
+        predict=True,
+        pandora=False,
+        use_gt_clusters=False,
+        use_average_cc_pos=0.99,
+        qmin=1.0,
+        data_config="config_files/config_hits_track_v4.yaml",
+        network_config="src/models/wrapper/example_mode_gatr_noise.py",
+        model_prefix="/tmp/mlpf_eval",
+        start_lr=1e-3,
+        frac_cluster_loss=0,
+        local_rank=0,
+        gpus="0",
+        batch_size=1,
+        num_workers=0,
+        prefetch_factor=1,
+        num_epochs=1,
+        steps_per_epoch=None,
+        samples_per_epoch=None,
+        steps_per_epoch_val=None,
+        samples_per_epoch_val=None,
+        train_val_split=0.8,
+        data_train=[],
+        data_val=[],
+        data_test=[],
+        data_fraction=1,
+        file_fraction=1,
+        fetch_by_files=True,
+        fetch_step=1,
+        log_wandb=False,
+        wandb_displayname="",
+        wandb_projectname="",
+        wandb_entity="",
+        name_output="gradio",
+        train_batches=100,
+    )
+    d.update(overrides)
+    return argparse.Namespace(**d)
+# -- Model loading -----------------------------------------------------------
+def load_model(
+    clustering_ckpt: str,
+    energy_pid_ckpt: Optional[str] = None,
+    device: str = "cpu",
+    args_overrides: Optional[dict] = None,
+):
+    """Load the full MLPF model (clustering + optional energy/PID correction).
+    Parameters
+    ----------
+    clustering_ckpt : str
+        Path to the clustering checkpoint (``.ckpt``).
+    energy_pid_ckpt : str or None
+        Path to the energy-correction / PID checkpoint (``.ckpt``).
+        If *None*, only clustering is performed (no energy correction / PID).
+    device : str
+        ``"cpu"`` or ``"cuda:0"`` etc.
+    args_overrides : dict or None
+        Extra key-value pairs forwarded to :func:`_default_args`.
+    Returns
+    -------
+    model : ExampleWrapper
+        The model in eval mode, on *device*.
+    args : argparse.Namespace
+        The arguments namespace used.
+    """
+    from src.models.Gatr_pf_e_noise import ExampleWrapper
+    overrides = dict(args_overrides or {})
+    has_correction = energy_pid_ckpt is not None
+    overrides["correction"] = has_correction
+    args = _default_args(**overrides)
+    dev = torch.device(device)
+    if has_correction:
+        ckpt = torch.load(energy_pid_ckpt, map_location=dev)
+        state_dict = ckpt["state_dict"]
+        model = ExampleWrapper(args=args, dev=0)
+        model.load_state_dict(state_dict, strict=False)
+        # Overwrite clustering layers from clustering checkpoint
+        model2 = ExampleWrapper.load_from_checkpoint(
+            clustering_ckpt, args=args, dev=0, strict=False, map_location=dev,
+        )
+        model.gatr = model2.gatr
+        model.ScaledGooeyBatchNorm2_1 = model2.ScaledGooeyBatchNorm2_1
+        model.clustering = model2.clustering
+        model.beta = model2.beta
+    else:
+        model = ExampleWrapper.load_from_checkpoint(
+            clustering_ckpt, args=args, dev=0, strict=False, map_location=dev,
+        )
+    model = model.to(dev)
+    model.eval()
+    return model, args
+def load_random_model(
+    device: str = "cpu",
+    args_overrides: Optional[dict] = None,
+):
+    """Create a GATr model with randomly initialised weights (no checkpoint).
+    This is useful for debugging to verify that checkpoint weights are
+    actually being loaded and used by the model.
+    Parameters
+    ----------
+    device : str
+        ``"cpu"`` or ``"cuda:0"`` etc.
+    args_overrides : dict or None
+        Extra key-value pairs forwarded to :func:`_default_args`.
+    Returns
+    -------
+    model : ExampleWrapper
+        The model (random weights) in eval mode, on *device*.
+    args : argparse.Namespace
+        The arguments namespace used.
+    """
+    from src.models.Gatr_pf_e_noise import ExampleWrapper
+    overrides = dict(args_overrides or {})
+    overrides["correction"] = False
+    args = _default_args(**overrides)
+    dev = torch.device(device)
+    model = ExampleWrapper(args=args, dev=0)
+    model = model.to(dev)
+    model.eval()
+    return model, args
+# -- Single-event data loading -----------------------------------------------
+def load_event_from_parquet(parquet_path: str, event_index: int = 0):
+    """Read a single event from a parquet file.
+    Returns an awkward record with fields ``X_hit``, ``X_track``, ``X_gen``,
+    ``ygen_hit``, ``ygen_track``, etc.
+    """
+    table = _read_parquet(parquet_path)
+    n_events = len(table["X_track"])
+    if event_index >= n_events:
+        raise IndexError(
+            f"event_index {event_index} out of range (file has {n_events} events)"
+        )
+    event = {field: table[field][event_index] for field in table.fields}
+    return event
+# -- Core inference function --------------------------------------------------
+@torch.no_grad()
+def run_single_event_inference(
+    event,
+    model,
+    args,
+    device: str = "cpu",
+):
+    """Run full MLPF inference on a single event.
+    Parameters
+    ----------
+    event : dict-like
+        A single event record (from :func:`load_event_from_parquet`).
+    model : ExampleWrapper
+        The loaded model (from :func:`load_model`).
+    args : argparse.Namespace
+        The arguments namespace (from :func:`load_model`).
+    device : str
+        Device string.
+    Returns
+    -------
+    particles_df : pandas.DataFrame
+        One row per predicted particle with columns:
+        ``cluster_id``, ``energy``, ``pid_class``, ``pid_label``,
+        ``px``, ``py``, ``pz``, ``is_charged``.
+    hit_cluster_df : pandas.DataFrame
+        One row per hit with columns:
+        ``hit_index``, ``cluster_id``, ``pandora_cluster_id``,
+        ``hit_type_id``, ``hit_type``, ``x``, ``y``, ``z``,
+        ``hit_energy``, ``cluster_x``, ``cluster_y``, ``cluster_z``.
+        ``pandora_cluster_id`` is -1 when pandora data is not available
+        or when the hit has no matching entry (e.g. CSV was modified after
+        loading from parquet).
+    mc_particles_df : pandas.DataFrame
+        One row per MC truth particle with columns:
+        ``pid``, ``energy``, ``momentum``, ``px``, ``py``, ``pz``,
+        ``mass``, ``theta``, ``phi``, ``vx``, ``vy``, ``vz``,
+        ``gen_status``, ``pdg_name``.
+    pandora_particles_df : pandas.DataFrame
+        One row per Pandora PFO with columns:
+        ``pfo_idx``, ``pid``, ``pdg_name``, ``energy``, ``momentum``,
+        ``px``, ``py``, ``pz``, ``ref_x``, ``ref_y``, ``ref_z``.
+        Empty when pandora data is not available in the input.
+    """
+    dev = torch.device(device)
+    # Ensure eval mode so that BatchNorm layers use running statistics from
+    # training instead of computing batch statistics from the current
+    # (single-event) input.  Without this, inference with batch_size=1
+    # produces incorrect normalization.
+    model.eval()
+    if dev.type == "cpu":
+        _patch_gatr_attention_for_cpu()
+    # 0. Extract MC truth particles table and pandora particles
+    mc_particles_df = _extract_mc_particles(event)
+    pandora_particles_df, pfo_calohit, pfo_track = _extract_pandora_particles(event)
+    # 1. Build DGL graph from the event
+    [g, y_data], graph_empty = create_graph(event, for_training=False, args=args)
+    if graph_empty:
+        return pd.DataFrame(), pd.DataFrame(), mc_particles_df, pandora_particles_df
+    g = g.to(dev)
+    # Prepare batch metadata expected by the model
+    y_data.batch_number = torch.zeros(y_data.E.shape[0], 1)
+    # 2. Forward pass through the GATr clustering backbone
+    inputs = g.ndata["pos_hits_xyz"].float().to(dev)
+    inputs_scalar = g.ndata["hit_type"].float().view(-1, 1).to(dev)
+    from gatr.interface import embed_point, embed_scalar
+    from xformers.ops.fmha import BlockDiagonalMask
+    inputs_normed = model.ScaledGooeyBatchNorm2_1(inputs)
+    embedded_inputs = embed_point(inputs_normed) + embed_scalar(inputs_scalar)
+    embedded_inputs = embedded_inputs.unsqueeze(-2)
+    mask = BlockDiagonalMask.from_seqlens([g.num_nodes()])
+    scalars = torch.cat(
+        (g.ndata["e_hits"].float().to(dev), g.ndata["p_hits"].float().to(dev)), dim=1
+    )
+    from gatr.interface import extract_point, extract_scalar
+    embedded_outputs, scalar_outputs = model.gatr(
+        embedded_inputs, scalars=scalars, attention_mask=mask
+    )
+    points = extract_point(embedded_outputs[:, 0, :])
+    nodewise_outputs = extract_scalar(embedded_outputs)
+    x_point = points
+    x_scalar = torch.cat(
+        (nodewise_outputs.view(-1, 1), scalar_outputs.view(-1, 1)), dim=1
+    )
+    x_cluster_coord = model.clustering(x_point)
+    beta = model.beta(x_scalar)
+    g.ndata["final_cluster"] = x_cluster_coord
+    g.ndata["beta"] = beta.view(-1)
+    # 3. Density-peak clustering
+    labels = DPC_custom_CLD(x_cluster_coord, g, dev)
+    labels, _ = remove_bad_tracks_from_cluster(g, labels)
+    # 4. Build hit→cluster table
+    n_hits = g.num_nodes()
+    hit_types_raw = g.ndata["hit_type"].cpu().numpy()
+    hit_type_names = {1: "track", 2: "ECAL", 3: "HCAL", 4: "muon"}
+    # Build pandora cluster ID per node (hits first, then tracks)
+    # Use min of array lengths for graceful handling when CSV was modified
+    n_calo = len(np.asarray(event.get("X_hit", [])))
+    pandora_cluster_ids = np.full(n_hits, -1, dtype=np.int64)
+    if len(pfo_calohit) > 0:
+        n_assign = min(len(pfo_calohit), n_calo)
+        pandora_cluster_ids[:n_assign] = pfo_calohit[:n_assign]
+    n_tracks = n_hits - n_calo
+    if n_tracks > 0 and len(pfo_track) > 0:
+        n_assign = min(len(pfo_track), n_tracks)
+        pandora_cluster_ids[n_calo:n_calo + n_assign] = pfo_track[:n_assign]
+    hit_cluster_df = pd.DataFrame({
+        "hit_index": np.arange(n_hits),
+        "cluster_id": labels.cpu().numpy(),
+        "pandora_cluster_id": pandora_cluster_ids,
+        "hit_type_id": hit_types_raw,
+        "hit_type": [hit_type_names.get(int(t), str(int(t))) for t in hit_types_raw],
+        "x": g.ndata["pos_hits_xyz"][:, 0].cpu().numpy(),
+        "y": g.ndata["pos_hits_xyz"][:, 1].cpu().numpy(),
+        "z": g.ndata["pos_hits_xyz"][:, 2].cpu().numpy(),
+        "hit_energy": g.ndata["e_hits"].view(-1).cpu().numpy(),
+        "cluster_x": x_cluster_coord[:, 0].cpu().numpy(),
+        "cluster_y": x_cluster_coord[:, 1].cpu().numpy(),
+        "cluster_z": x_cluster_coord[:, 2].cpu().numpy(),
+    })
+    # 5. Per-cluster summary (basic, before energy correction)
+    unique_labels = torch.unique(labels)
+    # cluster 0 = noise
+    cluster_ids = unique_labels[unique_labels > 0].cpu().numpy()
+    from torch_scatter import scatter_add
+    e_per_cluster = scatter_add(
+        g.ndata["e_hits"].view(-1).to(dev), labels.to(dev)
+    )
+    p_per_cluster = scatter_add(
+        g.ndata["p_hits"].view(-1).to(dev), labels.to(dev)
+    )
+    n_hits_per_cluster = scatter_add(
+        torch.ones(n_hits, device=dev), labels.to(dev)
+    )
+    # Check if any cluster has a track (→ charged)
+    is_track_per_cluster = scatter_add(
+        (g.ndata["hit_type"].to(dev) == 1).float(), labels.to(dev)
+    )
+    rows = []
+    for cid in cluster_ids:
+        mask_c = labels == cid
+        e_sum = e_per_cluster[cid].item()
+        p_sum = p_per_cluster[cid].item()
+        n_h = int(n_hits_per_cluster[cid].item())
+        has_track = is_track_per_cluster[cid].item() >= 1
+        # Mean position
+        pos_mean = g.ndata["pos_hits_xyz"][mask_c].mean(dim=0).cpu().numpy()
+        rows.append({
+            "cluster_id": int(cid),
+            "energy_sum_hits": round(e_sum, 4),
+            "p_track": round(p_sum, 4) if has_track else 0.0,
+            "n_hits": n_h,
+            "is_charged": has_track,
+            "mean_x": round(float(pos_mean[0]), 2),
+            "mean_y": round(float(pos_mean[1]), 2),
+            "mean_z": round(float(pos_mean[2]), 2),
+        })
+    particles_df = pd.DataFrame(rows)
+    # 6. If energy correction is available, run it
+    if args.correction and hasattr(model, "energy_correction"):
+        try:
+            particles_df = _run_energy_correction(
+                model, g, x_cluster_coord, beta, labels, y_data, particles_df, dev
+            )
+        except Exception as e:
+            # Attach a note but don't crash – the basic table is still useful
+            particles_df["note"] = f"Energy correction failed: {e}"
+    return particles_df, hit_cluster_df, mc_particles_df, pandora_particles_df
+def _extract_mc_particles(event):
+    """Build a DataFrame of MC truth particles from the event's ``X_gen``."""
+    x_gen = np.asarray(event.get("X_gen", []))
+    if x_gen.ndim != 2 or x_gen.shape[0] == 0 or x_gen.shape[1] < 18:
+        return pd.DataFrame()
+    rows = []
+    for i in range(x_gen.shape[0]):
+        pid_raw = int(x_gen[i, 0])
+        rows.append({
+            "particle_idx": i,
+            "pid": pid_raw,
+            "pdg_name": _ABS_PDG_NAME.get(abs(pid_raw), str(pid_raw)),
+            "gen_status": int(x_gen[i, 1]),
+            "energy": round(float(x_gen[i, 8]), 4),
+            "momentum": round(float(x_gen[i, 11]), 4),
+            "px": round(float(x_gen[i, 12]), 4),
+            "py": round(float(x_gen[i, 13]), 4),
+            "pz": round(float(x_gen[i, 14]), 4),
+            "mass": round(float(x_gen[i, 10]), 4),
+            "theta": round(float(x_gen[i, 4]), 4),
+            "phi": round(float(x_gen[i, 5]), 4),
+            "vx": round(float(x_gen[i, 15]), 4),
+            "vy": round(float(x_gen[i, 16]), 4),
+            "vz": round(float(x_gen[i, 17]), 4),
+        })
+    return pd.DataFrame(rows)
+def _extract_pandora_particles(event):
+    """Build a DataFrame of Pandora PFO particles from the event's ``X_pandora``.
+    ``X_pandora`` columns (per PFO):
+        0: pid (PDG ID)
+        1–3: px, py, pz (momentum components at reference point)
+        4–6: ref_x, ref_y, ref_z (reference point)
+        7: energy
+        8: momentum magnitude
+    Returns (pandora_particles_df, pfo_hit_links, pfo_track_links) where
+    *pfo_hit_links* and *pfo_track_links* are integer arrays mapping each
+    hit/track to a PFO index (0-based, -1 = unassigned).
+    """
+    x_pandora = np.asarray(event.get("X_pandora", []))
+    pfo_calohit = np.asarray(event.get("pfo_calohit", []), dtype=np.int64)
+    pfo_track = np.asarray(event.get("pfo_track", []), dtype=np.int64)
+    if x_pandora.ndim != 2 or x_pandora.shape[0] == 0 or x_pandora.shape[1] < 9:
+        return pd.DataFrame(), pfo_calohit, pfo_track
+    rows = []
+    for i in range(x_pandora.shape[0]):
+        pid_raw = int(x_pandora[i, 0])
+        rows.append({
+            "pfo_idx": i,
+            "pid": pid_raw,
+            "pdg_name": _ABS_PDG_NAME.get(abs(pid_raw), str(pid_raw)),
+            "energy": round(float(x_pandora[i, 7]), 4),
+            "momentum": round(float(x_pandora[i, 8]), 4),
+            "px": round(float(x_pandora[i, 1]), 4),
+            "py": round(float(x_pandora[i, 2]), 4),
+            "pz": round(float(x_pandora[i, 3]), 4),
+            "ref_x": round(float(x_pandora[i, 4]), 2),
+            "ref_y": round(float(x_pandora[i, 5]), 2),
+            "ref_z": round(float(x_pandora[i, 6]), 2),
+        })
+    return pd.DataFrame(rows), pfo_calohit, pfo_track
+def _run_energy_correction(model, g, x_cluster_coord, beta, labels, y_data, particles_df, dev):
+    """Run the energy correction & PID branch and enrich *particles_df*."""
+    from src.layers.shower_matching import match_showers, obtain_intersection_matrix, obtain_union_matrix
+    from torch_scatter import scatter_add, scatter_mean
+    from src.utils.post_clustering_features import (
+        get_post_clustering_features, get_extra_features, calculate_eta, calculate_phi,
+    )
+    x = torch.cat((x_cluster_coord, beta.view(-1, 1)), dim=1)
+    # Re-create per-cluster sub-graphs expected by the correction pipeline
+    particle_ids = torch.unique(g.ndata["particle_number"])
+    shower_p_unique = torch.unique(labels)
+    model_output_dummy = x  # used only for device by match_showers
+    shower_p_unique_m, row_ind, col_ind, i_m_w, _ = match_showers(
+        labels, {"graph": g, "part_true": y_data},
+        particle_ids, model_output_dummy, 0, 0, None,
+    )
+    row_ind = torch.Tensor(row_ind).to(dev).long()
+    col_ind = torch.Tensor(col_ind).to(dev).long()
+    if torch.sum(particle_ids == 0) > 0:
+        row_ind_ = row_ind - 1
+    else:
+        row_ind_ = row_ind
+    index_matches = (col_ind + 1).to(dev).long()
+    # Build per-cluster sub-graphs (matched + fakes)
+    graphs_matched = []
+    true_energies = []
+    reco_energies = []
+    pids_matched = []
+    coords_matched = []
+    e_true_daughters = []
+    for j, sh_label in enumerate(index_matches):
+        if torch.sum(sh_label == index_matches) == 1:
+            mask = labels == sh_label
+            sg = dgl.graph(([], []))
+            sg.add_nodes(int(mask.sum()))
+            sg = sg.to(dev)
+            sg.ndata["h"] = g.ndata["h"][mask]
+            if "pos_pxpypz" in g.ndata:
+                sg.ndata["pos_pxpypz"] = g.ndata["pos_pxpypz"][mask]
+            if "pos_pxpypz_at_vertex" in g.ndata:
+                sg.ndata["pos_pxpypz_at_vertex"] = g.ndata["pos_pxpypz_at_vertex"][mask]
+            sg.ndata["chi_squared_tracks"] = g.ndata["chi_squared_tracks"][mask]
+            energy_t = y_data.E.to(dev)
+            true_e = energy_t[row_ind_[j]]
+            pids_matched.append(y_data.pid[row_ind_[j]].item())
+            coords_matched.append(y_data.coord[row_ind_[j]].detach().cpu().numpy())
+            e_true_daughters.append(y_data.m[row_ind_[j]].to(dev))
+            reco_e = torch.sum(g.ndata["e_hits"].view(-1).to(dev)[mask])
+            graphs_matched.append(sg)
+            true_energies.append(true_e.view(-1))
+            reco_energies.append(reco_e.view(-1))
+    # Add fakes
+    pred_showers = shower_p_unique_m.clone()
+    pred_showers[index_matches] = -1
+    pred_showers[0] = -1
+    fakes_mask = pred_showers != -1
+    fakes_idx = torch.where(fakes_mask)[0]
+    graphs_fakes = []
+    reco_fakes = []
+    for fi in fakes_idx:
+        mask = labels == fi
+        sg = dgl.graph(([], []))
+        sg.add_nodes(int(mask.sum()))
+        sg = sg.to(dev)
+        sg.ndata["h"] = g.ndata["h"][mask]
+        if "pos_pxpypz" in g.ndata:
+            sg.ndata["pos_pxpypz"] = g.ndata["pos_pxpypz"][mask]
+        if "pos_pxpypz_at_vertex" in g.ndata:
+            sg.ndata["pos_pxpypz_at_vertex"] = g.ndata["pos_pxpypz_at_vertex"][mask]
+        sg.ndata["chi_squared_tracks"] = g.ndata["chi_squared_tracks"][mask]
+        graphs_fakes.append(sg)
+        reco_fakes.append(torch.sum(g.ndata["e_hits"].view(-1).to(dev)[mask]).view(-1))
+    if not graphs_matched and not graphs_fakes:
+        return particles_df
+    all_graphs = dgl.batch(graphs_matched + graphs_fakes)
+    sum_e = torch.cat(reco_energies + reco_fakes, dim=0)
+    # Compute high-level features
+    batch_num_nodes = all_graphs.batch_num_nodes()
+    batch_idx = []
+    for i, n in enumerate(batch_num_nodes):
+        batch_idx.extend([i] * n)
+    batch_idx = torch.tensor(batch_idx).to(dev)
+    all_graphs.ndata["h"][:, 0:3] = all_graphs.ndata["h"][:, 0:3] / 3300
+    graphs_sum_features = scatter_add(all_graphs.ndata["h"], batch_idx, dim=0)
+    graphs_sum_features = graphs_sum_features[batch_idx]
+    betas = torch.sigmoid(all_graphs.ndata["h"][:, -1])
+    all_graphs.ndata["h"] = torch.cat(
+        (all_graphs.ndata["h"], graphs_sum_features), dim=1
+    )
+    high_level = get_post_clustering_features(all_graphs, sum_e)
+    extra_features = get_extra_features(all_graphs, betas)
+    n_clusters = high_level.shape[0]
+    pred_energy = torch.ones(n_clusters, device=dev)
+    pred_pos = torch.ones(n_clusters, 3, device=dev)
+    pred_pid = torch.ones(n_clusters, device=dev).long()
+    node_features_avg = scatter_mean(all_graphs.ndata["h"], batch_idx, dim=0)[:, 0:3]
+    eta = calculate_eta(node_features_avg[:, 0], node_features_avg[:, 1], node_features_avg[:, 2])
+    phi = calculate_phi(node_features_avg[:, 0], node_features_avg[:, 1])
+    high_level = torch.cat(
+        (high_level, node_features_avg, eta.view(-1, 1), phi.view(-1, 1)), dim=1
+    )
+    num_tracks = high_level[:, 7]
+    charged_idx = torch.where(num_tracks >= 1)[0]
+    neutral_idx = torch.where(num_tracks < 1)[0]
+    def zero_nans(t):
+        out = t.clone()
+        out[out != out] = 0
+        return out
+    feats_charged = zero_nans(high_level[charged_idx])
+    feats_neutral = zero_nans(high_level[neutral_idx])
+    # Run charged prediction
+    charged_energies = model.energy_correction.model_charged.charged_prediction(
+        all_graphs, charged_idx, feats_charged,
+    )
+    # Run neutral prediction
+    neutral_energies, neutral_pxyz_avg = model.energy_correction.model_neutral.neutral_prediction(
+        all_graphs, neutral_idx, feats_neutral,
+    )
+    pids_charged = model.energy_correction.pids_charged
+    pids_neutral = model.energy_correction.pids_neutral
+    if len(pids_charged):
+        ch_e, ch_pos, ch_pid_logits, ch_ref = charged_energies
+    else:
+        ch_e, ch_pos, _ = charged_energies
+        ch_pid_logits = None
+    if len(pids_neutral):
+        ne_e, ne_pos, ne_pid_logits, ne_ref = neutral_energies
+    else:
+        ne_e, ne_pos, _ = neutral_energies
+        ne_pid_logits = None
+    pred_energy[charged_idx.flatten()] = ch_e if len(charged_idx) else pred_energy[charged_idx.flatten()]
+    pred_energy[neutral_idx.flatten()] = ne_e if len(neutral_idx) else pred_energy[neutral_idx.flatten()]
+    if ch_pid_logits is not None and len(charged_idx):
+        ch_labels = np.array(pids_charged)[np.argmax(ch_pid_logits.cpu().detach().numpy(), axis=1)]
+        pred_pid[charged_idx.flatten()] = torch.tensor(ch_labels).long().to(dev)
+    if ne_pid_logits is not None and len(neutral_idx):
+        ne_labels = np.array(pids_neutral)[np.argmax(ne_pid_logits.cpu().detach().numpy(), axis=1)]
+        pred_pid[neutral_idx.flatten()] = torch.tensor(ne_labels).long().to(dev)
+    pred_energy[pred_energy < 0] = 0.0
+    # Direction
+    if len(charged_idx):
+        pred_pos[charged_idx.flatten()] = ch_pos.float().to(dev)
+    if len(neutral_idx):
+        pred_pos[neutral_idx.flatten()] = ne_pos.float().to(dev)
+    # Build enriched output DataFrame
+    n_matched = len(graphs_matched)
+    rows = []
+    for k in range(n_clusters):
+        is_fake = k >= n_matched
+        pid_cls = int(pred_pid[k].item())
+        rows.append({
+            "cluster_id": k + 1,
+            "corrected_energy": round(pred_energy[k].item(), 4),
+            "raw_energy": round(sum_e[k].item(), 4),
+            "pid_class": pid_cls,
+            "pid_label": _PID_LABELS.get(pid_cls, str(pid_cls)),
+            "px": round(pred_pos[k, 0].item(), 4),
+            "py": round(pred_pos[k, 1].item(), 4),
+            "pz": round(pred_pos[k, 2].item(), 4),
+            "is_charged": bool(k in charged_idx),
+            "is_fake": is_fake,
+        })
+    return pd.DataFrame(rows)

src/layers/clustering.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""Clustering algorithms for particle-flow reconstruction.
+Adapted from densitypeakclustering (https://github.com/lanbing510/DensityPeakCluster).
+"""
+import torch
+import numpy as np
+from torch_scatter import scatter_add
+import densitypeakclustering as dc
+def local_density_energy(D, d_c, energies, normalize=False):
+    D_cuttoff = D < d_c
+    rho = np.zeros((D.shape[0],))
+    for s in range(len(rho)):
+        rho[s] = np.sum(energies[D_cuttoff[s, :]] * np.exp(-(D[s, D_cuttoff[s, :]] / d_c) ** 2))
+    if normalize:
+        rho = rho / np.max(rho)
+    return rho
+def DPC_custom_CLD(X, g, device):
+    d_c = 0.1
+    rho_min = 0.05
+    delta_min = 0.4
+    D = dc.distance_matrix(X.detach().cpu())
+    rho = local_density_energy(D, d_c, g.ndata["e_hits"].view(-1).cpu().numpy())
+    delta, nearest = dc.distance_to_larger_density(D, rho)
+    centers = dc.cluster_centers(rho, delta, rho_min=rho_min, delta_min=delta_min)
+    ids = dc.assign_cluster_id(rho, nearest, centers)
+    core_ids = np.full(len(X), -1)
+    D[np.isnan(D)] = 0
+    for indx, c in enumerate(centers):
+        idx = np.where((ids == indx) & (D[:, c] < 0.5))[0]
+        core_ids[idx] = indx
+    labels = torch.Tensor(core_ids) + 1
+    return labels.long().to(device)
+def remove_bad_tracks_from_cluster(g, labels_hdb):
+    mask_hit_type_t1 = g.ndata["hit_type"] == 2
+    mask_hit_type_t2 = g.ndata["hit_type"] == 1
+    mask_hit_type_t4 = g.ndata["hit_type"] == 4
+    labels_hdb_corrected_tracks = labels_hdb.clone()
+    labels_changed_tracks = 0.0 * (labels_hdb.clone())
+    for i in range(0, torch.max(labels_hdb) + 1):
+        mask_labels_i = labels_hdb == i
+        if torch.sum(mask_hit_type_t2[mask_labels_i]) > 0 and i > 0:
+            e_cluster = torch.sum(g.ndata["e_hits"][mask_labels_i])
+            p_track = g.ndata["p_hits"][mask_labels_i * mask_hit_type_t2]
+            number_of_hits_muon = torch.sum(mask_labels_i * mask_hit_type_t4)
+            diffs = torch.abs(e_cluster - p_track) / p_track
+            diffs = diffs.view(-1)
+            sigma_4 = 4 * 0.5 / torch.sqrt(p_track).view(-1)
+            bad_diffs = diffs > sigma_4
+            bad_tracks = bad_diffs * (number_of_hits_muon < 1)
+            cluster_t2_nodes = torch.nonzero(mask_labels_i & mask_hit_type_t2).view(-1)
+            bad_tracks_nodes = cluster_t2_nodes[bad_tracks]
+            labels_hdb_corrected_tracks[bad_tracks_nodes] = 0
+            if torch.sum(bad_tracks_nodes) > 0:
+                labels_changed_tracks[mask_labels_i] = 1
+    return labels_hdb_corrected_tracks, labels_changed_tracks
+def remove_labels_of_double_showers(labels, g):
+    is_track_per_shower = scatter_add(1 * (g.ndata["hit_type"] == 1), labels).int()
+    e_hits_sum = scatter_add(g.ndata["e_hits"].view(-1), labels.view(-1).long()).int()
+    mask_tracks = g.ndata["hit_type"] == 1
+    for i, label_i in enumerate(torch.unique(labels)):
+        if is_track_per_shower[label_i] == 2:
+            if label_i > 0:
+                sum_pred_2 = e_hits_sum[label_i]
+                mask_labels_i = labels == label_i
+                mask_label_i_and_is_track = mask_labels_i * mask_tracks
+                tracks_E = g.ndata['h'][:, -1][mask_label_i_and_is_track]
+                chi_tracks = g.ndata['chi_squared_tracks'][mask_label_i_and_is_track]
+                ind_min_E = torch.argmax(torch.abs(tracks_E - sum_pred_2))
+                ind_min_chi = torch.argmax(chi_tracks)
+                mask_hit_type_t1 = g.ndata["hit_type"][mask_labels_i] == 2
+                mask_hit_type_t2 = g.ndata["hit_type"][mask_labels_i] == 1
+                mask_all = mask_hit_type_t1
+                index_sorted = torch.argsort(g.ndata["radial_distance"][mask_labels_i][mask_hit_type_t1])
+                mask_sorted_ind = index_sorted < 10
+                mean_pos_cluster = torch.mean(
+                    g.ndata["pos_hits_xyz"][mask_labels_i][mask_all][mask_sorted_ind], dim=0
+                )
+                pos_track = g.ndata["pos_hits_xyz"][mask_labels_i][mask_hit_type_t2]
+                distance_track_cluster = torch.norm(pos_track - mean_pos_cluster, dim=1) / 1000
+                ind_max_dtc = torch.argmax(distance_track_cluster)
+                if torch.min(distance_track_cluster) < 0.4:
+                    ind_min = ind_max_dtc
+                elif ind_min_E == ind_min_chi:
+                    ind_min = ind_min_E
+                elif torch.max(chi_tracks - torch.min(chi_tracks)) < 2:
+                    ind_min = ind_min_E
+                else:
+                    ind_min = ind_min_chi
+                ind_change = torch.argwhere(mask_label_i_and_is_track)[ind_min]
+                labels[ind_change] = 0
+    return labels

src/layers/inference_oc.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+This file includes code adapted from:
+    densitypeakclustering
+    https://github.com/lanbing510/DensityPeakCluster
+The original implementation has been modified and integrated into this project.
+Please refer to the original repository for authorship, documentation,
+and license information.
+"""
+import dgl
+import torch
+import pandas as pd
+import numpy as np
+import wandb
+from src.layers.clustering import (
+    local_density_energy,
+    DPC_custom_CLD,
+    remove_bad_tracks_from_cluster,
+    remove_labels_of_double_showers,
+)
+from src.layers.shower_matching import (
+    CachedIndexList,
+    get_labels_pandora,
+    obtain_intersection_matrix,
+    obtain_union_matrix,
+    obtain_intersection_values,
+    match_showers,
+)
+from src.layers.shower_dataframe import (
+    get_correction_per_shower,
+    distance_to_true_cluster_of_track,
+    distance_to_cluster_track,
+    generate_showers_data_frame,
+)
+# Re-export everything so existing callers (utils_training, Gatr_pf_e_noise, …)
+# that do `from src.layers.inference_oc import X` continue to work unchanged.
+__all__ = [
+    "local_density_energy",
+    "DPC_custom_CLD",
+    "remove_bad_tracks_from_cluster",
+    "remove_labels_of_double_showers",
+    "CachedIndexList",
+    "get_labels_pandora",
+    "obtain_intersection_matrix",
+    "obtain_union_matrix",
+    "obtain_intersection_values",
+    "match_showers",
+    "get_correction_per_shower",
+    "distance_to_true_cluster_of_track",
+    "distance_to_cluster_track",
+    "generate_showers_data_frame",
+    "log_efficiency",
+    "store_at_batch_end",
+    "create_and_store_graph_output",
+]
+def log_efficiency(df, pandora=False, clustering=False):
+    mask = ~np.isnan(df["reco_showers_E"])
+    eff = np.sum(~np.isnan(df["pred_showers_E"][mask].values)) / len(
+        df["pred_showers_E"][mask].values
+    )
+    if pandora:
+        wandb.log({"efficiency validation pandora": eff})
+    elif clustering:
+        wandb.log({"efficiency validation clustering": eff})
+    else:
+        wandb.log({"efficiency validation": eff})
+def _make_save_path(path_save, local_rank, step, epoch, suffix=""):
+    return path_save + str(local_rank) + "_" + str(step) + "_" + str(epoch) + suffix + ".pt"
+def store_at_batch_end(
+    path_save,
+    df_batch1,
+    df_batch_pandora,
+    local_rank=0,
+    step=0,
+    epoch=None,
+    predict=False,
+    store=False,
+    pandora_available=False,
+):
+    path_save_ = _make_save_path(path_save, local_rank, step, epoch)
+    if store and predict:
+        df_batch1.to_pickle(path_save_)
+    if predict and pandora_available:
+        path_save_pandora = _make_save_path(path_save, local_rank, step, epoch, "_pandora")
+        if store and predict:
+            df_batch_pandora.to_pickle(path_save_pandora)
+    log_efficiency(df_batch1)
+    if predict and pandora_available:
+        log_efficiency(df_batch_pandora, pandora=True)
+def create_and_store_graph_output(
+    batch_g,
+    model_output,
+    y,
+    local_rank,
+    step,
+    epoch,
+    path_save,
+    store=False,
+    predict=False,
+    e_corr=None,
+    ec_x=None,
+    store_epoch=False,
+    total_number_events=0,
+    pred_pos=None,
+    pred_ref_pt=None,
+    use_gt_clusters=False,
+    pred_pid=None,
+    number_of_fakes=None,
+    extra_features=None,
+    fakes_labels=None,
+    pandora_available=False,
+    truth_tracks=False,
+):
+    number_of_showers_total = 0
+    number_of_showers_total1 = 0
+    number_of_fake_showers_total1 = 0
+    batch_g.ndata["coords"] = model_output[:, 0:3]
+    batch_g.ndata["beta"] = model_output[:, 3]
+    if e_corr is None:
+        batch_g.ndata["correction"] = model_output[:, 4]
+    graphs = dgl.unbatch(batch_g)
+    batch_id = y.batch_number.view(-1)
+    df_list1 = []
+    df_list_pandora = []
+    for i in range(0, len(graphs)):
+        mask = batch_id == i
+        dic = {}
+        dic["graph"] = graphs[i]
+        y1 = y.copy()
+        y1.mask(mask)
+        dic["part_true"] = y1
+        X = dic["graph"].ndata["coords"]
+        labels_clusters_removed_tracks = torch.zeros(
+            dic["graph"].num_nodes(), device=model_output.device
+        )
+        if use_gt_clusters:
+            labels_hdb = dic["graph"].ndata["particle_number"].type(torch.int64)
+        else:
+            labels_hdb = DPC_custom_CLD(X, dic["graph"], model_output.device)
+            if not truth_tracks:
+                labels_hdb, labels_clusters_removed_tracks = remove_bad_tracks_from_cluster(
+                    dic["graph"], labels_hdb
+                )
+        if predict and pandora_available:
+            labels_pandora = get_labels_pandora(dic, model_output.device)
+        particle_ids = torch.unique(dic["graph"].ndata["particle_number"])
+        shower_p_unique_hdb, row_ind_hdb, col_ind_hdb, i_m_w_hdb, iou_m = match_showers(
+            labels_hdb,
+            dic,
+            particle_ids,
+            model_output,
+            local_rank,
+            i,
+            path_save,
+            hdbscan=True,
+        )
+        if predict and pandora_available:
+            (
+                shower_p_unique_pandora,
+                row_ind_pandora,
+                col_ind_pandora,
+                i_m_w_pandora,
+                iou_m_pandora,
+            ) = match_showers(
+                labels_pandora,
+                dic,
+                particle_ids,
+                model_output,
+                local_rank,
+                i,
+                path_save,
+                pandora=True,
+            )
+        if len(shower_p_unique_hdb) > 1:
+            df_event1, number_of_showers_total1, number_of_fake_showers_total1 = generate_showers_data_frame(
+                labels_hdb,
+                dic,
+                shower_p_unique_hdb,
+                particle_ids,
+                row_ind_hdb,
+                col_ind_hdb,
+                i_m_w_hdb,
+                e_corr=e_corr,
+                number_of_showers_total=number_of_showers_total1,
+                step=step,
+                number_in_batch=total_number_events,
+                ec_x=ec_x,
+                pred_pos=pred_pos,
+                pred_ref_pt=pred_ref_pt,
+                pred_pid=pred_pid,
+                number_of_fakes=number_of_fakes,
+                number_of_fake_showers_total=number_of_fake_showers_total1,
+                extra_features=extra_features,
+                labels_clusters_removed_tracks=labels_clusters_removed_tracks,
+            )
+            if len(df_event1) > 1:
+                df_list1.append(df_event1)
+            if predict and pandora_available:
+                df_event_pandora = generate_showers_data_frame(
+                    labels_pandora,
+                    dic,
+                    shower_p_unique_pandora,
+                    particle_ids,
+                    row_ind_pandora,
+                    col_ind_pandora,
+                    i_m_w_pandora,
+                    pandora=True,
+                    step=step,
+                    number_in_batch=total_number_events,
+                )
+                if df_event_pandora is not None and type(df_event_pandora) is not tuple:
+                    df_list_pandora.append(df_event_pandora)
+                else:
+                    print("Not appending to df_list_pandora")
+            total_number_events = total_number_events + 1
+    df_batch1 = pd.concat(df_list1)
+    if predict and pandora_available:
+        df_batch_pandora = pd.concat(df_list_pandora)
+    else:
+        df_batch = []
+        df_batch_pandora = []
+    if store:
+        store_at_batch_end(
+            path_save,
+            df_batch1,
+            df_batch_pandora,
+            local_rank,
+            step,
+            epoch,
+            predict=predict,
+            store=store_epoch,
+            pandora_available=pandora_available,
+        )
+    if predict:
+        return df_batch_pandora, df_batch1, total_number_events
+    else:
+        return df_batch1

src/layers/object_cond.py ADDED Viewed

	@@ -0,0 +1,609 @@

+"""
+The loss implementation in this file is adapted from the HGCalML repository:
+    Repository: https://github.com/jkiesele/HGCalML
+    File: modules/lossLayers.py
+Original author: Jan Kieseler
+License: See the original repository for license details.
+The implementation has been modified and integrated into this project.
+"""
+from typing import Tuple, Union
+import numpy as np
+import torch
+from torch_scatter import scatter_max, scatter_add, scatter_mean
+import dgl
+def safe_index(arr, index):
+    # One-hot index (or zero if it's not in the array)
+    if index not in arr:
+        return 0
+    else:
+        return arr.index(index) + 1
+def assert_no_nans(x):
+    """
+    Raises AssertionError if there is a nan in the tensor
+    """
+    if torch.isnan(x).any():
+        print(x)
+    assert not torch.isnan(x).any()
+def calc_LV_Lbeta(
+    original_coords,
+    g,
+    y,
+    distance_threshold,
+    energy_correction,
+    beta: torch.Tensor,
+    cluster_space_coords: torch.Tensor,  # Predicted by model
+    cluster_index_per_event: torch.Tensor,  # Truth hit->cluster index
+    batch: torch.Tensor,
+    predicted_pid=None,  # predicted PID embeddings - will be aggregated by summing up the clusters and applying the post_pid_pool_module MLP afterwards
+    # From here on just parameters
+    qmin: float = 0.1,
+    s_B: float = 1.0,
+    noise_cluster_index: int = 0,  # cluster_index entries with this value are noise/noise
+    frac_combinations=0,  # fraction of the all possible pairs to be used for the clustering loss
+    use_average_cc_pos=0.0,
+    loss_type="hgcalimplementation",
+) -> Union[Tuple[torch.Tensor, torch.Tensor], dict]:
+    """
+    Calculates the L_V and L_beta object condensation losses.
+    Concepts:
+    - A hit belongs to exactly one cluster (cluster_index_per_event is (n_hits,)),
+      and to exactly one event (batch is (n_hits,))
+    - A cluster index of `noise_cluster_index` means the cluster is a noise cluster.
+      There is typically one noise cluster per event. Any hit in a noise cluster
+      is a 'noise hit'. A hit in an object is called a 'signal hit' for lack of a
+      better term.
+    - An 'object' is a cluster that is *not* a noise cluster.
+    beta_stabilizing: Choices are ['paper', 'clip', 'soft_q_scaling']:
+        paper: beta is sigmoid(model_output), q = beta.arctanh()**2 + qmin
+        clip:  beta is clipped to 1-1e-4, q = beta.arctanh()**2 + qmin
+        soft_q_scaling: beta is sigmoid(model_output), q = (clip(beta)/1.002).arctanh()**2 + qmin
+    huberize_norm_for_V_attractive: Huberizes the norms when used in the attractive potential
+    beta_term_option: Choices are ['paper', 'short-range-potential']:
+        Choosing 'short-range-potential' introduces a short range potential around high
+        beta points, acting like V_attractive.
+    Note this function has modifications w.r.t. the implementation in 2002.03605:
+    - The norms for V_repulsive are now Gaussian (instead of linear hinge)
+    """
+    # remove dummy rows added for dataloader #TODO think of better way to do this
+    device = beta.device
+    if torch.isnan(beta).any():
+        print("There are nans in beta! L198", len(beta[torch.isnan(beta)]))
+    beta = torch.nan_to_num(beta, nan=0.0)
+    assert_no_nans(beta)
+    # ________________________________
+    # Calculate a bunch of needed counts and indices locally
+    # cluster_index: unique index over events
+    # E.g. cluster_index_per_event=[ 0, 0, 1, 2, 0, 0, 1], batch=[0, 0, 0, 0, 1, 1, 1]
+    #      -> cluster_index=[ 0, 0, 1, 2, 3, 3, 4 ]
+    cluster_index, n_clusters_per_event = batch_cluster_indices(
+        cluster_index_per_event, batch
+    )
+    n_clusters = n_clusters_per_event.sum()
+    n_hits, cluster_space_dim = cluster_space_coords.size()
+    batch_size = batch.max() + 1
+    n_hits_per_event = scatter_count(batch)
+    # Index of cluster -> event (n_clusters,)
+    batch_cluster = scatter_counts_to_indices(n_clusters_per_event)
+    # Per-hit boolean, indicating whether hit is sig or noise
+    is_noise = cluster_index_per_event == noise_cluster_index
+    is_sig = ~is_noise
+    n_hits_sig = is_sig.sum()
+    n_sig_hits_per_event = scatter_count(batch[is_sig])
+    # Per-cluster boolean, indicating whether cluster is an object or noise
+    is_object = scatter_max(is_sig.long(), cluster_index)[0].bool()
+    is_noise_cluster = ~is_object
+    if noise_cluster_index != 0:
+        raise NotImplementedError
+    object_index_per_event = cluster_index_per_event[is_sig] - 1
+    object_index, n_objects_per_event = batch_cluster_indices(
+        object_index_per_event, batch[is_sig]
+    )
+    n_hits_per_object = scatter_count(object_index)
+    # print("n_hits_per_object", n_hits_per_object)
+    batch_object = batch_cluster[is_object]
+    n_objects = is_object.sum()
+    assert object_index.size() == (n_hits_sig,)
+    assert is_object.size() == (n_clusters,)
+    assert torch.all(n_hits_per_object > 0)
+    assert object_index.max() + 1 == n_objects
+    # ________________________________
+    # L_V term
+    # Calculate q
+    q = (beta.clip(0.0, 1 - 1e-4).arctanh() / 1.01) ** 2 + qmin
+    assert_no_nans(q)
+    assert q.device == device
+    assert q.size() == (n_hits,)
+    # Calculate q_alpha, the max q per object, and the indices of said maxima
+    # assert hit_energies.shape == q.shape
+    # q_alpha, index_alpha = scatter_max(hit_energies[is_sig], object_index)
+    q_alpha, index_alpha = scatter_max(q[is_sig], object_index)
+    assert q_alpha.size() == (n_objects,)
+    # Get the cluster space coordinates and betas for these maxima hits too
+    x_alpha = cluster_space_coords[is_sig][index_alpha]
+    x_alpha_original = original_coords[is_sig][index_alpha]
+    if use_average_cc_pos > 0:
+        x_alpha_sum = scatter_add(
+            q[is_sig].view(-1, 1).repeat(1, 3) * cluster_space_coords[is_sig],
+            object_index,
+            dim=0,
+        )  # * beta[is_sig].view(-1, 1).repeat(1, 3)
+        qbeta_alpha_sum = scatter_add(q[is_sig], object_index) + 1e-9  # * beta[is_sig]
+        div_fac = 1 / qbeta_alpha_sum
+        div_fac = torch.nan_to_num(div_fac, nan=0)
+        x_alpha_mean = torch.mul(x_alpha_sum, div_fac.view(-1, 1).repeat(1, 3))
+        x_alpha = use_average_cc_pos * x_alpha_mean + (1 - use_average_cc_pos) * x_alpha
+    beta_alpha = beta[is_sig][index_alpha]
+    assert x_alpha.size() == (n_objects, cluster_space_dim)
+    assert beta_alpha.size() == (n_objects,)
+    # Connectivity matrix from hit (row) -> cluster (column)
+    # Index to matrix, e.g.:
+    # [1, 3, 1, 0] --> [
+    #     [0, 1, 0, 0],
+    #     [0, 0, 0, 1],
+    #     [0, 1, 0, 0],
+    #     [1, 0, 0, 0]
+    #     ]
+    M = torch.nn.functional.one_hot(cluster_index).long()
+    # Anti-connectivity matrix; be sure not to connect hits to clusters in different events!
+    M_inv = get_inter_event_norms_mask(batch, n_clusters_per_event) - M
+    # Throw away noise cluster columns; we never need them
+    M = M[:, is_object]
+    M_inv = M_inv[:, is_object]
+    assert M.size() == (n_hits, n_objects)
+    assert M_inv.size() == (n_hits, n_objects)
+    # Calculate all norms
+    # Warning: Should not be used without a mask!
+    # Contains norms between hits and objects from different events
+    # (n_hits, 1, cluster_space_dim) - (1, n_objects, cluster_space_dim)
+    #   gives (n_hits, n_objects, cluster_space_dim)
+    norms = (cluster_space_coords.unsqueeze(1) - x_alpha.unsqueeze(0)).norm(dim=-1)
+    assert norms.size() == (n_hits, n_objects)
+    L_clusters = torch.tensor(0.0).to(device)
+    if frac_combinations != 0:
+        L_clusters = L_clusters_calc(
+            batch, cluster_space_coords, cluster_index, frac_combinations, q
+        )
+    # -------
+    # Attractive potential term
+    # First get all the relevant norms: We only want norms of signal hits
+    # w.r.t. the object they belong to, i.e. no noise hits and no noise clusters.
+    # First select all norms of all signal hits w.r.t. all objects, mask out later
+    N_k = torch.sum(M, dim=0)  # number of hits per object
+    norms = torch.sum(
+        torch.square(cluster_space_coords.unsqueeze(1) - x_alpha.unsqueeze(0)),
+        dim=-1,
+    ) # take the norm squared
+    norms_att = norms[is_sig]
+    #att func as in line 159 of object condensation
+    norms_att = torch.log(
+        torch.exp(torch.Tensor([1]).to(norms_att.device)) * norms_att / 2 + 1
+    )
+    assert norms_att.size() == (n_hits_sig, n_objects)
+    # Now apply the mask to keep only norms of signal hits w.r.t. to the object
+    # they belong to
+    norms_att *= M[is_sig]
+    # Sum over hits, then sum per event, then divide by n_hits_per_event, then sum over events
+    V_attractive = (q[is_sig]).unsqueeze(-1) * q_alpha.unsqueeze(0) * norms_att
+    V_attractive = V_attractive.sum(dim=0)  # K objects
+    V_attractive = V_attractive.view(-1) / (N_k.view(-1) + 1e-3)
+    L_V_attractive = torch.mean(V_attractive)
+    norms_rep = torch.relu(1. - torch.sqrt(norms + 1e-6))* M_inv
+    # (n_sig_hits, 1) * (1, n_objects) * (n_sig_hits, n_objects)
+    V_repulsive = q.unsqueeze(1) * q_alpha.unsqueeze(0) * norms_rep
+    # No need to apply a V = max(0, V); by construction V>=0
+    assert V_repulsive.size() == (n_hits, n_objects)
+    # Sum over hits, then sum per event, then divide by n_hits_per_event, then sum up events
+    nope = n_objects_per_event - 1
+    nope[nope == 0] = 1
+    L_V_repulsive = V_repulsive.sum(dim=0)
+    number_of_repulsive_terms_per_object = torch.sum(M_inv, dim=0)
+    L_V_repulsive = L_V_repulsive.view(
+        -1
+    ) / number_of_repulsive_terms_per_object.view(-1)
+    L_V_repulsive = torch.mean(L_V_repulsive)
+    L_V_repulsive2 = L_V_repulsive
+    L_V = (
+        L_V_attractive
+        + L_V_repulsive
+    )
+    n_noise_hits_per_event = scatter_count(batch[is_noise])
+    n_noise_hits_per_event[n_noise_hits_per_event == 0] = 1
+    L_beta_noise = (
+        s_B
+        * (
+            (scatter_add(beta[is_noise], batch[is_noise])) / n_noise_hits_per_event
+        ).sum()
+    )
+    # L_beta signal term
+    beta_per_object_c = scatter_add(beta[is_sig], object_index)
+    beta_alpha = beta[is_sig][index_alpha]
+    # hit_type_mask = (g.ndata["hit_type"]==1)*(g.ndata["particle_number"]>0)
+    # beta_alpha_track = beta[is_sig*hit_type_mask]
+    L_beta_sig = torch.mean(
+        1 - beta_alpha + 1 - torch.clip(beta_per_object_c, 0, 1)
+    )
+    L_beta_noise = L_beta_noise / 4
+    L_beta = L_beta_noise + L_beta_sig
+    L_alpha_coordinates = torch.mean(torch.norm(x_alpha_original - x_alpha, p=2, dim=1))
+    L_exp = L_beta
+    if (loss_type == "hgcalimplementation") or (loss_type == "vrepweighted") or (loss_type == "baseline"):
+        return (
+            L_V,
+            L_beta,
+            L_beta_sig,
+            L_beta_noise,
+            0,
+            0,
+            0,
+            None,
+            None,
+            0,
+            L_clusters,
+            0,
+            L_V_attractive,
+            L_V_repulsive,
+            L_alpha_coordinates,
+            L_exp,
+            norms_rep,
+            norms_att,
+            L_V_repulsive2,
+            0
+        )
+def object_condensation_loss2(
+    batch,
+    pred,
+    pred_2,
+    y,
+    q_min=0.1,
+    use_average_cc_pos=0.0,
+    output_dim=4,
+    clust_space_norm="none",
+):
+    """
+    :param batch:
+    :param pred:
+    :param y:
+    :param return_resolution: If True, it will only output resolution data to plot for regression (only used for evaluation...)
+    :param clust_loss_only: If True, it will only add the clustering terms to the loss
+    :return:
+    """
+    _, S = pred.shape
+    clust_space_dim = 3
+    bj = torch.sigmoid(torch.reshape(pred[:, clust_space_dim], [-1, 1]))  # 3: betas
+    # print("bj", bj)
+    original_coords = batch.ndata["h"][:, 0:clust_space_dim]
+    distance_threshold = 0
+    energy_correction = pred_2
+    xj = pred[:, 0:clust_space_dim]  # xj: cluster space coords
+    if clust_space_norm == "twonorm":
+        xj = torch.nn.functional.normalize(xj, dim=1)  # 0, 1, 2: cluster space coords
+    elif clust_space_norm == "tanh":
+        xj = torch.tanh(xj)
+    elif clust_space_norm == "none":
+        pass
+    else:
+        raise NotImplementedError
+    dev = batch.device
+    clustering_index_l = batch.ndata["particle_number"]
+    len_batch = len(batch.batch_num_nodes())
+    batch_numbers = torch.repeat_interleave(
+        torch.arange(0, len_batch).to(dev), batch.batch_num_nodes()
+    ).to(dev)
+    a = calc_LV_Lbeta(
+        original_coords,
+        batch,
+        y,
+        distance_threshold,
+        energy_correction,
+        beta=bj.view(-1),
+        cluster_space_coords=xj,  # Predicted by model
+        cluster_index_per_event=clustering_index_l.view(
+            -1
+        ).long(),  # Truth hit->cluster index
+        batch=batch_numbers.long(),
+        qmin=q_min,
+        use_average_cc_pos=use_average_cc_pos,
+    )
+    loss = 1 * a[0] + a[1]
+    return loss, a
+def formatted_loss_components_string(components: dict) -> str:
+    """
+    Formats the components returned by calc_LV_Lbeta
+    """
+    total_loss = components["L_V"] + components["L_beta"]
+    fractions = {k: v / total_loss for k, v in components.items()}
+    fkey = lambda key: f"{components[key]:+.4f} ({100.*fractions[key]:.1f}%)"
+    s = (
+        "  L_V                 = {L_V}"
+        "\n    L_V_attractive      = {L_V_attractive}"
+        "\n    L_V_repulsive       = {L_V_repulsive}"
+        "\n  L_beta              = {L_beta}"
+        "\n    L_beta_noise        = {L_beta_noise}"
+        "\n    L_beta_sig          = {L_beta_sig}".format(
+            L=total_loss, **{k: fkey(k) for k in components}
+        )
+    )
+    if "L_beta_norms_term" in components:
+        s += (
+            "\n      L_beta_norms_term   = {L_beta_norms_term}"
+            "\n      L_beta_logbeta_term = {L_beta_logbeta_term}".format(
+                **{k: fkey(k) for k in components}
+            )
+        )
+    if "L_noise_filter" in components:
+        s += f'\n  L_noise_filter = {fkey("L_noise_filter")}'
+    return s
+def huber(d, delta):
+    """
+    See: https://en.wikipedia.org/wiki/Huber_loss#Definition
+    Multiplied by 2 w.r.t Wikipedia version (aligning with Jan's definition)
+    """
+    return torch.where(
+        torch.abs(d) <= delta, d**2, 2.0 * delta * (torch.abs(d) - delta)
+    )
+def batch_cluster_indices(
+    cluster_id: torch.Tensor, batch: torch.Tensor
+) -> Tuple[torch.LongTensor, torch.LongTensor]:
+    """
+    Turns cluster indices per event to an index in the whole batch
+    Example:
+    cluster_id = torch.LongTensor([0, 0, 1, 1, 2, 0, 0, 1, 1, 1, 0, 0, 1])
+    batch = torch.LongTensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2])
+    -->
+    offset = torch.LongTensor([0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 5, 5, 5])
+    output = torch.LongTensor([0, 0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6])
+    """
+    device = cluster_id.device
+    assert cluster_id.device == batch.device
+    # Count the number of clusters per entry in the batch
+    n_clusters_per_event = scatter_max(cluster_id, batch, dim=-1)[0] + 1
+    # Offsets are then a cumulative sum
+    offset_values_nozero = n_clusters_per_event[:-1].cumsum(dim=-1)
+    # Prefix a zero
+    offset_values = torch.cat((torch.zeros(1, device=device), offset_values_nozero))
+    # Fill it per hit
+    offset = torch.gather(offset_values, 0, batch).long()
+    return offset + cluster_id, n_clusters_per_event
+def get_clustering(betas: torch.Tensor, X: torch.Tensor, tbeta=0.1, td=1.0):
+    """
+    Returns a clustering of hits -> cluster_index, based on the GravNet model
+    output (predicted betas and cluster space coordinates) and the clustering
+    parameters tbeta and td.
+    Takes torch.Tensors as input.
+    """
+    n_points = betas.size(0)
+    select_condpoints = betas > tbeta
+    # Get indices passing the threshold
+    indices_condpoints = select_condpoints.nonzero()
+    # Order them by decreasing beta value
+    indices_condpoints = indices_condpoints[(-betas[select_condpoints]).argsort()]
+    # Assign points to condensation points
+    # Only assign previously unassigned points (no overwriting)
+    # Points unassigned at the end are bkg (-1)
+    unassigned = torch.arange(n_points)
+    clustering = -1 * torch.ones(n_points, dtype=torch.long).to(betas.device)
+    for index_condpoint in indices_condpoints:
+        d = torch.norm(X[unassigned] - X[index_condpoint][0], dim=-1)
+        assigned_to_this_condpoint = unassigned[d < td]
+        clustering[assigned_to_this_condpoint] = index_condpoint[0]
+        unassigned = unassigned[~(d < td)]
+    return clustering
+def scatter_count(input: torch.Tensor):
+    """
+    Returns ordered counts over an index array
+    Example:
+    >>> scatter_count(torch.Tensor([0, 0, 0, 1, 1, 2, 2])) # input
+    >>> [3, 2, 2]
+    Index assumptions work like in torch_scatter, so:
+    >>> scatter_count(torch.Tensor([1, 1, 1, 2, 2, 4, 4]))
+    >>> tensor([0, 3, 2, 0, 2])
+    """
+    return scatter_add(torch.ones_like(input, dtype=torch.long), input.long())
+def scatter_counts_to_indices(input: torch.LongTensor) -> torch.LongTensor:
+    """
+    Converts counts to indices. This is the inverse operation of scatter_count
+    Example:
+    input:  [3, 2, 2]
+    output: [0, 0, 0, 1, 1, 2, 2]
+    """
+    return torch.repeat_interleave(
+        torch.arange(input.size(0), device=input.device), input
+    ).long()
+def get_inter_event_norms_mask(
+    batch: torch.LongTensor, nclusters_per_event: torch.LongTensor
+):
+    """
+    Creates mask of (nhits x nclusters) that is only 1 if hit i is in the same event as cluster j
+    Example:
+    cluster_id_per_event = torch.LongTensor([0, 0, 1, 1, 2, 0, 0, 1, 1, 1, 0, 0, 1])
+    batch = torch.LongTensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2])
+    Should return:
+    torch.LongTensor([
+        [1, 1, 1, 0, 0, 0, 0],
+        [1, 1, 1, 0, 0, 0, 0],
+        [1, 1, 1, 0, 0, 0, 0],
+        [1, 1, 1, 0, 0, 0, 0],
+        [1, 1, 1, 0, 0, 0, 0],
+        [0, 0, 0, 1, 1, 0, 0],
+        [0, 0, 0, 1, 1, 0, 0],
+        [0, 0, 0, 1, 1, 0, 0],
+        [0, 0, 0, 1, 1, 0, 0],
+        [0, 0, 0, 1, 1, 0, 0],
+        [0, 0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 0, 1, 1],
+        [0, 0, 0, 0, 0, 1, 1],
+        ])
+    """
+    device = batch.device
+    # Following the example:
+    # Expand batch to the following (nhits x nevents) matrix (little hacky, boolean mask -> long):
+    # [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    #  [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0],
+    #  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]]
+    batch_expanded_as_ones = (
+        batch
+        == torch.arange(batch.max() + 1, dtype=torch.long, device=device).unsqueeze(-1)
+    ).long()
+    # Then repeat_interleave it to expand it to nclusters rows, and transpose to get (nhits x nclusters)
+    return batch_expanded_as_ones.repeat_interleave(nclusters_per_event, dim=0).T
+def isin(ar1, ar2):
+    """To be replaced by torch.isin for newer releases of torch"""
+    return (ar1[..., None] == ar2).any(-1)
+def L_clusters_calc(batch, cluster_space_coords, cluster_index, frac_combinations, q):
+    number_of_pairs = 0
+    for batch_id in batch.unique():
+        # do all possible pairs...
+        bmask = batch == batch_id
+        clust_space_filt = cluster_space_coords[bmask]
+        pos_pairs_all = []
+        neg_pairs_all = []
+        if len(cluster_index[bmask].unique()) <= 1:
+            continue
+        L_clusters = torch.tensor(0.0).to(q.device)
+        for cluster in cluster_index[bmask].unique():
+            coords_pos = clust_space_filt[cluster_index[bmask] == cluster]
+            coords_neg = clust_space_filt[cluster_index[bmask] != cluster]
+            if len(coords_neg) == 0:
+                continue
+            clust_idx = cluster_index[bmask] == cluster
+            # all_ones = torch.ones_like((clust_idx, clust_idx))
+            # pos_pairs = [[i, j] for i in range(len(coords_pos)) for j in range (len(coords_pos)) if i < j]
+            total_num = (len(coords_pos) ** 2) / 2
+            num = int(frac_combinations * total_num)
+            pos_pairs = []
+            for i in range(num):
+                pos_pairs.append(
+                    [
+                        np.random.randint(len(coords_pos)),
+                        np.random.randint(len(coords_pos)),
+                    ]
+                )
+            neg_pairs = []
+            for i in range(len(pos_pairs)):
+                neg_pairs.append(
+                    [
+                        np.random.randint(len(coords_pos)),
+                        np.random.randint(len(coords_neg)),
+                    ]
+                )
+            pos_pairs_all += pos_pairs
+            neg_pairs_all += neg_pairs
+        pos_pairs = torch.tensor(pos_pairs_all)
+        neg_pairs = torch.tensor(neg_pairs_all)
+        assert pos_pairs.shape == neg_pairs.shape
+        if len(pos_pairs) == 0:
+            continue
+        cluster_space_coords_filtered = cluster_space_coords[bmask]
+        qs_filtered = q[bmask]
+        pos_norms = (
+            cluster_space_coords_filtered[pos_pairs[:, 0]]
+            - cluster_space_coords_filtered[pos_pairs[:, 1]]
+        ).norm(dim=-1)
+        neg_norms = (
+            cluster_space_coords_filtered[neg_pairs[:, 0]]
+            - cluster_space_coords_filtered[neg_pairs[:, 1]]
+        ).norm(dim=-1)
+        q_pos = qs_filtered[pos_pairs[:, 0]]
+        q_neg = qs_filtered[neg_pairs[:, 0]]
+        q_s = torch.cat([q_pos, q_neg])
+        norms_pos = torch.cat([pos_norms, neg_norms])
+        ys = torch.cat([torch.ones_like(pos_norms), -torch.ones_like(neg_norms)])
+        L_clusters += torch.sum(
+            q_s * torch.nn.HingeEmbeddingLoss(reduce=None)(norms_pos, ys)
+        )
+        number_of_pairs += norms_pos.shape[0]
+    if number_of_pairs > 0:
+        L_clusters = L_clusters / number_of_pairs
+    return L_clusters

src/layers/regression/loss_regression.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+import numpy as np
+def obtain_PID_charged(dic,pid_true_matched, pids_charged, args, pid_conversion_dict):
+    charged_PID_pred = dic["charged_PID_pred"]
+    charged_PID_true = np.array(pid_true_matched)[dic["charged_idx"].cpu().tolist()]
+    # one-hot encoded
+    charged_PID_true_onehot = torch.zeros(
+        len(charged_PID_true), len(pids_charged)
+    ).to(charged_PID_pred.device)
+    mask_charged = torch.ones(len(charged_PID_true))
+    pids_charged_arr = np.array(pids_charged)
+    for i, pid in enumerate(charged_PID_true):
+        if pid not in pid_conversion_dict:
+            print("Unknown PID", pid)
+        true_idx = pid_conversion_dict.get(pid, 3)
+        col = np.where(pids_charged_arr == true_idx)[0]
+        if len(col) == 0:
+            mask_charged[i] = 0
+        else:
+            charged_PID_true_onehot[i, col[0]] = 1
+    return charged_PID_pred, charged_PID_true_onehot, mask_charged
+def obtain_PID_neutral(dic,pid_true_matched,pids_neutral, args, pid_conversion_dict):
+    neutral_PID_pred = dic["neutral_PID_pred"]
+    neutral_idx = dic["neutrals_idx"]
+    neutral_PID_true = np.array(pid_true_matched)[neutral_idx.cpu()]
+    if type(neutral_PID_true) == np.float64:
+        neutral_PID_true = [neutral_PID_true]
+    # One-hot encoded
+    neutral_PID_true_onehot = torch.zeros(
+        len(neutral_PID_true), len(pids_neutral)
+    ).to(neutral_PID_pred.device)
+    mask_neutral = torch.ones(len(neutral_PID_true))
+    # convert from true PID to int list PID (4-class encoding)
+    pids_neutral_arr = np.array(pids_neutral)
+    for i, pid in enumerate(neutral_PID_true):
+        if pid not in pid_conversion_dict:
+            print("Unknown PID", pid)
+        true_idx = pid_conversion_dict.get(pid, 3)
+        col = np.where(pids_neutral_arr == true_idx)[0]
+        if len(col) == 0:
+            mask_neutral[i] = 0
+        else:
+            neutral_PID_true_onehot[i, col[0]] = 1
+    neutral_PID_true_onehot = neutral_PID_true_onehot.to(neutral_idx.device)
+    return neutral_PID_pred, neutral_PID_true_onehot, mask_neutral

src/layers/shower_dataframe.py ADDED Viewed

	@@ -0,0 +1,441 @@

+"""DataFrame construction and shower-level helpers for particle-flow reconstruction."""
+import torch
+import pandas as pd
+from torch_scatter import scatter_add, scatter_mean, scatter_max
+from src.layers.clustering import remove_labels_of_double_showers
+from src.layers.shower_matching import obtain_intersection_values
+# ---------------------------------------------------------------------------
+# Small tensor helpers
+# ---------------------------------------------------------------------------
+def nan_like(t):
+    return torch.zeros_like(t) * torch.nan
+def nan_tensor(*size, device):
+    return torch.zeros(*size, device=device) * torch.nan
+def _window(tensor, start, count):
+    return tensor[start : start + count]
+def _compute_pandora_momentum(labels, g):
+    """Scatter-mean the pandora momentum/reference-point node features per cluster.
+    Returns (pxyz, ref_pt, pandora_pid, calc_pandora_momentum).  All three
+    tensor outputs are None when the graph does not carry 'pandora_momentum'.
+    """
+    calc_pandora_momentum = "pandora_momentum" in g.ndata
+    if not calc_pandora_momentum:
+        return None, None, None, False
+    px = scatter_mean(g.ndata["pandora_momentum"][:, 0], labels)
+    py = scatter_mean(g.ndata["pandora_momentum"][:, 1], labels)
+    pz = scatter_mean(g.ndata["pandora_momentum"][:, 2], labels)
+    ref_pt_px = scatter_mean(g.ndata["pandora_reference_point"][:, 0], labels)
+    ref_pt_py = scatter_mean(g.ndata["pandora_reference_point"][:, 1], labels)
+    ref_pt_pz = scatter_mean(g.ndata["pandora_reference_point"][:, 2], labels)
+    pandora_pid = scatter_mean(g.ndata["pandora_pid"], labels)
+    ref_pt = torch.stack((ref_pt_px, ref_pt_py, ref_pt_pz), dim=1)
+    pxyz = torch.stack((px, py, pz), dim=1)
+    return pxyz, ref_pt, pandora_pid, True
+# ---------------------------------------------------------------------------
+# Per-shower correction
+# ---------------------------------------------------------------------------
+def get_correction_per_shower(labels, dic):
+    unique_labels = torch.unique(labels)
+    list_corr = []
+    for ii, pred_label in enumerate(unique_labels):
+        if ii == 0:
+            if pred_label != 0:
+                list_corr.append(dic["graph"].ndata["correction"][0].view(-1) * 0)
+        mask = labels == pred_label
+        corrections_E_label = dic["graph"].ndata["correction"][mask]
+        betas_label_indmax = torch.argmax(dic["graph"].ndata["beta"][mask])
+        list_corr.append(corrections_E_label[betas_label_indmax].view(-1))
+    corrections = torch.cat(list_corr, dim=0)
+    return corrections
+# ---------------------------------------------------------------------------
+# Track–cluster distance helpers
+# ---------------------------------------------------------------------------
+def distance_to_true_cluster_of_track(dic, labels):
+    g = dic["graph"]
+    mask_hit_type_t2 = g.ndata["hit_type"] == 1
+    if torch.sum(labels.unique() == 0) == 0:
+        distances = torch.zeros(len(labels.unique()) + 1).float().to(labels.device)
+        number_of_tracks = torch.zeros(len(labels.unique()) + 1).int()
+    else:
+        distances = torch.zeros(len(labels.unique())).float().to(labels.device)
+        number_of_tracks = torch.zeros(len(labels.unique())).int()
+    for i, label in enumerate(labels.unique()):
+        mask_labels_i = labels == label
+        mask = mask_labels_i * mask_hit_type_t2
+        if mask.sum() == 0:
+            continue
+        pos_track = g.ndata["pos_hits_xyz"][mask][0]
+        if pos_track.shape[0] == 0:
+            continue
+        true_part_idx_track = g.ndata["particle_number"][mask_labels_i * mask_hit_type_t2][0].int()
+        mask_labels_i_true = g.ndata["particle_number"] == true_part_idx_track
+        mean_pos_cluster_true = torch.mean(
+            g.ndata["pos_hits_xyz"][mask_labels_i_true], dim=0
+        )
+        number_of_tracks[label] = torch.sum(mask_labels_i_true * mask_hit_type_t2)
+        distances[label] = torch.norm(mean_pos_cluster_true - pos_track) / 3300
+    return distances, number_of_tracks
+def distance_to_cluster_track(dic, is_track_in_MC):
+    g = dic["graph"]
+    mask_hit_type_t1 = g.ndata["hit_type"] == 2
+    mask_hit_type_t2 = g.ndata["hit_type"] == 1
+    pos_track = g.ndata["pos_hits_xyz"][mask_hit_type_t2]
+    particle_track = g.ndata["particle_number"][mask_hit_type_t2]
+    if len(particle_track) > 0:
+        mean_pos_cluster_all = []
+        for i in particle_track:
+            if i == 0:
+                mean_pos_cluster_all.append(torch.zeros((1, 3)).view(-1, 3).to(particle_track.device))
+            else:
+                mask_labels_i = g.ndata["particle_number"] == i
+                mean_pos_cluster = torch.mean(g.ndata["pos_hits_xyz"][mask_labels_i * mask_hit_type_t1], dim=0)
+                mean_pos_cluster_all.append(mean_pos_cluster.view(-1, 3))
+        mean_pos_cluster_all = torch.cat(mean_pos_cluster_all, dim=0)
+        distance_track_cluster = torch.norm(mean_pos_cluster_all - pos_track, dim=1) / 1000
+        if len(particle_track) > len(torch.unique(particle_track)):
+            distance_track_cluster_unique = []
+            for i in torch.unique(particle_track):
+                mask_tracks = particle_track == i
+                distance_track_cluster_unique.append(torch.min(distance_track_cluster[mask_tracks]).view(-1))
+            distance_track_cluster_unique = torch.cat(distance_track_cluster_unique, dim=0)
+            unique_particle_track = torch.unique(particle_track)
+        else:
+            distance_track_cluster_unique = distance_track_cluster
+            unique_particle_track = particle_track
+        distance_to_cluster_all = is_track_in_MC.clone().float()
+        distance_to_cluster_all[unique_particle_track.long()] = distance_track_cluster_unique
+        return distance_to_cluster_all
+    else:
+        return is_track_in_MC.clone().float()
+# ---------------------------------------------------------------------------
+# Main DataFrame builder
+# ---------------------------------------------------------------------------
+def generate_showers_data_frame(
+    labels,
+    dic,
+    shower_p_unique,
+    particle_ids,
+    row_ind,
+    col_ind,
+    i_m_w,
+    pandora=False,
+    e_corr=None,
+    number_of_showers_total=None,
+    step=0,
+    number_in_batch=0,
+    ec_x=None,
+    pred_pos=None,
+    pred_pid=None,
+    pred_ref_pt=None,
+    number_of_fake_showers_total=None,
+    number_of_fakes=None,
+    extra_features=None,
+    labels_clusters_removed_tracks=None,
+):
+    e_pred_showers = scatter_add(dic["graph"].ndata["e_hits"].view(-1), labels)
+    e_pred_showers_ecal = scatter_add(1 * (dic["graph"].ndata["hit_type"].view(-1) == 2), labels)
+    e_pred_showers_hcal = scatter_add(1 * (dic["graph"].ndata["hit_type"].view(-1) == 3), labels)
+    if not pandora:
+        removed_tracks = scatter_add(1 * labels_clusters_removed_tracks, labels)
+    if pandora:
+        e_pred_showers_cali = scatter_mean(
+            dic["graph"].ndata["pandora_pfo_energy"].view(-1), labels
+        )
+        e_pred_showers_pfo = scatter_mean(
+            dic["graph"].ndata["pandora_pfo_energy"].view(-1), labels
+        )
+        pxyz_pred_pfo, ref_pt_pred_pfo, pandora_pid, calc_pandora_momentum = \
+            _compute_pandora_momentum(labels, dic["graph"])
+    else:
+        if e_corr is None:
+            corrections_per_shower = get_correction_per_shower(labels, dic)
+            e_pred_showers_cali = e_pred_showers * corrections_per_shower
+        else:
+            corrections_per_shower = e_corr.view(-1)
+            if number_of_fakes > 0:
+                corrections_per_shower_fakes = corrections_per_shower[-number_of_fakes:]
+                corrections_per_shower = corrections_per_shower[:-number_of_fakes]
+    e_reco_showers = scatter_add(
+        dic["graph"].ndata["e_hits"].view(-1),
+        dic["graph"].ndata["particle_number"].long(),
+    )
+    e_label_showers = scatter_max(
+        labels.view(-1),
+        dic["graph"].ndata["particle_number"].long(),
+    )[0]
+    is_track_in_MC = scatter_add(
+        1 * (dic["graph"].ndata["hit_type"].view(-1) == 1),
+        dic["graph"].ndata["particle_number"].long(),
+    )
+    track_chi = scatter_add(
+        1 * (dic["graph"].ndata["chi_squared_tracks"].view(-1) == 1),
+        dic["graph"].ndata["particle_number"].long(),
+    )
+    distance_to_cluster_all = distance_to_cluster_track(dic, is_track_in_MC)
+    distances, number_of_tracks = distance_to_true_cluster_of_track(dic, labels)
+    row_ind = torch.Tensor(row_ind).to(e_pred_showers.device).long()
+    col_ind = torch.Tensor(col_ind).to(e_pred_showers.device).long()
+    if torch.sum(particle_ids == 0) > 0:
+        row_ind_ = row_ind - 1
+    else:
+        row_ind_ = row_ind
+    pred_showers = shower_p_unique
+    energy_t = (
+        dic["part_true"].E_corrected.view(-1).to(e_pred_showers.device)
+    ).float()
+    gen_status = (
+        dic["part_true"].gen_status.view(-1).to(e_pred_showers.device)
+    ).float()
+    vertex = dic["part_true"].vertex.to(e_pred_showers.device)
+    pos_t = dic["part_true"].coord.to(e_pred_showers.device)
+    pid_t = dic["part_true"].pid.to(e_pred_showers.device)
+    if not pandora:
+        labels = remove_labels_of_double_showers(labels, dic["graph"])
+    is_track_per_shower = scatter_add(1 * (dic["graph"].ndata["hit_type"] == 1), labels).int()
+    is_track = torch.zeros(energy_t.shape).to(e_pred_showers.device)
+    index_matches = col_ind + 1
+    index_matches = index_matches.to(e_pred_showers.device).long()
+    dev = e_pred_showers.device
+    matched_es = nan_like(energy_t)
+    matched_ECAL = nan_like(energy_t)
+    matched_HCAL = nan_like(energy_t)
+    matched_positions = nan_tensor(energy_t.shape[0], 3, device=dev)
+    matched_ref_pt = nan_tensor(energy_t.shape[0], 3, device=dev)
+    matched_pid = nan_like(energy_t).long()
+    matched_positions_pfo = nan_tensor(energy_t.shape[0], 3, device=dev)
+    matched_pandora_pid = nan_tensor(energy_t.shape[0], device=dev)
+    matched_ref_pts_pfo = nan_tensor(energy_t.shape[0], 3, device=dev)
+    matched_extra_features = torch.zeros((energy_t.shape[0], 7)) * torch.nan
+    matched_es[row_ind_] = e_pred_showers[index_matches]
+    matched_ECAL[row_ind_] = 1.0 * e_pred_showers_ecal[index_matches]
+    matched_HCAL[row_ind_] = 1.0 * e_pred_showers_hcal[index_matches]
+    if pandora:
+        matched_es_cali = matched_es.clone()
+        matched_es_cali[row_ind_] = e_pred_showers_cali[index_matches]
+        matched_es_cali_pfo = matched_es.clone()
+        matched_es_cali_pfo[row_ind_] = e_pred_showers_pfo[index_matches]
+        matched_pandora_pid[row_ind_] = pandora_pid[index_matches]
+        if calc_pandora_momentum:
+            matched_positions_pfo[row_ind_] = pxyz_pred_pfo[index_matches]
+            matched_ref_pts_pfo[row_ind_] = ref_pt_pred_pfo[index_matches]
+        is_track[row_ind_] = is_track_per_shower[index_matches].float()
+    else:
+        if e_corr is None:
+            matched_es_cali = matched_es.clone()
+            matched_es_cali[row_ind_] = e_pred_showers_cali[index_matches]
+            calibration_per_shower = matched_es.clone()
+            calibration_per_shower[row_ind_] = corrections_per_shower[index_matches]
+            cluster_removed_tracks = matched_es.clone()
+        else:
+            matched_es_cali = matched_es.clone()
+            number_of_showers = e_pred_showers[index_matches].shape[0]
+            matched_es_cali[row_ind_] = _window(
+                corrections_per_shower, number_of_showers_total, number_of_showers
+            )
+            cluster_removed_tracks = matched_es.clone()
+            cluster_removed_tracks[row_ind_] = 1.0 * removed_tracks[index_matches]
+            if pred_pos is not None:
+                matched_positions[row_ind_] = _window(pred_pos, number_of_showers_total, number_of_showers)
+                matched_ref_pt[row_ind_] = _window(pred_ref_pt, number_of_showers_total, number_of_showers)
+                matched_pid[row_ind_] = _window(pred_pid, number_of_showers_total, number_of_showers)
+                if not pandora:
+                    matched_extra_features[row_ind_] = torch.tensor(
+                        _window(extra_features, number_of_showers_total, number_of_showers)
+                    )
+            calibration_per_shower = matched_es.clone()
+            calibration_per_shower[row_ind_] = _window(
+                corrections_per_shower, number_of_showers_total, number_of_showers
+            )
+            number_of_showers_total = number_of_showers_total + number_of_showers
+        is_track[row_ind_] = is_track_per_shower[index_matches].float()
+    # match the tracks to the particle
+    dic["graph"].ndata["particle_number_u"] = dic["graph"].ndata["particle_number"].clone()
+    dic["graph"].ndata["particle_number_u"][dic["graph"].ndata["particle_number_u"] == 0] = 100
+    tracks_label = scatter_max(
+        (dic["graph"].ndata["hit_type"] == 1) * (dic["graph"].ndata["particle_number_u"]), labels
+    )[0].int()
+    tracks_label = tracks_label - 1
+    tracks_label[tracks_label < 0] = 0
+    matched_es_tracks = nan_like(energy_t)
+    matched_es_tracks_1 = nan_like(energy_t)
+    matched_es_tracks[row_ind_] = row_ind_.float()
+    matched_es_tracks_1[row_ind_] = tracks_label[index_matches].float()
+    matched_es_tracks_1 = 1.0 * (matched_es_tracks == matched_es_tracks_1)
+    matched_es_tracks_1 = matched_es_tracks_1 * is_track
+    intersection_E = nan_like(energy_t)
+    if len(col_ind) > 0:
+        ie_e = obtain_intersection_values(i_m_w, row_ind, col_ind, dic)
+        intersection_E[row_ind_] = ie_e.to(e_pred_showers.device)
+        pred_showers[index_matches] = -1
+        pred_showers[0] = -1
+        mask = pred_showers != -1
+        fakes_in_event = mask.sum()
+        fake_showers_e = e_pred_showers[mask]
+        fake_showers_e_hcal = e_pred_showers_hcal[mask]
+        fake_showers_e_ecal = e_pred_showers_ecal[mask]
+        number_of_fake_showers = mask.sum()
+        all_labels = labels.unique().to(e_pred_showers.device)
+        number_of_fake_showers = mask.sum()
+        fakes_labels = torch.where(mask)[0].to(e_pred_showers.device)
+        fake_showers_distance_to_cluster = distances[fakes_labels.cpu()]
+        fake_showers_num_tracks = number_of_tracks[fakes_labels.cpu()]
+        if e_corr is None or pandora:
+            fake_showers_e_cali = e_pred_showers_cali[mask]
+        else:
+            fakes_positions = pred_pos[-number_of_fakes:][number_of_fake_showers_total:number_of_fake_showers_total + number_of_fake_showers]
+            fake_showers_e_cali = e_corr[-number_of_fakes:][number_of_fake_showers_total:number_of_fake_showers_total + number_of_fake_showers]
+            fakes_pid_pred = pred_pid[-number_of_fakes:][number_of_fake_showers_total:number_of_fake_showers_total + number_of_fake_showers]
+            fake_showers_e_reco = e_reco_showers[-number_of_fakes:][number_of_fake_showers_total:number_of_fake_showers_total + number_of_fake_showers]
+            fakes_positions = fakes_positions.to(e_pred_showers.device)
+            fakes_extra_features = extra_features[-number_of_fakes:][number_of_fake_showers_total:number_of_fake_showers_total + number_of_fake_showers]
+            fake_showers_e_cali = fake_showers_e_cali.to(e_pred_showers.device)
+            fakes_pid_pred = fakes_pid_pred.to(e_pred_showers.device)
+            fake_showers_e_reco = fake_showers_e_reco.to(e_pred_showers.device)
+        if pandora:
+            fake_pandora_pid = (torch.zeros((fake_showers_e.shape[0], 3)) * torch.nan).to(dev)
+            fake_pandora_pid = pandora_pid[mask]
+            if calc_pandora_momentum:
+                fake_positions_pfo = nan_tensor(fake_showers_e.shape[0], 3, device=dev)
+                fake_positions_pfo = pxyz_pred_pfo[mask]
+                fakes_positions_ref = nan_tensor(fake_showers_e.shape[0], 3, device=dev)
+                fakes_positions_ref = ref_pt_pred_pfo[mask]
+        if not pandora:
+            if e_corr is None:
+                fake_showers_e_cali_factor = corrections_per_shower[mask]
+            else:
+                fake_showers_e_cali_factor = fake_showers_e_cali
+        fake_showers_showers_e_truw = nan_tensor(fake_showers_e.shape[0], device=dev)
+        fake_showers_vertex = nan_tensor(fake_showers_e.shape[0], 3, device=dev)
+        fakes_is_track = (torch.zeros((fake_showers_e.shape[0])) * torch.nan).to(dev)
+        fakes_is_track = is_track_per_shower[mask]
+        fakes_positions_t = nan_tensor(fake_showers_e.shape[0], 3, device=dev)
+        if not pandora:
+            number_of_fake_showers_total = number_of_fake_showers_total + number_of_fake_showers
+        energy_t = torch.cat((energy_t, fake_showers_showers_e_truw), dim=0)
+        gen_status = torch.cat((gen_status, fake_showers_showers_e_truw), dim=0)
+        vertex = torch.cat((vertex, fake_showers_vertex), dim=0)
+        pid_t = torch.cat((pid_t.view(-1), fake_showers_showers_e_truw), dim=0)
+        pos_t = torch.cat((pos_t, fakes_positions_t), dim=0)
+        e_reco = torch.cat((e_reco_showers[1:], fake_showers_showers_e_truw), dim=0)
+        e_labels = torch.cat((e_label_showers[1:], 0 * fake_showers_showers_e_truw), dim=0)
+        is_track_in_MC = torch.cat((is_track_in_MC[1:], fake_showers_num_tracks.to(e_reco.device)), dim=0)
+        track_chi = torch.cat((track_chi[1:], fake_showers_num_tracks.to(e_reco.device)), dim=0)
+        distance_to_cluster_MC = torch.cat(
+            (distance_to_cluster_all[1:], fake_showers_distance_to_cluster.to(e_reco.device)), dim=0
+        )
+        e_pred = torch.cat((matched_es, fake_showers_e), dim=0)
+        e_pred_ECAL = torch.cat((matched_ECAL, fake_showers_e_ecal), dim=0)
+        e_pred_HCAL = torch.cat((matched_HCAL, fake_showers_e_hcal), dim=0)
+        e_pred_cali = torch.cat((matched_es_cali, fake_showers_e_cali), dim=0)
+        if pred_pos is not None:
+            e_pred_pos = torch.cat((matched_positions, fakes_positions), dim=0)
+            e_pred_pid = torch.cat((matched_pid, fakes_pid_pred), dim=0)
+            e_pred_ref_pt = torch.cat((matched_ref_pt, fakes_positions), dim=0)
+            extra_features_all = torch.cat(
+                (matched_extra_features, torch.tensor(fakes_extra_features)), dim=0
+            )
+        if pandora:
+            e_pred_cali_pfo = torch.cat((matched_es_cali_pfo, fake_showers_e_cali), dim=0)
+            positions_pfo = torch.cat((matched_positions_pfo, fake_positions_pfo), dim=0)
+            pandora_pid = torch.cat((matched_pandora_pid, fake_pandora_pid), dim=0)
+            ref_pts_pfo = torch.cat((matched_ref_pts_pfo, fakes_positions_ref), dim=0)
+        else:
+            cluster_removed_tracks = torch.cat((cluster_removed_tracks, 0 * fake_showers_e_cali), dim=0)
+        if not pandora:
+            calibration_factor = torch.cat((calibration_per_shower, fake_showers_e_cali_factor), dim=0)
+        e_pred_t = torch.cat(
+            (intersection_E, nan_like(fake_showers_e)),
+            dim=0,
+        )
+        is_track = torch.cat((is_track, fakes_is_track.to(is_track.device)), dim=0)
+        matched_es_tracks_1 = torch.cat(
+            (matched_es_tracks_1, 0 * fakes_is_track.to(is_track.device)), dim=0
+        )
+        # Build shared base dict, then update with pandora- or non-pandora-specific keys
+        d = {
+            "true_showers_E": energy_t.detach().cpu(),
+            "reco_showers_E": e_reco.detach().cpu(),
+            "pred_showers_E": e_pred.detach().cpu(),
+            "e_pred_and_truth": e_pred_t.detach().cpu(),
+            "pid": pid_t.detach().cpu(),
+            "step": torch.ones_like(energy_t.detach().cpu()) * step,
+            "number_batch": torch.ones_like(energy_t.detach().cpu()) * number_in_batch,
+            "is_track_in_cluster": is_track.detach().cpu(),
+            "is_track_correct": matched_es_tracks_1.detach().cpu(),
+            "is_track_in_MC": is_track_in_MC.detach().cpu(),
+            "track_chi": track_chi.detach().cpu(),
+            "distance_to_cluster_MC": distance_to_cluster_MC.detach().cpu(),
+            "vertex": vertex.detach().cpu().tolist(),
+            "ECAL_hits": e_pred_ECAL.detach().cpu(),
+            "HCAL_hits": e_pred_HCAL.detach().cpu(),
+            "gen_status": gen_status.detach().cpu(),
+            "labels": e_labels.detach().cpu(),
+        }
+        if pandora:
+            d.update({
+                "pandora_calibrated_E": e_pred_cali.detach().cpu(),
+                "pandora_calibrated_pfo": e_pred_cali_pfo.detach().cpu(),
+                "pandora_calibrated_pos": positions_pfo.detach().cpu().tolist(),
+                "pandora_ref_pt": ref_pts_pfo.detach().cpu().tolist(),
+                "pandora_pid": pandora_pid.detach().cpu(),
+            })
+        else:
+            d.update({
+                "calibration_factor": calibration_factor.detach().cpu(),
+                "calibrated_E": e_pred_cali.detach().cpu(),
+                "cluster_removed_tracks": cluster_removed_tracks.detach().cpu(),
+            })
+            if pred_pos is not None:
+                d["pred_pos_matched"] = e_pred_pos.detach().cpu().tolist()
+                d["pred_pid_matched"] = e_pred_pid.detach().cpu().tolist()
+                d["pred_ref_pt_matched"] = e_pred_ref_pt.detach().cpu().tolist()
+                d["matched_extra_features"] = extra_features_all.detach().cpu().tolist()
+        d["true_pos"] = pos_t.detach().cpu().tolist()
+        df = pd.DataFrame(data=d)
+        if number_of_showers_total is None:
+            return df
+        else:
+            return df, number_of_showers_total, number_of_fake_showers_total
+    else:
+        return [], 0, 0

src/layers/shower_matching.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Shower matching utilities for particle-flow reconstruction."""
+import torch
+import numpy as np
+from torch_scatter import scatter_add
+from scipy.optimize import linear_sum_assignment
+class CachedIndexList:
+    def __init__(self, lst):
+        self.lst = lst
+        self.cache = {}
+    def index(self, value):
+        if value in self.cache:
+            return self.cache[value]
+        else:
+            idx = self.lst.index(value)
+            self.cache[value] = idx
+            return idx
+def get_labels_pandora(dic, device):
+    labels_pandora = dic["graph"].ndata["pandora_pfo"].long()
+    labels_pandora = labels_pandora + 1
+    map_from = list(np.unique(labels_pandora.detach().cpu()))
+    map_from = CachedIndexList(map_from)
+    cluster_id = map(lambda x: map_from.index(x), labels_pandora.detach().cpu().numpy())
+    labels_pandora = torch.Tensor(list(cluster_id)).long().to(device)
+    return labels_pandora
+def obtain_intersection_matrix(shower_p_unique, particle_ids, labels, dic, e_hits):
+    len_pred_showers = len(shower_p_unique)
+    intersection_matrix = torch.zeros((len_pred_showers, len(particle_ids))).to(
+        shower_p_unique.device
+    )
+    intersection_matrix_w = torch.zeros((len_pred_showers, len(particle_ids))).to(
+        shower_p_unique.device
+    )
+    for index, id in enumerate(particle_ids):
+        counts = torch.zeros_like(labels)
+        mask_p = dic["graph"].ndata["particle_number"] == id
+        h_hits = e_hits.clone()
+        counts[mask_p] = 1
+        h_hits[~mask_p] = 0
+        intersection_matrix[:, index] = scatter_add(counts, labels)
+        intersection_matrix_w[:, index] = scatter_add(h_hits, labels.to(h_hits.device))
+    return intersection_matrix, intersection_matrix_w
+def obtain_union_matrix(shower_p_unique, particle_ids, labels, dic):
+    len_pred_showers = len(shower_p_unique)
+    union_matrix = torch.zeros((len_pred_showers, len(particle_ids)))
+    for index, id in enumerate(particle_ids):
+        counts = torch.zeros_like(labels)
+        mask_p = dic["graph"].ndata["particle_number"] == id
+        for index_pred, id_pred in enumerate(shower_p_unique):
+            mask_pred_p = labels == id_pred
+            mask_union = mask_pred_p + mask_p
+            union_matrix[index_pred, index] = torch.sum(mask_union)
+    return union_matrix
+def obtain_intersection_values(intersection_matrix_w, row_ind, col_ind, dic):
+    list_intersection_E = []
+    particle_ids = torch.unique(dic["graph"].ndata["particle_number"])
+    if torch.sum(particle_ids == 0) > 0:
+        intersection_matrix_wt = torch.transpose(intersection_matrix_w[1:, 1:], 1, 0)
+        row_ind = row_ind - 1
+    else:
+        intersection_matrix_wt = torch.transpose(intersection_matrix_w[1:, :], 1, 0)
+    for i in range(0, len(col_ind)):
+        list_intersection_E.append(
+            intersection_matrix_wt[row_ind[i], col_ind[i]].view(-1)
+        )
+    if len(list_intersection_E) > 0:
+        return torch.cat(list_intersection_E, dim=0)
+    else:
+        return 0
+def match_showers(
+    labels,
+    dic,
+    particle_ids,
+    model_output,
+    local_rank,
+    i,
+    path_save,
+    pandora=False,
+    hdbscan=False,
+):
+    iou_threshold = 0.25
+    shower_p_unique = torch.unique(labels)
+    if torch.sum(labels == 0) == 0:
+        shower_p_unique = torch.cat(
+            (
+                torch.Tensor([0]).to(shower_p_unique.device).view(-1),
+                shower_p_unique.view(-1),
+            ),
+            dim=0,
+        )
+    e_hits = dic["graph"].ndata["e_hits"].view(-1)
+    i_m, i_m_w = obtain_intersection_matrix(
+        shower_p_unique, particle_ids, labels, dic, e_hits
+    )
+    i_m = i_m.to(model_output.device)
+    i_m_w = i_m_w.to(model_output.device)
+    u_m = obtain_union_matrix(shower_p_unique, particle_ids, labels, dic)
+    u_m = u_m.to(model_output.device)
+    iou_matrix = i_m / u_m
+    if torch.sum(particle_ids == 0) > 0:
+        iou_matrix_num = (
+            torch.transpose(iou_matrix[1:, 1:], 1, 0).clone().detach().cpu().numpy()
+        )
+    else:
+        iou_matrix_num = (
+            torch.transpose(iou_matrix[1:, :], 1, 0).clone().detach().cpu().numpy()
+        )
+    iou_matrix_num[iou_matrix_num < iou_threshold] = 0
+    row_ind, col_ind = linear_sum_assignment(-iou_matrix_num)
+    mask_matching_matrix = iou_matrix_num[row_ind, col_ind] > 0
+    row_ind = row_ind[mask_matching_matrix]
+    col_ind = col_ind[mask_matching_matrix]
+    if torch.sum(particle_ids == 0) > 0:
+        row_ind = row_ind + 1
+    return shower_p_unique, row_ind, col_ind, i_m_w, iou_matrix

src/layers/tools_for_regression.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+import numpy as np
+from torch_scatter import scatter_mean, scatter_sum
+def pick_lowest_chi_squared(pxpypz, chi_s, batch_idx, xyz_nodes):
+    unique_batch = torch.unique(batch_idx)
+    p_direction = []
+    track_xyz = []
+    for i in range(0, len(unique_batch)):
+        mask = batch_idx == unique_batch[i]
+        if torch.sum(mask) > 1:
+            chis = chi_s[mask]
+            ind_min = torch.argmin(chis)
+            p_direction.append(pxpypz[mask][ind_min].view(-1, 3))
+            track_xyz.append(xyz_nodes[mask][ind_min].view(-1, 3))
+        else:
+            p_direction.append(pxpypz[mask].view(-1, 3))
+            track_xyz.append(xyz_nodes[mask].view(-1, 3))
+    return torch.concat(p_direction, dim=0), torch.stack(track_xyz)[:, 0]
+class AverageHitsP(torch.nn.Module):
+    # Same layout of the module as the GNN one, but just computes the average of the hits. Try to compare this + ML clustering with Pandora
+    def __init__(self, ecal_only=False):
+        super(AverageHitsP, self).__init__()
+        self.ecal_only = ecal_only
+    def predict(self, x_global_features, graphs_new=None, explain=False):
+        """
+        Forward, named 'predict' for compatibility reasons
+        :param x_global_features: Global features of the graphs - to be concatenated to each node feature
+        :param graphs_new:
+        :return:
+        """
+        assert graphs_new is not None
+        batch_num_nodes = graphs_new.batch_num_nodes()  # Num. of hits in each graph
+        batch_idx = []
+        batch_bounds = []
+        if self.ecal_only:
+            mask_ecal_only = [] # whether to consider only ECAL or ECAL+HCAL
+        for i, n in enumerate(batch_num_nodes):
+            batch_idx.extend([i] * n)
+            batch_bounds.append(n)
+        batch_idx = np.array(batch_idx)
+        for i in range(len(np.unique(batch_idx))):
+            if self.ecal_only:
+                n_ecal_hits = (graphs_new.ndata["h"][batch_idx == i, 5] > 0).sum()
+                n_hcal_hits = (graphs_new.ndata["h"][batch_idx == i, 6] > 0).sum()
+                for _ in range(batch_num_nodes[i]):
+                    mask_ecal_only.append((n_ecal_hits / (n_hcal_hits + n_ecal_hits)).item())
+        batch_idx = torch.tensor(batch_idx).to(graphs_new.device)
+        if self.ecal_only:
+            mask_ecal_only = torch.tensor(mask_ecal_only)  # round().int().bool().to(graphs_new.device)
+            mask_ecal_only = (mask_ecal_only > 0.05).int().bool().to(graphs_new.device)
+            #mask_ecal_only=torch.zeros(len(mask_ecal_only)).bool().to(graphs_new.device)
+        xyz_hits = graphs_new.ndata["h"][:, :3]
+        E_hits = graphs_new.ndata["h"][:, 8]
+        if self.ecal_only:
+            hcal_hits = graphs_new.ndata["h"][:, 6] > 0
+            E_hits[mask_ecal_only & (hcal_hits)] = 0
+        weighted_avg_hits = scatter_sum(xyz_hits * E_hits.unsqueeze(1), batch_idx, dim=0)
+        E_total = scatter_sum(E_hits, batch_idx, dim=0)
+        p_direction = weighted_avg_hits / E_total.unsqueeze(1)
+        p_tracks = torch.norm(p_direction, dim=1)
+        p_direction = p_direction / torch.norm(p_direction, dim=1).unsqueeze(1)
+        # if self.pos_regression:
+        return p_tracks, p_direction,  weighted_avg_hits / E_total.unsqueeze(1) * 3300 # Reference point
+        # return p_tracks
+class PickPAtDCA(torch.nn.Module):
+    # Same layout of the module as the GNN one, but just picks the track
+    def __init__(self):
+        super(PickPAtDCA, self).__init__()
+    def predict(self, x_global_features, graphs_new=None, explain=False):
+        """
+        Forward, named 'predict' for compatibility reasons
+        :param x_global_features: Global features of the graphs - to be concatenated to each node feature
+        :param graphs_new:
+        :return:
+        """
+        assert graphs_new is not None
+        batch_num_nodes = graphs_new.batch_num_nodes()
+        batch_idx = []
+        batch_bounds = []
+        for i, n in enumerate(batch_num_nodes):
+            batch_idx.extend([i] * n)
+            batch_bounds.append(n)
+        batch_idx = torch.tensor(batch_idx).to(graphs_new.device)
+        ht = graphs_new.ndata["h"][:, 3:7].argmax(dim=1)
+        filt = ht == 1  # track
+        filt_hits = ((ht == 2) + (ht == 3)).bool()
+        p_direction, p_xyz = pick_lowest_chi_squared(
+            graphs_new.ndata["pos_pxpypz_at_vertex"][filt],
+            graphs_new.ndata["chi_squared_tracks"][filt],
+            batch_idx[filt],
+            graphs_new.ndata["h"][filt, :3]
+        )
+        # Barycenters of clusters of hits
+        xyz_hits = graphs_new.ndata["h"][:, :3]
+        E_hits = graphs_new.ndata["h"][:, 8]
+        weighted_avg_hits = scatter_sum(xyz_hits * E_hits.unsqueeze(1), batch_idx, dim=0)
+        E_total = scatter_sum(E_hits, batch_idx, dim=0)
+        barycenters = weighted_avg_hits / E_total.unsqueeze(1)
+        p_tracks = torch.norm(p_direction, dim=1)
+        return p_tracks, p_direction, barycenters - p_xyz
+class ECNetWrapperAvg(torch.nn.Module):
+    # use the GNN+NN model for energy correction
+    # This one concatenates GNN features to the global features
+    def __init__(self):
+        super(ECNetWrapperAvg, self).__init__()
+        self.AvgHits = AverageHitsP(ecal_only=True)
+    def predict(self, x_global_features, graphs_new=None, explain=False):
+        """
+        Forward, named 'predict' for compatibility reasons
+        :param x_global_features: Global features of the graphs - to be concatenated to each node feature
+        :param graphs_new:
+        :return:
+        """
+        _, p_pred, _ = self.AvgHits.predict(x_global_features, graphs_new)
+        p_pred = (p_pred / torch.norm(p_pred, dim=1).unsqueeze(1)).clone()
+        return None, p_pred, None, None

src/layers/utils_training.py ADDED Viewed

	@@ -0,0 +1,166 @@

+from lightning.pytorch.callbacks import BaseFinetuning
+import torch
+import dgl
+from src.layers.inference_oc import DPC_custom_CLD
+from src.layers.inference_oc import match_showers
+from src.layers.inference_oc import remove_bad_tracks_from_cluster
+class FreezeClustering(BaseFinetuning):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def freeze_before_training(self, pl_module):
+        self.freeze(pl_module.ScaledGooeyBatchNorm2_1)
+        self.freeze(pl_module.gatr)
+        self.freeze(pl_module.clustering)
+        self.freeze(pl_module.beta)
+        print("CLUSTERING HAS BEEN FROOOZEN")
+    def finetune_function(self, pl_module, current_epoch, optimizer):
+        print("Not finetunning")
+def obtain_batch_numbers(x, g):
+    dev = x.device
+    graphs_eval = dgl.unbatch(g)
+    number_graphs = len(graphs_eval)
+    batch_numbers = []
+    for index in range(0, number_graphs):
+        gj = graphs_eval[index]
+        num_nodes = gj.number_of_nodes()
+        batch_numbers.append(index * torch.ones(num_nodes).to(dev))
+        # num_nodes = gj.number_of_nodes()
+    batch = torch.cat(batch_numbers, dim=0)
+    return batch
+def obtain_clustering_for_matched_showers(
+    batch_g, model_output, y_all, local_rank, use_gt_clusters=False, add_fakes=True
+):
+    graphs_showers_matched = []
+    graphs_showers_fakes = []
+    true_energy_showers = []
+    reco_energy_showers = []
+    reco_energy_showers_fakes = []
+    energy_true_daughters = []
+    y_pids_matched = []
+    y_coords_matched = []
+    if not  use_gt_clusters:
+        batch_g.ndata["coords"] = model_output[:, 0:3]
+        batch_g.ndata["beta"] = model_output[:, 3]
+    graphs = dgl.unbatch(batch_g)
+    batch_id = y_all.batch_number
+    for i in range(0, len(graphs)):
+        mask = batch_id == i
+        dic = {}
+        dic["graph"] = graphs[i]
+        y = y_all.copy()
+        y.mask(mask.flatten())
+        dic["part_true"] = y
+        if not  use_gt_clusters:
+            betas = torch.sigmoid(dic["graph"].ndata["beta"])
+            X = dic["graph"].ndata["coords"]
+        if use_gt_clusters:
+            labels = dic["graph"].ndata["particle_number"].type(torch.int64)
+        else:
+            labels =DPC_custom_CLD(X, dic["graph"], model_output.device)
+            labels, _ = remove_bad_tracks_from_cluster(dic["graph"], labels)
+        particle_ids = torch.unique(dic["graph"].ndata["particle_number"])
+        shower_p_unique = torch.unique(labels)
+        shower_p_unique, row_ind, col_ind, i_m_w, _ = match_showers(
+            labels, dic, particle_ids, model_output, local_rank, i, None
+        )
+        row_ind = torch.Tensor(row_ind).to(model_output.device).long()
+        col_ind = torch.Tensor(col_ind).to(model_output.device).long()
+        if torch.sum(particle_ids == 0) > 0:
+            row_ind_ = row_ind - 1
+        else:
+            # if there is no zero then index 0 corresponds to particle 1.
+            row_ind_ = row_ind
+        index_matches = col_ind + 1
+        index_matches = index_matches.to(model_output.device).long()
+        for j, unique_showers_label in enumerate(index_matches):
+            if torch.sum(unique_showers_label == index_matches) == 1:
+                index_in_matched = torch.argmax(
+                    (unique_showers_label == index_matches) * 1
+                )
+                mask = labels == unique_showers_label
+                sls_graph = graphs[i].ndata["pos_hits_xyz"][mask][:, 0:3]
+                g = dgl.graph(([], []))
+                g.add_nodes(sls_graph.shape[0])
+                g =  g.to(sls_graph.device)
+                g.ndata["h"] = graphs[i].ndata["h"][mask]
+                if "pos_pxpypz" in graphs[i].ndata:
+                    g.ndata["pos_pxpypz"] = graphs[i].ndata["pos_pxpypz"][mask]
+                if "pos_pxpypz_at_vertex" in graphs[i].ndata:
+                    g.ndata["pos_pxpypz_at_vertex"] = graphs[i].ndata[
+                        "pos_pxpypz_at_vertex"
+                    ][mask]
+                g.ndata["chi_squared_tracks"] = graphs[i].ndata["chi_squared_tracks"][mask]
+                energy_t = dic["part_true"].E.to(model_output.device)
+                energy_t_corr_daughters = dic["part_true"].m.to(
+                    model_output.device
+                )
+                true_energy_shower = energy_t[row_ind_[j]]
+                y_pids_matched.append(y.pid[row_ind_[j]].item())
+                y_coords_matched.append(y.coord[row_ind_[j]].detach().cpu().numpy())
+                energy_true_daughters.append(energy_t_corr_daughters[row_ind_[j]])
+                reco_energy_shower = torch.sum(graphs[i].ndata["e_hits"][mask])
+                graphs_showers_matched.append(g)
+                true_energy_showers.append(true_energy_shower.view(-1))
+                reco_energy_showers.append(reco_energy_shower.view(-1))
+        pred_showers = shower_p_unique
+        pred_showers[index_matches] = -1
+        pred_showers[
+            0
+        ] = (
+            -1
+        )
+        mask_fakes = pred_showers != -1
+        fakes_idx = torch.where(mask_fakes)[0]
+        if add_fakes:
+            for j in fakes_idx:
+                mask = labels == j
+                sls_graph = graphs[i].ndata["pos_hits_xyz"][mask][:, 0:3]
+                g = dgl.graph(([], []))
+                g.add_nodes(sls_graph.shape[0])
+                g =  g.to(sls_graph.device)
+                g.ndata["h"] = graphs[i].ndata["h"][mask]
+                if "pos_pxpypz" in graphs[i].ndata:
+                    g.ndata["pos_pxpypz"] = graphs[i].ndata["pos_pxpypz"][mask]
+                if "pos_pxpypz_at_vertex" in graphs[i].ndata:
+                    g.ndata["pos_pxpypz_at_vertex"] = graphs[i].ndata[
+                        "pos_pxpypz_at_vertex"
+                    ][mask]
+                g.ndata["chi_squared_tracks"] = graphs[i].ndata["chi_squared_tracks"][mask]
+                graphs_showers_fakes.append(g)
+                reco_energy_shower = torch.sum(graphs[i].ndata["e_hits"][mask])
+                reco_energy_showers_fakes.append(reco_energy_shower.view(-1))
+    graphs_showers_matched = dgl.batch(graphs_showers_matched + graphs_showers_fakes)
+    true_energy_showers = torch.cat(true_energy_showers, dim=0)
+    reco_energy_showers = torch.cat(reco_energy_showers + reco_energy_showers_fakes, dim=0)
+    e_true_corr_daughters = torch.cat(energy_true_daughters, dim=0)
+    number_of_fakes = len(reco_energy_showers_fakes)
+    return (
+        graphs_showers_matched,
+        true_energy_showers,
+        reco_energy_showers,
+        y_pids_matched,
+        e_true_corr_daughters,
+        y_coords_matched,
+        number_of_fakes,
+        fakes_idx
+    )

src/models/E_correction_module.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+import torch.nn as nn
+import io
+import pickle
+class Net(nn.Module):
+    def __init__(self, in_features=13, out_features=1, return_raw=True):
+        super(Net, self).__init__()
+        self.out_features = out_features
+        self.return_raw = return_raw
+        self.model = nn.ModuleList(
+            [
+                # nn.BatchNorm1d(13),
+                nn.Linear(in_features, 64),
+                nn.ReLU(),
+                nn.Linear(64, 64),
+                # nn.BatchNorm1d(64),
+                nn.ReLU(),
+                nn.Linear(64, 64),
+                nn.ReLU(),
+                nn.Linear(64, out_features),
+            ]
+        )
+        self.explainer_mode = False
+    def forward(self, x):
+        if not isinstance(x, torch.Tensor):
+            x = torch.tensor(x)
+        for layer in self.model:
+            x = layer(x)
+        if self.out_features > 1 and not self.return_raw:
+            return x[:, 0], x[:, 1:]
+        if self.explainer_mode:
+            return x.numpy()
+        return x
+    def freeze_batchnorm(self):
+        for layer in self.model:
+            if isinstance(layer, nn.BatchNorm1d):
+                layer.eval()
+                print("Frozen batchnorm in 1st layer only - ", layer)
+                break

src/models/Gatr_pf_e_noise.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""
+This file includes code adapted from:
+    Geometric Algebra Transformer (GATr)
+    https://github.com/Qualcomm-AI-research/geometric-algebra-transformer
+The original implementation is by Qualcomm AI Research. It has been modified
+and integrated into this project for particle-flow reconstruction at the
+CLD detector (FCC-ee). Please refer to the original repository for
+authorship, documentation, and license information.
+"""
+import torch
+import torch.nn as nn
+import dgl
+from src.layers.object_cond import object_condensation_loss2
+from src.models.energy_correction_NN import EnergyCorrection
+from src.layers.inference_oc import create_and_store_graph_output
+import lightning as L
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from xformers.ops.fmha import BlockDiagonalMask
+import os
+import wandb
+from gatr import GATr, SelfAttentionConfig, MLPConfig
+from gatr.interface import embed_point, extract_scalar, extract_point, embed_scalar
+from src.utils.logger_wandb import log_losses_wandb
+class ExampleWrapper(L.LightningModule):
+    def __init__(
+        self,
+        args,
+        dev,
+        blocks=10,
+        hidden_mv_channels=16,
+        hidden_s_channels=64,
+        config=None
+    ):
+        super().__init__()
+        self.strict_loading = False
+        self.input_dim = 3
+        self.output_dim = 4
+        self.loss_final = 0
+        self.number_b = 0
+        self.df_showers = []
+        self.df_showers_pandora = []
+        self.df_showers_db = []
+        self.args = args
+        self.dev = dev
+        self.config = config
+        self.gatr = GATr(
+            in_mv_channels=1,
+            out_mv_channels=1,
+            hidden_mv_channels=hidden_mv_channels,
+            in_s_channels=2,
+            out_s_channels=1,
+            hidden_s_channels=hidden_s_channels,
+            num_blocks=blocks,
+            attention=SelfAttentionConfig(),
+            mlp=MLPConfig(),
+        )
+        self.ScaledGooeyBatchNorm2_1 = nn.BatchNorm1d(self.input_dim, momentum=0.1)
+        self.clustering = nn.Linear(3, self.output_dim - 1, bias=False)
+        self.beta = nn.Linear(2, 1)
+        if self.args.correction:
+            self.energy_correction = EnergyCorrection(self)
+            self.ec_model_wrapper_charged = self.energy_correction.model_charged
+            self.ec_model_wrapper_neutral = self.energy_correction.model_neutral
+            self.pids_neutral = self.energy_correction.pids_neutral
+            self.pids_charged = self.energy_correction.pids_charged
+        else:
+            self.pids_neutral = []
+            self.pids_charged = []
+    def forward(self, g, y, step_count, eval="", return_train=False, use_gt_clusters=False):
+        if not use_gt_clusters:
+            inputs = g.ndata["pos_hits_xyz"].float()
+            inputs_scalar = g.ndata["hit_type"].float().view(-1, 1)
+            inputs = self.ScaledGooeyBatchNorm2_1(inputs)
+            embedded_inputs = embed_point(inputs) + embed_scalar(inputs_scalar)
+            embedded_inputs = embedded_inputs.unsqueeze(-2)  # (N, 1, 16)
+            mask = self.build_attention_mask(g)
+            scalars = torch.cat((g.ndata["e_hits"].float(), g.ndata["p_hits"].float()), dim=1)
+            embedded_outputs, scalar_outputs = self.gatr(
+                embedded_inputs, scalars=scalars, attention_mask=mask
+            )
+            points = extract_point(embedded_outputs[:, 0, :])
+            nodewise_outputs = extract_scalar(embedded_outputs)  # (N, 1, 1)
+            x_point = points
+            x_scalar = torch.cat(
+                (nodewise_outputs.view(-1, 1), scalar_outputs.view(-1, 1)), dim=1
+            )
+            x_cluster_coord = self.clustering(x_point)
+            beta = self.beta(x_scalar)
+            g.ndata["final_cluster"] = x_cluster_coord
+            g.ndata["beta"] = beta.view(-1)
+            x = torch.cat((x_cluster_coord, beta.view(-1, 1)), dim=1)
+        else:
+            x = torch.ones_like(g.ndata["h"][:, 0:4])
+        if self.args.correction:
+            result = self.energy_correction.forward_correction(g, x, y, return_train)
+            return result
+        else:
+            pred_energy_corr = torch.ones_like(beta.view(-1, 1))
+            return x, pred_energy_corr, 0, 0
+    def build_attention_mask(self, g):
+        batch_numbers = obtain_batch_numbers(g)
+        return BlockDiagonalMask.from_seqlens(
+            torch.bincount(batch_numbers.long()).tolist()
+        )
+    def unfreeze_all(self):
+        for p in self.energy_correction.model_charged.parameters():
+            p.requires_grad = True
+        for p in self.energy_correction.model_neutral.gatr_pid.parameters():
+            p.requires_grad = True
+        for p in self.energy_correction.model_neutral.PID_head.parameters():
+            p.requires_grad = True
+    def training_step(self, batch, batch_idx):
+        y = batch[1]
+        batch_g = batch[0]
+        if self.trainer.is_global_zero:
+            result = self(batch_g, y, batch_idx)
+        else:
+            result = self(batch_g, y, 1)
+        model_output = result[0]
+        e_cor = result[1]
+        (loss, losses) = object_condensation_loss2(
+            batch_g,
+            model_output,
+            e_cor,
+            y,
+            q_min=self.args.qmin,
+            use_average_cc_pos=self.args.use_average_cc_pos,
+        )
+        if self.args.correction:
+            self.energy_correction.global_step = self.global_step
+            fixed = self.current_epoch > 0
+            loss_EC, loss_pos, loss_neutral_pid, loss_charged_pid = self.energy_correction.get_loss(
+                batch_g, y, result, self.stats, fixed
+            )
+            loss = loss_EC + loss_neutral_pid + loss_charged_pid
+        if self.trainer.is_global_zero:
+            log_losses_wandb(True, batch_idx, 0, losses, loss)
+        self.loss_final = loss.item() + self.loss_final
+        self.number_b = self.number_b + 1
+        del model_output
+        del e_cor
+        del losses
+        return loss
+    def validation_step(self, batch, batch_idx):
+        self.create_paths()
+        y = batch[1]
+        batch_g = batch[0]
+        shap_vals, ec_x = None, None
+        if self.args.correction:
+            result = self(batch_g, y, 1, use_gt_clusters=self.args.use_gt_clusters)
+            model_output = result[0]
+            outputs = self.energy_correction.get_validation_step_outputs(batch_g, y, result)
+            e_cor1, pred_pos, pred_ref_pt, pred_pid, num_fakes, extra_features, fakes_labels = outputs
+            e_cor = e_cor1
+        else:
+            model_output, e_cor1, loss_ll, _ = self(batch_g, y, 1)
+            e_cor1 = torch.ones_like(model_output[:, 0].view(-1, 1))
+            e_cor = e_cor1
+            pred_pos = None
+            pred_pid = None
+            pred_ref_pt = None
+            num_fakes = None
+            extra_features = None
+            fakes_labels = None
+        if self.args.predict:
+            if self.args.correction:
+                model_output1 = model_output
+                e_corr = e_cor
+            else:
+                model_output1 = torch.cat((model_output, e_cor.view(-1, 1)), dim=1)
+                e_corr = None
+            (
+                df_batch_pandora,
+                df_batch1,
+                self.total_number_events,
+            ) = create_and_store_graph_output(
+                batch_g,
+                model_output1,
+                y,
+                0,
+                batch_idx,
+                0,
+                path_save=self.show_df_eval_path,
+                store=True,
+                predict=True,
+                e_corr=e_corr,
+                ec_x=ec_x,
+                total_number_events=self.total_number_events,
+                pred_pos=pred_pos,
+                pred_ref_pt=pred_ref_pt,
+                pred_pid=pred_pid,
+                use_gt_clusters=self.args.use_gt_clusters,
+                number_of_fakes=num_fakes,
+                extra_features=extra_features,
+                fakes_labels=fakes_labels,
+                pandora_available=self.args.pandora,
+            )
+            self.df_showers_pandora.append(df_batch_pandora)
+            self.df_showers_db.append(df_batch1)
+        del model_output
+    def create_paths(self):
+        show_df_eval_path = os.path.join(self.args.model_prefix, "showers_df_evaluation")
+        self.show_df_eval_path = show_df_eval_path
+    def on_train_epoch_end(self):
+        self.log("train_loss_epoch", self.loss_final / self.number_b)
+    def on_train_epoch_start(self):
+        self.loss_final = 0
+        self.number_b = 0
+        self.make_mom_zero()
+        if self.current_epoch == 0:
+            self.stats = {}
+            self.stats["counts"] = {}
+            self.stats["counts_pid_neutral"] = {}
+            self.stats["counts_pid_charged"] = {}
+    def on_validation_epoch_start(self):
+        self.total_number_events = 0
+        self.make_mom_zero()
+        self.df_showers = []
+        self.df_showers_pandora = []
+        self.df_showers_db = []
+        self.validation_step_outputs = []
+    def make_mom_zero(self):
+        if self.current_epoch > 1 or self.args.predict:
+            print("making momentum 0")
+            self.ScaledGooeyBatchNorm2_1.momentum = 0
+    def on_validation_epoch_end(self):
+        if self.trainer.is_global_zero:
+            if self.args.predict:
+                from src.layers.inference_oc import store_at_batch_end
+                import pandas as pd
+                if self.args.pandora:
+                    self.df_showers_pandora = pd.concat(self.df_showers_pandora)
+                else:
+                    self.df_showers_pandora = []
+                self.df_showers_db = pd.concat(self.df_showers_db)
+                store_at_batch_end(
+                    path_save=os.path.join(
+                        self.args.model_prefix, "showers_df_evaluation"
+                    ) + "/" + self.args.name_output,
+                    df_batch_pandora=self.df_showers_pandora,
+                    df_batch1=self.df_showers_db,
+                    step=0,
+                    predict=True,
+                    store=True,
+                    pandora_available=self.args.pandora
+                )
+        self.validation_step_outputs = []
+        self.df_showers = []
+        self.df_showers_pandora = []
+        self.df_showers_db = []
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.args.start_lr)
+        scheduler = CosineAnnealingThenFixedScheduler(optimizer, T_max=int(36400 * 2), fixed_lr=1e-5)
+        self.scheduler = scheduler
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": scheduler,
+                "interval": "step",
+                "monitor": "train_loss_epoch",
+                "frequency": 1,
+            },
+        }
+    def lr_scheduler_step(self, scheduler, optimizer_idx, metric=None):
+        scheduler.step()
+def obtain_batch_numbers(g):
+    graphs_eval = dgl.unbatch(g)
+    number_graphs = len(graphs_eval)
+    batch_numbers = []
+    for index in range(number_graphs):
+        num_nodes = graphs_eval[index].number_of_nodes()
+        batch_numbers.append(index * torch.ones(num_nodes))
+    return torch.cat(batch_numbers, dim=0)
+class CosineAnnealingThenFixedScheduler:
+    def __init__(self, optimizer, T_max, fixed_lr):
+        self.cosine_scheduler = CosineAnnealingLR(optimizer, T_max=T_max, eta_min=fixed_lr)
+        self.fixed_lr = 1e-6
+        self.T_max = T_max
+        self.step_count = 0
+        self.optimizer = optimizer
+    def step(self):
+        if self.step_count < self.T_max:
+            self.cosine_scheduler.step()
+        else:
+            for param_group in self.optimizer.param_groups:
+                param_group["lr"] = self.fixed_lr
+        self.step_count += 1
+    def get_last_lr(self):
+        if self.step_count < self.T_max:
+            return self.cosine_scheduler.get_last_lr()
+        else:
+            return [self.fixed_lr for _ in self.optimizer.param_groups]
+    def state_dict(self):
+        return {
+            "step_count": self.step_count,
+            "cosine_scheduler_state": self.cosine_scheduler.state_dict(),
+        }
+    def load_state_dict(self, state_dict):
+        self.step_count = state_dict["step_count"]
+        self.cosine_scheduler.load_state_dict(state_dict["cosine_scheduler_state"])

src/models/energy_correction_NN.py ADDED Viewed

	@@ -0,0 +1,299 @@

+"""
+PID + energy correction module.
+The model is called after object condensation clustering to correct
+reconstructed energies and predict particle IDs.
+"""
+import numpy as np
+import wandb
+import torch
+from torch.nn import CrossEntropyLoss
+from torch_scatter import scatter_add, scatter_mean
+from typing import NamedTuple, Any
+from src.layers.utils_training import obtain_clustering_for_matched_showers
+from src.utils.post_clustering_features import (
+    get_post_clustering_features, get_extra_features, calculate_eta, calculate_phi,
+)
+from src.utils.pid_conversion import pid_conversion_dict
+from src.layers.regression.loss_regression import obtain_PID_charged, obtain_PID_neutral
+from src.models.energy_correction_charged import ChargedEnergyCorrection
+from src.models.energy_correction_neutral import (
+    NeutralEnergyCorrection, criterion_E_cor, correct_mask_neutral,
+)
+class _ClusteringOutput(NamedTuple):
+    """Structured return type for clustering_and_global_features."""
+    graphs:           Any            # batched DGL graph (feature-augmented)
+    batch_idx:        torch.Tensor
+    high_level_feats: torch.Tensor   # per-shower aggregate features
+    charged_idx:      torch.Tensor
+    neutral_idx:      torch.Tensor
+    feats_charged:    torch.Tensor   # NaN-zeroed high_level_feats[charged_idx]
+    feats_neutral:    torch.Tensor   # NaN-zeroed high_level_feats[neutral_idx]
+    pred_energy:      torch.Tensor   # ones placeholder, filled by forward_correction
+    pred_pos:         torch.Tensor
+    pred_pid:         torch.Tensor
+    true:             Any
+    true_pid:         torch.Tensor
+    true_coords:      torch.Tensor
+    sum_e:            torch.Tensor
+    e_true_daughters: torch.Tensor
+    n_fakes:          int
+    extra_features:   torch.Tensor
+    fakes_idx:        torch.Tensor
+def _zero_nans(t: torch.Tensor) -> torch.Tensor:
+    out = t.clone()
+    out[out != out] = 0
+    return out
+def _decode_pid(pred_pid: torch.Tensor, pids: list, logits: torch.Tensor, idx: torch.Tensor) -> None:
+    if pids and len(idx):
+        labels = np.array(pids)[np.argmax(logits.cpu().detach(), axis=1)]
+        pred_pid[idx.flatten()] = torch.tensor(labels).long().to(idx.device)
+class EnergyCorrection:
+    def __init__(self, main_model):
+        self.args = main_model.args
+        self.get_PID_categories()
+        self.get_energy_correction()
+        self.pid_conversion_dict = pid_conversion_dict
+        self.main_model = main_model
+        self.global_step = 0
+    def get_PID_categories(self):
+        self.pids_neutral = [2, 3]
+        self.pids_charged = [0, 1, 4]
+    def get_energy_correction(self):
+        self.model_charged = ChargedEnergyCorrection(args=self.args)
+        self.model_neutral = NeutralEnergyCorrection(args=self.args)
+    def clustering_and_global_features(self, g, x, y, add_fakes=True) -> _ClusteringOutput:
+        (
+            graphs_new, true_new, sum_e, true_pid,
+            e_true_corr_daughters, true_coords, number_of_fakes, fakes_idx,
+        ) = obtain_clustering_for_matched_showers(
+            g, x, y, self.main_model.trainer.global_rank,
+            use_gt_clusters=self.args.use_gt_clusters,
+            add_fakes=add_fakes,
+        )
+        batch_num_nodes = graphs_new.batch_num_nodes()
+        batch_idx = []
+        for i, n in enumerate(batch_num_nodes):
+            batch_idx.extend([i] * n)
+        batch_idx = torch.tensor(batch_idx).to(self.main_model.device)
+        graphs_new.ndata["h"][:, 0:3] = graphs_new.ndata["h"][:, 0:3] / 3300
+        graphs_sum_features = scatter_add(graphs_new.ndata["h"], batch_idx, dim=0)
+        graphs_sum_features = graphs_sum_features[batch_idx]
+        betas = torch.sigmoid(graphs_new.ndata["h"][:, -1])
+        graphs_new.ndata["h"] = torch.cat(
+            (graphs_new.ndata["h"], graphs_sum_features), dim=1
+        )
+        high_level = get_post_clustering_features(graphs_new, sum_e)
+        extra_features = get_extra_features(graphs_new, betas)
+        dev = graphs_new.ndata["h"].device
+        n = high_level.shape[0]
+        pred_energy = torch.ones(n, device=dev)
+        pred_pos    = torch.ones(n, 3, device=dev)
+        pred_pid    = torch.ones(n, device=dev).long()
+        node_features_avg = scatter_mean(graphs_new.ndata["h"], batch_idx, dim=0)[:, 0:3]
+        eta = calculate_eta(node_features_avg[:, 0], node_features_avg[:, 1], node_features_avg[:, 2])
+        phi = calculate_phi(node_features_avg[:, 0], node_features_avg[:, 1])
+        high_level = torch.cat(
+            (high_level, node_features_avg, eta.view(-1, 1), phi.view(-1, 1)), dim=1
+        )
+        num_tracks  = high_level[:, 7]
+        charged_idx = torch.where(num_tracks >= 1)[0]
+        neutral_idx = torch.where(num_tracks < 1)[0]
+        assert len(charged_idx) + len(neutral_idx) == len(num_tracks)
+        assert high_level.shape[0] == graphs_new.batch_num_nodes().shape[0]
+        return _ClusteringOutput(
+            graphs=graphs_new,
+            batch_idx=batch_idx,
+            high_level_feats=high_level,
+            charged_idx=charged_idx,
+            neutral_idx=neutral_idx,
+            feats_charged=_zero_nans(high_level[charged_idx]),
+            feats_neutral=_zero_nans(high_level[neutral_idx]),
+            pred_energy=pred_energy,
+            pred_pos=pred_pos,
+            pred_pid=pred_pid,
+            true=true_new,
+            true_pid=true_pid,
+            true_coords=true_coords,
+            sum_e=sum_e,
+            e_true_daughters=e_true_corr_daughters,
+            n_fakes=number_of_fakes,
+            extra_features=extra_features,
+            fakes_idx=fakes_idx,
+        )
+    def forward_correction(self, g, x, y, return_train):
+        cf = self.clustering_and_global_features(g, x, y, add_fakes=self.args.predict)
+        charged_energies = self.model_charged.charged_prediction(
+            cf.graphs, cf.charged_idx, cf.feats_charged
+        )
+        neutral_energies, neutral_pxyz_avg = self.model_neutral.neutral_prediction(
+            cf.graphs, cf.neutral_idx, cf.feats_neutral
+        )
+        if len(self.pids_charged):
+            charged_energies, charged_positions, charged_PID_pred, charged_ref_pt_pred = charged_energies
+        else:
+            charged_energies, charged_positions, _ = charged_energies
+        if len(self.pids_neutral):
+            neutral_energies, neutral_positions, neutral_PID_pred, neutral_ref_pt_pred = neutral_energies
+        else:
+            neutral_energies, neutral_positions, _ = neutral_energies
+        cf.pred_energy[cf.charged_idx.flatten()] = charged_energies
+        cf.pred_energy[cf.neutral_idx.flatten()] = neutral_energies
+        _decode_pid(cf.pred_pid, self.pids_charged, charged_PID_pred, cf.charged_idx)
+        _decode_pid(cf.pred_pid, self.pids_neutral, neutral_PID_pred, cf.neutral_idx)
+        cf.pred_energy[cf.pred_energy < 0] = 0.0
+        pred_ref_pt = torch.ones_like(cf.pred_pos)
+        if len(cf.charged_idx):
+            pred_ref_pt[cf.charged_idx.flatten()] = charged_ref_pt_pred.to(pred_ref_pt.device)
+            cf.pred_pos[cf.charged_idx.flatten()] = charged_positions.float().to(cf.pred_pos.device)
+        if len(cf.neutral_idx):
+            pred_ref_pt[cf.neutral_idx.flatten()] = neutral_ref_pt_pred.to(cf.neutral_idx.device)
+            cf.pred_pos[cf.neutral_idx.flatten()] = neutral_positions.to(cf.neutral_idx.device).float()
+        predictions = {
+            "pred_energy_corr": cf.pred_energy,
+            "pred_pos":         cf.pred_pos,
+            "neutrals_idx":     cf.neutral_idx.flatten(),
+            "charged_idx":      cf.charged_idx.flatten(),
+            "pred_ref_pt":      pred_ref_pt,
+            "extra_features":   cf.extra_features,
+            "fakes_labels":     cf.fakes_idx,
+        }
+        if len(self.pids_charged) or len(self.pids_neutral):
+            predictions["pred_PID"]          = cf.pred_pid
+            predictions["charged_PID_pred"]  = charged_PID_pred
+            predictions["neutral_PID_pred"]  = neutral_PID_pred
+        if return_train:
+            return x, predictions, cf.true, cf.sum_e, cf.true_pid, cf.true, cf.true_coords, cf.n_fakes
+        else:
+            return (
+                x, predictions, cf.true, cf.sum_e, cf.graphs, cf.batch_idx,
+                cf.high_level_feats, cf.true_pid, cf.e_true_daughters,
+                cf.true_coords, cf.n_fakes,
+            )
+    def get_loss(self, batch_g, y, result, stats, fixed):
+        (
+            model_output, dic_e_cor, e_true, e_sum_hits, new_graphs, batch_id,
+            graph_level_features, pid_true_matched, e_true_corr_daughters,
+            part_coords_matched, num_fakes,
+        ) = result
+        e_cor = dic_e_cor["pred_energy_corr"]
+        mask_neutral_for_loss = correct_mask_neutral(
+            torch.tensor(pid_true_matched), dic_e_cor["neutrals_idx"]
+        )
+        e_true_neutrals = e_true[mask_neutral_for_loss]
+        e_pred_neutrals = e_cor[mask_neutral_for_loss]
+        e_reco_neutrals = e_sum_hits[mask_neutral_for_loss]
+        in_distribution = (torch.abs(e_true_neutrals - e_reco_neutrals) / e_true_neutrals) < 0.6
+        ypred  = e_pred_neutrals[in_distribution]
+        ybatch = e_true_neutrals[in_distribution]
+        loss_EC_neutrals = criterion_E_cor(ypred.flatten(), ybatch.flatten()) if len(ypred) > 0 else 0
+        wandb.log({"loss_EC_neutrals": loss_EC_neutrals})
+        loss_neutral_pid = 0
+        loss_charged_pid = 0
+        if len(self.pids_charged):
+            charged_PID_pred, charged_PID_true_onehot, mask_charged = obtain_PID_charged(
+                dic_e_cor, pid_true_matched, self.pids_charged, self.args, self.pid_conversion_dict
+            )
+            loss_charged_pid, acc_charged = pid_loss(
+                charged_PID_pred, charged_PID_true_onehot,
+                e_true[dic_e_cor["charged_idx"]], mask_charged, fixed, "charged",
+            )
+            wandb.log({"loss_charged_pid": loss_charged_pid})
+        if len(self.pids_neutral):
+            neutral_PID_pred, neutral_PID_true_onehot, mask_neutral = obtain_PID_neutral(
+                dic_e_cor, pid_true_matched, self.pids_neutral, self.args, self.pid_conversion_dict
+            )
+            loss_neutral_pid, acc_neutral = pid_loss(
+                neutral_PID_pred, neutral_PID_true_onehot,
+                e_true, mask_neutral, fixed, "neutral",
+            )
+            wandb.log({"loss_neutral_pid": loss_neutral_pid})
+        return loss_EC_neutrals, 0, loss_neutral_pid, loss_charged_pid
+    def get_validation_step_outputs(self, batch_g, y, result):
+        (
+            model_output, e_cor, e_true, e_sum_hits,
+            new_graphs, batch_id, graph_level_features,
+            pid_true_matched, e_true_corr_daughters,
+            coords_true, num_fakes,
+        ) = result
+        if len(self.pids_charged):
+            charged_idx = e_cor["charged_idx"]
+        if len(self.pids_neutral):
+            neutral_idx = e_cor["neutrals_idx"]
+        pred_pid         = e_cor["pred_PID"]
+        charged_PID_pred = e_cor["charged_PID_pred"]
+        neutral_PID_pred = e_cor["neutral_PID_pred"]
+        pred_pos         = e_cor["pred_pos"]
+        pred_ref_pt      = e_cor["pred_ref_pt"]
+        extra_features   = e_cor["extra_features"]
+        fakes_labels     = e_cor["fakes_labels"]
+        e_cor            = e_cor["pred_energy_corr"]
+        PID_logits = torch.zeros(len(e_cor), len(self.pids_charged) + len(self.pids_neutral)).float()
+        PID_logits[charged_idx.cpu(), 0] = charged_PID_pred.detach().cpu()[:, 0]
+        PID_logits[charged_idx.cpu(), 1] = charged_PID_pred.detach().cpu()[:, 1]
+        PID_logits[charged_idx.cpu(), 4] = charged_PID_pred.detach().cpu()[:, 2]
+        PID_logits[neutral_idx.cpu(), 2] = neutral_PID_pred.detach().cpu()[:, 0]
+        PID_logits[neutral_idx.cpu(), 3] = neutral_PID_pred.detach().cpu()[:, 1]
+        extra_features = extra_features.detach().cpu()
+        extra_features = torch.cat((extra_features, PID_logits), dim=1).numpy()
+        return e_cor, pred_pos, pred_ref_pt, pred_pid, num_fakes, extra_features, fakes_labels
+def pid_loss(
+    pid_pred_all: torch.Tensor,
+    pid_true_all: torch.Tensor,
+    e_true: torch.Tensor,
+    mask: torch.Tensor,
+    frozen: bool = False,
+    name: str = "",
+) -> tuple:
+    if not len(pid_pred_all):
+        return 0, 0
+    mask = mask.bool()
+    pid_pred = pid_pred_all[mask]
+    pid_true = pid_true_all[mask]
+    if not len(pid_pred):
+        return 0, 0
+    acc  = torch.sum(pid_pred == pid_true) / len(pid_pred)
+    loss = CrossEntropyLoss()(pid_pred, pid_true)
+    return loss, acc

src/models/energy_correction_charged.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+energy_correction_charged.py
+"""
+import torch
+import torch.nn as nn
+from torch_scatter import scatter_sum
+from xformers.ops.fmha import BlockDiagonalMask
+import dgl
+from gatr import GATr, SelfAttentionConfig, MLPConfig
+from gatr.interface import embed_point, embed_scalar
+from src.layers.tools_for_regression import PickPAtDCA
+class ChargedEnergyCorrection(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.in_features_global = 16
+        self.in_features_gnn = 16   # GATr multivector output dim per batch
+        self.pid_channels = [0, 1, 4]
+        n_layers = 3
+        self.args = args
+        self.gatr = GATr(
+            in_mv_channels=1,
+            out_mv_channels=1,
+            hidden_mv_channels=4,
+            in_s_channels=2,
+            out_s_channels=None,
+            hidden_s_channels=4,
+            num_blocks=3,
+            attention=SelfAttentionConfig(),
+            mlp=MLPConfig(),
+        )
+        out_features_gnn = self.in_features_gnn
+        in_features_global = self.in_features_global
+        n_pid_classes = len(self.pid_channels)
+        pid_layers = [nn.Linear(out_features_gnn + in_features_global + 1, 64)]
+        for _ in range(n_layers - 1):
+            pid_layers.append(nn.Linear(64, 64))
+            pid_layers.append(nn.ReLU())
+        pid_layers.append(nn.Linear(64, n_pid_classes))
+        self.PID_head = nn.Sequential(*pid_layers)
+        self.PickPAtDCA = PickPAtDCA()
+    def charged_prediction(self, graphs_new, charged_idx, graphs_high_level_features):
+        unbatched = dgl.unbatch(graphs_new)
+        if len(charged_idx) > 0:
+            charged_graphs = dgl.batch([unbatched[i] for i in charged_idx])
+            charged_energies = self.predict(
+                graphs_high_level_features,
+                charged_graphs,
+            )
+        else:
+            empty = torch.tensor([]).to(graphs_new.ndata["h"].device)
+            charged_energies = [empty, empty, empty, empty]
+        return charged_energies
+    def predict(self, x_global_features, graphs_new=None):
+        """
+        Forward pass for charged energy correction.
+        :param x_global_features: Global graph-level features (batch, in_features_global)
+        :param graphs_new: Batched DGL graph of hit-level data
+        :return: (E, direction, pid_pred, ref_pt_pred)
+        """
+        if graphs_new is not None:
+            batch_num_nodes = graphs_new.batch_num_nodes()
+            batch_idx = []
+            for i, n in enumerate(batch_num_nodes):
+                batch_idx.extend([i] * n)
+            batch_idx = torch.tensor(batch_idx).to(graphs_new.device)
+            hits_points = graphs_new.ndata["h"][:, 0:3]
+            hit_type = graphs_new.ndata["h"][:, 4:8].argmax(dim=1)
+            p = graphs_new.ndata["h"][:, 9]
+            e = graphs_new.ndata["h"][:, 8]
+            embedded_inputs = embed_point(hits_points) + embed_scalar(hit_type.view(-1, 1))
+            extra_scalars = torch.cat([p.unsqueeze(1), e.unsqueeze(1)], dim=1)
+            mask = self.build_attention_mask(graphs_new)
+            embedded_inputs = embedded_inputs.unsqueeze(-2)
+            embedded_outputs, _ = self.gatr(
+                embedded_inputs, scalars=extra_scalars, attention_mask=mask
+            )
+            embedded_outputs_per_batch = scatter_sum(embedded_outputs[:, 0, :], batch_idx, dim=0)
+            recovered_E = x_global_features[:, 6] / x_global_features[:, 3]
+            x_global_features = torch.cat((x_global_features, recovered_E.view(-1, 1)), dim=1)
+            model_x = torch.cat([x_global_features, embedded_outputs_per_batch], dim=1)
+        pid_pred = self.PID_head(model_x)
+        p_tracks, pos, ref_pt_pred = self.PickPAtDCA.predict(x_global_features, graphs_new)
+        E = torch.norm(pos, dim=1)
+        pos = (pos / torch.norm(pos, dim=1).unsqueeze(1)).clone()
+        return E, pos, pid_pred, ref_pt_pred
+    @staticmethod
+    def obtain_batch_numbers(g):
+        graphs_eval = dgl.unbatch(g)
+        batch_numbers = []
+        for index, gj in enumerate(graphs_eval):
+            num_nodes = gj.number_of_nodes()
+            batch_numbers.append(index * torch.ones(num_nodes))
+        return torch.cat(batch_numbers, dim=0)
+    def build_attention_mask(self, g):
+        batch_numbers = self.obtain_batch_numbers(g)
+        return BlockDiagonalMask.from_seqlens(
+            torch.bincount(batch_numbers.long()).tolist()
+        )

src/models/energy_correction_neutral.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""
+energy_correction_neutral.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch_scatter import scatter_sum
+from xformers.ops.fmha import BlockDiagonalMask
+import dgl
+from gatr import GATr, SelfAttentionConfig, MLPConfig
+from gatr.interface import embed_point, embed_scalar
+from src.models.E_correction_module import Net
+from src.layers.tools_for_regression import ECNetWrapperAvg, AverageHitsP
+class NeutralEnergyCorrection(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.in_features_global = 16
+        self.in_features_gnn = 16   # GATr multivector output dim per batch
+        self.pid_channels = [2, 3]
+        self.args = args
+        n_layers = 3
+        gatr_kwargs = dict(
+            in_mv_channels=1,
+            out_mv_channels=1,
+            hidden_mv_channels=4,
+            in_s_channels=2,
+            out_s_channels=None,
+            hidden_s_channels=4,
+            num_blocks=3,
+            attention=SelfAttentionConfig(),
+            mlp=MLPConfig(),
+        )
+        self.gatr = GATr(**gatr_kwargs)
+        self.gatr_pid = GATr(**gatr_kwargs)
+        out_features_gnn = self.in_features_gnn
+        in_features_global = self.in_features_global
+        n_pid_classes = len(self.pid_channels)
+        out_f = 1  # Energy prediction (scalar)
+        pid_layers = [nn.Linear(out_features_gnn + in_features_global, 64)]
+        for _ in range(n_layers - 1):
+            pid_layers.append(nn.Linear(64, 64))
+            pid_layers.append(nn.ReLU())
+        pid_layers.append(nn.Linear(64, n_pid_classes))
+        self.PID_head = nn.Sequential(*pid_layers)
+        self.model = Net(
+            in_features=out_features_gnn + in_features_global,
+            out_features=out_f,
+            return_raw=True,
+        )
+        self.ec_model_wrapper_neutral_avg = ECNetWrapperAvg()
+        self.AvgHits = AverageHitsP(ecal_only=True)
+    def neutral_prediction(self, graphs_new, neutral_idx, features_neutral_no_nan):
+        unbatched = dgl.unbatch(graphs_new)
+        if len(neutral_idx) > 0:
+            neutral_graphs = dgl.batch([unbatched[i] for i in neutral_idx])
+            neutral_energies = self.predict(
+                features_neutral_no_nan,
+                neutral_graphs,
+            )
+            neutral_pxyz_avg = self.ec_model_wrapper_neutral_avg.predict(
+                features_neutral_no_nan,
+                neutral_graphs,
+            )[1]
+        else:
+            empty = torch.tensor([]).to(graphs_new.ndata["h"].device)
+            neutral_energies = [empty, empty, empty, empty]
+            neutral_pxyz_avg = empty
+        return neutral_energies, neutral_pxyz_avg
+    def predict(self, x_global_features, graphs_new=None):
+        """
+        Forward pass for neutral energy correction.
+        :param x_global_features: Global graph-level features (batch, in_features_global)
+        :param graphs_new: Batched DGL graph of hit-level data
+        :return: (E_pred, direction, pid_pred, ref_pt_pred)
+        """
+        if graphs_new is not None:
+            batch_num_nodes = graphs_new.batch_num_nodes()
+            batch_idx = []
+            for i, n in enumerate(batch_num_nodes):
+                batch_idx.extend([i] * n)
+            batch_idx = torch.tensor(batch_idx).to(graphs_new.device)
+            hits_points = graphs_new.ndata["h"][:, 0:3]
+            hit_type = graphs_new.ndata["h"][:, 4:8].argmax(dim=1)
+            p = graphs_new.ndata["h"][:, 9]
+            e = graphs_new.ndata["h"][:, 8]
+            embedded_inputs = embed_point(hits_points) + embed_scalar(hit_type.view(-1, 1))
+            extra_scalars = torch.cat([p.unsqueeze(1), e.unsqueeze(1)], dim=1)
+            mask = self.build_attention_mask(graphs_new)
+            embedded_inputs = embedded_inputs.unsqueeze(-2)
+            embedded_outputs, _ = self.gatr(
+                embedded_inputs, scalars=extra_scalars, attention_mask=mask
+            )
+            embedded_outputs_per_batch = scatter_sum(embedded_outputs[:, 0, :], batch_idx, dim=0)
+            model_x = torch.cat([x_global_features, embedded_outputs_per_batch], dim=1)
+            embedded_outputs_pid, _ = self.gatr_pid(
+                embedded_inputs, scalars=extra_scalars, attention_mask=mask
+            )
+            embedded_outputs_per_batch_pid = scatter_sum(
+                embedded_outputs_pid[:, 0, :], batch_idx, dim=0
+            )
+            model_x_pid = torch.cat([x_global_features, embedded_outputs_per_batch_pid], dim=1)
+        res = self.model(model_x)
+        pid_pred = self.PID_head(model_x_pid)
+        E_pred = res[:, 0]
+        _, p_pred, ref_pt_pred = self.AvgHits.predict(x_global_features, graphs_new)
+        p_pred = (p_pred / torch.norm(p_pred, dim=1).unsqueeze(1)).clone()
+        return E_pred, p_pred, pid_pred, ref_pt_pred
+    @staticmethod
+    def obtain_batch_numbers(g):
+        graphs_eval = dgl.unbatch(g)
+        batch_numbers = []
+        for index, gj in enumerate(graphs_eval):
+            num_nodes = gj.number_of_nodes()
+            batch_numbers.append(index * torch.ones(num_nodes))
+        return torch.cat(batch_numbers, dim=0)
+    def build_attention_mask(self, g):
+        batch_numbers = self.obtain_batch_numbers(g)
+        return BlockDiagonalMask.from_seqlens(
+            torch.bincount(batch_numbers.long()).tolist()
+        )
+def correct_mask_neutral(pid_neutral, neural_mask):
+    """
+    Filter neutral-candidate indices to keep only genuine neutral PIDs.
+    """
+    pid_neutral = pid_neutral.to(neural_mask.device)
+    pid_neutral = torch.abs(pid_neutral)
+    keep_list = torch.tensor([22, 130, 2112], device=pid_neutral.device)
+    selected_pids = pid_neutral[neural_mask]
+    keep_mask = torch.isin(selected_pids, keep_list)
+    return neural_mask[keep_mask.to(neural_mask.device)]
+def criterion_E_cor(ypred, ytrue):
+    if len(ypred) > 0:
+        return torch.mean(F.l1_loss(ypred, ytrue, reduction="none"))
+    else:
+        return 0

src/models/wrapper/example_mode_gatr_noise.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from src.models.Gatr_pf_e_noise import ExampleWrapper
+class GraphTransformerNetWrapper(torch.nn.Module):
+    def __init__(self, args, dev, **kwargs) -> None:
+        super().__init__()
+        self.mod = ExampleWrapper(args, dev, **kwargs)
+    def forward(self, g, y, step_count, **kwargs):
+        return self.mod(g, y, step_count, **kwargs)
+def get_model(data_config, args, dev, **kwargs):
+    model = GraphTransformerNetWrapper(args, dev, **kwargs)
+    model_info = {}
+    return model, model_info
+def get_loss(data_config, **kwargs):
+    return torch.nn.MSELoss()

src/train_lightning1.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python
+import os
+import sys
+import glob
+import torch
+import lightning as L
+from lightning.pytorch.loggers import WandbLogger
+sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
+from src.utils.parser_args import parser
+from src.utils.train_utils import (
+    train_load,
+    test_load,
+    get_samples_steps_per_epoch,
+    model_setup,
+    set_gpus,
+)
+from src.utils.load_pretrained_models import (
+    load_train_model,
+    load_test_model,
+)
+from src.utils.callbacks import (
+    get_callbacks,
+    get_callbacks_eval,
+)
+# ----------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------
+def setup_wandb(args):
+    return WandbLogger(
+        project=args.wandb_projectname,
+        entity=args.wandb_entity,
+        name=args.wandb_displayname,
+        log_model="all",
+    )
+def build_trainer(args, gpus, logger, training=True):
+    callbacks = get_callbacks(args) if training else get_callbacks_eval(args)
+    strategy = "auto" if args.correction else "ddp" if training else None
+    return L.Trainer(
+        callbacks=callbacks,
+        accelerator="gpu",
+        devices=gpus,
+        default_root_dir=args.model_prefix,
+        logger=logger,
+        max_epochs=args.num_epochs if training else None,
+        strategy=strategy,
+        limit_train_batches=args.train_batches if training else None,
+        limit_val_batches=5 if training else None,
+    )
+# ----------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------
+def main():
+    args = parser.parse_args()
+    torch.autograd.set_detect_anomaly(True)
+    training_mode = not args.predict
+    args.local_rank = 0
+    # --------------------------------------------------
+    # Data
+    # --------------------------------------------------
+    args = get_samples_steps_per_epoch(args)
+    if training_mode:
+        args.data_train = glob.glob(args.data_train[0] + "*.parquet")
+        train_loader, val_loader, data_config, train_input_names = train_load(args)
+    else:
+        test_loaders, data_config = test_load(args)
+    # --------------------------------------------------
+    # Model & devices
+    # --------------------------------------------------
+    model = model_setup(args, data_config)
+    gpus, dev = set_gpus(args)
+    if training_mode and args.load_model_weights:
+        model = load_train_model(args, dev)
+    # --------------------------------------------------
+    # Logger
+    # --------------------------------------------------
+    wandb_logger = setup_wandb(args)
+    # --------------------------------------------------
+    # Training
+    # --------------------------------------------------
+    if training_mode:
+        trainer = build_trainer(args, gpus, wandb_logger, training=True)
+        args.local_rank = trainer.global_rank
+        trainer.fit(
+            model=model,
+            train_dataloaders=train_loader,
+            val_dataloaders=val_loader,
+        )
+    # --------------------------------------------------
+    # Evaluation
+    # --------------------------------------------------
+    if args.data_test:
+        if args.load_model_weights:
+            model = load_test_model(args, dev)
+        trainer = build_trainer(args, gpus, wandb_logger, training=False)
+        for name, get_test_loader in test_loaders.items():
+            test_loader = get_test_loader()
+            trainer.validate(
+                model=model,
+                dataloaders=test_loader,
+            )
+if __name__ == "__main__":
+    main()

src/utils/callbacks.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from lightning.pytorch.callbacks import (
+    TQDMProgressBar,
+    ModelCheckpoint,
+    LearningRateMonitor,
+)
+from src.layers.utils_training import FreezeClustering
+def get_callbacks(args):
+    checkpoint_callback = ModelCheckpoint(
+                dirpath=args.model_prefix,  # checkpoints_path, # <--- specify this on the trainer itself for version control
+                filename="_{epoch}_{step}",
+                # every_n_epochs=val_every_n_epochs,
+                every_n_train_steps=500,
+                save_top_k=-1,  # <--- this is important!
+                save_weights_only=True,
+            )
+    lr_monitor = LearningRateMonitor(logging_interval="epoch")
+    callbacks = [
+        TQDMProgressBar(refresh_rate=10),
+        checkpoint_callback,
+        lr_monitor,
+    ]
+    if args.freeze_clustering:
+            callbacks.append(FreezeClustering())
+    return callbacks
+def get_callbacks_eval(args):
+    callbacks=[TQDMProgressBar(refresh_rate=1)]
+    return callbacks

src/utils/import_tools.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from importlib.util import spec_from_file_location, module_from_spec
+def import_module(path, name='_mod'):
+    spec = spec_from_file_location(name, path)
+    mod = module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod

src/utils/inference/pandas_helpers.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import gzip
+import pickle
+import mplhep as hep
+from src.utils.pid_conversion import pid_conversion_dict
+#hep.style.use("CMS")
+import matplotlib
+import numpy as np
+import pandas as pd
+def open_mlpf_dataframe(path_mlpf, neutrals_only=False, charged_only=False):
+    data = pd.read_pickle(path_mlpf)
+    sd = data
+    sd["pid_4_class_true"] = sd["pid"].map(pid_conversion_dict)
+    if "pred_pid_matched" in sd.columns:
+        sd.loc[sd["pred_pid_matched"] < -1, "pred_pid_matched"] = np.nan
+    return sd
+def concat_with_batch_fix(dfs, batch_key="number_batch"):
+    corrected_dfs = []
+    batch_offset = 0
+    for df in dfs:
+        df = df.copy()
+        if batch_key in df.columns:
+            df[batch_key] = df[batch_key] + batch_offset
+            batch_offset = df[batch_key].max() + 1
+        else:
+            raise KeyError(f"'{batch_key}' not found in one of the DataFrames.")
+        corrected_dfs.append(df)
+    return pd.concat(corrected_dfs, ignore_index=True)

src/utils/load_pretrained_models.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+def load_train_model(args, dev):
+    from src.models.Gatr_pf_e_noise import ExampleWrapper as GravnetModel
+    model = GravnetModel.load_from_checkpoint(
+        args.load_model_weights, args=args, dev=0, map_location=dev,strict=False)
+    return model
+def load_test_model(args, dev):
+    if args.load_model_weights is not None and (not args.correction):
+            from src.models.Gatr_pf_e_noise import ExampleWrapper as GravnetModel
+            model = GravnetModel.load_from_checkpoint(
+                args.load_model_weights, args=args, dev=0, map_location=dev, strict=False
+            )
+    if args.load_model_weights is not None and args.correction:
+            from src.models.Gatr_pf_e_noise import ExampleWrapper as GravnetModel
+            ckpt = torch.load(args.load_model_weights, map_location=dev)
+            state_dict = ckpt["state_dict"]
+            model = GravnetModel( args=args, dev=0)
+            model.load_state_dict(state_dict, strict=False)
+            model2 = GravnetModel.load_from_checkpoint(args.load_model_weights_clustering, args=args, dev=0, strict=False, map_location=torch.device("cuda:0"))
+            model.gatr = model2.gatr
+            model.ScaledGooeyBatchNorm2_1 = model2.ScaledGooeyBatchNorm2_1
+            model.clustering = model2.clustering
+            model.beta = model2.beta
+    model.eval()
+    return model

src/utils/logger_wandb.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import wandb
+import numpy as np
+import torch
+from sklearn.metrics import roc_curve, roc_auc_score
+import json
+import dgl
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from torch_scatter import scatter_max
+from matplotlib.cm import ScalarMappable
+from matplotlib.colors import Normalize
+def log_losses_wandb(
+    logwandb, num_batches, local_rank, losses, loss, val=False
+):
+    if val:
+        val_ = " val"
+    else:
+        val_ = ""
+    if logwandb and ((num_batches - 1) % 10) == 0 and local_rank == 0:
+        wandb.log(
+            {
+                "loss" + val_ + " regression": loss,
+                "loss" + val_ + " lv": losses[0],
+                "loss" + val_ + " beta": losses[1],
+                "loss" + val_ + " beta sig": losses[2],
+                "loss" + val_ + " beta noise": losses[3],
+                "loss" + val_ + " attractive": losses[12],
+                "loss" + val_ + " repulsive": losses[13],
+            }
+        )

src/utils/parser_args.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--freeze-clustering",
+    action="store_true",
+    default=False,
+    help="Freeze the clustering part of the model",
+)
+parser.add_argument("-c", "--data-config", type=str, help="data config YAML file")
+parser.add_argument(
+    "-i",
+    "--data-train",
+    nargs="*",
+    default=[],
+    help="training files; supported syntax:"
+    " (a) plain list, `--data-train /path/to/a/* /path/to/b/*`;"
+    " (b) (named) groups [Recommended], `--data-train a:/path/to/a/* b:/path/to/b/*`,"
+    " the file splitting (for each dataloader worker) will be performed per group,"
+    " and then mixed together, to ensure a uniform mixing from all groups for each worker.",
+)
+parser.add_argument(
+    "-l",
+    "--data-val",
+    nargs="*",
+    default=[],
+    help="validation files; when not set, will use training files and split by `--train-val-split`",
+)
+parser.add_argument(
+    "-t",
+    "--data-test",
+    nargs="*",
+    default=[],
+    help="testing files; supported syntax:"
+    " (a) plain list, `--data-test /path/to/a/* /path/to/b/*`;"
+    " (b) keyword-based, `--data-test a:/path/to/a/* b:/path/to/b/*`, will produce output_a, output_b;"
+    " (c) split output per N input files, `--data-test a%10:/path/to/a/*`, will split per 10 input files",
+)
+parser.add_argument(
+    "--data-fraction",
+    type=float,
+    default=1,
+    help="fraction of events to load from each file; for training, the events are randomly selected for each epoch",
+)
+parser.add_argument(
+    "--file-fraction",
+    type=float,
+    default=1,
+    help="fraction of files to load; for training, the files are randomly selected for each epoch",
+)
+parser.add_argument(
+    "--fetch-by-files",
+    action="store_true",
+    default=False,
+    help="When enabled, will load all events from a small number (set by ``--fetch-step``) of files for each data fetching. "
+    "Otherwise (default), load a small fraction of events from all files each time, which helps reduce variations in the sample composition.",
+)
+parser.add_argument(
+    "--fetch-step",
+    type=float,
+    default=0.01,
+    help="fraction of events to load each time from every file (when ``--fetch-by-files`` is disabled); "
+    "Or: number of files to load each time (when ``--fetch-by-files`` is enabled). Shuffling & sampling is done within these events, so set a large enough value.",
+)
+parser.add_argument(
+    "--train-val-split",
+    type=float,
+    default=0.8,
+    help="training/validation split fraction",
+)
+parser.add_argument(
+    "-n",
+    "--network-config",
+    type=str,
+    help="network architecture configuration file; the path must be relative to the current dir",
+)
+parser.add_argument(
+    "-m",
+    "--model-prefix",
+    type=str,
+    default="models/{auto}/networkss",
+    help="path to save or load the model; for training, this will be used as a prefix, so model snapshots "
+    "will saved to `{model_prefix}_epoch-%d_state.pt` after each epoch, and the one with the best "
+    "validation metric to `{model_prefix}_best_epoch_state.pt`; for testing, this should be the full path "
+    "including the suffix, otherwise the one with the best validation metric will be used; "
+    "for training, `{auto}` can be used as part of the path to auto-generate a name, "
+    "based on the timestamp and network configuration",
+)
+parser.add_argument(
+    "--load-model-weights",
+    type=str,
+    default=None,
+    help="initialize model with pre-trained weights",
+)
+parser.add_argument(
+    "--load-model-weights-clustering",
+    type=str,
+    default=None,
+    help="initialize model with pre-trained weights for clustering part of the model",
+)
+parser.add_argument("--start-lr", type=float, default=5e-3, help="start learning rate")
+parser.add_argument("--num-epochs", type=int, default=20, help="number of epochs")
+parser.add_argument(
+    "--steps-per-epoch",
+    type=int,
+    default=None,
+    help="number of steps (iterations) per epochs; "
+    "if neither of `--steps-per-epoch` or `--samples-per-epoch` is set, each epoch will run over all loaded samples",
+)
+parser.add_argument(
+    "--steps-per-epoch-val",
+    type=int,
+    default=None,
+    help="number of steps (iterations) per epochs for validation; "
+    "if neither of `--steps-per-epoch-val` or `--samples-per-epoch-val` is set, each epoch will run over all loaded samples",
+)
+parser.add_argument(
+    "--samples-per-epoch",
+    type=int,
+    default=None,
+    help="number of samples per epochs; "
+    "if neither of `--steps-per-epoch` or `--samples-per-epoch` is set, each epoch will run over all loaded samples",
+)
+parser.add_argument(
+    "--samples-per-epoch-val",
+    type=int,
+    default=None,
+    help="number of samples per epochs for validation; "
+    "if neither of `--steps-per-epoch-val` or `--samples-per-epoch-val` is set, each epoch will run over all loaded samples",
+)
+parser.add_argument("--batch-size", type=int, default=128, help="batch size")
+parser.add_argument(
+    "--gpus",
+    type=str,
+    default="0",
+    help='device for the training/testing; to use CPU, set to empty string (""); to use multiple gpu, set it as a comma separated list, e.g., `1,2,3,4`',
+)
+parser.add_argument(
+    "--num-workers",
+    type=int,
+    default=1,
+    help="number of threads to load the dataset; memory consumption and disk access load increases (~linearly) with this numbers",
+)
+parser.add_argument(
+    "--prefetch-factor",
+    type=int,
+    default=1,
+    help="How many items to prefetch in the dataloaders. Should be about the same order of magnitude as batch size for optimal performance.",
+)
+parser.add_argument(
+    "--predict",
+    action="store_true",
+    default=False,
+    help="run prediction instead of training",
+)
+parser.add_argument(
+    "--log-wandb", action="store_true", default=False, help="use wandb for loging"
+)
+parser.add_argument(
+    "--wandb-displayname",
+    type=str,
+    help="give display name to wandb run, if not entered a random one is generated",
+)
+parser.add_argument(
+    "--wandb-projectname", type=str, help="project where the run is stored inside wandb"
+)
+parser.add_argument(
+    "--wandb-entity", type=str, help="username or team name where you are sending runs"
+)
+parser.add_argument(
+    "--qmin", type=float, default=0.1, help="define qmin for condensation"
+)
+parser.add_argument(
+    "--frac_cluster_loss",
+    type=float,
+    default=0,
+    help="Fraction of total pairs to use for the clustering loss",
+)
+parser.add_argument(
+    "--use-average-cc-pos",
+    default=0.0,
+    type=float,
+    help="push the alpha to the mean of the coordinates in the object by this value",
+)
+parser.add_argument(
+    "--correction",
+    action="store_true",
+    default=False,
+    help="Train correction only",
+)
+parser.add_argument(
+    "--use-gt-clusters",
+    default=False,
+    action="store_true",
+    help="If toggled, uses ground-truth clusters instead of the predicted ones by the model. We can use this to simulate 'ideal' clustering.",
+)
+parser.add_argument(
+    "--name-output",
+    type=str,
+    help="name of the dataframe stored during eval",
+)
+parser.add_argument(
+    "--train-batches",
+    default=100,
+    type=int,
+    help="number of train batches",
+)
+parser.add_argument(
+    "--pandora",
+    default=False,
+    action="store_true",
+    help="using pandora information",
+)

src/utils/pid_conversion.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# A global variable, so it doesn't have to be modified in 10 different places when new particles are added
+pid_conversion_dict = {11: 0, -11: 0, 211: 1, -211: 1, 130: 2, -130: 2, 2112: 2, -2112: 2, 22: 3, 321: 1, -321: 1, 2212: 1, -2212: 1, 310: 2, -310: 2, 3122: 2, -3122: 2, 3212: 2, -3212: 2, 3112: 1, -3112: 1, 3222: 1, -3222: 1, 3224: 1, -3224: 1, 3312: 2, -3312: 2, 13: 4, -13: 4, 3322: 2, -3322: 2, 1000020030.0: 2, 1000010050.0: 2, 1000010048.0: 2, 3334: 1, -3334:1, 1000020032.0: 2, 1000080128.0: 2, 1000110208.0: 2, 1000040064.0: 2, 1000070144.0: 2, 1000010020.0:2, 1000010030.0:2, 1000020040.0:2}
+pandora_to_our_mapping = {211: 1, -211: 1, -13: 4, 13: 4, 11: 0, -11: 0, 22: 3, 2112: 2, 130: 2, -2112: 2}
+our_to_pandora_mapping = {0: [11, -11], 1: [211, -211,2212, -2212, 321, -321, 3222, 3112, 3224, -3112, -3224], 2: [2112, 130, 310, 3122, 3212], 3: [22], 4:[13,-13]}

src/utils/post_clustering_features.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+from torch_scatter import scatter_sum, scatter_std
+def calculate_phi(x, y, z=None):
+    return torch.arctan2(y, x)
+def calculate_eta(x, y, z):
+    theta = torch.arctan2(torch.sqrt(x ** 2 + y ** 2), z)
+    return -torch.log(torch.tan(theta / 2))
+def get_post_clustering_features(graphs_new, sum_e):
+    '''
+    Obtain graph-level qualitative features that can then be used to regress the energy corr. factor.
+    :param graph_batch: Output from the previous step - clustered, matched showers
+    :return:
+    '''
+    batch_num_nodes = graphs_new.batch_num_nodes()  # Num. of hits in each graph
+    batch_idx = []
+    for i, n in enumerate(batch_num_nodes):
+        batch_idx.extend([i] * n)
+    batch_idx = torch.tensor(batch_idx).to(graphs_new.device)
+    e_hits = graphs_new.ndata["h"][:, 8]
+    muon_hits = graphs_new.ndata["h"][:, 7]
+    filter_muon = torch.where(muon_hits)[0]
+    per_graph_e_hits_muon = scatter_sum(e_hits[filter_muon], batch_idx[filter_muon], dim_size=batch_idx.max() + 1)
+    per_graph_n_hits_muon = scatter_sum((e_hits[filter_muon] > 0).type(torch.int), batch_idx[filter_muon], dim_size=batch_idx.max() + 1)
+    ecal_hits = graphs_new.ndata["h"][:, 5]
+    filter_ecal = torch.where(ecal_hits)[0]
+    hcal_hits = graphs_new.ndata["h"][:, 6]
+    filter_hcal = torch.where(hcal_hits)[0]
+    per_graph_e_hits_ecal = scatter_sum(e_hits[filter_ecal], batch_idx[filter_ecal], dim_size=batch_idx.max() + 1)
+    # similar as above but with scatter_std
+    per_graph_e_hits_ecal_dispersion = scatter_std(e_hits[filter_ecal], batch_idx[filter_ecal], dim_size=batch_idx.max() + 1) ** 2
+    per_graph_e_hits_hcal = scatter_sum(e_hits[filter_hcal], batch_idx[filter_hcal], dim_size=batch_idx.max() + 1)
+    # similar as above but with scatter_std -- !!!!! TODO: Retrain the base EC models using this definition !!!!!
+    per_graph_e_hits_hcal_dispersion = scatter_std(e_hits[filter_hcal], batch_idx[filter_hcal], dim_size=batch_idx.max() + 1) ** 2
+    # track_nodes =
+    track_p = scatter_sum(graphs_new.ndata["h"][:, 9], batch_idx)
+    chis_tracks = scatter_sum(graphs_new.ndata["chi_squared_tracks"], batch_idx)
+    num_tracks = scatter_sum((graphs_new.ndata["h"][:, 9] > 0).type(torch.int), batch_idx)
+    track_p = track_p / num_tracks
+    track_p[num_tracks == 0] = 0.
+    chis_tracks = chis_tracks / num_tracks
+    num_hits = graphs_new.batch_num_nodes()
+    # print shapes of the below things
+    return torch.nan_to_num(
+        torch.stack([per_graph_e_hits_ecal / sum_e,
+                        per_graph_e_hits_hcal / sum_e,
+                        num_hits, track_p,
+                        per_graph_e_hits_ecal_dispersion,
+                        per_graph_e_hits_hcal_dispersion,
+                        sum_e, num_tracks, torch.clamp(chis_tracks, -5, 5),
+                        per_graph_e_hits_muon,
+                        per_graph_n_hits_muon
+                        ]).T
+    )
+def get_extra_features(graphs_new, betas):
+    '''
+    Obtain extra graph-level features for debugging of the fakes
+    '''
+    batch_num_nodes = graphs_new.batch_num_nodes()  # Num. of hits in each graph
+    batch_idx = []
+    topk_highest_betas = []
+    for i, n in enumerate(batch_num_nodes):
+        batch_idx.extend([i] * n)
+    batch_idx = torch.tensor(batch_idx).to(graphs_new.device)
+    n_highest_betas = 1
+    for i in range(len(batch_num_nodes)):
+        betas_i = betas[batch_idx == i]
+        topk_betas = torch.topk(betas_i, n_highest_betas)
+        if len(topk_betas.values) < n_highest_betas:
+            topk_betas = torch.cat([topk_betas.values, torch.zeros(n_highest_betas - len(topk_betas.values))])
+        topk_highest_betas.append(topk_betas.values)
+    topk_highest_betas = torch.stack(topk_highest_betas)
+    # Concat with batch_num_nodes
+    features = torch.cat([batch_num_nodes.view(-1, 1), topk_highest_betas], dim=1)
+    return features

src/utils/train_utils.py ADDED Viewed

	@@ -0,0 +1,281 @@

+import os
+import ast
+import sys
+import shutil
+import glob
+import functools
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from src.dataset.dataset import SimpleIterDataset
+from src.utils.import_tools import import_module
+from src.dataset.functions_graph import graph_batch_func
+def set_gpus(args):
+    if args.gpus:
+        gpus = [int(i) for i in args.gpus.split(",")]
+        dev = torch.device(gpus[0])
+        print("Using GPUs:", gpus)
+    else:
+        print("No GPUs flag provided - Setting GPUs to [0]")
+        gpus = [0]
+        dev = torch.device(gpus[0])
+        raise Exception("Please provide GPU number")
+    return gpus, dev
+def get_gpu_dev(args):
+    if args.gpus != "":
+        accelerator = "gpu"
+        devices = args.gpus
+    else:
+        accelerator = 0
+        devices = 0
+    return accelerator, devices
+# TODO change this to use it from config file
+def model_setup(args, data_config):
+    """
+    Loads the model
+    :param args:
+    :param data_config:
+    :return: model, model_info, network_module
+    """
+    network_module = import_module(args.network_config, name="_network_module")
+    if args.gpus:
+        gpus = [int(i) for i in args.gpus.split(",")]  # ?
+        dev = torch.device(gpus[0])
+        print("using GPUs:", gpus)
+    else:
+        gpus = None
+        local_rank = 0
+        dev = torch.device("cpu")
+    model, model_info = network_module.get_model(
+        data_config, args=args, dev=dev
+    )
+    return model.mod
+def get_samples_steps_per_epoch(args):
+    if args.samples_per_epoch is not None:
+        if args.steps_per_epoch is None:
+            args.steps_per_epoch = args.samples_per_epoch // args.batch_size
+        else:
+            raise RuntimeError(
+                "Please use either `--steps-per-epoch` or `--samples-per-epoch`, but not both!"
+            )
+    if args.samples_per_epoch_val is not None:
+        if args.steps_per_epoch_val is None:
+            args.steps_per_epoch_val = args.samples_per_epoch_val // args.batch_size
+        else:
+            raise RuntimeError(
+                "Please use either `--steps-per-epoch-val` or `--samples-per-epoch-val`, but not both!"
+            )
+    if args.steps_per_epoch_val is None and args.steps_per_epoch is not None:
+        args.steps_per_epoch_val = round(
+            args.steps_per_epoch * (1 - args.train_val_split) / args.train_val_split
+        )
+    if args.steps_per_epoch_val is not None and args.steps_per_epoch_val < 0:
+        args.steps_per_epoch_val = None
+    return args
+def to_filelist(args, mode="train"):
+    if mode == "train":
+        flist = args.data_train
+    elif mode == "val":
+        flist = args.data_val
+    else:
+        raise NotImplementedError("Invalid mode %s" % mode)
+    # keyword-based: 'a:/path/to/a b:/path/to/b'
+    file_dict = {}
+    for f in flist:
+        if ":" in f:
+            name, fp = f.split(":")
+        else:
+            name, fp = "_", f
+        files = glob.glob(fp)
+        if name in file_dict:
+            file_dict[name] += files
+        else:
+            file_dict[name] = files
+    # sort files
+    for name, files in file_dict.items():
+        file_dict[name] = sorted(files)
+    if args.local_rank is not None:
+        if mode == "train":
+            gpus_list, _ = set_gpus(args)
+            local_world_size = len(gpus_list)  # int(os.environ['LOCAL_WORLD_SIZE'])
+            new_file_dict = {}
+            for name, files in file_dict.items():
+                new_files = files[args.local_rank :: local_world_size]
+                assert len(new_files) > 0
+                np.random.shuffle(new_files)
+                new_file_dict[name] = new_files
+            file_dict = new_file_dict
+            print(args.local_rank, len(file_dict["_"]))
+    filelist = sum(file_dict.values(), [])
+    assert len(filelist) == len(set(filelist))
+    return file_dict, filelist
+def train_load(args):
+    """
+    Loads the training data.
+    :param args:
+    :return: train_loader, val_loader, data_config, train_inputs
+    """
+    train_file_dict, train_files = to_filelist(args, "train")
+    if args.data_val:
+        val_file_dict, val_files = to_filelist(args, "val")
+        train_range = val_range = (0, 1)
+    else:
+        val_file_dict, val_files = train_file_dict, train_files
+        train_range = (0, args.train_val_split)
+        val_range = (args.train_val_split, 1)
+    train_data = SimpleIterDataset(
+        train_file_dict,
+        args.data_config,
+        for_training=True,
+        extra_selection=None,
+        remake_weights=False,
+        load_range_and_fraction=(train_range, args.data_fraction),
+        file_fraction=args.file_fraction,
+        fetch_by_files=args.fetch_by_files,
+        fetch_step=args.fetch_step,
+        infinity_mode=args.steps_per_epoch is not None,
+        name="train" + ("" if args.local_rank is None else "_rank%d" % args.local_rank),
+        args_parse=args
+    )
+    val_data = SimpleIterDataset(
+        val_file_dict,
+        args.data_config,
+        for_training=True,
+        extra_selection=None,
+        load_range_and_fraction=(val_range, args.data_fraction),
+        file_fraction=args.file_fraction,
+        fetch_by_files=args.fetch_by_files,
+        fetch_step=args.fetch_step,
+        infinity_mode=args.steps_per_epoch_val is not None,
+        name="val" + ("" if args.local_rank is None else "_rank%d" % args.local_rank),
+        args_parse=args
+    )
+    collator_func = graph_batch_func
+    # train_data_arg = train_data
+    # val_data_arg = val_data
+    # if args.train_cap == 1:
+    #    train_data_arg = [next(iter(train_data_arg))]
+    # if args.val_cap == 1:
+    #    val_data_arg = [next(iter(val_data_arg))]
+    prefetch_factor = None
+    if args.num_workers > 0:
+        prefetch_factor = args.prefetch_factor
+    train_loader = DataLoader(
+        train_data,
+        batch_size=args.batch_size,
+        drop_last=True,
+        pin_memory=True,
+        num_workers=min(args.num_workers, int(len(train_files) * args.file_fraction)),
+        collate_fn=collator_func,
+        persistent_workers=False,
+        prefetch_factor=prefetch_factor
+    )
+    val_loader = DataLoader(
+        val_data,
+        batch_size=args.batch_size,
+        drop_last=True,
+        pin_memory=True,
+        collate_fn=collator_func,
+        num_workers=min(args.num_workers, int(len(val_files) * args.file_fraction)),
+        persistent_workers=args.num_workers > 0
+        and args.steps_per_epoch_val is not None,
+        prefetch_factor=prefetch_factor
+    )
+    data_config = 0 #train_data.config
+    train_input_names = 0 #train_data.config.input_names
+    train_label_names = 0  # train_data.config.label_names
+    return train_loader, val_loader, data_config, train_input_names
+def test_load(args):
+    """
+    Loads the test data.
+    :param args:
+    :return: test_loaders, data_config
+    """
+    # keyword-based --data-test: 'a:/path/to/a b:/path/to/b'
+    # split --data-test: 'a%10:/path/to/a/*'
+    file_dict = {}
+    split_dict = {}
+    for f in args.data_test:
+        if ":" in f:
+            name, fp = f.split(":")
+            if "%" in name:
+                name, split = name.split("%")
+                split_dict[name] = int(split)
+        else:
+            name, fp = "", f
+        files = glob.glob(fp)
+        if name in file_dict:
+            file_dict[name] += files
+        else:
+            file_dict[name] = files
+    # sort files
+    for name, files in file_dict.items():
+        file_dict[name] = sorted(files)
+    # apply splitting
+    for name, split in split_dict.items():
+        files = file_dict.pop(name)
+        for i in range((len(files) + split - 1) // split):
+            file_dict[f"{name}_{i}"] = files[i * split : (i + 1) * split]
+    def get_test_loader(name):
+        filelist = file_dict[name]
+        num_workers = min(args.num_workers, len(filelist))
+        test_data = SimpleIterDataset(
+            {name: filelist},
+            args.data_config,
+            for_training=False,
+            extra_selection=None,
+            load_range_and_fraction=((0, 1), args.data_fraction),
+            fetch_by_files=True,
+            fetch_step=1,
+            name="test_" + name,
+            args_parse=args
+        )
+        test_loader = DataLoader(
+            test_data,
+            num_workers=num_workers,
+            batch_size=args.batch_size,
+            drop_last=False,
+            pin_memory=True,
+            collate_fn=graph_batch_func,
+        )
+        return test_loader
+    test_loaders = {
+        name: functools.partial(get_test_loader, name) for name in file_dict
+    }
+    #data_config = SimpleIterDataset({}, args.data_config, for_training=False).config
+    data_config = 0
+    return test_loaders, data_config
+def count_parameters(model):
+    return sum(p.numel() for p in model.mod.parameters() if p.requires_grad)

tests/test_cpu_attention.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""Tests for the CPU-compatible attention patch in src/inference.py."""
+import torch
+import torch.nn.functional as F
+def _cpu_sdpa_under_test(q, k, v, attn_mask=None):
+    """Standalone copy of _cpu_sdpa (the patched attention) for testing.
+    Mirrors the implementation in src.inference._patch_gatr_attention_for_cpu.
+    """
+    B, H, N, D = q.shape
+    scale = float(D) ** -0.5
+    q2 = q.reshape(B * H, N, D)
+    k2 = k.reshape(B * H, N, D)
+    v2 = v.reshape(B * H, N, D)
+    attn = torch.bmm(q2 * scale, k2.transpose(1, 2))
+    if attn_mask is not None:
+        attn = attn.masked_fill(~attn_mask.unsqueeze(0), float("-inf"))
+    attn = torch.softmax(attn, dim=-1)
+    attn = attn.nan_to_num(0.0)
+    out = torch.bmm(attn, v2)
+    return out.reshape(B, H, N, D)
+def test_cpu_sdpa_matches_reference():
+    """The CPU SDPA must agree with PyTorch's reference implementation."""
+    torch.manual_seed(42)
+    B, H, N, D = 2, 4, 16, 32
+    q = torch.randn(B, H, N, D)
+    k = torch.randn(B, H, N, D)
+    v = torch.randn(B, H, N, D)
+    out_ours = _cpu_sdpa_under_test(q, k, v)
+    # PyTorch reference (no mask)
+    out_ref = F.scaled_dot_product_attention(q, k, v)
+    assert out_ours.shape == (B, H, N, D)
+    assert torch.allclose(out_ours, out_ref, atol=1e-5), (
+        f"Max diff: {(out_ours - out_ref).abs().max().item()}"
+    )
+def test_cpu_sdpa_output_shape():
+    """Output shape must be [B, H, N, D], matching the input convention."""
+    B, H, N, D = 1, 8, 64, 16
+    q = torch.randn(B, H, N, D)
+    k = torch.randn(B, H, N, D)
+    v = torch.randn(B, H, N, D)
+    out = _cpu_sdpa_under_test(q, k, v)
+    assert out.shape == (B, H, N, D)
+def test_cpu_sdpa_single_head():
+    """Single-head attention must work correctly."""
+    torch.manual_seed(0)
+    B, H, N, D = 1, 1, 10, 8
+    q = torch.randn(B, H, N, D)
+    k = torch.randn(B, H, N, D)
+    v = torch.randn(B, H, N, D)
+    out_ours = _cpu_sdpa_under_test(q, k, v)
+    out_ref = F.scaled_dot_product_attention(q, k, v)
+    assert torch.allclose(out_ours, out_ref, atol=1e-5)
+def test_cpu_sdpa_asymmetric_heads_items():
+    """Ensure heads and items dimensions are not confused.
+    When H != N, swapping them would change the tensor layout and
+    produce different (wrong) results.
+    """
+    torch.manual_seed(123)
+    B, H, N, D = 1, 3, 7, 16  # H != N
+    q = torch.randn(B, H, N, D)
+    k = torch.randn(B, H, N, D)
+    v = torch.randn(B, H, N, D)
+    out_ours = _cpu_sdpa_under_test(q, k, v)
+    out_ref = F.scaled_dot_product_attention(q, k, v)
+    assert out_ours.shape == (B, H, N, D)
+    assert torch.allclose(out_ours, out_ref, atol=1e-5), (
+        f"Max diff: {(out_ours - out_ref).abs().max().item()}"
+    )
+if __name__ == "__main__":
+    test_cpu_sdpa_matches_reference()
+    test_cpu_sdpa_output_shape()
+    test_cpu_sdpa_single_head()
+    test_cpu_sdpa_asymmetric_heads_items()
+    print("All tests passed.")

tests/test_csv_priority.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""Tests that CSV data takes priority over parquet when both are available.
+This validates the fix for the issue where loading an event from parquet and
+then modifying the CSV text fields (e.g. removing tracks) was ignored because
+the code always re-loaded from the parquet file.
+"""
+import os
+import ast
+import textwrap
+def _extract_source_priority_logic():
+    """Extract and verify the input-source priority logic from app.py.
+    Reads the ``run_inference_ui`` function source and checks that CSV
+    is tested *before* parquet, so that user edits to the CSV text
+    fields are respected even when a parquet file path is present.
+    """
+    app_path = os.path.join(os.path.dirname(__file__), "..", "app.py")
+    with open(app_path) as f:
+        source = f.read()
+    return source
+def test_csv_checked_before_parquet():
+    """In run_inference_ui, the ``if use_csv`` branch must come before
+    ``use_parquet`` so that CSV edits are not silently ignored."""
+    source = _extract_source_priority_logic()
+    # Find positions of the key branching statements
+    idx_csv = source.find("if use_csv:")
+    idx_parquet_elif = source.find("elif use_parquet:")
+    idx_parquet_if = source.find("if use_parquet:")
+    # "if use_csv:" must exist
+    assert idx_csv != -1, "Could not find 'if use_csv:' in app.py"
+    # "elif use_parquet:" must exist (parquet is the fallback)
+    assert idx_parquet_elif != -1, (
+        "Could not find 'elif use_parquet:' in app.py — parquet should be "
+        "a fallback after CSV"
+    )
+    # CSV check must come before the parquet fallback
+    assert idx_csv < idx_parquet_elif, (
+        "'if use_csv:' must appear before 'elif use_parquet:' so that "
+        "user CSV edits take priority over re-reading the parquet file"
+    )
+    # There should NOT be a standalone "if use_parquet:" that would take
+    # priority over CSV (the old buggy pattern)
+    if idx_parquet_if != -1:
+        # The only occurrence should be inside the guard for empty input
+        # (not use_parquet and not use_csv). A standalone "if use_parquet:"
+        # that dispatches to load_event_from_parquet before checking CSV is
+        # the bug we fixed.
+        # Make sure it's not followed by load_event_from_parquet before
+        # "if use_csv:" appears
+        assert idx_parquet_if > idx_csv or "load_event_from_parquet" not in source[idx_parquet_if:idx_csv], (
+            "Found 'if use_parquet:' with load_event_from_parquet before "
+            "'if use_csv:' — this is the bug where parquet takes priority "
+            "over CSV edits"
+        )
+def test_parse_csv_event_logic():
+    """_parse_csv_event should correctly build event dicts from CSV text.
+    We inline the same parsing logic used by app.py to avoid importing
+    the module (which requires heavy dependencies like gradio).
+    """
+    import io
+    import numpy as np
+    import pandas as pd
+    def _read(text, min_cols=1):
+        if not text or not text.strip():
+            return np.zeros((0, min_cols), dtype=np.float64)
+        df = pd.read_csv(io.StringIO(text), header=None)
+        return df.values.astype(np.float64)
+    def _parse_csv_event(csv_hits, csv_tracks, csv_particles, csv_pandora=""):
+        hits_arr = _read(csv_hits, 11)
+        tracks_arr = _read(csv_tracks, 25)
+        particles_arr = _read(csv_particles, 18)
+        pandora_arr = _read(csv_pandora, 9)
+        if tracks_arr.shape[1] < 25 and tracks_arr.shape[0] > 0:
+            pad = np.zeros((tracks_arr.shape[0], 25 - tracks_arr.shape[1]))
+            tracks_arr = np.concatenate([tracks_arr, pad], axis=1)
+        ygen_hit = np.full(len(hits_arr), -1, dtype=np.int64)
+        ygen_track = np.full(len(tracks_arr), -1, dtype=np.int64)
+        return {
+            "X_hit": hits_arr,
+            "X_track": tracks_arr,
+            "X_gen": particles_arr,
+            "X_pandora": pandora_arr,
+            "ygen_hit": ygen_hit,
+            "ygen_track": ygen_track,
+        }
+    # Basic parse
+    csv_hits = "0,0,0,0,0,1.23,1800.5,200.3,100.1,0,1"
+    event = _parse_csv_event(csv_hits, "", "", "")
+    assert event["X_hit"].shape == (1, 11)
+    assert event["X_track"].shape == (0, 25)
+    assert np.isclose(event["X_hit"][0, 5], 1.23)
+    # Empty tracks after removing them
+    event2 = _parse_csv_event(csv_hits, "", "", "")
+    assert event2["X_track"].shape[0] == 0
+    # Two tracks vs one track
+    csv_tracks_two = (
+        "1,0,0,0,0,5.0,3.0,2.0,3.3,0,0,0,1800.0,150.0,90.0,12.5,8,0,0,0,0,0,2.9,1.9,3.2\n"
+        "1,0,0,0,0,3.0,1.0,1.5,2.1,0,0,0,1700.0,100.0,80.0,10.0,6,0,0,0,0,0,0.9,1.4,2.0"
+    )
+    csv_tracks_one = (
+        "1,0,0,0,0,5.0,3.0,2.0,3.3,0,0,0,1800.0,150.0,90.0,12.5,8,0,0,0,0,0,2.9,1.9,3.2"
+    )
+    event_two = _parse_csv_event(csv_hits, csv_tracks_two, "", "")
+    event_one = _parse_csv_event(csv_hits, csv_tracks_one, "", "")
+    assert event_two["X_track"].shape[0] == 2
+    assert event_one["X_track"].shape[0] == 1
+def test_input_source_decision_logic():
+    """Simulate the decision logic from run_inference_ui and verify that
+    CSV is used even when a parquet path is present."""
+    def decide_source(parquet_path, csv_hits):
+        """Mirrors the decision logic in run_inference_ui."""
+        use_parquet = parquet_path and os.path.isfile(parquet_path)
+        use_csv = bool(csv_hits and csv_hits.strip())
+        if use_csv:
+            return "csv"
+        elif use_parquet:
+            return "parquet"
+        else:
+            return "none"
+    # CSV present + parquet path present → should use CSV
+    # (use this script as a stand-in for an existing file)
+    existing_file = os.path.abspath(__file__)
+    assert decide_source(existing_file, "some,csv,data") == "csv"
+    # CSV present + no parquet → should use CSV
+    assert decide_source("", "some,csv,data") == "csv"
+    # CSV empty + parquet present → should use parquet
+    assert decide_source(existing_file, "") == "parquet"
+    # Both empty → none
+    assert decide_source("", "") == "none"
+if __name__ == "__main__":
+    test_csv_checked_before_parquet()
+    test_parse_csv_event_logic()
+    test_input_source_decision_logic()
+    print("All tests passed.")

tests/test_energy_correction_no_matches.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""Tests that energy correction runs even when no MC-truth showers are matched.
+The bug: ``_run_energy_correction`` returned early (``if not graphs_matched:
+return particles_df``) whenever no predicted cluster could be matched to a
+true particle.  In pure inference mode (no MC truth) *all* clusters are
+"fakes" and ``graphs_matched`` is always empty, so the correction was never
+applied and the output table only contained the basic ``energy_sum_hits`` /
+``p_track`` columns.
+The fix: only bail out when *both* ``graphs_matched`` **and** ``graphs_fakes``
+are empty (i.e. there are literally no clusters to correct).
+"""
+import ast
+import os
+def _get_function_source(path, func_name):
+    """Return the source of a top-level function from *path*."""
+    with open(path) as f:
+        source = f.read()
+    tree = ast.parse(source)
+    lines = source.splitlines(keepends=True)
+    for node in tree.body:
+        if isinstance(node, ast.FunctionDef) and node.name == func_name:
+            return "".join(lines[node.lineno - 1 : node.end_lineno])
+    raise ValueError(f"{func_name} not found in {path}")
+INFERENCE_PATH = os.path.join(
+    os.path.dirname(__file__), "..", "src", "inference.py"
+)
+def test_early_return_requires_both_empty():
+    """The early return must check both graphs_matched *and* graphs_fakes.
+    The old (buggy) guard was:
+        if not graphs_matched:
+            return particles_df
+    The fixed guard must be:
+        if not graphs_matched and not graphs_fakes:
+            return particles_df
+    """
+    src = _get_function_source(INFERENCE_PATH, "_run_energy_correction")
+    # The buggy single-condition early return must NOT appear
+    assert "if not graphs_matched:\n        return particles_df" not in src, (
+        "Found the old single-condition early return 'if not graphs_matched'; "
+        "energy correction would be skipped whenever no MC-truth matches exist."
+    )
+    # The correct two-condition guard must be present
+    assert "if not graphs_matched and not graphs_fakes:" in src, (
+        "Expected 'if not graphs_matched and not graphs_fakes:' in "
+        "_run_energy_correction but did not find it."
+    )
+def test_true_energies_t_not_called_with_cat_on_empty():
+    """``torch.cat(true_energies, dim=0)`` must not appear unconditionally.
+    When ``graphs_matched`` is empty, ``true_energies`` is an empty list and
+    ``torch.cat([], dim=0)`` raises a RuntimeError.  The fixed code removes
+    this line entirely (the variable was unused anyway).
+    """
+    src = _get_function_source(INFERENCE_PATH, "_run_energy_correction")
+    # Either the assignment is gone, or it is guarded
+    if "true_energies_t = torch.cat(true_energies" in src:
+        # If it still exists it must be guarded by an if-statement
+        lines = src.splitlines()
+        for i, line in enumerate(lines):
+            if "true_energies_t = torch.cat(true_energies" in line:
+                # Check that a guard exists somewhere before this line
+                guard_present = any(
+                    "if true_energies" in lines[j] or "if graphs_matched" in lines[j]
+                    for j in range(max(0, i - 5), i)
+                )
+                assert guard_present, (
+                    f"Line {i}: unguarded 'torch.cat(true_energies)' would "
+                    "raise RuntimeError on empty list when no showers match."
+                )
+if __name__ == "__main__":
+    test_early_return_requires_both_empty()
+    test_true_energies_t_not_called_with_cat_on_empty()
+    print("All tests passed.")

tests/test_pfo_links.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""Tests for the hit → Pandora cluster mapping (PFO links) field.
+Validates that:
+1. _parse_csv_event correctly parses the csv_pfo_links parameter.
+2. PFO links are gracefully handled when CSV is modified (partial matches).
+3. The _load_event_into_csv function includes PFO links output.
+4. The run_inference_ui function accepts the csv_pfo_links parameter.
+"""
+import os
+import io
+import numpy as np
+import pandas as pd
+# ---------------------------------------------------------------------------
+# Inline the parsing logic to avoid importing app.py (heavy dependencies)
+# ---------------------------------------------------------------------------
+def _parse_csv_event(csv_hits, csv_tracks, csv_particles, csv_pandora="", csv_pfo_links=""):
+    """Mirror of the _parse_csv_event logic from app.py."""
+    def _read(text, min_cols=1):
+        if not text or not text.strip():
+            return np.zeros((0, min_cols), dtype=np.float64)
+        df = pd.read_csv(io.StringIO(text), header=None)
+        return df.values.astype(np.float64)
+    hits_arr = _read(csv_hits, 11)
+    tracks_arr = _read(csv_tracks, 25)
+    particles_arr = _read(csv_particles, 18)
+    pandora_arr = _read(csv_pandora, 9)
+    if tracks_arr.shape[1] < 25 and tracks_arr.shape[0] > 0:
+        pad = np.zeros((tracks_arr.shape[0], 25 - tracks_arr.shape[1]))
+        tracks_arr = np.concatenate([tracks_arr, pad], axis=1)
+    ygen_hit = np.full(len(hits_arr), -1, dtype=np.int64)
+    ygen_track = np.full(len(tracks_arr), -1, dtype=np.int64)
+    # Parse PFO link arrays
+    pfo_calohit = np.array([], dtype=np.int64)
+    pfo_track = np.array([], dtype=np.int64)
+    if csv_pfo_links and csv_pfo_links.strip():
+        lines = csv_pfo_links.strip().split("\n")
+        if len(lines) >= 1 and lines[0].strip():
+            pfo_calohit = np.array(
+                [int(v) for v in lines[0].strip().split(",")], dtype=np.int64
+            )
+        if len(lines) >= 2 and lines[1].strip():
+            pfo_track = np.array(
+                [int(v) for v in lines[1].strip().split(",")], dtype=np.int64
+            )
+    return {
+        "X_hit": hits_arr,
+        "X_track": tracks_arr,
+        "X_gen": particles_arr,
+        "X_pandora": pandora_arr,
+        "ygen_hit": ygen_hit,
+        "ygen_track": ygen_track,
+        "pfo_calohit": pfo_calohit,
+        "pfo_track": pfo_track,
+    }
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+def test_parse_pfo_links_basic():
+    """PFO links should be correctly parsed from csv_pfo_links."""
+    csv_hits = "0,0,0,0,0,1.23,1800.5,200.3,100.1,0,1\n0,0,0,0,0,0.45,1900.2,-50.1,300.7,0,2"
+    csv_pfo_links = "3,5\n7"
+    event = _parse_csv_event(csv_hits, "", "", "", csv_pfo_links)
+    assert "pfo_calohit" in event
+    assert "pfo_track" in event
+    np.testing.assert_array_equal(event["pfo_calohit"], [3, 5])
+    np.testing.assert_array_equal(event["pfo_track"], [7])
+def test_parse_pfo_links_empty():
+    """Empty csv_pfo_links should produce empty arrays."""
+    csv_hits = "0,0,0,0,0,1.23,1800.5,200.3,100.1,0,1"
+    event = _parse_csv_event(csv_hits, "", "", "", "")
+    assert len(event["pfo_calohit"]) == 0
+    assert len(event["pfo_track"]) == 0
+def test_parse_pfo_links_calohit_only():
+    """Only calohit line provided (no track line)."""
+    csv_pfo_links = "1,2,-1,3"
+    event = _parse_csv_event("0,0,0,0,0,1.0,1.0,1.0,1.0,0,1", "", "", "", csv_pfo_links)
+    np.testing.assert_array_equal(event["pfo_calohit"], [1, 2, -1, 3])
+    assert len(event["pfo_track"]) == 0
+def test_parse_pfo_links_with_negatives():
+    """PFO links should correctly handle -1 values (unassigned hits)."""
+    csv_pfo_links = "3,-1,5,-1\n-1,2"
+    event = _parse_csv_event("", "", "", "", csv_pfo_links)
+    np.testing.assert_array_equal(event["pfo_calohit"], [3, -1, 5, -1])
+    np.testing.assert_array_equal(event["pfo_track"], [-1, 2])
+def test_pandora_cluster_partial_match():
+    """When CSV is modified (fewer hits than PFO links), use min of lengths."""
+    # Simulate the assignment logic from inference.py
+    n_calo = 3  # only 3 hits now
+    n_tracks = 1  # only 1 track now
+    n_hits = n_calo + n_tracks
+    pfo_calohit = np.array([0, 1, 2, 3, 4], dtype=np.int64)  # originally 5 hits
+    pfo_track = np.array([5, 6], dtype=np.int64)  # originally 2 tracks
+    pandora_cluster_ids = np.full(n_hits, -1, dtype=np.int64)
+    if len(pfo_calohit) > 0:
+        n_assign = min(len(pfo_calohit), n_calo)
+        pandora_cluster_ids[:n_assign] = pfo_calohit[:n_assign]
+    if n_tracks > 0 and len(pfo_track) > 0:
+        n_assign = min(len(pfo_track), n_tracks)
+        pandora_cluster_ids[n_calo:n_calo + n_assign] = pfo_track[:n_assign]
+    # First 3 calo hits should get their PFO IDs, 4th hit (track) gets first track PFO
+    np.testing.assert_array_equal(pandora_cluster_ids, [0, 1, 2, 5])
+def test_pandora_cluster_no_links():
+    """When no PFO links are available, all pandora_cluster_ids should be -1."""
+    n_hits = 5
+    n_calo = 3
+    n_tracks = 2
+    pfo_calohit = np.array([], dtype=np.int64)
+    pfo_track = np.array([], dtype=np.int64)
+    pandora_cluster_ids = np.full(n_hits, -1, dtype=np.int64)
+    if len(pfo_calohit) > 0:
+        n_assign = min(len(pfo_calohit), n_calo)
+        pandora_cluster_ids[:n_assign] = pfo_calohit[:n_assign]
+    if n_tracks > 0 and len(pfo_track) > 0:
+        n_assign = min(len(pfo_track), n_tracks)
+        pandora_cluster_ids[n_calo:n_calo + n_assign] = pfo_track[:n_assign]
+    np.testing.assert_array_equal(pandora_cluster_ids, [-1, -1, -1, -1, -1])
+def test_pandora_cluster_more_hits_than_links():
+    """When more hits exist than PFO links, extra hits get -1."""
+    n_calo = 5
+    n_tracks = 2
+    n_hits = n_calo + n_tracks
+    pfo_calohit = np.array([1, 2], dtype=np.int64)  # only 2 links for 5 hits
+    pfo_track = np.array([3], dtype=np.int64)  # only 1 link for 2 tracks
+    pandora_cluster_ids = np.full(n_hits, -1, dtype=np.int64)
+    if len(pfo_calohit) > 0:
+        n_assign = min(len(pfo_calohit), n_calo)
+        pandora_cluster_ids[:n_assign] = pfo_calohit[:n_assign]
+    if n_tracks > 0 and len(pfo_track) > 0:
+        n_assign = min(len(pfo_track), n_tracks)
+        pandora_cluster_ids[n_calo:n_calo + n_assign] = pfo_track[:n_assign]
+    np.testing.assert_array_equal(pandora_cluster_ids, [1, 2, -1, -1, -1, 3, -1])
+def test_app_source_has_csv_pfo_links_field():
+    """app.py should have the csv_pfo_links text field wired up."""
+    app_path = os.path.join(os.path.dirname(__file__), "..", "app.py")
+    with open(app_path) as f:
+        source = f.read()
+    assert "csv_pfo_links" in source, "app.py should reference csv_pfo_links"
+    assert "Hit → Pandora Cluster links" in source, (
+        "app.py should have the PFO links text field label"
+    )
+def test_run_inference_ui_accepts_pfo_links():
+    """run_inference_ui should accept csv_pfo_links as a parameter."""
+    import ast
+    app_path = os.path.join(os.path.dirname(__file__), "..", "app.py")
+    with open(app_path) as f:
+        tree = ast.parse(f.read())
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef) and node.name == "run_inference_ui":
+            arg_names = [arg.arg for arg in node.args.args]
+            assert "csv_pfo_links" in arg_names, (
+                "run_inference_ui should accept csv_pfo_links parameter"
+            )
+            return
+    raise AssertionError("Could not find run_inference_ui function in app.py")
+def test_load_event_returns_pfo_links():
+    """_load_event_into_csv error path should return 6 values (including PFO links)."""
+    import ast
+    app_path = os.path.join(os.path.dirname(__file__), "..", "app.py")
+    with open(app_path) as f:
+        tree = ast.parse(f.read())
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef) and node.name == "_load_event_into_csv":
+            # Check return statements in the function body
+            for child in ast.walk(node):
+                if isinstance(child, ast.Return) and isinstance(child.value, ast.Tuple):
+                    n_elts = len(child.value.elts)
+                    assert n_elts == 6, (
+                        f"_load_event_into_csv should return 6 values, got {n_elts}"
+                    )
+            return
+    raise AssertionError("Could not find _load_event_into_csv function in app.py")
+if __name__ == "__main__":
+    test_parse_pfo_links_basic()
+    test_parse_pfo_links_empty()
+    test_parse_pfo_links_calohit_only()
+    test_parse_pfo_links_with_negatives()
+    test_pandora_cluster_partial_match()
+    test_pandora_cluster_no_links()
+    test_pandora_cluster_more_hits_than_links()
+    test_app_source_has_csv_pfo_links_field()
+    test_run_inference_ui_accepts_pfo_links()
+    test_load_event_returns_pfo_links()
+    print("All PFO links tests passed.")