Added script to visualize all input variable distributions for a model.

Files changed (1) hide show

root_gnn_dgl/root_gnn_base/visualize_input_distributions.py +582 -0

root_gnn_dgl/root_gnn_base/visualize_input_distributions.py ADDED Viewed

	@@ -0,0 +1,582 @@

+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import uproot
+import yaml
+import argparse
+import sys
+from pathlib import Path
+from array import array
+import os
+import awkward as ak
+import math
+def tree_to_dataframe(tree_filepath, sort_by="", branches=[]):
+    """
+    Convert a ROOT tree to a Pandas DataFrame (Assuming data is columnar).
+    Depends on uproot and pandas libraries (import them before-hand).
+    """
+    data_dict = {}  # Use dictionary instead of list
+    with uproot.open(tree_filepath) as file:
+        if not branches:  # If branches list is empty
+            keys = file.keys()
+            for key in keys:
+                try:
+                    data_dict[key] = file[key].array(library="pd")
+                except Exception as e:
+                    print(f"Warning: Could not load branch '{key}': {e}")
+        else:  # If specific branches are requested
+            for branch in branches:
+                try:
+                    data_dict[branch] = file[branch].array(library="pd")
+                except KeyError:
+                    print(f"Warning: Branch '{branch}' not found in ROOT file")
+                except Exception as e:
+                    print(f"Warning: Could not load branch '{branch}': {e}")
+    # Create DataFrame from dictionary
+    data = pd.DataFrame(data_dict)
+    if sort_by == "":
+        return data
+    else:
+        if sort_by in data.columns:
+            data.sort_values(by=[sort_by], inplace=True)
+            data.reset_index(inplace=True, drop=True)
+        else:
+            print(f"Warning: Sort column '{sort_by}' not found in DataFrame")
+        return data
+def extract_dataset_info(yaml_file_path):
+    with open(yaml_file_path, 'r') as file:
+        config = yaml.safe_load(file)
+    datasets_info = {}
+    if "Datasets" in config:
+        for dset_name, dset_config in config['Datasets'].items():
+            if 'args' not in dset_config:
+                continue
+            args = dset_config["args"]
+            dset_info = {}
+            if "raw_dir" in args:
+                dset_info["raw_dir"] = args["raw_dir"]
+            if "file_names" in args:
+                dset_info["file_names"] = args["file_names"]
+            if "node_branch_names" in args:
+                dset_info["node_branch_names"] = args["node_branch_names"]
+            if "name" in args:
+                dset_info["name"] = args["name"]
+            if "node_feature_scales" in args:
+                dset_info["node_feature_scales"] = args["node_feature_scales"]
+            if "tree_name" in args:
+                dset_info["tree_name"] = args["tree_name"]
+            if "label" in args:
+                dset_info["label"] = args["label"]
+            # if "exclude_zeros" in args:
+            #     dset_info["exclude_zeros"] = args["exclude_zeros"]
+            # if "exclude_zeros" not in args:
+            #     print("ERROR: Please add the following variable to your config, under args for each dataset:\nFor example, exclude_zeros: [pt, phi, eta]\exclude_zeros should be a list that contains the endings of the names of observables that you want to exclude the value 0 from while plotting histograms.")
+            #     sys.exit()
+            if dset_info:
+                datasets_info[dset_name] = dset_info
+    return(datasets_info)
+def adaptive_bins(data, method='auto'):
+    """Choose optimal number of bins based on data characteristics"""
+    data = np.array([x for x in data if x is not None and not np.isnan(x)])
+    if len(data) == 0:
+        return 10
+    if method == 'sturges':
+        return int(np.ceil(np.log2(len(data)) + 1))
+    elif method == 'scott':
+        h = 3.5 * np.std(data) / (len(data) ** (1/3))
+        return int(np.ceil((np.max(data) - np.min(data)) / h))
+    elif method == 'freedman':
+        iqr = np.percentile(data, 75) - np.percentile(data, 25)
+        h = 2 * iqr / (len(data) ** (1/3))
+        return int(np.ceil((np.max(data) - np.min(data)) / h)) if h > 0 else 50
+    elif method == 'sqrt':
+        return int(np.ceil(np.sqrt(len(data))))
+    else:  # 'auto'
+        return 'auto'  # Let matplotlib decide
+def safe_clean_data(data, observable_name=""):
+    """Safely clean data, handling different data types and ignoring zeros for specific observables"""
+    if data is None or len(data) == 0:
+        return []
+    # Convert to numpy array if it isn't already
+    if not isinstance(data, np.ndarray):
+        data = np.array(data)
+    # Check if we should ignore zeros
+    # ignore_zeros = observable_name.lower().endswith(exclude_zeros)
+    # Handle different data types
+    if data.dtype.kind in ['i', 'f']:  # integer or float
+        # Numeric data - can use isnan and isfinite
+        if data.dtype.kind == 'f':  # float
+            mask = ~np.isnan(data) & np.isfinite(data)
+            clean_data = data[mask]
+        else:  # integer
+            clean_data = data  # integers don't have NaN/inf issues
+        clean_data = clean_data[(clean_data != -999) & (clean_data != -1)]
+        # Remove zeros if needed
+        # if ignore_zeros:
+        #     clean_data = clean_data[clean_data != 0]
+        return clean_data
+    else:
+        # Non-numeric data - filter manually
+        clean_list = []
+        for item in data:
+            if item is None:
+                continue
+            try:
+                # Try to convert to float to check if it's numeric
+                float_val = float(item)
+                if not (np.isnan(float_val) or np.isinf(float_val)):
+                    # Check if we should ignore zeros
+                    if ignore_zeros and float_val == 0:
+                        continue
+                    clean_list.append(float_val)
+            except (ValueError, TypeError):
+                # Not convertible to float, skip
+                continue
+        return np.array(clean_list) if clean_list else np.array([])
+def make_distributions(dset_info, output_dir, exclude_zeros):
+    os.makedirs(output_dir, exist_ok=True)
+    awk_type = ak.Array
+    list_type = type([])
+    for dset_name in dset_info:
+        curr_dset_info = dset_info[dset_name]
+        curr_df = tree_to_dataframe(f"{curr_dset_info['raw_dir']}{curr_dset_info['file_names']}:{curr_dset_info['tree_name']}")
+        # Collect all observables and their data for this dataset
+        observables_data = {}
+        for branch in curr_dset_info["node_branch_names"]:
+            if type(branch) != list_type:
+                continue
+            for observable in branch:
+                if type(observable) != type("str"):
+                    continue
+                try:
+                    data = curr_df[observable]
+                    if type(data.iloc[0]) == awk_type or type(data.iloc[0]) == list_type:
+                        appended_data = []
+                        for i in range(len(data.iloc[0])):
+                            try:
+                                ith_obs_data = np.array([x[i] if x is not None and len(x) > i else None for x in data])
+                                # Filter out None values
+                                ith_obs_data = ith_obs_data[ith_obs_data != None]
+                                if len(ith_obs_data) > 0:
+                                    appended_data.append(ith_obs_data)
+                            except (IndexError, TypeError):
+                                continue
+                        if appended_data:
+                            plot_data = np.concatenate(appended_data)
+                            observables_data[observable] = plot_data
+                    else:
+                        observables_data[observable] = data
+                except KeyError:
+                    continue
+        # Create subplot grid for all observables in this dataset
+        if not observables_data:
+            print(f"No data found for {dset_name}")
+            continue
+        n_observables = len(observables_data)
+        # Calculate grid dimensions (try to make it roughly square)
+        n_cols = math.ceil(math.sqrt(n_observables))
+        n_rows = math.ceil(n_observables / n_cols)
+        # Create the figure with subplots
+        fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3*n_rows))
+        fig.suptitle(f'All Distributions for {dset_name}', fontsize=16, y=0.98)
+        # Handle case where there's only one subplot
+        if n_observables == 1:
+            axes = [axes]
+        elif n_rows == 1:
+            axes = axes.reshape(1, -1)
+        elif n_cols == 1:
+            axes = axes.reshape(-1, 1)
+        # Flatten axes for easy iteration
+        axes_flat = axes.flatten() if n_observables > 1 else axes
+        # Plot each observable
+        for idx, (observable, plot_data) in enumerate(observables_data.items()):
+            ax = axes_flat[idx]
+            # Clean data safely
+            clean_data = safe_clean_data(plot_data, exclude_zeros, observable)
+            if len(clean_data) > 0:
+                try:
+                    bins = adaptive_bins(clean_data, method="freedman")
+                    # Plot histogram with label including event count
+                    ax.hist(clean_data, histtype="step", density=True, bins=bins,
+                           label=f'N = {len(clean_data):,}')
+                    if observable.lower().endswith(exclude_zeros):
+                        ax.set_title(f'{observable} (zeros excluded)', fontsize=10)
+                    else:
+                        ax.set_title(f'{observable}', fontsize=10)
+                    ax.set_xlabel(f'{observable}', fontsize=8)
+                    ax.set_ylabel('Density', fontsize=8)
+                    ax.tick_params(axis='both', which='major', labelsize=7)
+                    ax.grid(True, alpha=0.3)
+                    # Add legend with event count
+                    ax.legend(fontsize=8, loc='upper right')
+                except Exception as e:
+                    print(f"Error plotting {observable}: {e}")
+                    ax.text(0.5, 0.5, f'Plot error:\n{str(e)[:50]}...', ha='center', va='center',
+                           transform=ax.transAxes, fontsize=8)
+                    ax.set_title(f'{observable} (Error)', fontsize=10)
+            else:
+                ax.text(0.5, 0.5, 'No valid data\nN = 0', ha='center', va='center',
+                       transform=ax.transAxes)
+                ax.set_title(f'{observable} (No Data)', fontsize=10)
+        # Hide unused subplots
+        for idx in range(n_observables, len(axes_flat)):
+            axes_flat[idx].set_visible(False)
+        # Adjust layout and save
+        plt.tight_layout()
+        plt.subplots_adjust(top=0.93)  # Make room for suptitle
+        plt.savefig(f"{output_dir}/{dset_name}_all_distributions.png",
+                   dpi=300, bbox_inches='tight')
+        plt.close()
+        print(f"Created combined plot for {dset_name} with {n_observables} observables")
+def make_distributions_comparison_grid_by_label(dset_info, output_dir, output_filename, label_names=None, use_percentile_for_xlims = False, xlim_adjustment = False):
+    """Create comparison plots grouped by label instead of dataset
+    Args:
+        dset_info: Dictionary containing dataset information
+        output_dir: Directory to save output plots
+        label_names: Optional list of strings to use as label names in legends.
+                    If provided, must have length equal to number of unique labels.
+                    Index corresponds to label number.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    awk_type = ak.Array
+    list_type = type([])
+    label_to_datasets = {}
+    for dset_name, curr_dset_info in dset_info.items():
+        dataset_label = curr_dset_info.get('label', 'Unknown')
+        if dataset_label not in label_to_datasets:
+            label_to_datasets[dataset_label] = []
+        label_to_datasets[dataset_label].append(dset_name)
+    # First, collect all data organized by observable and then by label
+    observables_by_variable = {}
+    for dset_name in dset_info:
+        print(f"Processing dataset: {dset_name}")
+        curr_dset_info = dset_info[dset_name]
+        # Get the label for this dataset
+        dataset_label = curr_dset_info.get('label', 'Unknown')
+        print(f"  Label: {dataset_label}")
+        if type(curr_dset_info['file_names']) == type("str"):
+            curr_df = tree_to_dataframe(f"{curr_dset_info['raw_dir']}{curr_dset_info['file_names']}:{curr_dset_info['tree_name']}")
+        else:
+            curr_df_list = []
+            for i in range(len(curr_dset_info['file_names'])):
+                curr_name = curr_dset_info['file_names'][i]
+                curr_curr_df = tree_to_dataframe(f"{curr_dset_info['raw_dir']}{curr_name}:{curr_dset_info['tree_name']}")
+                curr_df_list.append(curr_curr_df)
+            curr_df = pd.concat(curr_df_list, ignore_index = True)
+        for branch in curr_dset_info["node_branch_names"]:
+            if type(branch) != list_type:
+                continue
+            for observable in branch:
+                if type(observable) != type("str"):
+                    continue
+                try:
+                    data = curr_df[observable]
+                    # Initialize observable dict if not exists
+                    if observable not in observables_by_variable:
+                        observables_by_variable[observable] = {}
+                    # Initialize label dict if not exists
+                    if dataset_label not in observables_by_variable[observable]:
+                        observables_by_variable[observable][dataset_label] = []
+                    if type(data.iloc[0]) == awk_type or type(data.iloc[0]) == list_type:
+                        appended_data = []
+                        # for i in range(len(data.iloc[0])):
+                        #     try:
+                        #         ith_obs_data = np.array([x[i] if x is not None and len(x) > i else None for x in data])
+                        #         ith_obs_data = ith_obs_data[ith_obs_data != None]
+                        #         if len(ith_obs_data) > 0:
+                        #             appended_data.append(ith_obs_data)
+                        #     except (IndexError, TypeError):
+                        #         continue
+                        for x in data:
+                            row_data = []
+                            for i in range(len(x)):
+                                if x[i] == 0 or x[i] == 0.0:
+                                    continue
+                                row_data.append(x[i])
+                            row_data = np.array(row_data)
+                            row_data = row_data[row_data != None]
+                            if len(row_data > 0):
+                                appended_data.append(row_data)
+                        if appended_data:
+                            plot_data = np.concatenate(appended_data)
+                            observables_by_variable[observable][dataset_label].append(plot_data)
+                    else:
+                        observables_by_variable[observable][dataset_label].append(data)
+                except KeyError:
+                    continue
+    # Combine data for each label (since multiple datasets might have the same label)
+    observables_by_label = {}
+    for observable, labels_data in observables_by_variable.items():
+        observables_by_label[observable] = {}
+        for label, data_list in labels_data.items():
+            if data_list:
+                # Concatenate all data for this label
+                combined_data = []
+                for data in data_list:
+                    clean_data = safe_clean_data(data, observable)
+                    if len(clean_data) > 0:
+                        combined_data.extend(clean_data)
+                if combined_data:
+                    observables_by_label[observable][label] = np.array(combined_data)
+    # Filter out observables with no data
+    observables_by_label = {k: v for k, v in observables_by_label.items() if v}
+    if not observables_by_label:
+        print("No observables found!")
+        return
+    # Get consistent colors for labels across all plots
+    all_labels = set()
+    for labels_data in observables_by_label.values():
+        all_labels.update(labels_data.keys())
+    all_labels = sorted(list(all_labels))  # Sort for consistency
+    print(f"Found labels: {all_labels}")
+    # Validate label_names parameter if provided
+    if label_names is not None:
+        if len(label_names) != len(all_labels):
+            raise ValueError(f"label_names must have length {len(all_labels)} to match number of unique labels, but got {len(label_names)}")
+        print(f"Using custom label names: {label_names}")
+    # Calculate grid dimensions
+    n_observables = len(observables_by_label)
+    n_cols = math.ceil(math.sqrt(n_observables))
+    n_rows = math.ceil(n_observables / n_cols)
+    print(f"Creating comparison grid for {n_observables} observables ({n_rows}x{n_cols})")
+    # Create the big figure
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
+    fig.suptitle('Distribution Comparisons Across All Labels', fontsize=20, y=0.98)
+    # Handle different subplot configurations
+    if n_observables == 1:
+        axes = [axes]
+    elif n_rows == 1:
+        axes = axes.reshape(1, -1)
+    elif n_cols == 1:
+        axes = axes.reshape(-1, 1)
+    # Flatten axes for easy iteration
+    axes_flat = axes.flatten() if n_observables > 1 else axes
+    # Create color map for labels
+    colors = plt.cm.tab10(np.linspace(0, 1, len(all_labels)))
+    label_colors = dict(zip(all_labels, colors))
+    # Plot each observable
+    for idx, (observable, labels_data) in enumerate(observables_by_label.items()):
+        ax = axes_flat[idx]
+        # Calculate consistent bins based on ALL data for this observable
+        all_combined_data = []
+        for label_data in labels_data.values():
+            all_combined_data.extend(label_data)
+        if not all_combined_data:
+            ax.text(0.5, 0.5, 'No valid data', ha='center', va='center', transform=ax.transAxes)
+            ax.set_title(f'{observable} (No Data)', fontsize=12)
+            continue
+        combined_array = np.array(all_combined_data)
+        if observable == "ph_phi" or observable == "ph_eta":
+            n_bins = 10
+        elif observable == "m_jet_btag77":
+            n_bins = 4
+        else:
+            n_bins = adaptive_bins(combined_array, method="freedman")
+        if n_bins > 35: ### CONTROL FINENESS OF BINNING HERE!!!!
+            n_bins = 35
+        bin_edges = np.histogram_bin_edges(combined_array, bins=n_bins)
+        print(f"{observable}: Using {len(bin_edges)-1} consistent bins for {len(labels_data)} labels")
+        # Plot each label's distribution for this observable
+        for label, plot_data in labels_data.items():
+            try:
+                # Determine label for legend
+                if label_names is not None:
+                    # Use custom label name based on label index
+                    label_idx = all_labels.index(label)
+                    legend_label = f'{label_names[label_idx]} (N={len(plot_data):,})'
+                else:
+                    # Use original format
+                    legend_label = f'Label {label} (N={len(plot_data):,})'
+                ax.hist(plot_data, bins=bin_edges, histtype="step", density=True,
+                       label=legend_label,
+                       color=label_colors[label], linewidth=1.5, alpha=0.8)
+            except Exception as e:
+                print(f"Error plotting {observable} for label {label}: {e}")
+                continue
+        # Add title and labels
+        title = f'{observable}'
+        # if observable.lower().endswith(exclude_zeros):
+        #     title += ' (zeros excluded)'
+        if use_percentile_for_xlims and xlim_adjustment:
+            print("ERROR: Only provide one of the flags at a time, either --use_percentile_for_xlims or --xlim_adjustment")
+            return()
+        if not use_percentile_for_xlims and not xlim_adjustment:
+            ax.set_xlim(bin_edges[0], bin_edges[-1])
+        elif use_percentile_for_xlims:
+            combined_array = np.array(all_combined_data)
+            ax.set_xlim(bin_edges[0], np.percentile(combined_array, 98))
+        elif xlim_adjustment:
+            combined_array = np.array(all_combined_data)
+            min_edge = max(bin_edges[0], np.mean(combined_array) - 3*np.std(combined_array))
+            max_edge = min(bin_edges[-1], np.mean(combined_array) + 3*np.std(combined_array))
+            ax.set_xlim(min_edge, max_edge)
+        ax.set_title(title, fontsize=12, pad=10)
+        ax.set_xlabel(f'{observable}', fontsize=10)
+        ax.set_ylabel('Density', fontsize=10)
+        ax.tick_params(axis='both', which='major', labelsize=8)
+        ax.grid(True, alpha=0.3)
+        # Create legend
+        if len(labels_data) <= 5:
+            if label_names is not None:
+                # Simple legend with just custom names and counts
+                ax.legend(fontsize=8, loc='best')
+            else:
+                # Create custom legend labels with dataset information
+                legend_labels = []
+                for label in labels_data.keys():
+                    datasets = label_to_datasets.get(label, [])
+                    if len(datasets) == 1:
+                        # Single dataset
+                        dataset_info = datasets[0]
+                    elif len(datasets) <= 2:
+                        # Few datasets - show all names
+                        dataset_info = ', '.join(datasets)
+                    else:
+                        # Many datasets - show count
+                        dataset_info = f"{datasets[0]}, +{len(datasets)-1} more"
+                    legend_labels.append(f'Label {label} (N={len(labels_data[label]):,})\n{dataset_info}')
+                # Get the legend handles and update their labels
+                handles, _ = ax.get_legend_handles_labels()
+                ax.legend(handles, legend_labels, fontsize=6, loc='best')
+        else:
+            total_events = sum(len(data) for data in labels_data.values())
+            ax.set_title(f'{title}\n(Total N={total_events:,})', fontsize=11)
+    # Hide unused subplots
+    for idx in range(n_observables, len(axes_flat)):
+        axes_flat[idx].set_visible(False)
+    # Adjust layout and save
+    plt.tight_layout()
+    plt.subplots_adjust(top=0.94, right=0.85 if len(all_labels) > 5 else 0.95)
+    output_path = f"{output_dir}/{output_filename}"
+    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
+    plt.close()
+    print(f"Created comparison grid by label: {output_path}")
+    print(f"Grid contains {n_observables} observables across {len(all_labels)} labels")
+    # Print summary of what was combined
+    print("\nLabel summary:")
+    for label in all_labels:
+        datasets_with_label = [dset for dset, info in dset_info.items() if info.get('label') == label]
+        if label_names is not None:
+            label_idx = all_labels.index(label)
+            display_name = label_names[label_idx]
+        else:
+            display_name = f"Label {label}"
+        print(f"  {display_name}: {len(datasets_with_label)} datasets ({', '.join(datasets_with_label)})")
+def main(): ###DONT SPECIFY EXCLUDE ZEROS HERE, BUT RATHER DERIVE IT FROM THE CONFIG!!!
+    parser = argparse.ArgumentParser()
+    add_arg = parser.add_argument
+    add_arg("--config", type=str, required = True, help = "The path to the config.")
+    add_arg("--output_dir", type=str, required = True, help = "The path of the directory where you want the plots to be outputted to.")
+    add_arg('--label_names', nargs='+', default = ["None"], help = "A list of the names associated with each label to be displayed in the legends of the histograms.")
+    add_arg("--output_filename", type=str, default = "input_var_distribution_comparisons.png", help = "The name of the file you want the plots to be outputted to.")
+    add_arg("--use_percentile_for_xlims", action = "store_true", help = "If this flag is provided, the xlims will be set as [first bin edge, 98th percentile] rather than [first bin edge, last bin edge].")
+    add_arg("--xlim_adjustment", action = "store_true", help = "If this flag is provided, the xlims will be set using the mean and std of the data.")
+    args = parser.parse_args()
+    config_filepath = args.config
+    output_dir = args.output_dir
+    label_names = args.label_names
+    output_filename = args.output_filename
+    use_percentile = args.use_percentile_for_xlims
+    xlim_adjustment = args.xlim_adjustment
+    dset = extract_dataset_info(config_filepath)
+    # exclude_zeros_list = []
+    # for key in dset:
+    #     exclude_zeros_list = dset[key]["exclude_zeros"]
+    #     break
+    # exclude_zeros = tuple(exclude_zeros_list)
+    # make_distributions(dset, output_dir, exclude_zeros)
+    if label_names[0] == "None":
+        make_distributions_comparison_grid_by_label(dset, output_dir, output_filename, use_percentile_for_xlims=use_percentile, xlim_adjustment=xlim_adjustment)
+    else:
+        make_distributions_comparison_grid_by_label(dset, output_dir, output_filename, label_names, use_percentile, xlim_adjustment)
+if __name__ == "__main__":
+    main()