| """ |
| This generates gpu kernel analysis output from nsys rep. Will call nsys |
| stats -r cuda_gpu_kern_trace, get non-overlapped gpu cycles, then generate |
| csv and html output for analysis |
| """ |
|
|
| import argparse |
| import logging |
| import os |
| import shlex |
|
|
| import regex as re |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
| def load_engine_model(): |
| """returns engine_model built from all json files in the current dir""" |
| import glob |
| import json |
|
|
| engine_model = {} |
|
|
| json_files = glob.glob(os.path.join(os.path.dirname(__file__) or ".", "*.json")) |
| for fname in json_files: |
| with open(fname, encoding="utf-8") as f: |
| engine_model.update(json.load(f)) |
| return engine_model |
|
|
|
|
| class GPUTrace2Graph: |
| """ |
| Parses output of nsys report, generates csv and bar chart output |
| """ |
|
|
| def __init__(self): |
| import pandas as pd |
|
|
| self.pd = pd |
| self.pd.options.mode.copy_on_write = True |
|
|
| |
| def gen_nonoverlapped_sum_from_gputrace(self, in_file, out_file): |
| logger.info("loading %s", in_file) |
| df = self.pd.read_csv( |
| in_file, usecols=["Start (ns)", "Duration (ns)", "Device", "Strm", "Name"] |
| ) |
| df["End (ns)"] = df["Start (ns)"] + df["Duration (ns)"] |
| df = self.sum_non_overlapping_intervals(df) |
| |
| df["Instances"] = 1 |
| df_sum = df.groupby("Name", as_index=False).agg( |
| {"Elapsed Time (ns)": "sum", "Duration (ns)": "sum", "Instances": "size"} |
| ) |
|
|
| |
| df_sum["Total Time (sec)"] = df_sum["Duration (ns)"] / 1e9 |
| df_sum["Elapsed Time (sec)"] = df_sum["Elapsed Time (ns)"] / 1e9 |
| df_sum = df_sum.sort_values(by="Elapsed Time (sec)", ascending=False) |
| df_sum[["Elapsed Time (sec)", "Total Time (sec)", "Instances", "Name"]].to_csv( |
| out_file, index=False |
| ) |
|
|
| def sum_non_overlapping_intervals(self, df): |
| """ |
| returns new sorted df with Elapsed Time (ns) column using |
| vectorized operations |
| """ |
| logger.info("sorting %s trace records by start time", str(df.shape)) |
|
|
| |
| df = df.sort_values(by="Start (ns)").reset_index(drop=True) |
|
|
| |
| df["Elapsed Time (ns)"] = df["Duration (ns)"] |
|
|
| |
| starts = df["Start (ns)"].values |
| ends = df["End (ns)"].values |
|
|
| |
| current_end = ends[0] |
| display_units = max(1, int(len(df) / 100)) |
| |
| for i in range(1, len(df)): |
| if i % display_units == 0: |
| print(f"processing trace: {int(i/len(df) * 100)} %", end="\r") |
| if starts[i] <= current_end: |
| if ends[i] > current_end: |
| |
| df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = ( |
| ends[i] - current_end |
| ) |
| current_end = ends[i] |
| else: |
| |
| df.iloc[i, df.columns.get_loc("Elapsed Time (ns)")] = 0 |
| else: |
| |
| current_end = ends[i] |
|
|
| return df |
|
|
| |
| def make_html(self, df, output_dir, title): |
| """make html graph from df""" |
| import plotly.express as px |
|
|
| if df.empty: |
| return |
| output_name = os.path.join(output_dir, "result") |
| if not title: |
| title = "Model_Engine" |
| x = "Model_Engine" |
| y = "Elapsed Time (sec)" |
| color = "Category" |
| """ generate kernel mapping table """ |
| |
| df["Model_Engine"] = self.pd.Categorical( |
| df["Model_Engine"], |
| sorted(df["Model_Engine"].unique(), key=lambda x: x.split("_")[-1]), |
| ) |
| df[["Model_Engine", color, "Instances", "Name", y]].sort_values( |
| by=color |
| ).to_csv(f"{output_name}.csv", index=False) |
| graph = px.histogram( |
| df.round(2), |
| x=x, |
| y=y, |
| title=(f"{y} for {title}"), |
| color=color, |
| text_auto=True, |
| ) |
| |
| graph.update_xaxes(automargin=True) |
| graph.write_html(f"{output_name}.html") |
| """ |
| Generate data table with columns per Model_Engine into result.html |
| """ |
| pivot_df = df.pivot_table( |
| values="Elapsed Time (sec)", |
| index="Category", |
| columns="Model_Engine", |
| aggfunc="sum", |
| observed=False, |
| ).round(2) |
| |
| pivot_df.loc["total_elapsed_sec"] = pivot_df.sum() |
| pivot_df.fillna("").to_html("temp.html") |
| with ( |
| open(f"{output_name}.html", "a", encoding="utf-8") as outfile, |
| open("temp.html", encoding="utf-8") as infile, |
| ): |
| outfile.write(infile.read()) |
| os.remove("temp.html") |
|
|
| print( |
| f"Finished generating: \n" |
| f" {output_name}.html for stack bar chart \n" |
| f" {output_name}.csv for Kernel-Category mapping" |
| ) |
|
|
| def anno_gpu_kernname(self, df, mapping): |
| """add "Category" column""" |
|
|
| def anno_gpu_kernname_helper(name): |
| for kern_name, val in mapping.items(): |
| if re.search(kern_name, name): |
| return val |
|
|
| df["Category"] = df["Name"].apply(anno_gpu_kernname_helper) |
|
|
| def make_nongpu_row(self, df, nongpu_sec): |
| """this will append non-gpu time entry at end of df""" |
| nongpu_row = self.pd.DataFrame([df.iloc[-1]]) |
| nongpu_row["Category"] = nongpu_row["Name"] = "CPU(non-GPU)" |
| nongpu_row["Instances"] = 1 |
| nongpu_row["Elapsed Time (sec)"] = nongpu_sec |
| return nongpu_row |
|
|
| def is_valid_file(self, base_file): |
| """asserts if base_file is non-existent or is empty""" |
| assert ( |
| os.path.isfile(base_file) and os.path.getsize(base_file) > 0 |
| ), f"{base_file} doesn't exist or is empty" |
|
|
| def should_gen_file(self, new_file, base_file): |
| """figure out if new file should be generated from base_file""" |
| self.is_valid_file(base_file) |
| if ( |
| os.path.exists(new_file) |
| and (os.path.getmtime(new_file) > os.path.getmtime(base_file)) |
| and (os.path.getsize(base_file) > 0) |
| ): |
| logger.info("reusing %s", new_file) |
| return False |
| else: |
| logger.info("generating %s", new_file) |
| return True |
|
|
| def gen_sum_file(self, file, nsys_cmd): |
| """ |
| generates sum file from nsys trace with times per kernel and |
| returns the name of the sum file |
| """ |
| import subprocess |
|
|
| file_dir = os.path.dirname(file) |
| file_name = os.path.basename(file) |
|
|
| if not file_dir: |
| file_dir = "." |
| |
| nsys_stats_file = os.path.join(file_dir, f"{file_name}_cuda_gpu_trace.csv") |
| sum_file = os.path.join(file_dir, f"{file_name}_cuda_gpu_kernel_tracesum.csv") |
| if self.should_gen_file(nsys_stats_file, file): |
| cmd = [ |
| nsys_cmd, |
| "stats", |
| "-r", |
| "cuda_gpu_trace", |
| file, |
| "-o", |
| f"{file_dir}/{file_name}", |
| ] |
| cmd_str = shlex.join(cmd) |
| logger.info("+ %s", cmd_str) |
| |
| file_size_mb = os.path.getsize(file) / 1e6 |
| logger.info( |
| "nsys stats for %.2f MB file expected to take %.2f min", |
| file_size_mb, |
| file_size_mb / 240, |
| ) |
| try: |
| subprocess.run(cmd, check=True) |
| except (FileNotFoundError, subprocess.CalledProcessError) as e: |
| logger.error( |
| "'%s' failed: %s. Use --nsys_cmd to specify nsys path", cmd_str, e |
| ) |
| exit(1) |
| logger.info("generating non-overalapped sum %s", sum_file) |
| self.gen_nonoverlapped_sum_from_gputrace(nsys_stats_file, sum_file) |
| self.is_valid_file(sum_file) |
| logger.info("Finished generating %s", sum_file) |
| return sum_file |
|
|
| def gen_graph(self, in_file, out_dir, title, nsys_cmd, engine_model): |
| """generates graph and csv file from in_file into out_dir""" |
| |
| combined_df = self.pd.DataFrame() |
| for idx, (file, engine, model, total_sec) in enumerate(in_file): |
| file_dir = os.path.dirname(file) |
| file_name = os.path.basename(file) |
| if not file_dir: |
| file_dir = "." |
| sum_file = self.gen_sum_file(file, nsys_cmd) |
| |
| df = self.pd.read_csv(sum_file) |
| |
| assert engine_model.get(engine), f"engine {engine} unknown" |
| assert engine_model[engine].get(model), f"model {model} unknown" |
| |
| file_name = file_name.replace(".nsys-rep", "") |
| df["Model_Engine"] = f"{model}_{engine}_{file_name}_{idx}" |
| self.anno_gpu_kernname(df, engine_model[engine][model]) |
| |
| gpu_sec = round(df["Elapsed Time (sec)"].sum(), 1) |
| total_sec = round(float(total_sec), 1) |
| if total_sec < gpu_sec: |
| logger.warning( |
| "Elapsed sec %.2f < GPU sec %.2f resetting Elapsed sec ", |
| total_sec, |
| gpu_sec, |
| ) |
| total_sec = gpu_sec |
| nongpu_row = self.make_nongpu_row(df, total_sec - gpu_sec) |
| df = self.pd.concat([df, nongpu_row], ignore_index=True) |
| combined_df = self.pd.concat([combined_df, df], ignore_index=True) |
| if out_dir is None: |
| out_dir = "." |
| else: |
| os.makedirs(out_dir, exist_ok=True) |
| |
| self.make_html(combined_df, out_dir, title) |
|
|
|
|
| def parse_tuple(s): |
| return tuple(s.split(",")) |
|
|
|
|
| def main(): |
| logging.basicConfig( |
| format=("%(asctime)s - %(levelname)s - %(message)s"), level=logging.INFO |
| ) |
| parser = argparse.ArgumentParser( |
| description=( |
| "Process nsys rep and generate kernel non-overlapped cycles. \n" |
| "Example:\n" |
| "gputrc2graph.py --in_file d1.nsys-rep,sglang,llama,100 \n" |
| "d2.nsys-rep,sglang,gpt-oss,102 " |
| '--out_dir results/ --title "Model=gpt-oss SGLANG chart"' |
| ), |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| ) |
|
|
| |
| engine_model_supported = load_engine_model() |
| |
| engine_model_supported_str = ", ".join( |
| f"{engine}:[{', '.join(models.keys())}]" |
| for engine, models in engine_model_supported.items() |
| ) |
| parser.add_argument( |
| "--in_file", |
| type=parse_tuple, |
| nargs="+", |
| help=( |
| "list of (nsys-rep, engine, model, elapsed_nonprofiled_sec) " |
| "separated by space. Elapsed_nonprofiled_sec is runtime without " |
| "profiling used to calculate non-gpu time. Specify 0 to use " |
| "elapsed time from nsys-rep but that might inflate non-gpu time. " |
| f"Available engine:[model] are: {engine_model_supported_str} " |
| f"Example: --infile d1.nsys-rep,sglan,llama,100 " |
| "d2.nsys-rep,sglang,gpt-oss,102" |
| ), |
| required=True, |
| ) |
| parser.add_argument("--out_dir", help=("output dir for result.csv/html")) |
| parser.add_argument("--title", help=("title for html chart")) |
| parser.add_argument( |
| "--nsys_cmd", |
| help=("nsys cmd, e.g. /usr/bin/nsys, Default: nsys"), |
| default="nsys", |
| ) |
| args = parser.parse_args() |
| gputrace = GPUTrace2Graph() |
| gputrace.gen_graph( |
| args.in_file, args.out_dir, args.title, args.nsys_cmd, engine_model_supported |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|