# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. import json import logging import math from statistics import median logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) _NON_REQUEST_TOP_LEVEL_KEYS = { # System-level metrics "throughput", # Peak memory metrics (added by inference scripts; optionally checked if present in golden values) "mem-max-allocated-bytes", } def _median_as_float(value): """Convert scalar or list metric to a single float (median). For list metrics (e.g., per-request throughput), treat the first element as warmup if length > 1, matching existing throughput behavior. """ if isinstance(value, list): assert len(value) > 0, "Metric list is empty." values = [float(v) for v in value] if len(values) > 1: values = values[1:] return float(median(values)) return float(value) def _bytes_to_gib(num_bytes: float) -> float: return float(num_bytes) / (1024.0**3) def test_inference_pipeline(golden_values_path: str, test_values_path: str) -> None: with open(golden_values_path, 'r') as f1, open(test_values_path, 'r') as f2: golden_values_content = f1.read() tensorboard_content = f2.read() output_groundtruth = json.loads(golden_values_content) if isinstance(output_groundtruth, str): # Handle JSONL output, assume only one line in this case. output_groundtruth = json.loads(output_groundtruth) output_current = json.loads(tensorboard_content) if isinstance(output_current, str): # Handle JSONL output, assume only one line in this case. output_current = json.loads(output_current) groundtruth_request_ids = set(output_groundtruth.keys()) - _NON_REQUEST_TOP_LEVEL_KEYS current_request_ids = set(output_current.keys()) - _NON_REQUEST_TOP_LEVEL_KEYS assert groundtruth_request_ids.issuperset(current_request_ids), ( "Some request IDs from groundtruth are missing in current or current has unexpected IDs: " f"{sorted(groundtruth_request_ids)} vs {sorted(current_request_ids)}" ) if groundtruth_request_ids != current_request_ids: logger.warning( "Some request IDs from groundtruth are missing in output; only the subset of ids in groundtruth will be tested: " f"{sorted(groundtruth_request_ids)} vs {sorted(current_request_ids)}" ) assert len(output_groundtruth) > 0, "No test performed for output" # Throughput assertions. if "throughput" in output_groundtruth.keys(): # First warmup iteration is excluded from throughput statistics. throughput_sampled = median(output_current["throughput"][1:]) throughput_golden = median(output_groundtruth["throughput"][1:]) # 10% is empirically observed to be within hardware variance. assert ( throughput_sampled >= 0.9 * throughput_golden ), f"Throughput is slower than expected! Expected to be within 10% of ~{throughput_golden} tok/s but benchmarked {output_current['throughput']} tok/s" # If throughput is significantly improved (> 20%), update golden values accordingly. assert ( throughput_sampled < throughput_golden * 1.2 ), f"Throughput has been improved from expected ~{throughput_golden} tok/s to {output_current['throughput']} tok/s. Please update golden values in the functional tests." output_groundtruth.pop('throughput') # Peak memory regression checks (optional: only if present in golden values). if "mem-max-allocated-bytes" in output_groundtruth: assert "mem-max-allocated-bytes" in output_current, ( f"Golden values include mem-max-allocated-bytes but current output does not. " "Ensure the inference script records memory metrics to the output JSON." ) sampled = _median_as_float(output_current["mem-max-allocated-bytes"]) golden = _median_as_float(output_groundtruth["mem-max-allocated-bytes"]) assert golden > 0, f"Golden mem_max_allocated_bytes must be > 0, got {golden}." low = 0.95 * golden high = 1.05 * golden if sampled < low: raise AssertionError( f"Memory is too low for mem-max-allocated-bytes: " f"expected within 5% of {golden:.0f} bytes ({_bytes_to_gib(golden):.3f} GiB) " f"but got {sampled:.0f} bytes ({_bytes_to_gib(sampled):.3f} GiB). " "This is >5% lower than expected; please update golden values in the functional tests." ) if sampled > high: raise AssertionError( f"Memory is too high for mem-max-allocated-bytes: " f"expected within ±5% of {golden:.0f} bytes ({_bytes_to_gib(golden):.3f} GiB) " f"but got {sampled:.0f} bytes ({_bytes_to_gib(sampled):.3f} GiB). " "This is >5% higher than expected; this is likely a regression." ) output_groundtruth.pop("mem-max-allocated-bytes") for request_id, groundtruth_results in output_groundtruth.items(): current_results = output_current[request_id] at_least_one_test_loop = False if "generated_tokens" in groundtruth_results: at_least_one_test_loop = True tokens_groundtruth = groundtruth_results["generated_tokens"] tokens_current = current_results["generated_tokens"] # Check token equality assert ( tokens_groundtruth == tokens_current ), f"Token mismatch:\nGround truth: {tokens_groundtruth}\nCurrent: {tokens_current}" if "logprobs" in groundtruth_results: at_least_one_test_loop = True logprobs_groundtruth = groundtruth_results["logprobs"] logprobs_current = current_results["logprobs"] # Check logprobs length and tolerance assert len(logprobs_groundtruth) == len( logprobs_current ), f"Logprobs length mismatch: {len(logprobs_groundtruth)} vs {len(logprobs_current)}" for i, (lp1, lp2) in enumerate(zip(logprobs_groundtruth, logprobs_current)): assert math.isclose( lp1, lp2, abs_tol=0.001 ), f"Logprobs differ at index {i}: {lp1:.5f} vs {lp2:.5f}" if "generated_text" in groundtruth_results: at_least_one_test_loop = True generated_text_groundtruth = groundtruth_results["generated_text"] generated_text_current = current_results["generated_text"] min_len = min(len(generated_text_groundtruth), len(generated_text_current)) assert min_len > 0, ( "Generated text mismatch:" f"\nGround truth: {generated_text_groundtruth}\nCurrent: {generated_text_current}" ) assert generated_text_groundtruth[:min_len] == generated_text_current[:min_len], ( "Generated text mismatch:" f"\nGround truth (truncated to {min_len} chars): {generated_text_groundtruth[:min_len]}" f"\nCurrent (truncated to {min_len} chars): {generated_text_current[:min_len]}" ) if not at_least_one_test_loop: raise AssertionError(f"No test performed for output {groundtruth_results}")