Hanrui / sglang /test /registered /debug_utils /test_crash_dump.py
Lekr0's picture
Add files using upload-large-folder tool
61ba51e verified
import glob
import os
import pickle
import tempfile
import time
import unittest
import requests
from sglang.srt.environ import envs
from sglang.srt.utils import kill_process_tree
from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
register_cuda_ci(est_time=40, suite="nightly-1-gpu", nightly=True)
register_amd_ci(est_time=40, suite="nightly-amd-1-gpu", nightly=True)
class TestCrashDump(CustomTestCase):
crash_dump_folder = None
MAX_NEW_TOKENS = 4
NUM_REQUESTS_BEFORE_CRASH = 5
@classmethod
def setUpClass(cls):
cls.crash_dump_folder = tempfile.mkdtemp(prefix="crash_dump_test_")
with envs.SGLANG_TEST_CRASH_AFTER_STREAM_OUTPUTS.override(
cls.NUM_REQUESTS_BEFORE_CRASH * cls.MAX_NEW_TOKENS + 10
):
cls.process = popen_launch_server(
"Qwen/Qwen3-0.6B",
DEFAULT_URL_FOR_TEST,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--crash-dump-folder",
cls.crash_dump_folder,
"--skip-server-warmup",
],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_crash_dump_generated(self):
"""Test that crash dump file is generated after server crash."""
# Send multiple requests to trigger the crash
for i in range(self.NUM_REQUESTS_BEFORE_CRASH * 2):
try:
response = requests.post(
DEFAULT_URL_FOR_TEST + "/generate",
json={
"text": f"Hello, this is request {i}.",
"sampling_params": {
"max_new_tokens": self.MAX_NEW_TOKENS,
"temperature": 0,
},
},
timeout=30,
)
except requests.exceptions.RequestException:
# Connection error expected after crash
pass
# Wait for crash dump to be written
time.sleep(5)
# Find the crash dump file
dump_pattern = os.path.join(self.crash_dump_folder, "*", "crash_dump_*.pkl")
dump_files = glob.glob(dump_pattern)
# Check that a dump file was created
self.assertTrue(
len(dump_files) > 0,
f"No crash dump file found in {self.crash_dump_folder}. "
f"Pattern: {dump_pattern}",
)
# Read the dump file and verify contents
dump_file = dump_files[0]
with open(dump_file, "rb") as f:
dump_data = pickle.load(f)
# Verify the dump structure
self.assertIn("server_args", dump_data)
self.assertIn("requests", dump_data)
# Check that there are more than 5 requests in the dump
requests_list = dump_data["requests"]
self.assertGreater(
len(requests_list),
self.NUM_REQUESTS_BEFORE_CRASH,
f"Expected more than {self.NUM_REQUESTS_BEFORE_CRASH} requests in dump, but got {len(requests_list)}",
)
# Verify each request tuple has the expected structure (obj, out, created_time, finish_time)
for i, req_tuple in enumerate(requests_list):
self.assertIsInstance(
req_tuple,
tuple,
f"Request {i} should be a tuple, got {type(req_tuple)}",
)
self.assertGreaterEqual(
len(req_tuple),
4,
f"Request {i} tuple should have at least 4 elements",
)
if __name__ == "__main__":
unittest.main()