Spaces:

ACE-Step
/

Ace-Step-v1.5

Configuration error

App Files Files Community

ChuxiJ commited on 28 days ago

Commit

6d3b89f

1 Parent(s): 033008e

resolve intermittent CUDA assertion error in concurrent serving scenarios

Browse files

Files changed (2) hide show

acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py +24 -0
acestep/third_parts/nano-vllm/nanovllm/utils/context.py +26 -6

acestep/third_parts/nano-vllm/nanovllm/engine/llm_engine.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import atexit
 from dataclasses import fields
 from time import perf_counter
 from tqdm.auto import tqdm
@@ -20,6 +21,15 @@ class LLMEngine:
         config = Config(model, **config_kwargs)
         self.ps = []
         self.events = []
         ctx = mp.get_context("spawn")
         for i in range(1, config.tensor_parallel_size):
             event = ctx.Event()
@@ -108,6 +118,20 @@ class LLMEngine:
         sampling_params: SamplingParams | list[SamplingParams],
         use_tqdm: bool = True,
         unconditional_prompts: list[str] | list[list[int]] | None = None,
     ) -> list[str]:
         # Clean up any residual state from previous interrupted generations
         # This prevents 'deque index out of range' errors from accumulated block leaks

 import atexit
+import threading
 from dataclasses import fields
 from time import perf_counter
 from tqdm.auto import tqdm
         config = Config(model, **config_kwargs)
         self.ps = []
         self.events = []
+        # Thread-safety lock for generate().
+        # The scheduler, block manager, model runner, and CUDA graph buffers are all
+        # shared mutable state that is NOT thread-safe. In concurrent serving scenarios
+        # (API server with ThreadPoolExecutor, multiple queue workers, Gradio with
+        # concurrent requests), multiple threads can call generate() simultaneously.
+        # Without this lock, concurrent access corrupts scheduler state, block tables,
+        # and CUDA graph input buffers, leading to intermittent CUDA device-side
+        # assertion failures (illegal memory access in KV cache).
+        self._generate_lock = threading.Lock()
         ctx = mp.get_context("spawn")
         for i in range(1, config.tensor_parallel_size):
             event = ctx.Event()
         sampling_params: SamplingParams | list[SamplingParams],
         use_tqdm: bool = True,
         unconditional_prompts: list[str] | list[list[int]] | None = None,
+    ) -> list[str]:
+        # Serialize access to the engine to prevent concurrent corruption of
+        # scheduler state, block manager, CUDA graph buffers, and KV cache.
+        # This is the primary defense against the intermittent CUDA device-side
+        # assertion error that occurs in concurrent serving scenarios.
+        with self._generate_lock:
+            return self._generate_impl(prompts, sampling_params, use_tqdm, unconditional_prompts)
+    def _generate_impl(
+        self,
+        prompts: list[str] | list[list[int]],
+        sampling_params: SamplingParams | list[SamplingParams],
+        use_tqdm: bool = True,
+        unconditional_prompts: list[str] | list[list[int]] | None = None,
     ) -> list[str]:
         # Clean up any residual state from previous interrupted generations
         # This prevents 'deque index out of range' errors from accumulated block leaks

acestep/third_parts/nano-vllm/nanovllm/utils/context.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from dataclasses import dataclass
 import torch
@@ -13,15 +14,34 @@ class Context:
     context_lens: torch.Tensor | None = None
     block_tables: torch.Tensor | None = None
-_CONTEXT = Context()
 def get_context():
-    return _CONTEXT
 def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None):
-    global _CONTEXT
-    _CONTEXT = Context(is_prefill, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables)
 def reset_context():
-    global _CONTEXT
-    _CONTEXT = Context()

 from dataclasses import dataclass
+import threading
 import torch
     context_lens: torch.Tensor | None = None
     block_tables: torch.Tensor | None = None
+# Thread-local storage for context.
+#
+# ROOT CAUSE FIX: The original implementation used a plain module-level global
+# `_CONTEXT` variable. In concurrent serving scenarios (API server with
+# ThreadPoolExecutor, multiple queue workers, or Gradio with concurrent requests),
+# multiple threads can call set_context() / get_context() / reset_context()
+# concurrently. This creates a race condition:
+#
+#   Thread A: set_context(...)        # sets slot_mapping, block_tables for request A
+#   Thread B: set_context(...)        # OVERWRITES with request B's data
+#   Thread A: run_model(...)          # reads Thread B's context → WRONG KV cache addresses
+#                                     # → CUDA illegal memory access / device-side assertion
+#
+# By using threading.local(), each thread gets its own independent Context,
+# eliminating the race condition entirely.
+_THREAD_LOCAL = threading.local()
 def get_context():
+    ctx = getattr(_THREAD_LOCAL, 'context', None)
+    if ctx is None:
+        ctx = Context()
+        _THREAD_LOCAL.context = ctx
+    return ctx
 def set_context(is_prefill, cu_seqlens_q=None, cu_seqlens_k=None, max_seqlen_q=0, max_seqlen_k=0, slot_mapping=None, context_lens=None, block_tables=None):
+    _THREAD_LOCAL.context = Context(is_prefill, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, slot_mapping, context_lens, block_tables)
 def reset_context():
+    _THREAD_LOCAL.context = Context()