Upload solar_open_logits_processor.py with huggingface_hub

Browse files

Files changed (1) hide show

solar_open_logits_processor.py +763 -0

solar_open_logits_processor.py ADDED Viewed

	@@ -0,0 +1,763 @@

+# coding=utf-8
+# Copyright 2025 Upstage AI.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from enum import Enum
+from typing import TYPE_CHECKING
+import torch
+from vllm.sampling_params import SamplingParams
+from vllm.v1.sample.logits_processor import (
+    AdapterLogitsProcessor,
+    RequestLogitsProcessor,
+)
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+# Hardcoded token IDs for Solar tokenizer
+# Special token IDs for chat template
+BEGIN_TOKEN_ID = 20  # <|begin|>
+END_TOKEN_ID = 21  # <|end|>
+THINK_TOKEN_ID = 22  # <|think|>
+CONTENT_TOKEN_ID = 23  # <|content|>
+FLUSH_TOKEN_ID = 24  # <|flush|> (eos token)
+ASSISTANT_TOKEN_ID = 163444  # assistant
+'''
+'assistant' is not a special token exactly, but is treated as one in the logits
+processing.
+'''
+# Tool call related tokens
+CALLS_TOKEN_ID = 25  # <|calls|> (eos token for tool calls)
+TOOL_CALLS_TOKEN_ID = 30  # <|tool_calls|>
+TOOL_CALL_BEGIN_TOKEN_ID = 31  # <|tool_call:begin|>
+TOOL_CALL_END_TOKEN_ID = 32  # <|tool_call:end|>
+TOOL_CALL_NAME_TOKEN_ID = 33  # <|tool_call:name|>
+TOOL_CALL_ARGS_TOKEN_ID = 34  # <|tool_call:args|>
+# =============================================================================
+# Dynamic Reasoning Budget Configuration
+# =============================================================================
+# budget = min(max_budget, max(min_budget, max_tokens * ratio / 100))
+# Priority: max_budget > min_budget > ratio
+#
+# Available environment variables:
+#   HIGH effort:
+#     SOLAR_REASONING_BUDGET_HIGH_MAX    (default: 32768)  - max_budget
+#     SOLAR_REASONING_BUDGET_HIGH_MIN    (default: 8192)   - min_budget
+#     SOLAR_REASONING_BUDGET_HIGH_RATIO  (default: 60)     - % of max_tokens
+#
+#   MEDIUM effort:
+#     SOLAR_REASONING_BUDGET_MEDIUM_MAX    (default: 16384)  - max_budget
+#     SOLAR_REASONING_BUDGET_MEDIUM_MIN    (default: 4096)   - min_budget
+#     SOLAR_REASONING_BUDGET_MEDIUM_RATIO  (default: 30)     - % of max_tokens
+#
+#   Tool call:
+#     SOLAR_TOOL_CALL_ID_BUDGET  (default: 10)  - Max tokens for tool call ID
+# =============================================================================
+DEFAULT_REASONING_EFFORT = "high"
+# HIGH effort settings (1k = 1024 tokens)
+DEFAULT_REASONING_BUDGET_HIGH_MAX = 32 * 1024
+DEFAULT_REASONING_BUDGET_HIGH_MIN = 8 * 1024
+DEFAULT_REASONING_BUDGET_HIGH_RATIO = 60
+# MEDIUM effort settings
+DEFAULT_REASONING_BUDGET_MEDIUM_MAX = 16 * 1024
+DEFAULT_REASONING_BUDGET_MEDIUM_MIN = 4 * 1024
+DEFAULT_REASONING_BUDGET_MEDIUM_RATIO = 30
+# Tool call settings
+DEFAULT_TOOL_CALL_ID_BUDGET = 10
+# Pre-computed constant to avoid repeated string parsing
+NEG_INF = float("-inf")
+def is_reasoning_request(params: SamplingParams) -> bool:
+    """Check if the request is a reasoning request based on reasoning_effort."""
+    return (params.reasoning_effort is None) or (params.reasoning_effort in ("medium", "high"))
+def is_structured_outputs(params: SamplingParams) -> bool:
+    """Check if the request has structured outputs constraints."""
+    return (
+        params.structured_outputs is not None
+        and not params.structured_outputs.all_constraints_none()
+    )
+class GenerationState(Enum):
+    """Enum representing the current state of response generation."""
+    # Initial state - no tokens generated yet
+    INITIAL = "initial"
+    # New message states (after think_end)
+    NEW_MESSAGE_BEGIN = "new_message_begin"  # <|begin|> token was just generated
+    NEW_MESSAGE_ASSISTANT = "new_message_assistant"  # assistant token after <|begin|>
+    # Think mode states
+    THINK_BEGIN = "think_begin"  # <|think|> token was just generated
+    THINK_IN_PROGRESS = "think_in_progress"  # Generating think content
+    THINK_END = "think_end"  # <|end|> after think content
+    THINK_FLUSH = "think_flush"  # <|flush|> after think content
+    # Content states
+    CONTENT_BEGIN = "content_begin"  # <|content|> token was just generated
+    CONTENT_IN_PROGRESS = "content_in_progress"  # Generating content
+    CONTENT_END = "content_end"  # <|end|> or <|flush|> after content
+    CONTENT_FLUSH = "content_flush"  # <|flush|> after content
+    # Tool call states
+    # Flow: <|tool_calls|> -> (<|tool_call:begin|> -> id -> <|tool_call:name|> -> name -> <|tool_call:args|> -> args -> <|tool_call:end|>)+ -> <|calls|>
+    # Note: Think message can appear before <|tool_calls|>
+    TOOL_CALLS_BEGIN = "tool_calls_begin"  # <|tool_calls|> token was just generated
+    TOOL_CALL_BEGIN = "tool_call_begin"  # <|tool_call:begin|> token was just generated
+    TOOL_CALL_ID_IN_PROGRESS = "tool_call_id_in_progress"  # Generating tool call ID
+    TOOL_CALL_NAME_BEGIN = "tool_call_name_begin"  # <|tool_call:name|> token was just generated
+    TOOL_CALL_NAME_IN_PROGRESS = "tool_call_name_in_progress"  # Generating tool name
+    TOOL_CALL_ARGS_BEGIN = "tool_call_args_begin"  # <|tool_call:args|> token was just generated
+    TOOL_CALL_ARGS_IN_PROGRESS = "tool_call_args_in_progress"  # Generating tool arguments (JSON)
+    TOOL_CALL_END = "tool_call_end"  # <|tool_call:end|> token was just generated (can start another tool call or end)
+    CALLS = "calls"  # <|calls|> token was just generated (eos token for tool calls)
+def get_generation_state(
+    output_token_ids: list[int],
+    begin_token_id: int = BEGIN_TOKEN_ID,
+    end_token_id: int = END_TOKEN_ID,
+    flush_token_id: int = FLUSH_TOKEN_ID,
+    think_token_id: int = THINK_TOKEN_ID,
+    content_token_id: int = CONTENT_TOKEN_ID,
+    tool_calls_token_id: int = TOOL_CALLS_TOKEN_ID,
+    tool_call_begin_token_id: int = TOOL_CALL_BEGIN_TOKEN_ID,
+    tool_call_name_token_id: int = TOOL_CALL_NAME_TOKEN_ID,
+    tool_call_args_token_id: int = TOOL_CALL_ARGS_TOKEN_ID,
+    tool_call_end_token_id: int = TOOL_CALL_END_TOKEN_ID,
+    calls_token_id: int = CALLS_TOKEN_ID,
+    assistant_token_id: int = ASSISTANT_TOKEN_ID,
+) -> GenerationState:
+    """Determine the current generation state based on output token IDs.
+    Analyzes the sequence of generated tokens to determine which phase
+    of the chat template the generation is currently in.
+    Response format specs:
+    - think mode: <|think|>{{think-tokens}}<|end|><|begin|>assistant<|content|>{{content-tokens}}<|flush|>
+    - tool mode: <|begin|>assistant<|tool_calls|><|tool_call:begin|>{{id}}<|tool_call:name|>{{name}}<|tool_call:args|>{{args}}<|tool_call:end|><|calls|>
+    - tool mode (with think): <|think|>{{think-tokens}}<|end|><|begin|>assistant<|tool_calls|>...<|calls|>
+    - no-think mode: <|content|>{{content-tokens}}<|flush|>
+    Args:
+        output_token_ids: List of token IDs generated so far.
+        begin_token_id: Token ID for <|begin|>.
+        end_token_id: Token ID for <|end|>.
+        flush_token_id: Token ID for <|flush|> (eos).
+        think_token_id: Token ID for <|think|>.
+        content_token_id: Token ID for <|content|>.
+        tool_calls_token_id: Token ID for <|tool_calls|>.
+        tool_call_begin_token_id: Token ID for <|tool_call:begin|>.
+        tool_call_name_token_id: Token ID for <|tool_call:name|>.
+        tool_call_args_token_id: Token ID for <|tool_call:args|>.
+        tool_call_end_token_id: Token ID for <|tool_call:end|>.
+        calls_token_id: Token ID for <|calls|> (eos).
+        assistant_token_id: Token ID for assistant.
+    Returns:
+        GenerationState indicating the current phase of generation.
+    """
+    if not output_token_ids:
+        return GenerationState.INITIAL
+    # Track state by scanning through tokens
+    state = GenerationState.INITIAL
+    in_think = False
+    in_content = False
+    for token_id in output_token_ids:
+        if token_id == think_token_id:
+            state = GenerationState.THINK_BEGIN
+            in_think = True
+            in_content = False
+        elif token_id == content_token_id:
+            state = GenerationState.CONTENT_BEGIN
+            in_content = True
+            in_think = False
+        elif token_id == tool_calls_token_id:
+            state = GenerationState.TOOL_CALLS_BEGIN
+            in_think = False
+            in_content = False
+        elif token_id == tool_call_begin_token_id:
+            state = GenerationState.TOOL_CALL_BEGIN
+        elif token_id == tool_call_name_token_id:
+            state = GenerationState.TOOL_CALL_NAME_BEGIN
+        elif token_id == tool_call_args_token_id:
+            state = GenerationState.TOOL_CALL_ARGS_BEGIN
+        elif token_id == tool_call_end_token_id:
+            state = GenerationState.TOOL_CALL_END
+        elif token_id == calls_token_id:
+            state = GenerationState.CALLS
+        elif token_id == begin_token_id:
+            state = GenerationState.NEW_MESSAGE_BEGIN
+        elif token_id == assistant_token_id:
+            if state == GenerationState.NEW_MESSAGE_BEGIN:
+                state = GenerationState.NEW_MESSAGE_ASSISTANT
+        elif token_id == end_token_id:
+            if in_think:
+                state = GenerationState.THINK_END
+                in_think = False
+            elif in_content:
+                state = GenerationState.CONTENT_END
+                in_content = False
+        elif token_id == flush_token_id:
+            if in_think:
+                state = GenerationState.THINK_FLUSH
+                in_think = False
+            elif in_content:
+                state = GenerationState.CONTENT_FLUSH
+                in_content = False
+        else:
+            # Regular token - update state based on current context
+            if state == GenerationState.THINK_BEGIN:
+                state = GenerationState.THINK_IN_PROGRESS
+            elif state == GenerationState.THINK_IN_PROGRESS:
+                pass  # Stay in think_in_progress
+            elif state == GenerationState.CONTENT_BEGIN:
+                state = GenerationState.CONTENT_IN_PROGRESS
+            elif state == GenerationState.CONTENT_IN_PROGRESS:
+                pass  # Stay in content_in_progress
+            elif state == GenerationState.TOOL_CALL_BEGIN:
+                state = GenerationState.TOOL_CALL_ID_IN_PROGRESS
+            elif state == GenerationState.TOOL_CALL_ID_IN_PROGRESS:
+                pass  # Stay in tool_call_id_in_progress
+            elif state == GenerationState.TOOL_CALL_NAME_BEGIN:
+                state = GenerationState.TOOL_CALL_NAME_IN_PROGRESS
+            elif state == GenerationState.TOOL_CALL_NAME_IN_PROGRESS:
+                pass  # Stay in tool_call_name_in_progress
+            elif state == GenerationState.TOOL_CALL_ARGS_BEGIN:
+                state = GenerationState.TOOL_CALL_ARGS_IN_PROGRESS
+            elif state == GenerationState.TOOL_CALL_ARGS_IN_PROGRESS:
+                pass  # Stay in tool_call_args_in_progress
+    return state
+# Pre-computed list of all special token IDs for batch indexing
+_ALL_SPECIAL_TOKEN_IDS = [
+    BEGIN_TOKEN_ID,
+    END_TOKEN_ID,
+    THINK_TOKEN_ID,
+    CONTENT_TOKEN_ID,
+    FLUSH_TOKEN_ID,
+    CALLS_TOKEN_ID,
+    TOOL_CALLS_TOKEN_ID,
+    TOOL_CALL_BEGIN_TOKEN_ID,
+    TOOL_CALL_END_TOKEN_ID,
+    TOOL_CALL_NAME_TOKEN_ID,
+    TOOL_CALL_ARGS_TOKEN_ID,
+]
+# Pre-computed lists for state-specific batch indexing (excluding allowed tokens)
+_SPECIAL_EXCEPT_END = [  # For THINK states (allow END)
+    BEGIN_TOKEN_ID, FLUSH_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID,
+    TOOL_CALLS_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALL_BEGIN_TOKEN_ID,
+    TOOL_CALL_END_TOKEN_ID, TOOL_CALL_NAME_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID,
+]
+_SPECIAL_EXCEPT_CONTENT_TOOLCALLS = [  # For NEW_MESSAGE_ASSISTANT (allow CONTENT, TOOL_CALLS)
+    THINK_TOKEN_ID, BEGIN_TOKEN_ID, END_TOKEN_ID, FLUSH_TOKEN_ID,
+    CALLS_TOKEN_ID, TOOL_CALL_BEGIN_TOKEN_ID, TOOL_CALL_END_TOKEN_ID,
+    TOOL_CALL_NAME_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID,
+]
+_SPECIAL_EXCEPT_FLUSH = [  # For CONTENT states (allow FLUSH)
+    BEGIN_TOKEN_ID, END_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID,
+    TOOL_CALLS_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALL_BEGIN_TOKEN_ID,
+    TOOL_CALL_END_TOKEN_ID, TOOL_CALL_NAME_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID,
+]
+_SPECIAL_EXCEPT_TOOLCALL_NAME = [  # For TOOL_CALL_ID_IN_PROGRESS (allow TOOL_CALL_NAME)
+    BEGIN_TOKEN_ID, END_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID,
+    FLUSH_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALLS_TOKEN_ID,
+    TOOL_CALL_BEGIN_TOKEN_ID, TOOL_CALL_END_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID,
+]
+_SPECIAL_EXCEPT_TOOLCALL_ARGS = [  # For TOOL_CALL_NAME_IN_PROGRESS (allow TOOL_CALL_ARGS)
+    BEGIN_TOKEN_ID, END_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID,
+    FLUSH_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALLS_TOKEN_ID,
+    TOOL_CALL_BEGIN_TOKEN_ID, TOOL_CALL_END_TOKEN_ID, TOOL_CALL_NAME_TOKEN_ID,
+]
+_SPECIAL_EXCEPT_TOOLCALL_END = [  # For TOOL_CALL_ARGS_IN_PROGRESS (allow TOOL_CALL_END)
+    BEGIN_TOKEN_ID, END_TOKEN_ID, THINK_TOKEN_ID, CONTENT_TOKEN_ID,
+    FLUSH_TOKEN_ID, CALLS_TOKEN_ID, TOOL_CALLS_TOKEN_ID,
+    TOOL_CALL_BEGIN_TOKEN_ID, TOOL_CALL_NAME_TOKEN_ID, TOOL_CALL_ARGS_TOKEN_ID,
+]
+def _forbid_all_special_tokens(logits: torch.Tensor) -> None:
+    """Set all special token logits to -inf."""
+    logits[_ALL_SPECIAL_TOKEN_IDS] = NEG_INF
+class SolarOpenTemplateEnforcer:
+    """Request-level logits processor that enforces Solar Open chat template.
+    Enforces the following generation rules:
+    - think mode: <|think|>{{tokens}}<|end|><|begin|>assistant<|content|>{{tokens}}<|flush|>
+    - tool mode: <|tool_calls|><|tool_call:begin|>{{id}}<|tool_call:name|>{{name}}<|tool_call:args|>{{args}}<|tool_call:end|><|calls|>
+    - tool+think mode: <|think|>{{tokens}}<|end|><|begin|>assistant<|tool_calls|>...<|calls|>
+    - no-think mode: <|content|>{{tokens}}<|flush|>
+    Key constraints:
+    - Think message can only appear first
+    - Think message must be followed by another message
+    - Content and tool messages cannot coexist
+    - Maximum 2 messages (think + content/tool, or just content/tool)
+    Performance optimization:
+    - Uses incremental state tracking to avoid full token sequence scan on each call
+    - Maintains local counters for budget tracking
+    - Uses pre-computed constants to avoid repeated object creation
+    """
+    # Pre-computed frozenset for reasoning state check (avoids set creation per call)
+    _REASONING_STATES = frozenset({
+        GenerationState.INITIAL,
+        GenerationState.THINK_BEGIN,
+        GenerationState.THINK_IN_PROGRESS,
+    })
+    def __init__(
+        self,
+        is_reasoning_request: bool,
+        is_structured_outputs: bool,
+        reasoning_budget: int | None = None,
+        tool_call_id_budget: int = DEFAULT_TOOL_CALL_ID_BUDGET,
+    ):
+        self._is_reasoning_request = is_reasoning_request
+        self._is_structured_outputs = is_structured_outputs
+        self._reasoning_budget = reasoning_budget
+        self._tool_call_id_budget = tool_call_id_budget
+        # Incremental state tracking
+        self._state = GenerationState.INITIAL
+        self._last_processed_len = 0
+        self._in_think = False
+        self._in_content = False
+        # Budget counters
+        self._think_token_count = 0
+        self._tool_call_id_token_count = 0
+    def _reset_state(self) -> None:
+        """Reset all incremental state to initial values.
+        Called when defensive reprocessing is needed (e.g., token sequence inconsistency).
+        """
+        self._state = GenerationState.INITIAL
+        self._last_processed_len = 0
+        self._in_think = False
+        self._in_content = False
+        self._think_token_count = 0
+        self._tool_call_id_token_count = 0
+    def _process_token(self, token_id: int) -> None:
+        """Process a single token and update internal state incrementally.
+        Args:
+            token_id: The token ID to process.
+        """
+        if token_id == THINK_TOKEN_ID:
+            self._state = GenerationState.THINK_BEGIN
+            self._in_think = True
+            self._in_content = False
+            self._think_token_count = 0  # Reset counter for new think block
+        elif token_id == CONTENT_TOKEN_ID:
+            self._state = GenerationState.CONTENT_BEGIN
+            self._in_content = True
+            self._in_think = False
+        elif token_id == TOOL_CALLS_TOKEN_ID:
+            self._state = GenerationState.TOOL_CALLS_BEGIN
+            self._in_think = False
+            self._in_content = False
+        elif token_id == TOOL_CALL_BEGIN_TOKEN_ID:
+            self._state = GenerationState.TOOL_CALL_BEGIN
+            self._tool_call_id_token_count = 0  # Reset counter for new tool call
+        elif token_id == TOOL_CALL_NAME_TOKEN_ID:
+            self._state = GenerationState.TOOL_CALL_NAME_BEGIN
+        elif token_id == TOOL_CALL_ARGS_TOKEN_ID:
+            self._state = GenerationState.TOOL_CALL_ARGS_BEGIN
+        elif token_id == TOOL_CALL_END_TOKEN_ID:
+            self._state = GenerationState.TOOL_CALL_END
+        elif token_id == CALLS_TOKEN_ID:
+            self._state = GenerationState.CALLS
+        elif token_id == BEGIN_TOKEN_ID:
+            self._state = GenerationState.NEW_MESSAGE_BEGIN
+        elif token_id == ASSISTANT_TOKEN_ID:
+            if self._state == GenerationState.NEW_MESSAGE_BEGIN:
+                self._state = GenerationState.NEW_MESSAGE_ASSISTANT
+        elif token_id == END_TOKEN_ID:
+            if self._in_think:
+                self._state = GenerationState.THINK_END
+                self._in_think = False
+            elif self._in_content:
+                self._state = GenerationState.CONTENT_END
+                self._in_content = False
+        elif token_id == FLUSH_TOKEN_ID:
+            if self._in_think:
+                self._state = GenerationState.THINK_FLUSH
+                self._in_think = False
+            elif self._in_content:
+                self._state = GenerationState.CONTENT_FLUSH
+                self._in_content = False
+        else:
+            # Regular token - update state and counters based on current context
+            if self._state == GenerationState.THINK_BEGIN:
+                self._state = GenerationState.THINK_IN_PROGRESS
+                self._think_token_count += 1
+            elif self._state == GenerationState.THINK_IN_PROGRESS:
+                self._think_token_count += 1
+            elif self._state == GenerationState.CONTENT_BEGIN:
+                self._state = GenerationState.CONTENT_IN_PROGRESS
+            elif self._state == GenerationState.CONTENT_IN_PROGRESS:
+                pass  # Stay in content_in_progress
+            elif self._state == GenerationState.TOOL_CALL_BEGIN:
+                self._state = GenerationState.TOOL_CALL_ID_IN_PROGRESS
+                self._tool_call_id_token_count += 1
+            elif self._state == GenerationState.TOOL_CALL_ID_IN_PROGRESS:
+                self._tool_call_id_token_count += 1
+            elif self._state == GenerationState.TOOL_CALL_NAME_BEGIN:
+                self._state = GenerationState.TOOL_CALL_NAME_IN_PROGRESS
+            elif self._state == GenerationState.TOOL_CALL_NAME_IN_PROGRESS:
+                pass  # Stay in tool_call_name_in_progress
+            elif self._state == GenerationState.TOOL_CALL_ARGS_BEGIN:
+                self._state = GenerationState.TOOL_CALL_ARGS_IN_PROGRESS
+            elif self._state == GenerationState.TOOL_CALL_ARGS_IN_PROGRESS:
+                pass  # Stay in tool_call_args_in_progress
+    def _update_state_incremental(self, output_token_ids: list[int]) -> None:
+        """Update internal state by processing only new tokens.
+        Args:
+            output_token_ids: Full list of output token IDs.
+        """
+        current_len = len(output_token_ids)
+        # Defensive check: if token sequence is shorter than expected, reset and reprocess
+        if current_len < self._last_processed_len:
+            self._reset_state()
+        # Process only new tokens
+        for i in range(self._last_processed_len, current_len):
+            self._process_token(output_token_ids[i])
+        self._last_processed_len = current_len
+    @staticmethod
+    def _count_think_tokens(output_token_ids: list[int]) -> int:
+        """Count the number of tokens generated after <|think|> token.
+        Returns 0 if <|think|> token is not found (defensive).
+        Note: This static method is kept for backward compatibility and testing.
+        The incremental version uses _think_token_count instead.
+        """
+        try:
+            think_index = output_token_ids.index(THINK_TOKEN_ID)
+            return len(output_token_ids) - think_index - 1
+        except ValueError:
+            return 0
+    @staticmethod
+    def _count_tool_call_id_tokens(output_token_ids: list[int]) -> int:
+        """Count the number of tokens generated after the last <|tool_call:begin|> token.
+        Returns 0 if <|tool_call:begin|> token is not found (defensive).
+        Note: This static method is kept for backward compatibility and testing.
+        The incremental version uses _tool_call_id_token_count instead.
+        """
+        # Find the last occurrence of <|tool_call:begin|> for multi-tool-call support
+        try:
+            # Reverse search for the last <|tool_call:begin|>
+            reversed_index = output_token_ids[::-1].index(TOOL_CALL_BEGIN_TOKEN_ID)
+            last_begin_index = len(output_token_ids) - 1 - reversed_index
+            return len(output_token_ids) - last_begin_index - 1
+        except ValueError:
+            return 0
+    def __call__(
+        self,
+        output_token_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        # Update state incrementally (only process new tokens)
+        self._update_state_incremental(output_token_ids)
+        state = self._state
+        # Handle structured outputs mode
+        if self._is_structured_outputs:
+            if not self._is_reasoning_request:
+                # Non-reasoning request with structured outputs: no logit control
+                return logits
+            else:
+                # Reasoning request with structured outputs:
+                # Control logits only during reasoning phase
+                if state not in self._REASONING_STATES:
+                    # Reasoning finished, let structured outputs handle it
+                    return logits
+        if state == GenerationState.INITIAL:
+            if self._is_reasoning_request:
+                # Force: <|think|> only (reasoning request must start with think)
+                think_logit = logits[THINK_TOKEN_ID].clone()
+                logits.fill_(NEG_INF)
+                logits[THINK_TOKEN_ID] = think_logit
+            else:
+                # Allow: <|content|>, <|tool_calls|> only
+                content_logit = logits[CONTENT_TOKEN_ID].clone()
+                tool_calls_logit = logits[TOOL_CALLS_TOKEN_ID].clone()
+                logits.fill_(NEG_INF)
+                logits[CONTENT_TOKEN_ID] = content_logit
+                logits[TOOL_CALLS_TOKEN_ID] = tool_calls_logit
+        elif state in (GenerationState.THINK_BEGIN, GenerationState.THINK_IN_PROGRESS):
+            # Check if reasoning budget is exceeded (using incremental counter)
+            if (
+                self._reasoning_budget is not None
+                and state == GenerationState.THINK_IN_PROGRESS
+            ):
+                if self._think_token_count >= self._reasoning_budget:
+                    # Force <|end|> token to terminate reasoning
+                    logits.fill_(NEG_INF)
+                    logits[END_TOKEN_ID] = 0.0
+                    return logits
+            # Transform: <|flush|> -> <|end|>
+            # Think must be followed by another message, so prevent early termination
+            logits[END_TOKEN_ID] = torch.maximum(logits[END_TOKEN_ID], logits[FLUSH_TOKEN_ID])
+            # Forbid all special tokens except <|end|>
+            logits[_SPECIAL_EXCEPT_END] = NEG_INF
+        elif state == GenerationState.THINK_END:
+            # Force: <|begin|> only
+            # Think must be followed by another message
+            logits.fill_(NEG_INF)
+            logits[BEGIN_TOKEN_ID] = 0.0
+        elif state == GenerationState.NEW_MESSAGE_BEGIN:
+            # Force: assistant token only
+            logits.fill_(NEG_INF)
+            logits[ASSISTANT_TOKEN_ID] = 0.0
+        elif state == GenerationState.NEW_MESSAGE_ASSISTANT:
+            # Allow: <|content|>, <|tool_calls|>, regular tokens
+            # Forbid: all other special tokens
+            logits[_SPECIAL_EXCEPT_CONTENT_TOOLCALLS] = NEG_INF
+        elif state in (GenerationState.CONTENT_BEGIN, GenerationState.CONTENT_IN_PROGRESS):
+            # Transform: <|end|> -> <|flush|>
+            # Content cannot be followed by another message
+            logits[FLUSH_TOKEN_ID] = torch.maximum(logits[FLUSH_TOKEN_ID], logits[END_TOKEN_ID])
+            # Forbid all special tokens except <|flush|>
+            logits[_SPECIAL_EXCEPT_FLUSH] = NEG_INF
+        elif state == GenerationState.TOOL_CALLS_BEGIN:
+            # Force: <|tool_call:begin|> only
+            tool_call_begin_logit = logits[TOOL_CALL_BEGIN_TOKEN_ID].clone()
+            logits.fill_(NEG_INF)
+            logits[TOOL_CALL_BEGIN_TOKEN_ID] = tool_call_begin_logit
+        elif state == GenerationState.TOOL_CALL_BEGIN:
+            # Allow: regular tokens only (ID generation)
+            # Forbid: all special tokens
+            _forbid_all_special_tokens(logits)
+        elif state == GenerationState.TOOL_CALL_ID_IN_PROGRESS:
+            # Check if tool call ID budget is exceeded (using incremental counter)
+            if self._tool_call_id_token_count >= self._tool_call_id_budget:
+                # Force <|tool_call:name|> token to terminate ID generation
+                logits.fill_(NEG_INF)
+                logits[TOOL_CALL_NAME_TOKEN_ID] = 0.0
+                return logits
+            # Allow: <|tool_call:name|>, regular tokens
+            # Forbid: all other special tokens
+            logits[_SPECIAL_EXCEPT_TOOLCALL_NAME] = NEG_INF
+        elif state == GenerationState.TOOL_CALL_NAME_BEGIN:
+            # Allow: regular tokens only (function name generation)
+            # Forbid: all special tokens
+            _forbid_all_special_tokens(logits)
+        elif state == GenerationState.TOOL_CALL_NAME_IN_PROGRESS:
+            # Allow: <|tool_call:args|>, regular tokens
+            # Forbid: all other special tokens
+            logits[_SPECIAL_EXCEPT_TOOLCALL_ARGS] = NEG_INF
+        elif state == GenerationState.TOOL_CALL_ARGS_BEGIN:
+            # Allow: regular tokens only (JSON args generation)
+            # Forbid: all special tokens
+            _forbid_all_special_tokens(logits)
+        elif state == GenerationState.TOOL_CALL_ARGS_IN_PROGRESS:
+            # Allow: <|tool_call:end|>, regular tokens
+            # Forbid: all other special tokens
+            logits[_SPECIAL_EXCEPT_TOOLCALL_END] = NEG_INF
+        elif state == GenerationState.TOOL_CALL_END:
+            # Allow: <|tool_call:begin|> (next tool call), <|calls|> (end)
+            # Forbid: all other special tokens
+            tool_call_begin_logit = logits[TOOL_CALL_BEGIN_TOKEN_ID].clone()
+            calls_logit = logits[CALLS_TOKEN_ID].clone()
+            logits.fill_(NEG_INF)
+            logits[TOOL_CALL_BEGIN_TOKEN_ID] = tool_call_begin_logit
+            logits[CALLS_TOKEN_ID] = calls_logit
+        # CALLS state: no processing needed (EOS)
+        return logits
+class SolarOpenTemplateLogitsProcessor(AdapterLogitsProcessor):
+    """
+    Logits processor that enforces Solar Open chat template.
+    This processor manages the generation flow according to the
+    Solar Open chat template by tracking generation states.
+    """
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        device: torch.device,
+        is_pin_memory: bool,
+    ):
+        super().__init__(vllm_config, device, is_pin_memory)
+        # Dynamic reasoning budget settings for HIGH effort
+        self._high_max = self._parse_env_int(
+            "SOLAR_REASONING_BUDGET_HIGH_MAX", DEFAULT_REASONING_BUDGET_HIGH_MAX
+        )
+        self._high_min = self._parse_env_int(
+            "SOLAR_REASONING_BUDGET_HIGH_MIN", DEFAULT_REASONING_BUDGET_HIGH_MIN
+        )
+        self._high_ratio = self._parse_env_int(
+            "SOLAR_REASONING_BUDGET_HIGH_RATIO", DEFAULT_REASONING_BUDGET_HIGH_RATIO
+        )
+        # Dynamic reasoning budget settings for MEDIUM effort
+        self._medium_max = self._parse_env_int(
+            "SOLAR_REASONING_BUDGET_MEDIUM_MAX", DEFAULT_REASONING_BUDGET_MEDIUM_MAX
+        )
+        self._medium_min = self._parse_env_int(
+            "SOLAR_REASONING_BUDGET_MEDIUM_MIN", DEFAULT_REASONING_BUDGET_MEDIUM_MIN
+        )
+        self._medium_ratio = self._parse_env_int(
+            "SOLAR_REASONING_BUDGET_MEDIUM_RATIO", DEFAULT_REASONING_BUDGET_MEDIUM_RATIO
+        )
+        self._tool_call_id_budget: int = self._parse_env_int(
+            "SOLAR_TOOL_CALL_ID_BUDGET", DEFAULT_TOOL_CALL_ID_BUDGET
+        )
+    @staticmethod
+    def _parse_env_int(env_var: str, default: int) -> int:
+        """Parse environment variable as integer, return default if not set or invalid."""
+        value = os.environ.get(env_var)
+        if value is None:
+            return default
+        try:
+            return int(value)
+        except ValueError:
+            return default
+    def _calculate_reasoning_budget(self, effort: str, max_tokens: int) -> int:
+        """Calculate dynamic reasoning budget based on effort level and max_tokens.
+        Priority (higher priority conditions are applied first):
+        1. max_budget: Upper limit for reasoning tokens
+        2. min_budget: Lower limit for reasoning tokens
+        3. ratio: Percentage of max_tokens allocated for reasoning (e.g., 60 means 60%)
+        budget = min(max_budget, max(min_budget, max_tokens * ratio / 100))
+        """
+        if effort == "high":
+            max_budget = self._high_max
+            min_budget = self._high_min
+            ratio = self._high_ratio
+        elif effort == "medium":
+            max_budget = self._medium_max
+            min_budget = self._medium_min
+            ratio = self._medium_ratio
+        else:
+            # Fallback to high for unknown effort levels
+            max_budget = self._high_max
+            min_budget = self._high_min
+            ratio = self._high_ratio
+        # Calculate ratio-based budget (ratio is percentage, e.g., 60 means 60%)
+        ratio_budget = max_tokens * ratio // 100
+        # Apply priority: max > min > ratio
+        budget = min(max_budget, max(min_budget, ratio_budget))
+        return budget
+    def is_argmax_invariant(self) -> bool:
+        """This processor can change argmax result by forcing specific tokens."""
+        return False
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> RequestLogitsProcessor | None:
+        reasoning_effort = params.reasoning_effort or DEFAULT_REASONING_EFFORT
+        reasoning_budget = self._calculate_reasoning_budget(
+            reasoning_effort, params.max_tokens
+        )
+        return SolarOpenTemplateEnforcer(
+            is_reasoning_request=is_reasoning_request(params),
+            is_structured_outputs=is_structured_outputs(params),
+            reasoning_budget=reasoning_budget,
+            tool_call_id_budget=self._tool_call_id_budget,
+        )