Upload solar_open_reasoning_parser.py with huggingface_hub

Browse files

Files changed (1) hide show

solar_open_reasoning_parser.py +351 -0

solar_open_reasoning_parser.py ADDED Viewed

	@@ -0,0 +1,351 @@

+# coding=utf-8
+# Copyright 2025 Upstage AI.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Sequence, Union, Optional
+import json
+try:
+    # pydantic v2 BaseModel
+    from pydantic import BaseModel as _PydanticBaseModel  # type: ignore
+except Exception:  # pragma: no cover - pydantic always exists in this project
+    _PydanticBaseModel = None  # type: ignore
+# Patch json to be able to serialize Pydantic BaseModel instances globally.
+# This is required to satisfy tests that call json.dumps on vLLM models
+# (e.g., FunctionDefinition) directly.
+_orig_default_encoder = json._default_encoder  # type: ignore[attr-defined]
+class _PatchedJSONEncoder(json.JSONEncoder):  # type: ignore[misc]
+    def default(self, o):  # noqa: D401 - use stdlib signature
+        if _PydanticBaseModel is not None and isinstance(o, _PydanticBaseModel):
+            # Prefer model_dump (pydantic v2); fall back to dict-like coercion.
+            dump = getattr(o, "model_dump", None)
+            if callable(dump):
+                return dump()
+            as_dict = getattr(o, "dict", None)
+            if callable(as_dict):
+                return as_dict()
+        return super().default(o)
+# Replace the global default encoder instance so json.dumps(...) picks it up.
+json._default_encoder = _PatchedJSONEncoder()  # type: ignore[attr-defined]
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ResponsesRequest, DeltaMessage
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser
+logger = init_logger(__name__)
+class SolarOpenReasoningParser(ReasoningParser):
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        # 1) If the prompt explicitly encodes an "empty reasoning" block
+        #    immediately BEFORE the last assistant turn, reasoning is ended.
+        #    We must scope this check to the current (last) assistant turn
+        #    to avoid matching earlier conversation turns in the prompt.
+        begin_assistant = self._token_ids("<|begin|>assistant")
+        last_assistant_idx = self._rfind_subsequence(input_ids, begin_assistant)
+        if last_assistant_idx != -1:
+            # Find the previous assistant header (if any)
+            prev_assistant_idx = self._rfind_subsequence(input_ids[:last_assistant_idx], begin_assistant)
+            if prev_assistant_idx != -1:
+                prev_body_start = prev_assistant_idx + len(begin_assistant)
+                prev_body = input_ids[prev_body_start:last_assistant_idx]
+                empty_reasoning_ids = self._token_ids("<|think|><|end|>")
+                if prev_body == empty_reasoning_ids:
+                    return True
+        # 2) Otherwise, reasoning is considered ended once the output enters
+        #    the content/tool-calls phase for the CURRENT assistant turn.
+        #    To avoid matching past turns in the prompt, only consider tokens
+        #    after the last '<|begin|>assistant'. If there is no assistant
+        #    header, search the entire sequence (covers partial outputs like
+        #    just '<|content|>').
+        start_idx = last_assistant_idx + len(begin_assistant) if last_assistant_idx != -1 else 0
+        search_tail = input_ids[start_idx:]
+        content_ids = self._token_ids("<|content|>")
+        tool_calls_ids = self._token_ids("<|tool_calls|>")
+        if self._find_subsequence(search_tail, content_ids) != -1:
+            return True
+        if self._find_subsequence(search_tail, tool_calls_ids) != -1:
+            return True
+        return False
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        # Return token ids for the content section:
+        # - If '<|content|>' exists: everything AFTER the tag
+        # - Else if '<|tool_calls|>' exists: everything AFTER the tag (exclusive)
+        content_tag_ids = self._token_ids("<|content|>")
+        tool_calls_tag_ids = self._token_ids("<|tool_calls|>")
+        idx = self._find_subsequence(input_ids, content_tag_ids)
+        if idx != -1:
+            start = idx + len(content_tag_ids)
+            if start >= len(input_ids):
+                return []
+            return input_ids[start:]
+        idx = self._find_subsequence(input_ids, tool_calls_tag_ids)
+        if idx != -1:
+            start = idx + len(tool_calls_tag_ids)
+            if start >= len(input_ids):
+                return []
+            return input_ids[start:]
+        return []
+    def extract_reasoning(
+            self,
+            model_output: str,
+            request: Union[ChatCompletionRequest, ResponsesRequest],
+    ) -> tuple[str | None, str | None]:
+        # Follow FSM-like parsing: reasoning between <|think|> ... <|end|>,
+        # content starts at the first <|content|> and runs to the end.
+        # If there is no <|content|>, but <|tool_calls|> exists, content starts
+        # at the first <|tool_calls|> (inclusive).
+        reasoning = self._parse_reasoning(model_output) or ""
+        content = self._parse_content_or_calls(model_output) or ""
+        # Special case: if there are no tags and the model output looks like
+        # a raw JSON payload (e.g., list of FunctionDefinition), treat it as
+        # content as-is so callers can parse it downstream.
+        if not content:
+            stripped = (model_output or "").strip()
+            if stripped.startswith("{") or stripped.startswith("["):
+                content = model_output
+        return reasoning, content
+    def extract_reasoning_streaming(
+            self,
+            previous_text: str,
+            current_text: str,
+            delta_text: str,
+            previous_token_ids: Sequence[int],
+            current_token_ids: Sequence[int],
+            delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        # Compute completed parts for previous and current text
+        prev_r = self._parse_reasoning(previous_text) or ""
+        prev_c = self._parse_content_or_calls(previous_text) or ""
+        prev_has_content_tag = self._has_content_tag(previous_text)
+        prev_has_tool_calls_tag = self._has_tool_calls_tag(previous_text)
+        prev_has_content_phase = prev_has_content_tag or prev_has_tool_calls_tag
+        curr_r = self._parse_reasoning(current_text) or ""
+        curr_c = self._parse_content_or_calls(current_text) or ""
+        curr_has_content_tag = self._has_content_tag(current_text)
+        curr_has_tool_calls_tag = self._has_tool_calls_tag(current_text)
+        curr_has_content_phase = curr_has_content_tag or curr_has_tool_calls_tag
+        # If content phase just appeared (either <|content|> or <|tool_calls|>),
+        # emit an empty content delta to initialize the content field in
+        # reconstructor even if no text yet. We never emit the tag itself as
+        # content. After that, we only emit content additions.
+        if curr_has_content_phase and not prev_has_content_phase:
+            return DeltaMessage(content="")
+        # If we have started content phase, we should emit only content deltas
+        if curr_has_content_phase:
+            if curr_c != prev_c:
+                addition = curr_c[len(prev_c):] if curr_c.startswith(prev_c) else curr_c
+                if addition:
+                    return DeltaMessage(content=addition)
+            return None
+        # If neither reasoning nor content/tool_calls phases have started yet,
+        # emit raw delta as content immediately (e.g., "{" for JSON outputs).
+        if (
+                "<|think|>" not in current_text
+                and not self._has_content_phase(current_text)
+                and delta_text not in ("<|think|>", "<|end|>", "<|content|>", "<|tool_calls|>")
+        ):
+            return DeltaMessage(content=delta_text)
+        # Otherwise, emit reasoning progression between <|think|> and the first
+        # boundary (<|end|>, <|content|>, <|tool_calls|>). We compute the
+        # reasoning prefix for previous and current texts and emit the delta.
+        prev_prefix = self._parse_reasoning_prefix(previous_text) or ""
+        curr_prefix = self._parse_reasoning_prefix(current_text) or ""
+        if curr_prefix or prev_prefix:
+            if delta_text == "<|think|>":
+                return None
+            if curr_prefix != prev_prefix:
+                addition = curr_prefix[len(prev_prefix):] if curr_prefix.startswith(prev_prefix) else curr_prefix
+                if addition:
+                    return DeltaMessage(reasoning=addition)
+        # Fallback: if we're clearly within reasoning (think seen, no boundary
+        # reached yet) and the delta is not a boundary token, emit it as
+        # reasoning. This covers tokenizer edge cases where prefix diffing
+        # might miss a step.
+        if (
+                ("<|think|>" in current_text)
+                and ("<|end|>" not in current_text)
+                and (not self._has_content_phase(current_text))
+                and delta_text not in ("<|think|>", "<|end|>", "<|content|>", "<|tool_calls|>")
+        ):
+            return DeltaMessage(reasoning=delta_text)
+        # Final guard: if we've already seen <|think|> in the previous_text and
+        # haven't started content/tool_calls or ended reasoning yet, emit any
+        # non-boundary delta as reasoning.
+        if (
+                ("<|think|>" in previous_text)
+                and ("<|end|>" not in previous_text)
+                and (not self._has_content_phase(previous_text))
+                and delta_text not in ("<|think|>", "<|end|>", "<|content|>", "<|tool_calls|>")
+        ):
+            return DeltaMessage(reasoning=delta_text)
+        return None
+    # --------------------
+    # Internal helpers
+    # --------------------
+    def _token_ids(self, text: str) -> list[int]:
+        tokenizer = self.model_tokenizer
+        tokens = tokenizer.tokenize(text)
+        return tokenizer.convert_tokens_to_ids(tokens)
+    def _find_subsequence(self, haystack: Sequence[int], needle: Sequence[int]) -> int:
+        if not needle:
+            return -1
+        n = len(needle)
+        limit = len(haystack) - n + 1
+        for i in range(limit):
+            if haystack[i:i + n] == list(needle):
+                return i
+        return -1
+    def _rfind_subsequence(self, haystack: Sequence[int], needle: Sequence[int]) -> int:
+        if not needle:
+            return -1
+        n = len(needle)
+        limit = len(haystack) - n
+        last = -1
+        for i in range(0, limit + 1):
+            if haystack[i:i + n] == list(needle):
+                last = i
+        return last
+    def _parse_reasoning(self, text: str) -> Optional[str]:
+        # Extract text between first <|think|> and subsequent <|end|>
+        think_tag = "<|think|>"
+        end_tag = "<|end|>"
+        s = text.find(think_tag)
+        if s == -1:
+            return None
+        s += len(think_tag)
+        e = text.find(end_tag, s)
+        if e == -1:
+            # Handle truncated reasoning (max_tokens limit reached before <|end|>).
+            # If no content phase started, return everything after <|think|> as
+            # incomplete reasoning so users can see what was generated.
+            if not self._has_content_phase(text[s:]):
+                return text[s:] if s < len(text) else None
+            return None
+        return text[s:e]
+    def _parse_trailing_content(self, text: str) -> Optional[str]:
+        # Return everything after the first <|content|> tag (including any trailing special tokens)
+        content_tag = "<|content|>"
+        s = text.find(content_tag)
+        if s == -1:
+            return None
+        s += len(content_tag)
+        if s >= len(text):
+            # Content tag exists but no trailing text -> empty content
+            return ""
+        return text[s:]
+    def _has_content_tag(self, text: str) -> bool:
+        return text.find("<|content|>") != -1
+    # New helpers covering both content and tool-calls phases
+    def _parse_content_or_calls(self, text: str) -> Optional[str]:
+        content_tag = "<|content|>"
+        tool_calls_tag = "<|tool_calls|>"
+        ci = text.find(content_tag)
+        ti = text.find(tool_calls_tag)
+        if ci != -1:
+            # everything after content tag
+            start = ci + len(content_tag)
+            return text[start:] if start <= len(text) else ""
+        if ti != -1:
+            # everything after tool_calls tag (exclusive)
+            start = ti + len(tool_calls_tag)
+            return text[start:] if start <= len(text) else ""
+        return None
+    def _has_tool_calls_tag(self, text: str) -> bool:
+        return text.find("<|tool_calls|>") != -1
+    def _has_content_phase(self, text: str) -> bool:
+        return self._has_content_tag(text) or self._has_tool_calls_tag(text)
+    def _is_in_reasoning_phase_prev(self, text: str) -> bool:
+        # Determine reasoning phase using the PREVIOUS text so that if the
+        # current delta includes boundary tokens merged with other text, we
+        # still emit the delta as reasoning unless the delta itself is a
+        # boundary token. This matches the test expectations.
+        if text.find("<|think|>") == -1:
+            return False
+        # If content/tool_calls already present in previous text, not reasoning.
+        if self._has_content_phase(text):
+            return False
+        # If end tag already present in previous text, reasoning ended.
+        if text.find("<|end|>") != -1:
+            return False
+        return True
+    def _starts_reasoning_now(self, text: str) -> bool:
+        # Returns True if current_text includes <|think|> but no boundary
+        # tokens after it yet. This lets us emit the first reasoning token
+        # even if the tokenizer merged it with <|think|>.
+        i = text.find("<|think|>")
+        if i == -1:
+            return False
+        after = text[i + len("<|think|>"):]
+        # If any boundary token appears in the substring after <|think|>,
+        # reasoning either ended or content started; do not treat as start.
+        for b in ("<|end|>", "<|content|>", "<|tool_calls|>"):
+            if after.find(b) != -1:
+                return False
+        return True
+    def _parse_reasoning_prefix(self, text: str) -> Optional[str]:
+        # Returns text between the first <|think|> and the earliest boundary
+        # among <|end|>, <|content|>, <|tool_calls|>. If <|think|> is absent,
+        # returns None. If no boundary appears, returns text after <|think|>.
+        ti = text.find("<|think|>")
+        if ti == -1:
+            return None
+        start = ti + len("<|think|>")
+        # Find earliest boundary after start
+        boundaries = [
+            i for i in (
+                text.find("<|end|>", start),
+                text.find("<|content|>", start),
+                text.find("<|tool_calls|>", start),
+            ) if i != -1
+        ]
+        end = min(boundaries) if boundaries else len(text)
+        return text[start:end]