Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.6 (1M context) commited on 18 days ago

Commit

b5e4add

1 Parent(s): e375e45

Fix empty text and incorrect is_special for Mistral control tokens

mistral-common's decode_token() returns "" for chat-template tokens
(<s>, [INST], [/INST], [SYSTEM_PROMPT], [/SYSTEM_PROMPT], tool markers).
Fall back to the HF tokenizer so every token arrives with a printable
string form. Also widen special_token_ids from {eos, bos, pad, unk} to
include all control-token IDs from mistral-common's Tekkenizer, fixing
is_special for chat-template delimiters.

Applied to both streaming and non-streaming endpoints, plus the
logit-candidate decode helper.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (2) hide show

backend/mistral_tokenizer.py +54 -1
backend/model_service.py +62 -11

backend/mistral_tokenizer.py CHANGED Viewed

@@ -8,7 +8,7 @@ produce correct token sequences for the model.
 """
 import logging
-from typing import List, Optional
 logger = logging.getLogger(__name__)
@@ -118,6 +118,59 @@ class MistralTokenizerWrapper:
         result = self.tokenizer.decode([token_id])
         return result
 def create_mistral_tokenizer(model_name: str) -> Optional[MistralTokenizerWrapper]:
     """

 """
 import logging
+from typing import List, Optional, Set
 logger = logging.getLogger(__name__)
         result = self.tokenizer.decode([token_id])
         return result
+    def get_control_token_ids(self) -> Set[int]:
+        """
+        Return the full set of control/special token IDs known to the
+        underlying Tekkenizer (e.g. ``<s>``, ``</s>``, ``[INST]``, ``[/INST]``,
+        ``[SYSTEM_PROMPT]``, tool-call markers, etc.).
+        These IDs are needed to label tokens with an accurate ``is_special``
+        flag in the trace response. The HF tokenizer's ``all_special_ids``
+        misses Mistral-specific chat-template delimiters, so we source them
+        directly from mistral-common.
+        Tries multiple attribute paths for robustness across mistral-common
+        versions. Falls back to an empty set (with a warning) if none work —
+        callers should still have the HF ``all_special_ids`` as a baseline.
+        """
+        if not self._available:
+            return set()
+        try:
+            inner = self.tokenizer.instruct_tokenizer.tokenizer
+        except AttributeError:
+            logger.warning(
+                "MistralTokenizer has no instruct_tokenizer.tokenizer attribute"
+            )
+            return set()
+        # Preferred path: Tekkenizer reserves ranks [0, num_special_tokens)
+        # for control tokens, so we can materialise the full set cheaply.
+        num_special = getattr(inner, "num_special_tokens", None)
+        if isinstance(num_special, int) and num_special > 0:
+            return set(range(num_special))
+        # Fallback: try a couple of commonly-used attribute shapes.
+        for attr in ("_special_tokens", "special_tokens"):
+            specials = getattr(inner, attr, None)
+            if isinstance(specials, dict):
+                # dict[str, int] — values are token IDs
+                try:
+                    return {int(v) for v in specials.values()}
+                except Exception:
+                    pass
+            if isinstance(specials, (list, tuple, set)):
+                try:
+                    return {int(v) for v in specials}
+                except Exception:
+                    pass
+        logger.warning(
+            "Could not determine control token ids from MistralTokenizer; "
+            "is_special will be limited to HF tokenizer's all_special_ids"
+        )
+        return set()
 def create_mistral_tokenizer(model_name: str) -> Optional[MistralTokenizerWrapper]:
     """

backend/model_service.py CHANGED Viewed

@@ -1846,8 +1846,19 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
             prompt_token_ids = manager.mistral_tokenizer.encode_chat(system_prompt, prompt)
             inputs = {"input_ids": torch.tensor([prompt_token_ids]).to(manager.device)}
             prompt_length = len(prompt_token_ids)
-            # Decode tokens using MistralTokenizer for accuracy
-            prompt_tokens = [manager.mistral_tokenizer.decode_token(tid) for tid in prompt_token_ids]
             logger.info(f"Used MistralTokenizer for Devstral: {prompt_length} tokens")
         else:
             # Standard HF tokenization for other models
@@ -2476,12 +2487,23 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         from .tokenizer_utils import TokenizerMetadata
         token_metadata = TokenizerMetadata(manager.tokenizer)
-        special_token_ids = {
             manager.tokenizer.eos_token_id,
             manager.tokenizer.bos_token_id,
             manager.tokenizer.pad_token_id,
-            manager.tokenizer.unk_token_id
-        }
         def build_token_data(token_ids, token_texts, token_type):
             """Build token data with full metadata for hover tooltips"""
@@ -2610,7 +2632,18 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                 prompt_token_ids = manager.mistral_tokenizer.encode_chat(system_prompt, prompt)
                 inputs = {"input_ids": torch.tensor([prompt_token_ids]).to(manager.device)}
                 prompt_length = len(prompt_token_ids)
-                prompt_tokens = [manager.mistral_tokenizer.decode_token(tid) for tid in prompt_token_ids]
             else:
                 inputs = manager.tokenizer(formatted_prompt, return_tensors="pt").to(manager.device)
                 prompt_length = inputs["input_ids"].shape[1]
@@ -2859,10 +2892,19 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                     top_raw_logits, top_raw_indices = torch.topk(raw_logits, k=min(top_n_display, len(raw_logits)))
                     # Build raw logits entries (before temperature)
-                    # Use correct tokenizer for Devstral vs other models
                     def decode_token(tid):
                         if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
-                            return manager.mistral_tokenizer.decode_token(tid)
                         else:
                             return manager.tokenizer.decode([tid], skip_special_tokens=False)
@@ -3543,12 +3585,21 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             from .tokenizer_utils import TokenizerMetadata
             token_metadata_builder = TokenizerMetadata(manager.tokenizer)
-            special_token_ids_set = {
                 manager.tokenizer.eos_token_id,
                 manager.tokenizer.bos_token_id,
                 manager.tokenizer.pad_token_id,
-                manager.tokenizer.unk_token_id
-            }
             def build_token_data(token_ids, token_texts, token_type):
                 multi_split_flags = token_metadata_builder.is_multi_split_identifier(token_ids)

             prompt_token_ids = manager.mistral_tokenizer.encode_chat(system_prompt, prompt)
             inputs = {"input_ids": torch.tensor([prompt_token_ids]).to(manager.device)}
             prompt_length = len(prompt_token_ids)
+            # Decode tokens using MistralTokenizer for accuracy. mistral-common's
+            # decode_token() returns "" for some control tokens (<s>, [INST],
+            # [/INST], [SYSTEM_PROMPT], etc.). Fall back to the HF tokenizer so
+            # every token arrives with a printable string form.
+            def _decode_prompt_token(tid: int) -> str:
+                text = manager.mistral_tokenizer.decode_token(tid)
+                if text:
+                    return text
+                try:
+                    return manager.tokenizer.decode([tid], skip_special_tokens=False) or ""
+                except Exception:
+                    return ""
+            prompt_tokens = [_decode_prompt_token(tid) for tid in prompt_token_ids]
             logger.info(f"Used MistralTokenizer for Devstral: {prompt_length} tokens")
         else:
             # Standard HF tokenization for other models
         from .tokenizer_utils import TokenizerMetadata
         token_metadata = TokenizerMetadata(manager.tokenizer)
+        # Include every id the tokenizer considers a special / added token
+        # (BOS, EOS, PAD, UNK, chat-template delimiters like [INST]/[/INST],
+        # system-prompt markers, tool-call markers, etc.). The HF tokenizer's
+        # all_special_ids misses Mistral-specific chat-template delimiters, so
+        # we also union in the control-token ids from mistral-common when the
+        # Mistral path is active.
+        special_token_ids = set(getattr(manager.tokenizer, "all_special_ids", []) or [])
+        for tok_id in (
             manager.tokenizer.eos_token_id,
             manager.tokenizer.bos_token_id,
             manager.tokenizer.pad_token_id,
+            manager.tokenizer.unk_token_id,
+        ):
+            if tok_id is not None:
+                special_token_ids.add(tok_id)
+        if manager.mistral_tokenizer is not None:
+            special_token_ids |= manager.mistral_tokenizer.get_control_token_ids()
         def build_token_data(token_ids, token_texts, token_type):
             """Build token data with full metadata for hover tooltips"""
                 prompt_token_ids = manager.mistral_tokenizer.encode_chat(system_prompt, prompt)
                 inputs = {"input_ids": torch.tensor([prompt_token_ids]).to(manager.device)}
                 prompt_length = len(prompt_token_ids)
+                # mistral-common's decode_token() returns "" for control tokens
+                # (<s>, [INST], [/INST], [SYSTEM_PROMPT], etc.). Fall back to
+                # the HF tokenizer so every token has a printable string form.
+                def _decode_prompt_token(tid: int) -> str:
+                    text = manager.mistral_tokenizer.decode_token(tid)
+                    if text:
+                        return text
+                    try:
+                        return manager.tokenizer.decode([tid], skip_special_tokens=False) or ""
+                    except Exception:
+                        return ""
+                prompt_tokens = [_decode_prompt_token(tid) for tid in prompt_token_ids]
             else:
                 inputs = manager.tokenizer(formatted_prompt, return_tensors="pt").to(manager.device)
                 prompt_length = inputs["input_ids"].shape[1]
                     top_raw_logits, top_raw_indices = torch.topk(raw_logits, k=min(top_n_display, len(raw_logits)))
                     # Build raw logits entries (before temperature)
+                    # Use correct tokenizer for Devstral vs other models.
+                    # On the Mistral path, fall back to the HF tokenizer when
+                    # mistral-common returns "" so logit candidates (e.g. EOS,
+                    # chat-template ids) always carry a printable string.
                     def decode_token(tid):
                         if manager.model_id == "devstral-small" and manager.mistral_tokenizer is not None:
+                            text = manager.mistral_tokenizer.decode_token(tid)
+                            if text:
+                                return text
+                            try:
+                                return manager.tokenizer.decode([tid], skip_special_tokens=False) or ""
+                            except Exception:
+                                return ""
                         else:
                             return manager.tokenizer.decode([tid], skip_special_tokens=False)
             from .tokenizer_utils import TokenizerMetadata
             token_metadata_builder = TokenizerMetadata(manager.tokenizer)
+            # Include every id the tokenizer considers a special / added token
+            # (BOS, EOS, PAD, UNK, chat-template delimiters like [INST]/[/INST],
+            # system-prompt markers, tool-call markers, etc.). See the matching
+            # construction in the non-streaming endpoint for rationale.
+            special_token_ids_set = set(getattr(manager.tokenizer, "all_special_ids", []) or [])
+            for tok_id in (
                 manager.tokenizer.eos_token_id,
                 manager.tokenizer.bos_token_id,
                 manager.tokenizer.pad_token_id,
+                manager.tokenizer.unk_token_id,
+            ):
+                if tok_id is not None:
+                    special_token_ids_set.add(tok_id)
+            if manager.mistral_tokenizer is not None:
+                special_token_ids_set |= manager.mistral_tokenizer.get_control_token_ids()
             def build_token_data(token_ids, token_texts, token_type):
                 multi_split_flags = token_metadata_builder.is_multi_split_identifier(token_ids)