Spaces:

Jellyfish042
/

Compression-Lens

Running

Jellyfish042 commited on Jan 19

Commit

01ede16

1 Parent(s): ef158b3

fix

Files changed (2) hide show

core/helpers.py CHANGED Viewed

@@ -233,6 +233,34 @@ class TokenizerBytesConverter:
         return result
     def encode_to_flat_bytes(
         self,
         text: str,

         return result
+    def encode_to_ids_and_bytes(
+        self,
+        text: str,
+        add_special_tokens: bool = False,
+        strip_leading_space: bool = True,
+    ) -> List[tuple]:
+        """
+        Encode text to (token_id, token_bytes) pairs.
+        This is useful when the caller needs both the vocab token id and the exact
+        byte sequence used by the tokenizer for alignment/visualization.
+        """
+        token_ids = self._tokenizer.encode(text, add_special_tokens=add_special_tokens)
+        result = []
+        for idx, token_id in enumerate(token_ids):
+            token_bytes = self.token_to_bytes(token_id)
+            if token_bytes is None:
+                continue
+            # Match encode_to_bytes() behavior for SentencePiece ByteFallback tokenizers.
+            if idx == 0 and self._decoder_type == "sentencepiece" and strip_leading_space and token_bytes and token_bytes[0] == 0x20:
+                token_bytes = token_bytes[1:]
+            result.append((token_id, token_bytes))
+        return result
     def encode_to_flat_bytes(
         self,
         text: str,

visualization/html_generator.py CHANGED Viewed

@@ -78,16 +78,17 @@ def get_token_info_for_text(text: str) -> dict:
     # Get Qwen tokens with positions
     qwen_tokens = []
     byte_to_qwen = {}
-    qwen_bytes_list = qwen_tokenizer.encode_to_bytes(text)
     byte_pos = 0
-    for idx, token_bytes in enumerate(qwen_bytes_list):
         start = byte_pos
         end = byte_pos + len(token_bytes)
         try:
             token_str = bytes(token_bytes).decode("utf-8")
         except UnicodeDecodeError:
             token_str = repr(bytes(token_bytes))
-        qwen_tokens.append((start, end, token_str))
         byte_to_qwen[start] = idx
         byte_pos = end
@@ -109,7 +110,7 @@ def get_token_info_for_text(text: str) -> dict:
             token_str = token_bytes.decode("utf-8")
         except UnicodeDecodeError:
             token_str = repr(token_bytes)
-        rwkv_tokens.append((start, end, token_str))
         byte_to_rwkv[start] = idx
         byte_pos = end
@@ -249,9 +250,9 @@ def generate_comparison_html(
     def get_tokens_for_range(byte_start, byte_end, token_list):
         result = []
-        for idx, (t_start, t_end, t_str) in enumerate(token_list):
             if t_start < byte_end and t_end > byte_start:
-                result.append((idx, t_str))
         return result
     # Build tokens based on common boundaries

     # Get Qwen tokens with positions
     qwen_tokens = []
     byte_to_qwen = {}
+    # Keep both token id (vocab id) and decoded bytes so the tooltip can show true token ids.
+    qwen_id_and_bytes = qwen_tokenizer.encode_to_ids_and_bytes(text)
     byte_pos = 0
+    for idx, (token_id, token_bytes) in enumerate(qwen_id_and_bytes):
         start = byte_pos
         end = byte_pos + len(token_bytes)
         try:
             token_str = bytes(token_bytes).decode("utf-8")
         except UnicodeDecodeError:
             token_str = repr(bytes(token_bytes))
+        qwen_tokens.append((start, end, token_id, token_str))
         byte_to_qwen[start] = idx
         byte_pos = end
             token_str = token_bytes.decode("utf-8")
         except UnicodeDecodeError:
             token_str = repr(token_bytes)
+        rwkv_tokens.append((start, end, token_id, token_str))
         byte_to_rwkv[start] = idx
         byte_pos = end
     def get_tokens_for_range(byte_start, byte_end, token_list):
         result = []
+        for t_start, t_end, token_id, t_str in token_list:
             if t_start < byte_end and t_end > byte_start:
+                result.append((token_id, t_str))
         return result
     # Build tokens based on common boundaries