Spaces:

WatNeru
/

LLMView

Paused

App Files Files Community

WatNeru commited on Nov 23, 2025

Commit

5cc4cfa

1 Parent(s): 52c76d4

不可視文字の処理

Browse files

Files changed (2) hide show

package/ai.py +56 -1
package/rust_adapter.py +29 -1

package/ai.py CHANGED Viewed

@@ -152,9 +152,64 @@ class AI:
                 # トークンIDを文字列に変換
                 items: List[Tuple[str, float]] = []
                 for idx, prob in zip(top_indices, top_probs):
                     token_id = idx.item()
-                    token = tokenizer.decode([token_id])
                     prob_value = prob.item()
                     items.append((token, float(prob_value)))

                 # トークンIDを文字列に変換
                 items: List[Tuple[str, float]] = []
+                # Llama 3.2の特殊トークンを定義
+                LLAMA_SPECIAL_TOKENS = [
+                    "<|begin_of_text|>",
+                    "<|end_of_text|>",
+                    "<|eot_id|>",
+                    "<|start_header_id|>",
+                    "<|end_header_id|>",
+                ]
+                def _clean_text(text: str) -> str:
+                    """制御文字・不可視文字・置換文字・特殊トークンを厳密に取り除く（Llama 3.2対応）"""
+                    if not text:
+                        return ""
+                    # Llama 3.2の特殊トークンを除去
+                    for special_token in LLAMA_SPECIAL_TOKENS:
+                        text = text.replace(special_token, "")
+                    # 制御文字（0x00-0x1F、0x7F-0x9F）を除去
+                    # ただし、改行・タブ・復帰は許可
+                    cleaned = []
+                    for ch in text:
+                        code = ord(ch)
+                        # 許可する制御文字: 改行(0x0A), タブ(0x09), 復帰(0x0D)
+                        if code in [0x09, 0x0A, 0x0D]:
+                            cleaned.append(ch)
+                        # 通常の印刷可能文字（0x20-0x7E、およびその他のUnicode印刷可能文字）
+                        elif ch.isprintable():
+                            # 置換文字（U+FFFD）を除去
+                            if ch != "\uFFFD":
+                                cleaned.append(ch)
+                        # その他の制御文字や不可視文字は除去
+                    result = "".join(cleaned)
+                    # ゼロ幅文字を除去
+                    result = result.replace("\u200B", "")  # Zero-width space
+                    result = result.replace("\u200C", "")  # Zero-width non-joiner
+                    result = result.replace("\u200D", "")  # Zero-width joiner
+                    result = result.replace("\uFEFF", "")  # Zero-width no-break space
+                    # その他の不可視文字（結合文字など）を除去
+                    result = result.replace("\u200E", "")  # Left-to-right mark
+                    result = result.replace("\u200F", "")  # Right-to-left mark
+                    result = result.replace("\u202A", "")  # Left-to-right embedding
+                    result = result.replace("\u202B", "")  # Right-to-left embedding
+                    result = result.replace("\u202C", "")  # Pop directional formatting
+                    result = result.replace("\u202D", "")  # Left-to-right override
+                    result = result.replace("\u202E", "")  # Right-to-left override
+                    return result.strip()
                 for idx, prob in zip(top_indices, top_probs):
                     token_id = idx.item()
+                    # Llama 3.2対応: skip_special_tokens=Trueで特殊トークンを除外
+                    token = tokenizer.decode([token_id], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+                    token = _clean_text(token)
+                    # 空文字列のトークンは除外
+                    if not token:
+                        continue
                     prob_value = prob.item()
                     items.append((token, float(prob_value)))

package/rust_adapter.py CHANGED Viewed

@@ -36,6 +36,34 @@ class RustAdapter:
         return cls._instance
     # ===== 公開API =====
     def build_word_tree(self, prompt_text: str, root_text: str = "", top_k: int = 5, max_depth: int = 10) -> List[Dict[str, Any]]:
         """
         単語ツリーを構築して、完成ピースを dict の配列で返す。
@@ -50,7 +78,7 @@ class RustAdapter:
         )
         # print(f"[RustAdapter] build_word_tree: {pieces}")
         return [
-            {"text": p.get_full_word(), "probability": float(p.probability)}
             for p in pieces
         ]

         return cls._instance
     # ===== 公開API =====
+    def _clean_text(self, text: str) -> str:
+        """制御文字・不可視文字・置換文字を厳密に取り除く（最終出力用）"""
+        if not text:
+            return ""
+        # 制御文字（0x00-0x1F、0x7F-0x9F）を除去
+        # ただし、改行・タブ・復帰は許可
+        cleaned = []
+        for ch in text:
+            code = ord(ch)
+            # 許可する制御文字: 改行(0x0A), タブ(0x09), 復帰(0x0D)
+            if code in [0x09, 0x0A, 0x0D]:
+                cleaned.append(ch)
+            # 通常の印刷可能文字（0x20-0x7E、およびその他のUnicode印刷可能文字）
+            elif ch.isprintable():
+                # 置換文字（U+FFFD）を除去
+                if ch != "\uFFFD":
+                    cleaned.append(ch)
+            # その他の制御文字や不可視文字は除去
+        result = "".join(cleaned)
+        # ゼロ幅文字を除去
+        result = result.replace("\u200B", "")  # Zero-width space
+        result = result.replace("\u200C", "")  # Zero-width non-joiner
+        result = result.replace("\u200D", "")  # Zero-width joiner
+        result = result.replace("\uFEFF", "")  # Zero-width no-break space
+        return result
     def build_word_tree(self, prompt_text: str, root_text: str = "", top_k: int = 5, max_depth: int = 10) -> List[Dict[str, Any]]:
         """
         単語ツリーを構築して、完成ピースを dict の配列で返す。
         )
         # print(f"[RustAdapter] build_word_tree: {pieces}")
         return [
+            {"text": self._clean_text(p.get_full_word()), "probability": float(p.probability)}
             for p in pieces
         ]