不可視文字の処理
Browse files- package/ai.py +56 -1
- package/rust_adapter.py +29 -1
package/ai.py
CHANGED
|
@@ -152,9 +152,64 @@ class AI:
|
|
| 152 |
|
| 153 |
# トークンIDを文字列に変換
|
| 154 |
items: List[Tuple[str, float]] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
for idx, prob in zip(top_indices, top_probs):
|
| 156 |
token_id = idx.item()
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
prob_value = prob.item()
|
| 159 |
items.append((token, float(prob_value)))
|
| 160 |
|
|
|
|
| 152 |
|
| 153 |
# トークンIDを文字列に変換
|
| 154 |
items: List[Tuple[str, float]] = []
|
| 155 |
+
|
| 156 |
+
# Llama 3.2の特殊トークンを定義
|
| 157 |
+
LLAMA_SPECIAL_TOKENS = [
|
| 158 |
+
"<|begin_of_text|>",
|
| 159 |
+
"<|end_of_text|>",
|
| 160 |
+
"<|eot_id|>",
|
| 161 |
+
"<|start_header_id|>",
|
| 162 |
+
"<|end_header_id|>",
|
| 163 |
+
]
|
| 164 |
+
|
| 165 |
+
def _clean_text(text: str) -> str:
|
| 166 |
+
"""制御文字・不可視文字・置換文字・特殊トークンを厳密に取り除く(Llama 3.2対応)"""
|
| 167 |
+
if not text:
|
| 168 |
+
return ""
|
| 169 |
+
|
| 170 |
+
# Llama 3.2の特殊トークンを除去
|
| 171 |
+
for special_token in LLAMA_SPECIAL_TOKENS:
|
| 172 |
+
text = text.replace(special_token, "")
|
| 173 |
+
|
| 174 |
+
# 制御文字(0x00-0x1F、0x7F-0x9F)を除去
|
| 175 |
+
# ただし、改行・タブ・復帰は許可
|
| 176 |
+
cleaned = []
|
| 177 |
+
for ch in text:
|
| 178 |
+
code = ord(ch)
|
| 179 |
+
# 許可する制御文字: 改行(0x0A), タブ(0x09), 復帰(0x0D)
|
| 180 |
+
if code in [0x09, 0x0A, 0x0D]:
|
| 181 |
+
cleaned.append(ch)
|
| 182 |
+
# 通常の印刷可能文字(0x20-0x7E、およびその他のUnicode印刷可能文字)
|
| 183 |
+
elif ch.isprintable():
|
| 184 |
+
# 置換文字(U+FFFD)を除去
|
| 185 |
+
if ch != "\uFFFD":
|
| 186 |
+
cleaned.append(ch)
|
| 187 |
+
# その他の制御文字や不可視文字は除去
|
| 188 |
+
|
| 189 |
+
result = "".join(cleaned)
|
| 190 |
+
# ゼロ幅文字を除去
|
| 191 |
+
result = result.replace("\u200B", "") # Zero-width space
|
| 192 |
+
result = result.replace("\u200C", "") # Zero-width non-joiner
|
| 193 |
+
result = result.replace("\u200D", "") # Zero-width joiner
|
| 194 |
+
result = result.replace("\uFEFF", "") # Zero-width no-break space
|
| 195 |
+
# その他の不可視文字(結合文字など)を除去
|
| 196 |
+
result = result.replace("\u200E", "") # Left-to-right mark
|
| 197 |
+
result = result.replace("\u200F", "") # Right-to-left mark
|
| 198 |
+
result = result.replace("\u202A", "") # Left-to-right embedding
|
| 199 |
+
result = result.replace("\u202B", "") # Right-to-left embedding
|
| 200 |
+
result = result.replace("\u202C", "") # Pop directional formatting
|
| 201 |
+
result = result.replace("\u202D", "") # Left-to-right override
|
| 202 |
+
result = result.replace("\u202E", "") # Right-to-left override
|
| 203 |
+
return result.strip()
|
| 204 |
+
|
| 205 |
for idx, prob in zip(top_indices, top_probs):
|
| 206 |
token_id = idx.item()
|
| 207 |
+
# Llama 3.2対応: skip_special_tokens=Trueで特殊トークンを除外
|
| 208 |
+
token = tokenizer.decode([token_id], skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 209 |
+
token = _clean_text(token)
|
| 210 |
+
# 空文字列のトークンは除外
|
| 211 |
+
if not token:
|
| 212 |
+
continue
|
| 213 |
prob_value = prob.item()
|
| 214 |
items.append((token, float(prob_value)))
|
| 215 |
|
package/rust_adapter.py
CHANGED
|
@@ -36,6 +36,34 @@ class RustAdapter:
|
|
| 36 |
return cls._instance
|
| 37 |
|
| 38 |
# ===== 公開API =====
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def build_word_tree(self, prompt_text: str, root_text: str = "", top_k: int = 5, max_depth: int = 10) -> List[Dict[str, Any]]:
|
| 40 |
"""
|
| 41 |
単語ツリーを構築して、完成ピースを dict の配列で返す。
|
|
@@ -50,7 +78,7 @@ class RustAdapter:
|
|
| 50 |
)
|
| 51 |
# print(f"[RustAdapter] build_word_tree: {pieces}")
|
| 52 |
return [
|
| 53 |
-
{"text": p.get_full_word(), "probability": float(p.probability)}
|
| 54 |
for p in pieces
|
| 55 |
]
|
| 56 |
|
|
|
|
| 36 |
return cls._instance
|
| 37 |
|
| 38 |
# ===== 公開API =====
|
| 39 |
+
def _clean_text(self, text: str) -> str:
|
| 40 |
+
"""制御文字・不可視文字・置換文字を厳密に取り除く(最終出力用)"""
|
| 41 |
+
if not text:
|
| 42 |
+
return ""
|
| 43 |
+
|
| 44 |
+
# 制御文字(0x00-0x1F、0x7F-0x9F)を除去
|
| 45 |
+
# ただし、改行・タブ・復帰は許可
|
| 46 |
+
cleaned = []
|
| 47 |
+
for ch in text:
|
| 48 |
+
code = ord(ch)
|
| 49 |
+
# 許可する制御文字: 改行(0x0A), タブ(0x09), 復帰(0x0D)
|
| 50 |
+
if code in [0x09, 0x0A, 0x0D]:
|
| 51 |
+
cleaned.append(ch)
|
| 52 |
+
# 通常の印刷可能文字(0x20-0x7E、およびその他のUnicode印刷可能文字)
|
| 53 |
+
elif ch.isprintable():
|
| 54 |
+
# 置換文字(U+FFFD)を除去
|
| 55 |
+
if ch != "\uFFFD":
|
| 56 |
+
cleaned.append(ch)
|
| 57 |
+
# その他の制御文字や不可視文字は除去
|
| 58 |
+
|
| 59 |
+
result = "".join(cleaned)
|
| 60 |
+
# ゼロ幅文字を除去
|
| 61 |
+
result = result.replace("\u200B", "") # Zero-width space
|
| 62 |
+
result = result.replace("\u200C", "") # Zero-width non-joiner
|
| 63 |
+
result = result.replace("\u200D", "") # Zero-width joiner
|
| 64 |
+
result = result.replace("\uFEFF", "") # Zero-width no-break space
|
| 65 |
+
return result
|
| 66 |
+
|
| 67 |
def build_word_tree(self, prompt_text: str, root_text: str = "", top_k: int = 5, max_depth: int = 10) -> List[Dict[str, Any]]:
|
| 68 |
"""
|
| 69 |
単語ツリーを構築して、完成ピースを dict の配列で返す。
|
|
|
|
| 78 |
)
|
| 79 |
# print(f"[RustAdapter] build_word_tree: {pieces}")
|
| 80 |
return [
|
| 81 |
+
{"text": self._clean_text(p.get_full_word()), "probability": float(p.probability)}
|
| 82 |
for p in pieces
|
| 83 |
]
|
| 84 |
|