WatNeru commited on
Commit
5cc4cfa
·
1 Parent(s): 52c76d4

不可視文字の処理

Browse files
Files changed (2) hide show
  1. package/ai.py +56 -1
  2. package/rust_adapter.py +29 -1
package/ai.py CHANGED
@@ -152,9 +152,64 @@ class AI:
152
 
153
  # トークンIDを文字列に変換
154
  items: List[Tuple[str, float]] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  for idx, prob in zip(top_indices, top_probs):
156
  token_id = idx.item()
157
- token = tokenizer.decode([token_id])
 
 
 
 
 
158
  prob_value = prob.item()
159
  items.append((token, float(prob_value)))
160
 
 
152
 
153
  # トークンIDを文字列に変換
154
  items: List[Tuple[str, float]] = []
155
+
156
+ # Llama 3.2の特殊トークンを定義
157
+ LLAMA_SPECIAL_TOKENS = [
158
+ "<|begin_of_text|>",
159
+ "<|end_of_text|>",
160
+ "<|eot_id|>",
161
+ "<|start_header_id|>",
162
+ "<|end_header_id|>",
163
+ ]
164
+
165
+ def _clean_text(text: str) -> str:
166
+ """制御文字・不可視文字・置換文字・特殊トークンを厳密に取り除く(Llama 3.2対応)"""
167
+ if not text:
168
+ return ""
169
+
170
+ # Llama 3.2の特殊トークンを除去
171
+ for special_token in LLAMA_SPECIAL_TOKENS:
172
+ text = text.replace(special_token, "")
173
+
174
+ # 制御文字(0x00-0x1F、0x7F-0x9F)を除去
175
+ # ただし、改行・タブ・復帰は許可
176
+ cleaned = []
177
+ for ch in text:
178
+ code = ord(ch)
179
+ # 許可する制御文字: 改行(0x0A), タブ(0x09), 復帰(0x0D)
180
+ if code in [0x09, 0x0A, 0x0D]:
181
+ cleaned.append(ch)
182
+ # 通常の印刷可能文字(0x20-0x7E、およびその他のUnicode印刷可能文字)
183
+ elif ch.isprintable():
184
+ # 置換文字(U+FFFD)を除去
185
+ if ch != "\uFFFD":
186
+ cleaned.append(ch)
187
+ # その他の制御文字や不可視文字は除去
188
+
189
+ result = "".join(cleaned)
190
+ # ゼロ幅文字を除去
191
+ result = result.replace("\u200B", "") # Zero-width space
192
+ result = result.replace("\u200C", "") # Zero-width non-joiner
193
+ result = result.replace("\u200D", "") # Zero-width joiner
194
+ result = result.replace("\uFEFF", "") # Zero-width no-break space
195
+ # その他の不可視文字(結合文字など)を除去
196
+ result = result.replace("\u200E", "") # Left-to-right mark
197
+ result = result.replace("\u200F", "") # Right-to-left mark
198
+ result = result.replace("\u202A", "") # Left-to-right embedding
199
+ result = result.replace("\u202B", "") # Right-to-left embedding
200
+ result = result.replace("\u202C", "") # Pop directional formatting
201
+ result = result.replace("\u202D", "") # Left-to-right override
202
+ result = result.replace("\u202E", "") # Right-to-left override
203
+ return result.strip()
204
+
205
  for idx, prob in zip(top_indices, top_probs):
206
  token_id = idx.item()
207
+ # Llama 3.2対応: skip_special_tokens=Trueで特殊トークンを除外
208
+ token = tokenizer.decode([token_id], skip_special_tokens=True, clean_up_tokenization_spaces=False)
209
+ token = _clean_text(token)
210
+ # 空文字列のトークンは除外
211
+ if not token:
212
+ continue
213
  prob_value = prob.item()
214
  items.append((token, float(prob_value)))
215
 
package/rust_adapter.py CHANGED
@@ -36,6 +36,34 @@ class RustAdapter:
36
  return cls._instance
37
 
38
  # ===== 公開API =====
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def build_word_tree(self, prompt_text: str, root_text: str = "", top_k: int = 5, max_depth: int = 10) -> List[Dict[str, Any]]:
40
  """
41
  単語ツリーを構築して、完成ピースを dict の配列で返す。
@@ -50,7 +78,7 @@ class RustAdapter:
50
  )
51
  # print(f"[RustAdapter] build_word_tree: {pieces}")
52
  return [
53
- {"text": p.get_full_word(), "probability": float(p.probability)}
54
  for p in pieces
55
  ]
56
 
 
36
  return cls._instance
37
 
38
  # ===== 公開API =====
39
+ def _clean_text(self, text: str) -> str:
40
+ """制御文字・不可視文字・置換文字を厳密に取り除く(最終出力用)"""
41
+ if not text:
42
+ return ""
43
+
44
+ # 制御文字(0x00-0x1F、0x7F-0x9F)を除去
45
+ # ただし、改行・タブ・復帰は許可
46
+ cleaned = []
47
+ for ch in text:
48
+ code = ord(ch)
49
+ # 許可する制御文字: 改行(0x0A), タブ(0x09), 復帰(0x0D)
50
+ if code in [0x09, 0x0A, 0x0D]:
51
+ cleaned.append(ch)
52
+ # 通常の印刷可能文字(0x20-0x7E、およびその他のUnicode印刷可能文字)
53
+ elif ch.isprintable():
54
+ # 置換文字(U+FFFD)を除去
55
+ if ch != "\uFFFD":
56
+ cleaned.append(ch)
57
+ # その他の制御文字や不可視文字は除去
58
+
59
+ result = "".join(cleaned)
60
+ # ゼロ幅文字を除去
61
+ result = result.replace("\u200B", "") # Zero-width space
62
+ result = result.replace("\u200C", "") # Zero-width non-joiner
63
+ result = result.replace("\u200D", "") # Zero-width joiner
64
+ result = result.replace("\uFEFF", "") # Zero-width no-break space
65
+ return result
66
+
67
  def build_word_tree(self, prompt_text: str, root_text: str = "", top_k: int = 5, max_depth: int = 10) -> List[Dict[str, Any]]:
68
  """
69
  単語ツリーを構築して、完成ピースを dict の配列で返す。
 
78
  )
79
  # print(f"[RustAdapter] build_word_tree: {pieces}")
80
  return [
81
+ {"text": self._clean_text(p.get_full_word()), "probability": float(p.probability)}
82
  for p in pieces
83
  ]
84