Arabic
arabic
tokenizer
morphology
nlp
dialect
fr3on commited on
Commit
562053c
·
verified ·
1 Parent(s): 37aebb1

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization_df_arc.py +15 -0
tokenization_df_arc.py CHANGED
@@ -213,7 +213,22 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
213
  text = self.phrase_helper.merge_phrases(text)
214
  return super().encode(text, *args, **kwargs)
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
 
217
  """
218
  Converts a sequence of tokens (string) in a single string.
219
  """
 
213
  text = self.phrase_helper.merge_phrases(text)
214
  return super().encode(text, *args, **kwargs)
215
 
216
+ def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, **kwargs):
217
+ """
218
+ Override decode to force use of convert_tokens_to_string for readable output.
219
+ """
220
+ # Ensure token_ids is a list of ints
221
+ if isinstance(token_ids, int):
222
+ token_ids = [token_ids]
223
+
224
+ # Convert to tokens
225
+ tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
226
+
227
+ # Convert to string using our custom logic
228
+ return self.convert_tokens_to_string(tokens)
229
+
230
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
231
+
232
  """
233
  Converts a sequence of tokens (string) in a single string.
234
  """