Arabic
arabic
tokenizer
morphology
nlp
dialect
fr3on commited on
Commit
8941194
·
verified ·
1 Parent(s): 6443890

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization_df_arc.py +14 -1
tokenization_df_arc.py CHANGED
@@ -218,6 +218,19 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
218
  text = self.normalizer_helper.normalize(text)
219
  text = self.morph_helper.segment_text(text)
220
  text = self.phrase_helper.merge_phrases(text)
221
- return super()._encode_plus(text, *args, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
 
 
218
  text = self.normalizer_helper.normalize(text)
219
  text = self.morph_helper.segment_text(text)
220
  text = self.phrase_helper.merge_phrases(text)
221
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
222
+ """
223
+ Converts a sequence of tokens (string) in a single string.
224
+ """
225
+ # Join with simple space (since we used Whitespace pre-tokenizer)
226
+ text = " ".join(tokens)
227
+
228
+ # Remove morphological markers (underscores)
229
+ # We only remove internal underscores.
230
+ # Note: This is an approximation.
231
+ text = text.replace("_", "")
232
+
233
+ return text
234
+
235
 
236