PyTorch
gpt2
achille-fusco commited on
Commit
a4b935f
·
verified ·
1 Parent(s): dac8022

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +36 -26
tokenizer.py CHANGED
@@ -122,7 +122,7 @@ class ParadigmFinderSegmenter:
122
  def segment_with_alignment(self, raw_text: str) -> Tuple[str, List[Optional[int]]]:
123
  """
124
  Preprocess + segment; return segmented text and a char map from segmented
125
- text back to raw indices (None for inserted spaces).
126
  """
127
  # 1) Preprocess with alignment
128
  pre_chars, pre_map = [], []
@@ -260,7 +260,7 @@ class ParadigmTokenizerWrapper(PreTrainedTokenizerFast):
260
  slow_tokenizer_class = None
261
 
262
  def __init__(self, *args, **kwargs):
263
- # ensure fast tokenizer is loaded directly (no slow->fast conversion)
264
  name_or_path = kwargs.get("name_or_path", None)
265
  if name_or_path is None and len(args) > 0 and isinstance(args[0], str):
266
  name_or_path = args[0]
@@ -273,10 +273,6 @@ class ParadigmTokenizerWrapper(PreTrainedTokenizerFast):
273
 
274
  super().__init__(*args, **kwargs)
275
 
276
- # The folder path AutoTokenizer passes becomes available as:
277
- # - kwargs.get("name_or_path") on first init
278
- # - or self.name_or_path after init
279
- # new:
280
  repo_id_or_path = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
281
  or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
282
  revision = kwargs.get("revision", None)
@@ -296,30 +292,44 @@ class ParadigmTokenizerWrapper(PreTrainedTokenizerFast):
296
  space_punct=cfg.get("space_punct", True),
297
  )
298
 
299
- # ---- main entry point ----
300
  def __call__(self, text, **kwargs):
301
- if isinstance(text, str):
302
- seg, _ = self.segmenter.segment_with_alignment(text)
303
- return super().__call__(seg, **kwargs)
304
- try:
305
- items = list(text)
306
- except TypeError:
307
- # single non-str item (e.g., tuple)
308
- s = _coerce_to_str(text)
309
- seg, _ = self.segmenter.segment_with_alignment(s)
310
- return super().__call__(seg, **kwargs)
311
-
312
- segs = []
313
- for t in items:
314
- s = _coerce_to_str(t)
315
- seg, _ = self.segmenter.segment_with_alignment(s)
316
- segs.append(seg)
317
- return super().__call__(segs, **kwargs)
318
-
319
- def tokenize(self, text, **kwargs):
320
  if isinstance(text, str):
321
  seg, _ = self.segmenter.segment_with_alignment(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  return super().tokenize(seg, **kwargs)
 
323
  try:
324
  items = list(text)
325
  except TypeError:
 
122
  def segment_with_alignment(self, raw_text: str) -> Tuple[str, List[Optional[int]]]:
123
  """
124
  Preprocess + segment; return segmented text and a char map from segmented
125
+ text back to raw indices.
126
  """
127
  # 1) Preprocess with alignment
128
  pre_chars, pre_map = [], []
 
260
  slow_tokenizer_class = None
261
 
262
  def __init__(self, *args, **kwargs):
263
+
264
  name_or_path = kwargs.get("name_or_path", None)
265
  if name_or_path is None and len(args) > 0 and isinstance(args[0], str):
266
  name_or_path = args[0]
 
273
 
274
  super().__init__(*args, **kwargs)
275
 
 
 
 
 
276
  repo_id_or_path = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
277
  or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
278
  revision = kwargs.get("revision", None)
 
292
  space_punct=cfg.get("space_punct", True),
293
  )
294
 
 
295
  def __call__(self, text, **kwargs):
296
+ # 1) fast path: already a plain string
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  if isinstance(text, str):
298
  seg, _ = self.segmenter.segment_with_alignment(text)
299
+ return super().__call__(seg, **kwargs)
300
+
301
+ # 2) dicts: coerce to a single string (don't iterate keys!)
302
+ if isinstance(text, dict):
303
+ s = _coerce_to_str(text)
304
+ seg, _ = self.segmenter.segment_with_alignment(s)
305
+ return super().__call__(seg, **kwargs)
306
+
307
+ # 3) sequences (list/tuple/etc.): coerce each element to a string
308
+ try:
309
+ items = list(text)
310
+ except TypeError:
311
+ s = _coerce_to_str(text)
312
+ seg, _ = self.segmenter.segment_with_alignment(s)
313
+ return super().__call__(seg, **kwargs)
314
+
315
+ segs = []
316
+ for t in items:
317
+ s = _coerce_to_str(t)
318
+ seg, _ = self.segmenter.segment_with_alignment(s)
319
+ segs.append(seg)
320
+ return super().__call__(segs, **kwargs)
321
+
322
+
323
+ def tokenize(self, text, **kwargs):
324
+ if isinstance(text, str):
325
+ seg, _ = self.segmenter.segment_with_alignment(text) # <-- fix here
326
+ return super().tokenize(seg, **kwargs)
327
+
328
+ if isinstance(text, dict):
329
+ s = _coerce_to_str(text)
330
+ seg, _ = self.segmenter.segment_with_alignment(s)
331
  return super().tokenize(seg, **kwargs)
332
+
333
  try:
334
  items = list(text)
335
  except TypeError: