Update tokenizer.py
Browse files- tokenizer.py +36 -26
tokenizer.py
CHANGED
|
@@ -122,7 +122,7 @@ class ParadigmFinderSegmenter:
|
|
| 122 |
def segment_with_alignment(self, raw_text: str) -> Tuple[str, List[Optional[int]]]:
|
| 123 |
"""
|
| 124 |
Preprocess + segment; return segmented text and a char map from segmented
|
| 125 |
-
text back to raw indices
|
| 126 |
"""
|
| 127 |
# 1) Preprocess with alignment
|
| 128 |
pre_chars, pre_map = [], []
|
|
@@ -260,7 +260,7 @@ class ParadigmTokenizerWrapper(PreTrainedTokenizerFast):
|
|
| 260 |
slow_tokenizer_class = None
|
| 261 |
|
| 262 |
def __init__(self, *args, **kwargs):
|
| 263 |
-
|
| 264 |
name_or_path = kwargs.get("name_or_path", None)
|
| 265 |
if name_or_path is None and len(args) > 0 and isinstance(args[0], str):
|
| 266 |
name_or_path = args[0]
|
|
@@ -273,10 +273,6 @@ class ParadigmTokenizerWrapper(PreTrainedTokenizerFast):
|
|
| 273 |
|
| 274 |
super().__init__(*args, **kwargs)
|
| 275 |
|
| 276 |
-
# The folder path AutoTokenizer passes becomes available as:
|
| 277 |
-
# - kwargs.get("name_or_path") on first init
|
| 278 |
-
# - or self.name_or_path after init
|
| 279 |
-
# new:
|
| 280 |
repo_id_or_path = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
|
| 281 |
or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
|
| 282 |
revision = kwargs.get("revision", None)
|
|
@@ -296,30 +292,44 @@ class ParadigmTokenizerWrapper(PreTrainedTokenizerFast):
|
|
| 296 |
space_punct=cfg.get("space_punct", True),
|
| 297 |
)
|
| 298 |
|
| 299 |
-
# ---- main entry point ----
|
| 300 |
def __call__(self, text, **kwargs):
|
| 301 |
-
|
| 302 |
-
seg, _ = self.segmenter.segment_with_alignment(text)
|
| 303 |
-
return super().__call__(seg, **kwargs)
|
| 304 |
-
try:
|
| 305 |
-
items = list(text)
|
| 306 |
-
except TypeError:
|
| 307 |
-
# single non-str item (e.g., tuple)
|
| 308 |
-
s = _coerce_to_str(text)
|
| 309 |
-
seg, _ = self.segmenter.segment_with_alignment(s)
|
| 310 |
-
return super().__call__(seg, **kwargs)
|
| 311 |
-
|
| 312 |
-
segs = []
|
| 313 |
-
for t in items:
|
| 314 |
-
s = _coerce_to_str(t)
|
| 315 |
-
seg, _ = self.segmenter.segment_with_alignment(s)
|
| 316 |
-
segs.append(seg)
|
| 317 |
-
return super().__call__(segs, **kwargs)
|
| 318 |
-
|
| 319 |
-
def tokenize(self, text, **kwargs):
|
| 320 |
if isinstance(text, str):
|
| 321 |
seg, _ = self.segmenter.segment_with_alignment(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
return super().tokenize(seg, **kwargs)
|
|
|
|
| 323 |
try:
|
| 324 |
items = list(text)
|
| 325 |
except TypeError:
|
|
|
|
| 122 |
def segment_with_alignment(self, raw_text: str) -> Tuple[str, List[Optional[int]]]:
|
| 123 |
"""
|
| 124 |
Preprocess + segment; return segmented text and a char map from segmented
|
| 125 |
+
text back to raw indices.
|
| 126 |
"""
|
| 127 |
# 1) Preprocess with alignment
|
| 128 |
pre_chars, pre_map = [], []
|
|
|
|
| 260 |
slow_tokenizer_class = None
|
| 261 |
|
| 262 |
def __init__(self, *args, **kwargs):
|
| 263 |
+
|
| 264 |
name_or_path = kwargs.get("name_or_path", None)
|
| 265 |
if name_or_path is None and len(args) > 0 and isinstance(args[0], str):
|
| 266 |
name_or_path = args[0]
|
|
|
|
| 273 |
|
| 274 |
super().__init__(*args, **kwargs)
|
| 275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
repo_id_or_path = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
|
| 277 |
or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
|
| 278 |
revision = kwargs.get("revision", None)
|
|
|
|
| 292 |
space_punct=cfg.get("space_punct", True),
|
| 293 |
)
|
| 294 |
|
|
|
|
| 295 |
def __call__(self, text, **kwargs):
|
| 296 |
+
# 1) fast path: already a plain string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
if isinstance(text, str):
|
| 298 |
seg, _ = self.segmenter.segment_with_alignment(text)
|
| 299 |
+
return super().__call__(seg, **kwargs)
|
| 300 |
+
|
| 301 |
+
# 2) dicts: coerce to a single string (don't iterate keys!)
|
| 302 |
+
if isinstance(text, dict):
|
| 303 |
+
s = _coerce_to_str(text)
|
| 304 |
+
seg, _ = self.segmenter.segment_with_alignment(s)
|
| 305 |
+
return super().__call__(seg, **kwargs)
|
| 306 |
+
|
| 307 |
+
# 3) sequences (list/tuple/etc.): coerce each element to a string
|
| 308 |
+
try:
|
| 309 |
+
items = list(text)
|
| 310 |
+
except TypeError:
|
| 311 |
+
s = _coerce_to_str(text)
|
| 312 |
+
seg, _ = self.segmenter.segment_with_alignment(s)
|
| 313 |
+
return super().__call__(seg, **kwargs)
|
| 314 |
+
|
| 315 |
+
segs = []
|
| 316 |
+
for t in items:
|
| 317 |
+
s = _coerce_to_str(t)
|
| 318 |
+
seg, _ = self.segmenter.segment_with_alignment(s)
|
| 319 |
+
segs.append(seg)
|
| 320 |
+
return super().__call__(segs, **kwargs)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def tokenize(self, text, **kwargs):
|
| 324 |
+
if isinstance(text, str):
|
| 325 |
+
seg, _ = self.segmenter.segment_with_alignment(text) # <-- fix here
|
| 326 |
+
return super().tokenize(seg, **kwargs)
|
| 327 |
+
|
| 328 |
+
if isinstance(text, dict):
|
| 329 |
+
s = _coerce_to_str(text)
|
| 330 |
+
seg, _ = self.segmenter.segment_with_alignment(s)
|
| 331 |
return super().tokenize(seg, **kwargs)
|
| 332 |
+
|
| 333 |
try:
|
| 334 |
items = list(text)
|
| 335 |
except TypeError:
|