Update tokenizer.py
Browse files- tokenizer.py +18 -1
tokenizer.py
CHANGED
|
@@ -34,9 +34,10 @@ class SyllabicTokenizerWrapper(PreTrainedTokenizerFast):
|
|
| 34 |
# Resolve the directory where the artifacts live
|
| 35 |
hf_dir = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
|
| 36 |
or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
|
|
|
|
| 37 |
|
| 38 |
# Load preprocessing flags saved during training
|
| 39 |
-
cfg_path = os.path.join(hf_dir, "preprocess_config.json")
|
| 40 |
if not os.path.exists(cfg_path):
|
| 41 |
raise FileNotFoundError(
|
| 42 |
f"Missing preprocess_config.json in {hf_dir}. "
|
|
@@ -47,6 +48,22 @@ class SyllabicTokenizerWrapper(PreTrainedTokenizerFast):
|
|
| 47 |
|
| 48 |
self.preprocessor = Preprocessor(**self.pre_cfg)
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
# --- core segmentation helpers ---
|
| 51 |
def _segment_one(self, text: str) -> Tuple[str, List[Optional[int]]]:
|
| 52 |
return preprocess_and_segment_with_alignment(text, self.preprocessor)
|
|
|
|
| 34 |
# Resolve the directory where the artifacts live
|
| 35 |
hf_dir = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
|
| 36 |
or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
|
| 37 |
+
revision = kwargs.get("revision", None)
|
| 38 |
|
| 39 |
# Load preprocessing flags saved during training
|
| 40 |
+
cfg_path = os.path.join(hf_dir, "preprocess_config.json", revision)
|
| 41 |
if not os.path.exists(cfg_path):
|
| 42 |
raise FileNotFoundError(
|
| 43 |
f"Missing preprocess_config.json in {hf_dir}. "
|
|
|
|
| 48 |
|
| 49 |
self.preprocessor = Preprocessor(**self.pre_cfg)
|
| 50 |
|
| 51 |
+
'''
|
| 52 |
+
cfg = {"lowercase": True, "space_punct": True}
|
| 53 |
+
ppath = _get_repo_file(repo_id_or_path, "paradigms.json", revision)
|
| 54 |
+
self.paradigms, self.paradigms_meta = _load_paradigms_any(ppath)
|
| 55 |
+
|
| 56 |
+
cpath = _get_repo_file(repo_id_or_path, "preprocess_config.json", revision)
|
| 57 |
+
cfg_path_exists = os.path.exists(cpath) # when local path returned
|
| 58 |
+
with open(cpath, "r", encoding="utf-8") as f:
|
| 59 |
+
cfg.update(json.load(f))
|
| 60 |
+
|
| 61 |
+
self.segmenter = ParadigmFinderSegmenter(
|
| 62 |
+
paradigms=self.paradigms,
|
| 63 |
+
lowercase=cfg.get("lowercase", True),
|
| 64 |
+
space_punct=cfg.get("space_punct", True),
|
| 65 |
+
)'''
|
| 66 |
+
|
| 67 |
# --- core segmentation helpers ---
|
| 68 |
def _segment_one(self, text: str) -> Tuple[str, List[Optional[int]]]:
|
| 69 |
return preprocess_and_segment_with_alignment(text, self.preprocessor)
|