NeTSlab
/

gpt2-10M-syllitok-eng

Model card Files Files and versions

achille-fusco commited on Aug 16, 2025

Commit

6966297

·

verified ·

1 Parent(s): 7a6bbf6

Update tokenizer.py

Files changed (1) hide show

tokenizer.py +18 -1

tokenizer.py CHANGED Viewed

@@ -34,9 +34,10 @@ class SyllabicTokenizerWrapper(PreTrainedTokenizerFast):
         # Resolve the directory where the artifacts live
         hf_dir = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
                  or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
         # Load preprocessing flags saved during training
-        cfg_path = os.path.join(hf_dir, "preprocess_config.json")
         if not os.path.exists(cfg_path):
             raise FileNotFoundError(
                 f"Missing preprocess_config.json in {hf_dir}. "
@@ -47,6 +48,22 @@ class SyllabicTokenizerWrapper(PreTrainedTokenizerFast):
         self.preprocessor = Preprocessor(**self.pre_cfg)
     # --- core segmentation helpers ---
     def _segment_one(self, text: str) -> Tuple[str, List[Optional[int]]]:
         return preprocess_and_segment_with_alignment(text, self.preprocessor)

         # Resolve the directory where the artifacts live
         hf_dir = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
                  or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
+        revision = kwargs.get("revision", None)
         # Load preprocessing flags saved during training
+        cfg_path = os.path.join(hf_dir, "preprocess_config.json", revision)
         if not os.path.exists(cfg_path):
             raise FileNotFoundError(
                 f"Missing preprocess_config.json in {hf_dir}. "
         self.preprocessor = Preprocessor(**self.pre_cfg)
+        '''
+                cfg = {"lowercase": True, "space_punct": True}
+        ppath = _get_repo_file(repo_id_or_path, "paradigms.json", revision)
+        self.paradigms, self.paradigms_meta = _load_paradigms_any(ppath)
+        cpath = _get_repo_file(repo_id_or_path, "preprocess_config.json", revision)
+        cfg_path_exists = os.path.exists(cpath)  # when local path returned
+        with open(cpath, "r", encoding="utf-8") as f:
+            cfg.update(json.load(f))
+        self.segmenter = ParadigmFinderSegmenter(
+            paradigms=self.paradigms,
+            lowercase=cfg.get("lowercase", True),
+            space_punct=cfg.get("space_punct", True),
+        )'''
     # --- core segmentation helpers ---
     def _segment_one(self, text: str) -> Tuple[str, List[Optional[int]]]:
         return preprocess_and_segment_with_alignment(text, self.preprocessor)