PyTorch
gpt2
achille-fusco commited on
Commit
6966297
·
verified ·
1 Parent(s): 7a6bbf6

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +18 -1
tokenizer.py CHANGED
@@ -34,9 +34,10 @@ class SyllabicTokenizerWrapper(PreTrainedTokenizerFast):
34
  # Resolve the directory where the artifacts live
35
  hf_dir = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
36
  or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
 
37
 
38
  # Load preprocessing flags saved during training
39
- cfg_path = os.path.join(hf_dir, "preprocess_config.json")
40
  if not os.path.exists(cfg_path):
41
  raise FileNotFoundError(
42
  f"Missing preprocess_config.json in {hf_dir}. "
@@ -47,6 +48,22 @@ class SyllabicTokenizerWrapper(PreTrainedTokenizerFast):
47
 
48
  self.preprocessor = Preprocessor(**self.pre_cfg)
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # --- core segmentation helpers ---
51
  def _segment_one(self, text: str) -> Tuple[str, List[Optional[int]]]:
52
  return preprocess_and_segment_with_alignment(text, self.preprocessor)
 
34
  # Resolve the directory where the artifacts live
35
  hf_dir = kwargs.get("name_or_path", getattr(self, "name_or_path", None)) \
36
  or os.path.dirname(getattr(self, "tokenizer_file", "")) or "."
37
+ revision = kwargs.get("revision", None)
38
 
39
  # Load preprocessing flags saved during training
40
+ cfg_path = os.path.join(hf_dir, "preprocess_config.json", revision)
41
  if not os.path.exists(cfg_path):
42
  raise FileNotFoundError(
43
  f"Missing preprocess_config.json in {hf_dir}. "
 
48
 
49
  self.preprocessor = Preprocessor(**self.pre_cfg)
50
 
51
+ '''
52
+ cfg = {"lowercase": True, "space_punct": True}
53
+ ppath = _get_repo_file(repo_id_or_path, "paradigms.json", revision)
54
+ self.paradigms, self.paradigms_meta = _load_paradigms_any(ppath)
55
+
56
+ cpath = _get_repo_file(repo_id_or_path, "preprocess_config.json", revision)
57
+ cfg_path_exists = os.path.exists(cpath) # when local path returned
58
+ with open(cpath, "r", encoding="utf-8") as f:
59
+ cfg.update(json.load(f))
60
+
61
+ self.segmenter = ParadigmFinderSegmenter(
62
+ paradigms=self.paradigms,
63
+ lowercase=cfg.get("lowercase", True),
64
+ space_punct=cfg.get("space_punct", True),
65
+ )'''
66
+
67
  # --- core segmentation helpers ---
68
  def _segment_one(self, text: str) -> Tuple[str, List[Optional[int]]]:
69
  return preprocess_and_segment_with_alignment(text, self.preprocessor)