saracandu
/

stlenc

@@ -2,18 +2,30 @@ import json
 import os
 import torch
 from typing import Any, Dict, List, Optional, Tuple, Union
-from transformers import PreTrainedTokenizer
-from huggingface_hub import hf_hub_download
 class STLTokenizer(PreTrainedTokenizer):
-    def __init__(self, vocab_file="vocab.json", unk_token="unk", pad_token="pad",
-                 bos_token="/s", eos_token="s", model_max_length=512, **kwargs):
         current_dir = os.path.dirname(__file__)
         full_vocab_path = os.path.join(current_dir, vocab_file)
         if not os.path.exists(full_vocab_path):
-            full_vocab_path = hf_hub_download("saracandu/stldec_random", "vocab.json")
         with open(full_vocab_path, "r", encoding="utf-8") as f:
             self.vocab = json.load(f)
@@ -30,36 +42,40 @@ class STLTokenizer(PreTrainedTokenizer):
         )
     @property
-    def vocab_size(self):
         return len(self.vocab)
-    def get_vocab(self):
         return dict(self.vocab)
-    def _tokenize(self, text):
-        # La tua logica di tokenizzazione
         text = f'{self.bos_token} {text} {self.eos_token}'.replace(' ', '@')
-        tokens, i = [], 0
         while i < len(text):
             best_match = None
             for j in range(min(i + 50, len(text)), i, -1):
-                sub = text[i:j]
-                if sub in self.vocab:
-                    best_match = sub
                     break
             if best_match:
-                tokens.append(best_match); i += len(best_match)
             else:
-                tokens.append(self.unk_token); i += 1
         return tokens
-    def _convert_token_to_id(self, token):
         return self.vocab.get(token, self.vocab.get(self.unk_token))
-    def _convert_id_to_token(self, index):
         return self.id_to_token.get(index, self.unk_token)
-    def save_vocabulary(self, save_directory, filename_prefix=None):
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory)
@@ -69,4 +85,9 @@ class STLTokenizer(PreTrainedTokenizer):
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self.vocab, f, indent=2, ensure_ascii=False)
-        return (vocab_file,)

 import os
 import torch
 from typing import Any, Dict, List, Optional, Tuple, Union
+from transformers import PreTrainedTokenizer, AutoTokenizer
 class STLTokenizer(PreTrainedTokenizer):
+    model_type = "stl_encoder"
+    def __init__(
+        self,
+        vocab_file="vocab.json",
+        unk_token="unk",
+        pad_token="pad",
+        bos_token="/s",
+        eos_token="s",
+        model_max_length=512,
+        **kwargs
+    ):
         current_dir = os.path.dirname(__file__)
         full_vocab_path = os.path.join(current_dir, vocab_file)
         if not os.path.exists(full_vocab_path):
+            from huggingface_hub import hf_hub_download
+            try:
+                full_vocab_path = hf_hub_download("saracandu/stlenc", vocab_file)
+            except:
+                full_vocab_path = vocab_file
         with open(full_vocab_path, "r", encoding="utf-8") as f:
             self.vocab = json.load(f)
         )
     @property
+    def vocab_size(self) -> int:
         return len(self.vocab)
+    def get_vocab(self) -> Dict[str, int]:
         return dict(self.vocab)
+    def _tokenize(self, text: str) -> List[str]:
         text = f'{self.bos_token} {text} {self.eos_token}'.replace(' ', '@')
+        tokens = []
+        i = 0
         while i < len(text):
             best_match = None
             for j in range(min(i + 50, len(text)), i, -1):
+                subtoken = text[i:j]
+                if subtoken in self.vocab:
+                    best_match = subtoken
                     break
             if best_match:
+                tokens.append(best_match)
+                i += len(best_match)
             else:
+                tokens.append(self.unk_token)
+                i += 1
         return tokens
+    def _convert_token_to_id(self, token: str) -> int:
         return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index: int) -> str:
         return self.id_to_token.get(index, self.unk_token)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if not os.path.isdir(save_directory):
             os.makedirs(save_directory)
         with open(vocab_file, "w", encoding="utf-8") as f:
             json.dump(self.vocab, f, indent=2, ensure_ascii=False)
+        return (vocab_file,)
+try:
+    AutoTokenizer.register("stl_encoder", STLTokenizer)
+except Exception:
+    pass