lamarr-llm-development
/

elbedding

@@ -103,28 +103,28 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
             ValueError: If repo_id is not provided when model_file_or_name is not a file.
             OSError: If the model file cannot be loaded or downloaded.
         """
-        if not os.path.isfile(model_file_or_name):
-            if repo_id is None:
-                raise ValueError("repo_id must be provided if model_file_or_name is not a local file")
-            try:
-                # List all files in the repo
-                repo_files = list_repo_files(repo_id)
-                # Find the tokenizer model file
-                tokenizer_files = [f for f in repo_files if f.endswith('.model')]
-                if not tokenizer_files:
-                    raise FileNotFoundError(f"No .model file found in repository {repo_id}")
-                # Use the first .model file found
-                model_file = tokenizer_files[0]
-                print(f"Found tokenizer model file: {model_file}")
-                # Download the file
-                model_file_or_name = hf_hub_download(repo_id=repo_id, filename=model_file)
-                print(f"Downloaded tokenizer model to: {model_file_or_name}")
-            except Exception as e:
-                raise OSError(f"Failed to download tokenizer model: {str(e)}")
         try:
             return spm.SentencePieceProcessor(model_file=model_file_or_name)
@@ -182,10 +182,10 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
         if config_path is None:
             config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
-        if os.path.isfile(config_path):
-            self.tokenizer_config = self.load_json(Path(config_path))
-        else: # Load from repo
-            self.tokenizer_config = self.load_json(Path(self.find_tokenizer_config(Path(config_path), repo_id=REPO_ID)))
     @property
     def vocab_size(self) -> int:

             ValueError: If repo_id is not provided when model_file_or_name is not a file.
             OSError: If the model file cannot be loaded or downloaded.
         """
+        # if not os.path.isfile(model_file_or_name):
+        #     if repo_id is None:
+        #         raise ValueError("repo_id must be provided if model_file_or_name is not a local file")
+        #     try:
+        #         # List all files in the repo
+        #         repo_files = list_repo_files(repo_id)
+        #         # Find the tokenizer model file
+        #         tokenizer_files = [f for f in repo_files if f.endswith('.model')]
+        #         if not tokenizer_files:
+        #             raise FileNotFoundError(f"No .model file found in repository {repo_id}")
+        #         # Use the first .model file found
+        #         model_file = tokenizer_files[0]
+        #         print(f"Found tokenizer model file: {model_file}")
+        #         # Download the file
+        #         model_file_or_name = hf_hub_download(repo_id=repo_id, filename=model_file)
+        #         print(f"Downloaded tokenizer model to: {model_file_or_name}")
+        #     except Exception as e:
+        #         raise OSError(f"Failed to download tokenizer model: {str(e)}")
         try:
             return spm.SentencePieceProcessor(model_file=model_file_or_name)
         if config_path is None:
             config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
+        # if os.path.isfile(config_path):
+        self.tokenizer_config = self.load_json(Path(config_path))
+        # else: # Load from repo
+            # self.tokenizer_config = self.load_json(Path(self.find_tokenizer_config(Path(config_path), repo_id=REPO_ID)))
     @property
     def vocab_size(self) -> int: