lamarr-llm-development
/

elbedding

@@ -103,28 +103,28 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
             ValueError: If repo_id is not provided when model_file_or_name is not a file.
             OSError: If the model file cannot be loaded or downloaded.
         """
-        # if not os.path.isfile(model_file_or_name):
-        #     if repo_id is None:
-        #         raise ValueError("repo_id must be provided if model_file_or_name is not a local file")
-        #     try:
-        #         # List all files in the repo
-        #         repo_files = list_repo_files(repo_id)
-        #         # Find the tokenizer model file
-        #         tokenizer_files = [f for f in repo_files if f.endswith('.model')]
-        #         if not tokenizer_files:
-        #             raise FileNotFoundError(f"No .model file found in repository {repo_id}")
-        #         # Use the first .model file found
-        #         model_file = tokenizer_files[0]
-        #         print(f"Found tokenizer model file: {model_file}")
-        #         # Download the file
-        #         model_file_or_name = hf_hub_download(repo_id=repo_id, filename=model_file)
-        #         print(f"Downloaded tokenizer model to: {model_file_or_name}")
-        #     except Exception as e:
-        #         raise OSError(f"Failed to download tokenizer model: {str(e)}")
         try:
             return spm.SentencePieceProcessor(model_file=model_file_or_name)
@@ -182,10 +182,10 @@ class HFGPTXTokenizer(PreTrainedTokenizer):
         if config_path is None:
             config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
-        # if os.path.isfile(config_path):
-        self.tokenizer_config = self.load_json(Path(config_path))
-        # else: # Load from repo
-            # self.tokenizer_config = self.load_json(Path(self.find_tokenizer_config(Path(config_path), repo_id=REPO_ID)))
     @property
     def vocab_size(self) -> int:

             ValueError: If repo_id is not provided when model_file_or_name is not a file.
             OSError: If the model file cannot be loaded or downloaded.
         """
+        if not os.path.isfile(model_file_or_name):
+            if repo_id is None:
+                raise ValueError("repo_id must be provided if model_file_or_name is not a local file")
+            try:
+                # List all files in the repo
+                repo_files = list_repo_files(repo_id)
+                # Find the tokenizer model file
+                tokenizer_files = [f for f in repo_files if f.endswith('.model')]
+                if not tokenizer_files:
+                    raise FileNotFoundError(f"No .model file found in repository {repo_id}")
+                # Use the first .model file found
+                model_file = tokenizer_files[0]
+                print(f"Found tokenizer model file: {model_file}")
+                # Download the file
+                model_file_or_name = hf_hub_download(repo_id=repo_id, filename=model_file)
+                print(f"Downloaded tokenizer model to: {model_file_or_name}")
+            except Exception as e:
+                raise OSError(f"Failed to download tokenizer model: {str(e)}")
         try:
             return spm.SentencePieceProcessor(model_file=model_file_or_name)
         if config_path is None:
             config_path = str(Path(cp_path) / TOKENIZER_CONFIG_FILE)
+        if os.path.isfile(config_path):
+            self.tokenizer_config = self.load_json(Path(config_path))
+        else: # Load from repo
+            self.tokenizer_config = self.load_json(Path(self.find_tokenizer_config(Path(config_path), repo_id=REPO_ID)))
     @property
     def vocab_size(self) -> int: