Update isoformer_tokenizer.py
Browse files- isoformer_tokenizer.py +6 -1
isoformer_tokenizer.py
CHANGED
|
@@ -38,11 +38,16 @@ class IsoformerTokenizer(PreTrainedTokenizer):
|
|
| 38 |
self,
|
| 39 |
**kwargs
|
| 40 |
):
|
| 41 |
-
|
| 42 |
pretrained_model_path = kwargs.get("pretrained_model_name_or_path", "")
|
|
|
|
|
|
|
| 43 |
dna_vocab_path = os.path.join(pretrained_model_path, "dna_vocab_list.txt")
|
| 44 |
rna_vocab_path = os.path.join(pretrained_model_path, "rna_vocab_list.txt")
|
| 45 |
protein_vocab_path = os.path.join(pretrained_model_path, "protein_vocab_list.txt")
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
dna_hf_tokenizer = EsmTokenizer(dna_vocab_path, model_max_length=196608)
|
| 48 |
dna_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
|
|
|
|
| 38 |
self,
|
| 39 |
**kwargs
|
| 40 |
):
|
| 41 |
+
print(f"\n>>> DEBUG: IsoformerTokenizer __init__ received kwargs: {kwargs}")
|
| 42 |
pretrained_model_path = kwargs.get("pretrained_model_name_or_path", "")
|
| 43 |
+
print(f">>> DEBUG: Determined pretrained_model_path: '{pretrained_model_path}'")
|
| 44 |
+
|
| 45 |
dna_vocab_path = os.path.join(pretrained_model_path, "dna_vocab_list.txt")
|
| 46 |
rna_vocab_path = os.path.join(pretrained_model_path, "rna_vocab_list.txt")
|
| 47 |
protein_vocab_path = os.path.join(pretrained_model_path, "protein_vocab_list.txt")
|
| 48 |
+
print(f">>> DEBUG: dna_vocab_path will be: '{dna_vocab_path}'") # Add this
|
| 49 |
+
print(f">>> DEBUG: Checking if dna_vocab_path exists: {os.path.exists(dna_vocab_path)}")
|
| 50 |
+
print(f">>> DEBUG: Checking if dna_vocab_list.txt exists in CWD: {os.path.exists('dna_vocab_list.txt')}")
|
| 51 |
|
| 52 |
dna_hf_tokenizer = EsmTokenizer(dna_vocab_path, model_max_length=196608)
|
| 53 |
dna_hf_tokenizer.eos_token = None # Stops the tokenizer adding an EOS/SEP token at the end
|