Update tessar_tokenizer.py
Browse files- tessar_tokenizer.py +22 -1
tessar_tokenizer.py
CHANGED
|
@@ -8,6 +8,9 @@ from transformers import PreTrainedTokenizerFast
|
|
| 8 |
class TessarTokenizer(PreTrainedTokenizerFast):
|
| 9 |
"""
|
| 10 |
Tessar Tokenizer implementation for Hugging Face Transformers
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
|
| 13 |
model_input_names = ['input_ids', 'attention_mask']
|
|
@@ -74,6 +77,9 @@ class TessarTokenizer(PreTrainedTokenizerFast):
|
|
| 74 |
Returns:
|
| 75 |
tuple: Paths to the saved files
|
| 76 |
"""
|
|
|
|
|
|
|
|
|
|
| 77 |
# Prepare file paths
|
| 78 |
vocab_file = os.path.join(
|
| 79 |
save_directory,
|
|
@@ -161,4 +167,19 @@ def load_tessar_tokenizer(pretrained_model_name_or_path: str):
|
|
| 161 |
Returns:
|
| 162 |
TessarTokenizer: Initialized tokenizer
|
| 163 |
"""
|
| 164 |
-
return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
class TessarTokenizer(PreTrainedTokenizerFast):
|
| 9 |
"""
|
| 10 |
Tessar Tokenizer implementation for Hugging Face Transformers
|
| 11 |
+
|
| 12 |
+
This custom tokenizer extends the PreTrainedTokenizerFast with specialized
|
| 13 |
+
configuration and tokenization methods for the Tessar model.
|
| 14 |
"""
|
| 15 |
|
| 16 |
model_input_names = ['input_ids', 'attention_mask']
|
|
|
|
| 77 |
Returns:
|
| 78 |
tuple: Paths to the saved files
|
| 79 |
"""
|
| 80 |
+
# Ensure the save directory exists
|
| 81 |
+
os.makedirs(save_directory, exist_ok=True)
|
| 82 |
+
|
| 83 |
# Prepare file paths
|
| 84 |
vocab_file = os.path.join(
|
| 85 |
save_directory,
|
|
|
|
| 167 |
Returns:
|
| 168 |
TessarTokenizer: Initialized tokenizer
|
| 169 |
"""
|
| 170 |
+
return TessarTokenizer.from_pretrained(pretrained_model_name_or_path)
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
# Optionally, add some example usage
|
| 174 |
+
if __name__ == "__main__":
|
| 175 |
+
# Example of loading a pretrained tokenizer
|
| 176 |
+
try:
|
| 177 |
+
tokenizer = load_tessar_tokenizer("SVECTOR-CORPORATION/Tessar-largest")
|
| 178 |
+
print("Tokenizer loaded successfully!")
|
| 179 |
+
|
| 180 |
+
# Basic tokenization example
|
| 181 |
+
text = "Hello, how are you doing today?"
|
| 182 |
+
encoded = tokenizer(text, return_tensors="pt")
|
| 183 |
+
print("Encoded Input:", encoded)
|
| 184 |
+
except Exception as e:
|
| 185 |
+
print(f"Error loading tokenizer: {e}")
|