Spaces:
Runtime error
Runtime error
| from minbpe import RegexTokenizer | |
| # Initialize the tokenizer | |
| tokenizer = RegexTokenizer() | |
| # Read text from a file | |
| file_path = "/Users/mohammad.ibrahim/Desktop/TSAI/combined_text.txt" | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| text = file.read() | |
| # Train the tokenizer | |
| tokenizer.train(text, 256 + 5) # 256 are the byte tokens, then do 3 merges | |
| # Encode the text | |
| encoded_text = tokenizer.encode(text) | |
| print("Encoded:", encoded_text) | |
| # Decode the text | |
| decoded_text = tokenizer.decode(encoded_text) | |
| print("Decoded:", decoded_text) | |
| # Save the trained tokenizer model | |
| tokenizer.save("first") # Writes two files: toy.model (for loading) and toy.vocab (for viewing) | |