Spaces:
Runtime error
Runtime error
| import codecs | |
| from SmilesPE.tokenizer import * | |
| def load_vocabulary_to_dict(vocabulary_path): | |
| vocab_dict = {} | |
| with codecs.open(vocabulary_path, 'r', 'utf-8') as file: | |
| for index, line in enumerate(file): | |
| token = line.strip().split()[0] # Assuming first item is the token | |
| vocab_dict[token] = index # Or use the token itself as ID if preferable | |
| return vocab_dict | |
| def smilespe_tokenizer(smiles_string, vocab_dict): | |
| # Initialize SPE_Tokenizer with the vocabulary | |
| spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt', 'r', 'utf-8') | |
| spe = SPE_Tokenizer(spe_vob) | |
| # Tokenize the SMILES string | |
| tokenized = spe.tokenize(smiles_string) | |
| # Convert tokens to IDs using the vocab_dict | |
| token_ids = [vocab_dict[token] for token in tokenized if token in vocab_dict] | |
| return tokenized, token_ids | |
| # Load the vocabulary into a dictionary | |
| # vocab_path = 'chembl_smiles_tokenizer30000.txt' | |
| # vocab_dict = load_vocabulary_to_dict(vocab_path) | |
| # # Example usage | |
| # smiles_string = 'Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1' | |
| # tokens, token_ids = smilespe_tokenizer(smiles_string, vocab_dict) | |
| # print("Tokens:", tokens) | |
| # print("Token IDs:", token_ids) | |