Spaces:

saicharan2804
/

SmilesPeTokenizer

Runtime error

SmilesPeTokenizer / SmilesPeTokenizer.py

saicharan2804

Added token IDs

6b58f50 about 2 years ago

1.22 kB

	import codecs
	from SmilesPE.tokenizer import *

	def load_vocabulary_to_dict(vocabulary_path):
	vocab_dict = {}
	with codecs.open(vocabulary_path, 'r', 'utf-8') as file:
	for index, line in enumerate(file):
	token = line.strip().split()[0] # Assuming first item is the token
	vocab_dict[token] = index # Or use the token itself as ID if preferable
	return vocab_dict

	def smilespe_tokenizer(smiles_string, vocab_dict):
	# Initialize SPE_Tokenizer with the vocabulary
	spe_vob = codecs.open('chembl_smiles_tokenizer30000.txt', 'r', 'utf-8')
	spe = SPE_Tokenizer(spe_vob)

	# Tokenize the SMILES string
	tokenized = spe.tokenize(smiles_string)

	# Convert tokens to IDs using the vocab_dict
	token_ids = [vocab_dict[token] for token in tokenized if token in vocab_dict]

	return tokenized, token_ids

	# Load the vocabulary into a dictionary
	# vocab_path = 'chembl_smiles_tokenizer30000.txt'
	# vocab_dict = load_vocabulary_to_dict(vocab_path)

	# # Example usage
	# smiles_string = 'Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1'
	# tokens, token_ids = smilespe_tokenizer(smiles_string, vocab_dict)
	# print("Tokens:", tokens)
	# print("Token IDs:", token_ids)