bat-name-parser / utils /string_utils.py
siva-valyx's picture
Initial Commit
c879739 verified
import re
from typing import Dict
from enum import Enum
class Entities(Enum):
PAYER = "PAYER"
PAYER_BANK_ACCOUNT = "PAYER_ACCOUNT"
VPA = "VPA"
MESSAGE = "MESSAGE"
IFSCCODE = "IFSCCODE"
UTR = "UTR"
TXNMETHOD = "TXNMETHOD"
BANK = "BANK"
class StringUtils:
@classmethod
def replace_multiple_spaces_with_single_space(cls, text):
return re.sub(r'\s+', ' ', text).strip()
@classmethod
def find_word_indices(cls, text, word):
word = cls.replace_multiple_spaces_with_single_space(word)
start_index = text.find(word)
if start_index == -1:
return None # Word not found
end_index = start_index + len(word) - 1
return start_index, end_index
@classmethod
def get_spacy_ref_for_word(cls, text, word, type):
start_index, end_index = StringUtils.find_word_indices(text, word)
return [start_index, end_index + 1, type]
@classmethod
def get_spacy_dataset(cls, transaction: str, entities_name_to_type_map: Dict[str, Entities]):
transaction_dataset = []
for entity_value in entities_name_to_type_map:
if entities_name_to_type_map[entity_value] in [Entities.PAYER, Entities.UTR]:
transaction_dataset.append(StringUtils.get_spacy_ref_for_word(
text=transaction,
word=entity_value,
type=entities_name_to_type_map[entity_value].value
))
return transaction_dataset