| from typing import Tuple
|
| import regex as re
|
| import sys
|
| from tqdm import tqdm
|
| from .indic_num_map import INDIC_NUM_MAP
|
|
|
|
|
| URL_PATTERN = r'\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b'
|
| EMAIL_PATTERN = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
|
|
|
| NUMERAL_PATTERN = r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
|
|
|
| OTHER_PATTERN = r'[A-Za-z0-9]*[#|@]\w+'
|
|
|
|
|
| def normalize_indic_numerals(line: str):
|
| """
|
| Normalize the numerals in Indic languages from native script to Roman script (if present).
|
|
|
| Args:
|
| line (str): an input string with Indic numerals to be normalized.
|
|
|
| Returns:
|
| str: an input string with the all Indic numerals normalized to Roman script.
|
| """
|
| return "".join([INDIC_NUM_MAP.get(c, c) for c in line])
|
|
|
|
|
| def wrap_with_placeholders(text: str, patterns: list) -> Tuple[str, dict]:
|
| """
|
| Wraps substrings with matched patterns in the given text with placeholders and returns
|
| the modified text along with a mapping of the placeholders to their original value.
|
|
|
| Args:
|
| text (str): an input string which needs to be wrapped with the placeholders.
|
| pattern (list): list of patterns to search for in the input string.
|
|
|
| Returns:
|
| Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping
|
| placeholders to their original values.
|
| """
|
| serial_no = 1
|
|
|
| placeholder_entity_map = dict()
|
|
|
| for pattern in patterns:
|
| matches = set(re.findall(pattern, text))
|
|
|
|
|
| for match in matches:
|
| if pattern==URL_PATTERN :
|
|
|
| temp = match.replace(".",'')
|
| if len(temp)<4:
|
| continue
|
| if pattern==NUMERAL_PATTERN :
|
|
|
| temp = match.replace(" ",'').replace(".",'').replace(":",'')
|
| if len(temp)<4:
|
| continue
|
|
|
|
|
|
|
| indic_failure_cases = ['آی ڈی ', 'ꯑꯥꯏꯗꯤ', 'आईडी', 'आई . डी . ', 'ऐटि', 'آئی ڈی ', 'ᱟᱭᱰᱤ ᱾', 'आयडी', 'ऐडि', 'आइडि']
|
| placeholder = "<ID{}>".format(serial_no)
|
| alternate_placeholder = "< ID{} >".format(serial_no)
|
| placeholder_entity_map[placeholder] = match
|
| placeholder_entity_map[alternate_placeholder] = match
|
|
|
| for i in indic_failure_cases:
|
| placeholder_temp = "<{}{}>".format(i,serial_no)
|
| placeholder_entity_map[placeholder_temp] = match
|
| placeholder_temp = "< {}{} >".format(i, serial_no)
|
| placeholder_entity_map[placeholder_temp] = match
|
| placeholder_temp = "< {} {} >".format(i, serial_no)
|
| placeholder_entity_map[placeholder_temp] = match
|
|
|
| text = text.replace(match, placeholder)
|
| serial_no+=1
|
|
|
| text = re.sub("\s+", " ", text)
|
|
|
|
|
| text = text.replace(">/",">")
|
|
|
| return text, placeholder_entity_map
|
|
|
|
|
| def normalize(text: str, patterns: list = [EMAIL_PATTERN, URL_PATTERN, NUMERAL_PATTERN, OTHER_PATTERN]) -> Tuple[str, dict]:
|
| """
|
| Normalizes and wraps the spans of input string with placeholder tags. It first normalizes
|
| the Indic numerals in the input string to Roman script. Later, it uses the input string with normalized
|
| Indic numerals to wrap the spans of text matching the pattern with placeholder tags.
|
|
|
| Args:
|
| text (str): input string.
|
| pattern (list): list of patterns to search for in the input string.
|
|
|
| Returns:
|
| Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping
|
| placeholders to their original values.
|
| """
|
| text = normalize_indic_numerals(text.strip("\n"))
|
| text, placeholder_entity_map = wrap_with_placeholders(text, patterns)
|
| return text, placeholder_entity_map
|
|
|