| | from typing import Tuple, List
|
| | import regex as re
|
| | import sys
|
| | from tqdm import tqdm
|
| | from joblib import Parallel, delayed
|
| | from indic_num_map import INDIC_NUM_MAP
|
| |
|
| |
|
| | URL_PATTERN = r'\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b'
|
| | EMAIL_PATTERN = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
|
| |
|
| | NUMERAL_PATTERN = r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
|
| |
|
| | OTHER_PATTERN = r'[A-Za-z0-9]*[#|@]\w+'
|
| |
|
| |
|
| | def normalize_indic_numerals(line: str):
|
| | """
|
| | Normalize the numerals in Indic languages from native script to Roman script (if present).
|
| |
|
| | Args:
|
| | line (str): an input string to be normalized.
|
| |
|
| | Returns:
|
| | str: an input string with the all Indic numerals normalized to Roman script.
|
| | """
|
| | return "".join([INDIC_NUM_MAP.get(c, c) for c in line])
|
| |
|
| |
|
| | def wrap_with_dnt_tag(src: str, tgt: str, pattern: str) -> Tuple[str, str]:
|
| | """
|
| | Wraps all occurences of a given pattern match that are present in both source and target sentences
|
| | with a do not translate tags (`<dnt>` {input string} `</dnt>`). This will be particularly useful
|
| | when some span of input string needs to be forwarded as it is and not translated.
|
| |
|
| | Args:
|
| | src (str): source sentence.
|
| | tgt (str): target sentence.
|
| | pattern (str): pattern to search for in the source and target sentence.
|
| |
|
| | Returns:
|
| | Tuple[str, str]: A tuple containing source and target sentences where source sentences
|
| | are wrapped in `<dnt>` and `</dnt>` tags in case of pattern matches.
|
| | """
|
| |
|
| |
|
| | src_matches = set(re.findall(pattern, src))
|
| | tgt_matches = set(re.findall(pattern, tgt))
|
| |
|
| |
|
| | common_matches = src_matches.intersection(tgt_matches)
|
| |
|
| |
|
| | for match in common_matches:
|
| | src = src.replace(match, f' <dnt> {match} </dnt> ')
|
| | tgt = tgt.replace(match, f' <dnt> {match} </dnt> ')
|
| |
|
| | src = re.sub("\s+", " ", src)
|
| | tgt = re.sub("\s+", " ", tgt)
|
| |
|
| | return src, tgt
|
| |
|
| |
|
| | def normalize(src_line: str, tgt_line: str, patterns: List[str]) -> Tuple[str, str]:
|
| | """
|
| | Normalizes and wraps the spans of text that are present in both source and target sentence
|
| | with `<dnt>` and `</dnt>` tags. It first normalizes the Indic numerals in the input string to
|
| | Roman script. Later, it uses the source and target sentence with normalized Indic numerals to
|
| | wrap the spans of source sentence matching the pattern with `<dnt>` and `</dnt>` tags.
|
| |
|
| | Args:
|
| | src_line (str): source sentence.
|
| | tgt_line (str): source sentence.
|
| | pattern (List[str]): list of patterns to search for in the input string.
|
| |
|
| | Returns:
|
| | Tuple[str, str]: A tuple containing source and target sentences where source sentences
|
| | are wrapped in `<dnt>` and `</dnt>` tags in case of pattern matches.
|
| | """
|
| | src_line = normalize_indic_numerals(src_line.strip("\n"))
|
| | tgt_line = normalize_indic_numerals(tgt_line.strip("\n"))
|
| | for pattern in patterns:
|
| | src_line, tgt_line = wrap_with_dnt_tag(src_line, tgt_line, pattern)
|
| | return src_line, tgt_line
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| |
|
| | src_infname = sys.argv[1]
|
| | tgt_infname = sys.argv[2]
|
| | src_outfname = sys.argv[3]
|
| | tgt_outfname = sys.argv[4]
|
| |
|
| | num_lines = sum(1 for line in open(src_infname, "r"))
|
| | patterns = [EMAIL_PATTERN, URL_PATTERN, NUMERAL_PATTERN, OTHER_PATTERN]
|
| |
|
| | with open(src_infname, "r", encoding="utf-8") as src_infile, \
|
| | open(tgt_infname, "r", encoding="utf-8") as tgt_infile, \
|
| | open(src_outfname, "w", encoding="utf-8") as src_outfile, \
|
| | open(tgt_outfname, "w", encoding="utf-8") as tgt_outfile:
|
| |
|
| | out_lines = Parallel(n_jobs=-1, backend="multiprocessing")(
|
| | delayed(normalize)(src_line, tgt_line, patterns) for src_line, tgt_line in tqdm(zip(src_infile, tgt_infile), total=num_lines)
|
| | )
|
| |
|
| | for src_line, tgt_line in tqdm(out_lines):
|
| | src_outfile.write(src_line + "\n")
|
| | tgt_outfile.write(tgt_line + "\n")
|
| |
|