bomolopuu
/

audio2transcription_bot

Model card Files Files and versions

audio2transcription_bot / utils /text_norm.py

bomolopuu's picture

Upload 120 files

6f03d40 verified over 1 year ago

history blame contribute delete

2.98 kB

	import json
	import re
	import unicodedata

	from utils.norm_config import norm_config


	def text_normalize(
	text,
	iso_code="xxx",
	lower_case=True,
	remove_numbers=False,
	remove_brackets=False,
	rm_extra_spaces=False,
	):

	"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces

	Args:
	text : The string to be normalized
	iso_code :
	remove_numbers : Boolean flag to specify if words containing only digits should be removed

	Returns:
	normalized_text : the string after all normalization

	"""

	config = norm_config.get(iso_code, norm_config["*"])

	for field in [
	"lower_case",
	"punc_set",
	"del_set",
	"mapping",
	"digit_set",
	"unicode_norm",
	]:
	if field not in config:
	config[field] = norm_config["*"][field]

	text = unicodedata.normalize(config["unicode_norm"], text)

	# Convert to lower case

	if config["lower_case"] and lower_case:
	text = text.lower()

	# brackets

	# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
	text = re.sub(r"\([^\)]\d[^\)]\)", " ", text)
	if remove_brackets:
	text = re.sub(r"\([^\)]*\)", " ", text)

	# Apply mappings

	for old, new in config["mapping"].items():
	text = re.sub(old, new, text)

	# Replace punctutations with space

	punct_pattern = r"[" + config["punc_set"]

	punct_pattern += "]"

	normalized_text = re.sub(punct_pattern, " ", text)

	# remove characters in delete list

	delete_patten = r"[" + config["del_set"] + "]"

	normalized_text = re.sub(delete_patten, "", normalized_text)

	# Remove words containing only digits
	# We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
	# For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
	# The lookaround enables overlapping pattern matches to be replaced

	if remove_numbers:

	digits_pattern = "[" + config["digit_set"]

	digits_pattern += "]+"

	complete_digit_pattern = (
	r"^"
	+ digits_pattern
	+ "(?=\s)\|(?<=\s)"
	+ digits_pattern
	+ "(?=\s)\|(?<=\s)"
	+ digits_pattern
	+ "$"
	)

	normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)

	if config["rm_diacritics"]:
	from unidecode import unidecode

	normalized_text = unidecode(normalized_text)

	if rm_extra_spaces:
	normalized_text = re.sub(r"\s+", " ", normalized_text).strip()

	return normalized_text