Spaces:

ValadisCERTH
/

NumbersModuleSerco

Sleeping

App Files Files Community

NumbersModuleSerco / helper.py

ValadisCERTH

Update helper.py

eb0f4d2 almost 3 years ago

raw

history blame contribute delete

10.9 kB

	import spacy
	import re
	from word2number import w2n

	# load the spacy model
	spacy.cli.download("en_core_web_lg")
	nlp = spacy.load("en_core_web_lg")


	def capture_numbers(input_sentence):
	'''
	This is a function to capture cases of refered numbers either in numeric or free-text form
	'''

	try:
	# Define the regular expression patterns
	pattern1 = r"(\d+\|\w+(?:\s+\w+))\s+(decimal\|point\|dot\|comma)\s+(\d+\|\w+(?:\s+\w+))"

	# Find all matches in the text
	matches = re.findall(pattern1, input_sentence)

	# This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
	pattern_numbers = []
	for match in matches:
	if len(match) == 3:
	# add the $pattern string to easily specify them in a subsequent step
	full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
	pattern_numbers.append(full_string)

	for elem in pattern_numbers:
	input_sentence = input_sentence.replace(elem, " ")

	if pattern_numbers:
	# Remove duplicates with set and convert back to list
	pattern_final_numbers = list(set(pattern_numbers))
	else:
	pattern_final_numbers = []

	# we delete the captured references from the sentence, because if we capture something like seven point five
	# then spacy will also identify seven and five, which we do not want it to
	for element in pattern_final_numbers:
	target_elem = element.replace("$pattern","").strip()
	if target_elem in input_sentence:
	input_sentence = input_sentence.replace(target_elem, " ")


	# This is for cases of thirty eight or one million and two, etc.

	# Define a regular expression to match multiword free-text numbers
	pattern2 = r"(?<!\w)(?:(?:zero\|one\|two\|three\|four\|five\|six\|seven\|eight\|nine\|ten\|eleven\|twelve\|thirteen\|fourteen\|fifteen\|sixteen\|seventeen\|eighteen\|nineteen\|twenty\|thirty\|forty\|fifty\|sixty\|seventy\|eighty\|ninety\|hundred\|thousand\|million\|billion\|trillion)(?:\s(?:and\s)?(?:zero\|one\|two\|three\|four\|five\|six\|seven\|eight\|nine\|ten\|eleven\|twelve\|thirteen\|fourteen\|fifteen\|sixteen\|seventeen\|eighteen\|nineteen\|twenty\|thirty\|forty\|fifty\|sixty\|seventy\|eighty\|ninety\|hundred\|thousand\|million\|billion\|trillion))+\s?)+(?!\w*pennies)"

	# Find all multiword free-text number matches in the sentence
	multi_numbers = re.findall(pattern2, input_sentence)

	if multi_numbers:
	multinumber_final_numbers = list(set(multi_numbers))
	else:
	multinumber_final_numbers = []

	for elem in multinumber_final_numbers:
	if elem in input_sentence:
	input_sentence = input_sentence.replace(elem, " ")

	# we also delete the captured references from the sentence in this case
	for element in multinumber_final_numbers:
	target_elem = element.replace("$pattern","").strip()
	if target_elem in input_sentence:
	input_sentence = input_sentence.replace(target_elem, " ")


	# Parse the input sentence with Spacy
	doc = nlp(input_sentence)

	# This is to capture all the numbers in int and float form, as well as numbers like eight, two, hundred
	s_numbers = [token.text for token in doc if token.like_num]

	if s_numbers:
	# Remove duplicates with set and convert back to list
	spacy_final_numbers = list(set(s_numbers))

	else:
	spacy_final_numbers = []

	# return the extracted numbers
	return pattern_final_numbers + multinumber_final_numbers + spacy_final_numbers

	except:
	return 0


	def numeric_number_dot_freetext(text):
	'''
	This is a function to convert cases of '6 point five and six point 5'
	'''

	try:
	# # Define a dictionary to map words to numbers
	num_dict = {
	'zero': 0,
	'one': 1,
	'two': 2,
	'three': 3,
	'four': 4,
	'five': 5,
	'six': 6,
	'seven': 7,
	'eight': 8,
	'nine': 9,
	'ten': 10,
	'eleven': 11,
	'twelve': 12,
	'thirteen': 13,
	'fourteen': 14,
	'fifteen': 15,
	'sixteen': 16,
	'seventeen': 17,
	'eighteen': 18,
	'nineteen': 19,
	'twenty': 20,
	'thirty': 30,
	'forty': 40,
	'fifty': 50,
	'sixty': 60,
	'seventy': 70,
	'eighty': 80,
	'ninety': 90,
	'hundred': 100,
	'thousand': 1000,
	'million': 1000000,
	'billion': 1000000000,
	'trillion': 1000000000000
	}

	# # Define a regular expression pattern to extract the numeric form and free text form from input text
	pattern = r"(\d+\|\w+(?:\s+\w+))\s+(?:decimal\|point\|dot\|comma)\s+(\d+\|\w+(?:\s+\w+))"

	# Use regular expression to extract the numeric form and free text form from input text
	match = re.search(pattern, text)

	if match:
	num1 = match.group(1)
	num2 = match.group(2)

	# If the numeric form is a word, map it to its numerical value
	if num1 in num_dict:
	num1 = num_dict[num1]

	# if not in the dictionary try also with the w2n library
	else:

	# try to convert to float. That means this is a number, otherwise it is a string so continue
	try:
	num1 = float(num1)
	except:

	# this will handle cases like "bla bla bla seven"
	try:
	num1 = w2n.word_to_num(num1)

	# this is to handle cases like "bla bla bla 7"
	except:

	try:
	# we identify all the numeric references
	num_ref1 = [int(ref) for ref in re.findall(r'\d+', num1)]

	# if there is exactly one number then we cope with that
	if len(num_ref1) == 1:
	num1 = num_ref1[0]

	# in any other case throw an error
	elif len(num_ref1) > 1:
	return (0,'MAGNITUDE','more_magnitude')

	elif len(num_ref1) == 0:
	return (0,'MAGNITUDE','no_magnitude')

	except:
	return (0,'MAGNITUDE','unknown_error')


	# If the free text form is a word, map it to its numerical value
	if num2 in num_dict:
	num2 = num_dict[num2]

	else:
	try:
	num2 = int(num2)
	except:
	try:
	num2 = w2n.word_to_num(num2)
	except:
	try:
	# we identify all the numeric references
	num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)]

	# if there is exactly one number then we cope with that
	if len(num_ref2) == 1:
	num2 = num_ref2[0]

	# in any other case throw an error
	elif len(num_ref2) > 1:
	return (0,'MAGNITUDE','more_magnitude')

	elif len(num_ref2) == 0:
	return (0,'MAGNITUDE','no_magnitude')

	except:
	return (0,'MAGNITUDE','unknown_error')


	try:
	# Convert both parts to float and add them together to get the final decimal value
	result = float(num1) + float(num2) / (10 ** len(str(num2)))
	return result
	except:
	return (0, 'MAGNITUDE', 'unknown_error')


	else:
	# If input text doesn't match the expected pattern, return None
	return 0

	except:
	return 0


	def convert_into_numeric(num_list):
	'''
	This is a function to convert the identified numbers into a numeric form
	'''

	if num_list:

	# at first we examine how many numbers were captured. Only one number should exist
	if len(num_list) > 1:
	return (0,'MAGNITUDE','more_magnitudes')

	else:
	target_num = num_list[0]

	# case it is an integer or float, convert it, otherwise move to following cases
	try:
	target_num_float = float(target_num)
	return {'Number' : target_num}

	except:
	# case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
	if "$pattern" in target_num:
	num, _ = target_num.split("$")

	# Try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
	num_conversion = numeric_number_dot_freetext(num)

	if num_conversion:
	return {'Number' : num_conversion}

	# if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
	else:
	try:
	num_conversion = w2n.word_to_num(target_num)
	return {'Number' : num_conversion}

	# if none of the above try to handle cases of "million and two" or "a million and two". In such cases, we delete any 'a' reference
	# and we insert the word 'one' at the beginning. In that way the w2n library can handle them besides immediately throw an error
	except:

	try:
	target_num = target_num.replace(" a ", " ")
	new_target_num = "one " + target_num
	num_conversion = w2n.word_to_num(new_target_num)
	return {'Number' : num_conversion}

	except:
	return (0,'MAGNITUDE','unknown_error')

	else:
	return (0,'MAGNITUDE','no_magnitude')


	def magnitude_binding(input_text):
	'''
	This is a function that binds together all the subcomponents of the magnitude number identification, while also controlling for multiple, or zero magnitude references
	'''

	try:

	# capture the referred magnitudes
	target_numbers = capture_numbers(input_text)

	# we only accept for one magnitude reference
	if len(target_numbers) == 1:
	numeric_target_numbers = convert_into_numeric(target_numbers)

	return numeric_target_numbers

	# in case of zero references return the appropriate code (to aid returning the correct prompt)
	elif len(target_numbers) == 0:
	return (0,'MAGNITUDE','no_magnitude')

	# in case of more than one references return the appropriate code (to aid returning the correct prompt)
	elif len(target_numbers) > 1:
	return (0,'MAGNITUDE','more_magnitudes')

	# in case of unexpected error return the appropriate code (to aid returning the correct prompt)
	else:
	return (0,'MAGNITUDE','unknown_error')

	except:
	return (0,'MAGNITUDE','unknown_error')