Spaces:

ValadisCERTH
/

NumbersModuleSerco

Sleeping

App Files Files Community

ValadisCERTH commited on Mar 30, 2023

Commit

13c4417

1 Parent(s): 3f018ec

Create helper.py

Browse files

Files changed (1) hide show

helper.py +227 -0

helper.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import spacy
+import re
+from word2number import w2n
+# load the spacy model
+spacy.cli.download("en_core_web_lg")
+nlp = spacy.load("en_core_web_lg")
+def capture_numbers (input_sentence):
+  '''
+    This is a function to capture cases of refered numbers either in numeric or free-text form
+  '''
+  try:
+    # Define the regular expression patterns
+    pattern1 = r"\b(\w+)\s+(point|decimal|dot|comma)\s+(\w+)\b"
+    # Find all matches in the text
+    matches = re.findall(pattern1, input_sentence)
+    # This part is to capture cases like six point five, 5 point five, six point 5, 5 point 5
+    pattern_numbers = []
+    for match in matches:
+        if len(match) == 3:
+            # add the $pattern string to easily specify them in a subsequent step
+            full_string = "{} {} {} {}".format(match[0], match[1], match[2], '$pattern')
+            pattern_numbers.append(full_string)
+    for elem in pattern_numbers:
+      input_sentence = input_sentence.replace(elem, " ")
+    if pattern_numbers:
+        # Remove duplicates with set and convert back to list
+        final_numbers = list(set(pattern_numbers))
+        return final_numbers
+    else:
+      # Parse the input sentence with Spacy
+      doc = nlp(input_sentence)
+      # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hunded
+      numbers = [token.text for token in doc if token.like_num]
+      # Remove duplicates with set and convert back to list
+      final_numbers = list(set(numbers))
+      # Print the extracted numbers
+      if final_numbers:
+        return final_numbers
+      else:
+        return 0
+  except:
+    return 0
+def numeric_freetext_dot_freetext(text):
+    '''
+      This is a function to convert cases of 'six point five'
+    '''
+    # Define a dictionary to map freetext numbers to numeric values
+    number_map = {
+        'zero': 0,
+        'one': 1,
+        'two': 2,
+        'three': 3,
+        'four': 4,
+        'five': 5,
+        'six': 6,
+        'seven': 7,
+        'eight': 8,
+        'nine': 9,
+        'ten': 10,
+        'eleven': 11,
+        'twelve': 12,
+        'thirteen': 13,
+        'fourteen': 14,
+        'fifteen': 15,
+        'sixteen': 16,
+        'seventeen': 17,
+        'eighteen': 18,
+        'nineteen': 19,
+        'twenty': 20,
+        'thirty': 30,
+        'forty': 40,
+        'fifty': 50,
+        'sixty': 60,
+        'seventy': 70,
+        'eighty': 80,
+        'ninety': 90,
+        'hundred': 100,
+        'thousand': 1000,
+        'million': 1000000,
+        'billion': 1000000000,
+        'trillion': 1000000000000
+    }
+    try:
+      # Define regular expression to match freetext numbers
+      pattern = re.compile(r'(\w+(?:\s+\w+)*)\s+(point|decimal|dot|comma)\s+(\w+(?:\s+\w+)*)')
+      # Extract freetext number and decimal part from input text
+      match = pattern.search(text)
+      if match:
+          whole_part = match.group(1).lower()
+          decimal_part = match.group(3).lower()
+          # Convert whole and decimal parts to numeric form
+          numeric_whole = sum(number_map[word] * (10 ** (len(whole_part.split()) - i - 1)) for i, word in enumerate(whole_part.split()))
+          numeric_decimal = sum(number_map[word] * (0.1 ** (i + 1)) for i, word in enumerate(decimal_part.split()))
+          return numeric_whole + numeric_decimal
+      # Return None if the input text doesn't match the regular expression
+      return 0
+    except:
+      return 0
+def numeric_number_dot_freetext(text):
+  '''
+  This is a function to convert cases of '6 point five and six point 5'
+  '''
+  try:
+      # Define a dictionary to map words to numbers
+      num_dict = {"zero":0, "one":1, "two":2, "three":3, "four":4, "five":5,
+                  "six":6, "seven":7, "eight":8, "nine":9}
+      # Define a regular expression pattern to extract the numeric form and free text form from input text
+      pattern = r"(\d+|\w+)(?:\s+(?:decimal|point|dot|comma)\s+)(\d+|\w+)"
+      # Use regular expression to extract the numeric form and free text form from input text
+      match = re.search(pattern, text)
+      if match:
+          num1 = match.group(1)
+          num2 = match.group(2)
+          # If the numeric form is a word, map it to its numerical value
+          if num1 in num_dict:
+              num1 = num_dict[num1]
+          # If the free text form is a word, map it to its numerical value
+          if num2 in num_dict:
+              num2 = num_dict[num2]
+          # Convert both parts to float and add them together to get the final decimal value
+          result = float(num1) + float(num2) / (10 ** len(str(num2)))
+          return result
+      else:
+          # If input text doesn't match the expected pattern, return None
+          return 0
+  except:
+    return 0
+def convert_into_numeric(num_list):
+  '''
+  This is a function to convert the identified numbers into a numeric form
+  '''
+  if num_list:
+    # at first we examine how many numbers were captured. Only one number should exist
+    if len(num_list) > 1:
+      return 0
+    else:
+      target_num = num_list[0]
+      # case it is an integer or float, convert it, otherwise move to following cases
+      try:
+        target_num_float = float(target_num)
+        return {'Number' : target_num}
+      except:
+        # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
+        if "$pattern" in target_num:
+          num, _ = target_num.split("$")
+          # try at first with that function for the case of six point five
+          num_conversion = numeric_freetext_dot_freetext(num)
+          if num_conversion:
+            return {'Number' : num_conversion}
+          # if not, try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
+          else:
+            num_conversion = numeric_number_dot_freetext(num)
+            if num_conversion:
+              return {'Number' : num_conversion}
+        # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
+        else:
+          try:
+            num_conversion = w2n.word_to_num(target_num)
+            return {'Number' : num_conversion}
+          # if none of the above, error.
+          except:
+            return 0
+  else:
+    return 0
+def magnitude_binding(input_text):
+  try:
+    target_numbers = capture_numbers(input_text)
+    numeric_target_numbers = convert_into_numeric(target_numbers)
+    return numeric_target_numbers
+  except:
+    return 0