Spaces:

ValadisCERTH
/

NumbersModuleSerco

Sleeping

App Files Files Community

ValadisCERTH commited on Mar 31, 2023

Commit

902d4bf

1 Parent(s): 13c4417

Update helper.py

Browse files

Files changed (1) hide show

helper.py +151 -153

helper.py CHANGED Viewed

@@ -7,14 +7,14 @@ spacy.cli.download("en_core_web_lg")
 nlp = spacy.load("en_core_web_lg")
-def capture_numbers (input_sentence):
   '''
     This is a function to capture cases of refered numbers either in numeric or free-text form
   '''
   try:
     # Define the regular expression patterns
-    pattern1 = r"\b(\w+)\s+(point|decimal|dot|comma)\s+(\w+)\b"
     # Find all matches in the text
     matches = re.findall(pattern1, input_sentence)
@@ -31,95 +31,61 @@ def capture_numbers (input_sentence):
       input_sentence = input_sentence.replace(elem, " ")
     if pattern_numbers:
         # Remove duplicates with set and convert back to list
-        final_numbers = list(set(pattern_numbers))
-        return final_numbers
     else:
-      # Parse the input sentence with Spacy
-      doc = nlp(input_sentence)
-      # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hunded
-      numbers = [token.text for token in doc if token.like_num]
-      # Remove duplicates with set and convert back to list
-      final_numbers = list(set(numbers))
-      # Print the extracted numbers
-      if final_numbers:
-        return final_numbers
-      else:
-        return 0
-  except:
-    return 0
-def numeric_freetext_dot_freetext(text):
-    '''
-      This is a function to convert cases of 'six point five'
-    '''
-    # Define a dictionary to map freetext numbers to numeric values
-    number_map = {
-        'zero': 0,
-        'one': 1,
-        'two': 2,
-        'three': 3,
-        'four': 4,
-        'five': 5,
-        'six': 6,
-        'seven': 7,
-        'eight': 8,
-        'nine': 9,
-        'ten': 10,
-        'eleven': 11,
-        'twelve': 12,
-        'thirteen': 13,
-        'fourteen': 14,
-        'fifteen': 15,
-        'sixteen': 16,
-        'seventeen': 17,
-        'eighteen': 18,
-        'nineteen': 19,
-        'twenty': 20,
-        'thirty': 30,
-        'forty': 40,
-        'fifty': 50,
-        'sixty': 60,
-        'seventy': 70,
-        'eighty': 80,
-        'ninety': 90,
-        'hundred': 100,
-        'thousand': 1000,
-        'million': 1000000,
-        'billion': 1000000000,
-        'trillion': 1000000000000
-    }
-    try:
-      # Define regular expression to match freetext numbers
-      pattern = re.compile(r'(\w+(?:\s+\w+)*)\s+(point|decimal|dot|comma)\s+(\w+(?:\s+\w+)*)')
-      # Extract freetext number and decimal part from input text
-      match = pattern.search(text)
-      if match:
-          whole_part = match.group(1).lower()
-          decimal_part = match.group(3).lower()
-          # Convert whole and decimal parts to numeric form
-          numeric_whole = sum(number_map[word] * (10 ** (len(whole_part.split()) - i - 1)) for i, word in enumerate(whole_part.split()))
-          numeric_decimal = sum(number_map[word] * (0.1 ** (i + 1)) for i, word in enumerate(decimal_part.split()))
-          return numeric_whole + numeric_decimal
-      # Return None if the input text doesn't match the regular expression
-      return 0
-    except:
-      return 0
 def numeric_number_dot_freetext(text):
@@ -128,100 +94,132 @@ def numeric_number_dot_freetext(text):
   '''
   try:
-      # Define a dictionary to map words to numbers
-      num_dict = {"zero":0, "one":1, "two":2, "three":3, "four":4, "five":5,
-                  "six":6, "seven":7, "eight":8, "nine":9}
-      # Define a regular expression pattern to extract the numeric form and free text form from input text
-      pattern = r"(\d+|\w+)(?:\s+(?:decimal|point|dot|comma)\s+)(\d+|\w+)"
       # Use regular expression to extract the numeric form and free text form from input text
       match = re.search(pattern, text)
       if match:
           num1 = match.group(1)
           num2 = match.group(2)
           # If the numeric form is a word, map it to its numerical value
           if num1 in num_dict:
               num1 = num_dict[num1]
-          # If the free text form is a word, map it to its numerical value
-          if num2 in num_dict:
-              num2 = num_dict[num2]
-          # Convert both parts to float and add them together to get the final decimal value
-          result = float(num1) + float(num2) / (10 ** len(str(num2)))
-          return result
-      else:
-          # If input text doesn't match the expected pattern, return None
-          return 0
-  except:
-    return 0
-def convert_into_numeric(num_list):
-  '''
-  This is a function to convert the identified numbers into a numeric form
-  '''
-  if num_list:
-    # at first we examine how many numbers were captured. Only one number should exist
-    if len(num_list) > 1:
-      return 0
-    else:
-      target_num = num_list[0]
-      # case it is an integer or float, convert it, otherwise move to following cases
-      try:
-        target_num_float = float(target_num)
-        return {'Number' : target_num}
-      except:
-        # case that it belongs to one of the patterns of freetext number followed by numeric form etc (all the combinations)
-        if "$pattern" in target_num:
-          num, _ = target_num.split("$")
-          # try at first with that function for the case of six point five
-          num_conversion = numeric_freetext_dot_freetext(num)
-          if num_conversion:
-            return {'Number' : num_conversion}
-          # if not, try with this function for all the rest of cases (6 point 5, 6 point five, six point 5)
           else:
-            num_conversion = numeric_number_dot_freetext(num)
-            if num_conversion:
-              return {'Number' : num_conversion}
-        # if none of the above has worked, then examine the case of freetext numbers without patterns (e.g. two, million, twenty three, etc)
-        else:
           try:
-            num_conversion = w2n.word_to_num(target_num)
-            return {'Number' : num_conversion}
-          # if none of the above, error.
           except:
-            return 0
-  else:
-    return 0
-def magnitude_binding(input_text):
-  try:
-    target_numbers = capture_numbers(input_text)
-    numeric_target_numbers = convert_into_numeric(target_numbers)
-    return numeric_target_numbers
   except:
     return 0

 nlp = spacy.load("en_core_web_lg")
+def capture_numbers(input_sentence):
   '''
     This is a function to capture cases of refered numbers either in numeric or free-text form
   '''
   try:
     # Define the regular expression patterns
+    pattern1 = r"(\d+|\w+(?:\s+\w+)*)\s+(decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
     # Find all matches in the text
     matches = re.findall(pattern1, input_sentence)
       input_sentence = input_sentence.replace(elem, " ")
     if pattern_numbers:
         # Remove duplicates with set and convert back to list
+        pattern_final_numbers = list(set(pattern_numbers))
     else:
+        pattern_final_numbers = []
+    # we delete the captured references from the sentence, because if we capture something like seven point five
+    # then spacy will also identify seven and five, which we do not want it to
+    for element in pattern_final_numbers:
+      target_elem = element.replace("$pattern","").strip()
+      if target_elem in input_sentence:
+          input_sentence = input_sentence.replace(target_elem, " ")
+    # This is for cases of thirty eight or one million and two, etc.
+    # Define a regular expression to match multiword free-text numbers
+    pattern2 = r"(?<!\w)(?:(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion)(?:\s(?:and\s)?(?:zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety|hundred|thousand|million|billion|trillion))+\s?)+(?!\w*pennies)"
+    # Find all multiword free-text number matches in the sentence
+    multi_numbers = re.findall(pattern2, input_sentence)
+    if multi_numbers:
+      multinumber_final_numbers = list(set(multi_numbers))
+    else:
+      multinumber_final_numbers = []
+    for elem in multinumber_final_numbers:
+      if elem in input_sentence:
+        input_sentence = input_sentence.replace(elem, " ")
+    # we also delete the captured references from the sentence in this case
+    for element in multinumber_final_numbers:
+      target_elem = element.replace("$pattern","").strip()
+      if target_elem in input_sentence:
+          input_sentence = input_sentence.replace(target_elem, " ")
+    # Parse the input sentence with Spacy
+    doc = nlp(input_sentence)
+    # This is to capture all the numbers in int and float form, as well as numbers like eight, two, hundred
+    s_numbers = [token.text for token in doc if token.like_num]
+    if s_numbers:
+      # Remove duplicates with set and convert back to list
+      spacy_final_numbers = list(set(s_numbers))
+    else:
+      spacy_final_numbers = []
+    # return the extracted numbers
+    return pattern_final_numbers + multinumber_final_numbers + spacy_final_numbers
+  except:
+    return 0
 def numeric_number_dot_freetext(text):
   '''
   try:
+      # # Define a dictionary to map words to numbers
+      num_dict = {
+          'zero': 0,
+          'one': 1,
+          'two': 2,
+          'three': 3,
+          'four': 4,
+          'five': 5,
+          'six': 6,
+          'seven': 7,
+          'eight': 8,
+          'nine': 9,
+          'ten': 10,
+          'eleven': 11,
+          'twelve': 12,
+          'thirteen': 13,
+          'fourteen': 14,
+          'fifteen': 15,
+          'sixteen': 16,
+          'seventeen': 17,
+          'eighteen': 18,
+          'nineteen': 19,
+          'twenty': 20,
+          'thirty': 30,
+          'forty': 40,
+          'fifty': 50,
+          'sixty': 60,
+          'seventy': 70,
+          'eighty': 80,
+          'ninety': 90,
+          'hundred': 100,
+          'thousand': 1000,
+          'million': 1000000,
+          'billion': 1000000000,
+          'trillion': 1000000000000
+      }
+      # # Define a regular expression pattern to extract the numeric form and free text form from input text
+      pattern = r"(\d+|\w+(?:\s+\w+)*)\s+(?:decimal|point|dot|comma)\s+(\d+|\w+(?:\s+\w+)*)"
       # Use regular expression to extract the numeric form and free text form from input text
       match = re.search(pattern, text)
       if match:
           num1 = match.group(1)
           num2 = match.group(2)
           # If the numeric form is a word, map it to its numerical value
           if num1 in num_dict:
               num1 = num_dict[num1]
+          # if not in the dictionary try also with the w2n library
+          else:
+              # try to convert to float. That means this is a number, otherwise it is a string so continue
+              try:
+                num1 = float(num1)
+              except:
+                # this will handle cases like "bla bla bla seven"
+                try:
+                  num1 = w2n.word_to_num(num1)
+                # this is to handle cases like "bla bla bla 7"
+                except:
+                    try:
+                      # we identify all the numeric references
+                      num_ref1 = [int(ref) for ref in re.findall(r'\d+', num1)]
+                      # if there is exactly one number then we cope with that
+                      if len(num_ref1) == 1:
+                        num1 = num_ref1[0]
+                      # in any other case throw an error
+                      elif len(num_ref1) > 1:
+                        return (0,'MAGNITUDE','more_magnitude')
+                      elif len(num_ref1) == 0:
+                        return (0,'MAGNITUDE','no_magnitude')
+                    except:
+                        return (0,'MAGNITUDE','unknown_error')
+          # If the free text form is a word, map it to its numerical value
+          if num2 in num_dict:
+              num2 = num_dict[num2]
           else:
+              try:
+                num2 = int(num2)
+              except:
+                  try:
+                    num2 = w2n.word_to_num(num2)
+                  except:
+                      try:
+                          # we identify all the numeric references
+                          num_ref2 = [int(ref) for ref in re.findall(r'\d+', num2)]
+                          # if there is exactly one number then we cope with that
+                          if len(num_ref2) == 1:
+                            num2 = num_ref2[0]
+                          # in any other case throw an error
+                          elif len(num_ref2) > 1:
+                            return (0,'MAGNITUDE','more_magnitude')
+                          elif len(num_ref2) == 0:
+                            return (0,'MAGNITUDE','no_magnitude')
+                      except:
+                          return (0,'MAGNITUDE','unknown_error')
           try:
+            # Convert both parts to float and add them together to get the final decimal value
+            result = float(num1) + float(num2) / (10 ** len(str(num2)))
+            return result
           except:
+            return (0, 'MAGNITUDE', 'unknown_error')
+      else:
+          # If input text doesn't match the expected pattern, return None
+          return 0
   except:
     return 0