Spaces:

ValadisCERTH
/

ComparativesModuleSerco

Sleeping

App Files Files Community

ValadisCERTH commited on Mar 31, 2023

Commit

8f7c535

1 Parent(s): 44e368b

Update helper.py

Browse files

Files changed (1) hide show

helper.py +62 -27

helper.py CHANGED Viewed

@@ -65,9 +65,6 @@ def find_comptives_straight_patterns(sentence):
             if next_token.text.lower() == "than":
                 prev_token = token.nbor(-1)
-                # this part is to check what will be before more/less. We can add a NOUN as mandatory (e.g magnitude) or even specifically the word magnitude
-                # for the moment we have disable it
                 if token.text.lower() == 'more':
                   comparatives.append({'comparative': [token.text+" "+next_token.text, '>']})
                 elif token.text.lower() == 'less':
@@ -283,6 +280,7 @@ def identify_bigger_smaller_advanced(sentence):
   return bigger_list + smaller_list
 def find_equal_to_comptives_ngrams(sentence):
     """
     This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams.
@@ -347,23 +345,41 @@ def single_verb_comptives(sentence):
     # search for all verbs and examine their lemma with all the synonyms of each of the previous references. Assign a label accordingly
     for token in doc:
-        if token.pos_ == "VERB":
-            for lemma in token.lemma_.split('|'):
-                synsets = wordnet.synsets(lemma, pos='v')
-                for syn in synsets:
-                    if any(lemma in bigger_references_sg for lemma in syn.lemma_names()):
-                        bigger_list.append({'comparative': [token.text, ">"]})
-                        break
-                    elif any(lemma in lesser_references_sg for lemma in syn.lemma_names()):
-                        smaller_list.append({'comparative': [token.text, "<"]})
-                        break
-                    elif any(lemma in equal_references_sg for lemma in syn.lemma_names()):
-                        equal_list.append({'comparative': [token.text, "="]})
-                        break
     final_list = bigger_list + smaller_list + equal_list
@@ -406,6 +422,7 @@ def cosine_sim(a, b):
     return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]
 # we examine the n-grams reversely and any time we find a match, we "delete" that match, so that lesser ngrams will not be matched \
 # (e.g. is on a par with, would also match afterwords on a par with, par with, etc)
@@ -433,7 +450,7 @@ def multiword_verb_comptives(sentence):
     matched_ngrams = set()
     # Iterate through n-grams of sentence, starting with the largest n-grams
-    for n in range(5, 1, -1):
         for i in range(len(tokens)-n+1):
             ngram = ' '.join(tokens[i:i+n])
@@ -522,23 +539,41 @@ def identify_comparatives(sentence):
     # return all the patterns that were captured
     comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb
-    # since those different techniques might capture similar patterns, we keep only unique references. More precisely
-    unique_comparatives = {}
-    for item in comparatives:
-        if item['comparative'][0] not in unique_comparatives:
-            unique_comparatives[item['comparative'][0]] = item
     unique_output = list(unique_comparatives.values())
     return unique_output
 def comparatives_binding(sentence):
   try:
     comparative_symbols = find_comptives_symbols(sentence)
     comparative_mentions = identify_comparatives(sentence)
     # starting with the symbols, if one was captured

             if next_token.text.lower() == "than":
                 prev_token = token.nbor(-1)
                 if token.text.lower() == 'more':
                   comparatives.append({'comparative': [token.text+" "+next_token.text, '>']})
                 elif token.text.lower() == 'less':
   return bigger_list + smaller_list
 def find_equal_to_comptives_ngrams(sentence):
     """
     This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams.
     # search for all verbs and examine their lemma with all the synonyms of each of the previous references. Assign a label accordingly
     for token in doc:
+        # first examine for 1-1 pair matching and 1-1 lemma pair matching
+        if token.text in bigger_references_sg or token.lemma_ in bigger_references_sg:
+          bigger_list.append({'comparative': [token.text, ">"]})
+          break
+        elif token.text in lesser_references_sg or token.lemma_ in lesser_references_sg:
+          smaller_list.append({'comparative': [token.text, "<"]})
+          break
+        elif token.text in equal_references_sg or token.lemma_ in equal_references_sg:
+          equal_list.append({'comparative': [token.text, "="]})
+          break
+        else:
+          # if not, then try with synonyms only for verbs
+          if token.pos_ == "VERB":
+              for lemma in token.lemma_.split('|'):
+                  synsets = wordnet.synsets(lemma, pos='v')
+                  for syn in synsets:
+                      if any(lemma in bigger_references_sg for lemma in syn.lemma_names()):
+                          bigger_list.append({'comparative': [token.text, ">"]})
+                          break
+                      elif any(lemma in lesser_references_sg for lemma in syn.lemma_names()):
+                          smaller_list.append({'comparative': [token.text, "<"]})
+                          break
+                      elif any(lemma in equal_references_sg for lemma in syn.lemma_names()):
+                          equal_list.append({'comparative': [token.text, "="]})
+                          break
     final_list = bigger_list + smaller_list + equal_list
     return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0]
 # we examine the n-grams reversely and any time we find a match, we "delete" that match, so that lesser ngrams will not be matched \
 # (e.g. is on a par with, would also match afterwords on a par with, par with, etc)
     matched_ngrams = set()
     # Iterate through n-grams of sentence, starting with the largest n-grams
+    for n in range(5, 0, -1):
         for i in range(len(tokens)-n+1):
             ngram = ' '.join(tokens[i:i+n])
     # return all the patterns that were captured
     comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb
+    # since those different techniques might capture similar patterns, we keep only unique references. More precisely
+    # we discard any unique reference while also any reference thay may exist as a substring on any other reference
+    # sort the list by length of the comparatives, in descending order
+    comparatives.sort(key=lambda item: len(item['comparative'][0]), reverse=False)
+    unique_comparatives = {}
+    for i, item in enumerate(comparatives):
+        comparative = item['comparative'][0]
+        # check if the comparative is already in the dictionary or a substring/similar string of an existing comparative
+        is_unique = True
+        for existing_comp in unique_comparatives:
+            if (comparative in existing_comp) or (existing_comp in comparative):
+                is_unique = False
+                break
+        if is_unique:
+            unique_comparatives[comparative] = item
+        elif i == len(comparatives) - 1:
+            # if it's the last item and it's not unique, replace the first unique item in the list with this item
+            for j, existing_item in enumerate(unique_comparatives.values()):
+                if (existing_item['comparative'][0] in comparative) or (comparative in existing_item['comparative'][0]):
+                    unique_comparatives.pop(list(unique_comparatives.keys())[j])
+                    unique_comparatives[comparative] = item
+                    break
     unique_output = list(unique_comparatives.values())
     return unique_output
 def comparatives_binding(sentence):
   try:
     comparative_symbols = find_comptives_symbols(sentence)
     comparative_mentions = identify_comparatives(sentence)
     # starting with the symbols, if one was captured