| import spacy |
| import re |
| import nltk |
| from nltk.corpus import wordnet |
| import numpy as np |
|
|
| from sklearn.metrics.pairwise import cosine_similarity |
|
|
| nltk.download('wordnet') |
| nltk.download('omw-1.4') |
| nltk.download('punkt') |
|
|
| |
| spacy.cli.download("en_core_web_sm") |
|
|
| |
| nlp = spacy.load('en_core_web_sm', disable=["parser", "ner"]) |
|
|
|
|
| def find_comptives_symbols(sentence): |
| """ |
| Capture unique cases of symbols like <, >, = |
| If more than one symbols exist, return [] |
| """ |
|
|
| |
| pattern = r"(?<![<=>])[%s](?![<=>])" % (re.escape("<=>")) |
| matches = re.findall(pattern, sentence) |
|
|
| found_symbols = [] |
| for matching in matches: |
| |
| found_symbols.append({'comparative': ['symbol', matching]}) |
|
|
| |
| return found_symbols |
|
|
|
|
| def find_comptives_straight_patterns(sentence): |
| """ |
| Function to identivy mentions of comparatives. The form is "comparative adverbs/adjectives followed by than", "words like more/less followed by than", "equal to" |
| """ |
|
|
| doc = nlp(sentence) |
| comparatives = [] |
| |
| for token in doc: |
|
|
| |
| if token.text.lower() == "equal": |
| next_token = token.nbor() |
| |
| if next_token.text.lower() == "to": |
| prev_token = token.nbor(-1) |
| |
| if prev_token.pos_ == "NOUN": |
| comparatives.append({'comparative': ["equal to", "="]}) |
|
|
|
|
| |
| elif token.text.lower() in ["more", "less"]: |
| |
| next_token = token.nbor() |
| |
| if next_token.text.lower() == "than": |
| prev_token = token.nbor(-1) |
| |
| if token.text.lower() == 'more': |
| comparatives.append({'comparative': [token.text+" "+next_token.text, '>']}) |
| elif token.text.lower() == 'less': |
| comparatives.append({'comparative': [token.text+" "+next_token.text, '<']}) |
|
|
|
|
| |
| elif token.tag_ == "JJR" or token.tag_ == "RBR": |
| next_token = token.nbor() |
| |
| if next_token.text.lower() == "than" and next_token.nbor().pos_ != "NOUN": |
|
|
| |
|
|
| |
| big_synonyms = set(wordnet.synsets('big') + wordnet.synsets('large') + wordnet.synsets('great') + wordnet.synsets('huge') + wordnet.synsets('enormous') + wordnet.synsets('heavy') + wordnet.synsets('strong') + wordnet.synsets('enormous') + wordnet.synsets('massive') + wordnet.synsets('immense') + wordnet.synsets('substantial')) |
| bigger_synonyms = set(wordnet.synsets('bigger') + wordnet.synsets('larger') + wordnet.synsets('greater') + wordnet.synsets('higher') + wordnet.synsets('taller') + wordnet.synsets('heavier') + wordnet.synsets('stronger')) |
|
|
| bigger_related_words = big_synonyms.union(bigger_synonyms) |
| bigger_rel_words = [word.name().split('.')[0] for word in bigger_related_words] |
|
|
| flag_bigger = 0 |
|
|
| if token.text.lower() in bigger_rel_words: |
| flag_bigger = 1 |
| comparatives.append({'comparative': [token.text+" "+next_token.text, '>']}) |
|
|
| |
| if not flag_bigger: |
|
|
| |
| small_synonyms = set(wordnet.synsets('small') + wordnet.synsets('little') + wordnet.synsets('tiny') + wordnet.synsets('petite') + wordnet.synsets('miniature') + wordnet.synsets('slight') + wordnet.synsets('meager') + wordnet.synsets('inconsequential') + wordnet.synsets('minor')) |
| smaller_synonyms = set(wordnet.synsets('smaller') + wordnet.synsets('lesser') + wordnet.synsets('lower') + wordnet.synsets('shorter') + wordnet.synsets('lighter') + wordnet.synsets('weaker')) |
|
|
| smaller_related_words = small_synonyms.union(smaller_synonyms) |
| smaller_rel_words = [word.name().split('.')[0] for word in smaller_related_words] |
|
|
| if token.text.lower() in smaller_rel_words: |
| flag_bigger = 0 |
| comparatives.append({'comparative': [token.text+" "+next_token.text, '<']}) |
|
|
| return comparatives |
|
|
|
|
| |
|
|
| def identify_comparison(sentence): |
| """ |
| Capture patterns of 'word-er' followed by 'than' (e.g. 'better than', 'lesser than', etc) |
| """ |
|
|
| pattern = r'\b(\w+er than)\b' |
| matches = re.findall(pattern, sentence) |
|
|
| if matches: |
| return matches |
| else: |
| return 0 |
|
|
|
|
| def find_more_than_reference(sentence): |
| """ |
| Capture patterns of 'more' followed by 'word' followed by 'than' (e.g. more advanced than) |
| """ |
|
|
| pattern = r"(more) (\w+) than" |
| matches = re.findall(pattern, sentence) |
|
|
| if matches: |
| return [' '.join(match) for match in matches] |
| else: |
| return 0 |
|
|
|
|
| def find_less_than_reference(sentence): |
| """ |
| Capture patterns of 'less' followed by 'word' followed by 'than' (e.g. less advanced than) |
| """ |
|
|
| pattern = r"(less) (\w+) than" |
| matches = re.findall(pattern, sentence) |
| |
| if matches: |
| return [' '.join(match) for match in matches] |
| else: |
| return 0 |
|
|
|
|
| def is_related_to(word, target_word): |
| """ |
| Returns True if the input 'word' is semantically related to the 'target_word', otherwise False. |
| """ |
|
|
| target_synsets = set(wordnet.synsets(target_word)) |
| word_synsets = set(wordnet.synsets(word)) |
|
|
| if word_synsets.intersection(target_synsets): |
| return True |
| else: |
| return False |
|
|
|
|
| def is_related_to_bigger(word): |
| """ |
| Returns True if the input 'word' is semantically related to the concept 'bigger', otherwise False. |
| """ |
|
|
| if word.lower() == "more" or word.lower().startswith("more "): |
| return True |
| |
| |
| big_synonyms = set(wordnet.synsets('big') + wordnet.synsets('large') + wordnet.synsets('great') + wordnet.synsets('huge') + wordnet.synsets('enormous') + wordnet.synsets('heavy') + wordnet.synsets('strong') + wordnet.synsets('enormous') + wordnet.synsets('massive') + wordnet.synsets('immense') + wordnet.synsets('substantial')) |
| bigger_synonyms = set(wordnet.synsets('bigger') + wordnet.synsets('larger') + wordnet.synsets('greater') + wordnet.synsets('higher') + wordnet.synsets('taller') + wordnet.synsets('heavier') + wordnet.synsets('stronger')) |
|
|
| related_words = big_synonyms.union(bigger_synonyms) |
| |
| |
| for related_word in related_words: |
| if is_related_to(word, related_word.name().split('.')[0]): |
| return True |
| return False |
|
|
|
|
| def is_related_to_smaller(word): |
| """ |
| Returns True if the input word is semantically related to the concept of 'smaller', otherwise False. |
| """ |
| if word.lower() == "less" or word.lower().startswith("less "): |
| return True |
| |
| |
| small_synonyms = set(wordnet.synsets('small') + wordnet.synsets('little') + wordnet.synsets('tiny') + wordnet.synsets('petite') + wordnet.synsets('miniature') + wordnet.synsets('slight') + wordnet.synsets('meager') + wordnet.synsets('inconsequential') + wordnet.synsets('minor')) |
| smaller_synonyms = set(wordnet.synsets('smaller') + wordnet.synsets('lesser') + wordnet.synsets('lower') + wordnet.synsets('shorter') + wordnet.synsets('lighter') + wordnet.synsets('weaker')) |
|
|
| related_words = small_synonyms.union(smaller_synonyms) |
| |
| |
| for related_word in related_words: |
| if is_related_to(word, related_word.name().split('.')[0]): |
| return True |
| return False |
|
|
|
|
| def identify_bigger_smaller_advanced(sentence): |
| """ |
| This is a complementary function to capture cases of 'words ending with -er' followed by 'than' and cases of 'more'/'less' followed 'word' followed by 'than' |
| """ |
|
|
| |
| word_er_than = identify_comparison(sentence) |
| |
| |
| more_word_than = find_more_than_reference(sentence) |
|
|
| |
| less_word_than = find_less_than_reference(sentence) |
|
|
| bigger_list = [] |
| smaller_list = [] |
|
|
| |
| if word_er_than or more_word_than or less_word_than: |
|
|
| |
| if word_er_than: |
| for word in word_er_than: |
|
|
| |
| target_word = word.replace("than", "").strip() |
|
|
| |
| bigger_word = is_related_to_bigger(target_word) |
| smaller_word = is_related_to_smaller(target_word) |
| |
| |
| if bigger_word and not smaller_word: |
| bigger_list.append({"comparative":[word, ">"]}) |
|
|
| |
| elif smaller_word and not bigger_word: |
| smaller_list.append({"comparative":[word, "<"]}) |
|
|
| |
| if more_word_than: |
| for word in more_word_than: |
| |
| |
| target_word = word.replace("than", "").replace("more", "").strip() |
|
|
| |
| bigger_word = is_related_to_bigger(target_word) |
|
|
| |
| if bigger_word: |
| bigger_list.append({"comparative":[word, ">"]}) |
|
|
|
|
| |
| if less_word_than: |
| for word in less_word_than: |
|
|
| |
| target_word = word.replace("than", "").replace("less", "").strip() |
|
|
| |
| lesser_word = is_related_to_smaller(target_word) |
|
|
| |
| if lesser_word: |
| smaller_list.append({"comparative":[word, "<"]}) |
| |
| |
| return bigger_list + smaller_list |
|
|
|
|
|
|
| def find_equal_to_comptives_ngrams(sentence): |
| """ |
| This function takes a sentence as input and returns a reference phrase based on semantic similarity using n-grams. |
| The possible reference phrases are provided as a list. |
| """ |
|
|
| |
| possible_references = ["equal to", "same as", "similar to", "identical to", "equivalent to", "tantamount to", "corresponding to", "comparable to", "akin to", "commensurate with", "in line with", "on a par with" , "indistinguishable from" , "corresponding with", "congruent with"] |
| |
| |
| max_similarity = 0.85 |
|
|
| possible_reference_list = [] |
|
|
| |
| embedding_references = [] |
| for reference in possible_references: |
| reference_doc = nlp(reference) |
| embedding_references.append(reference_doc) |
|
|
| |
| for n in range(2, 5): |
|
|
| |
| sentence_ngrams = list(nltk.ngrams(sentence.split(), n)) |
|
|
| for sent_ngram in sentence_ngrams: |
| sentence_ngram_str = ' '.join(sent_ngram) |
| sentence_ngram_doc = nlp(sentence_ngram_str) |
|
|
| for emb_ref in embedding_references: |
| similarity = sentence_ngram_doc.similarity(emb_ref) |
|
|
| if similarity >= max_similarity: |
| possible_reference_list.append({'comparative': [sentence_ngram_str, "="]}) |
| break |
|
|
| |
| if possible_reference_list: |
| return possible_reference_list |
| else: |
| return [] |
|
|
|
|
|
|
| def single_verb_comptives(sentence): |
| """ |
| This function takes a sentence and identifies any mention of bigger than, smaller than, equal to, expressed |
| as single-word verb. It uses wordnet synsets to examine for synonyms and antonyms |
| """ |
|
|
| |
| bigger_references_sg = ["surpass", "exceed", "outstrip", "outdo", "outmatch", "outclass", "eclipse", "overshadow", "outrank", "overtake", "top", "beat", "transcend", "dominate", "prevail", "trump", "vanquish", "outperform", "outgun", "outdistance", "outshine"] |
| lesser_references_sg = ["lag", "trail", "lose", "underperform", "yield", "surrender", "submit", "succumb", "straggle", "dawdle", "lollygag", "loiter", "delay", "defer", "postpone", "procrastinate", "linger", "hesitate", "prolong", "drag"] |
| equal_references_sg = ["match", "equal", "tie", "correspond", "conform", "agree", "harmonize", "coordinate", "comply", "fit", "parallel", "resemble", "mirror", "emulate", "equilibrate", "balance", "counterbalance", "offset", "compensate"] |
|
|
| doc = nlp(sentence) |
| |
| bigger_list = [] |
| smaller_list = [] |
| equal_list = [] |
|
|
| |
| for token in doc: |
|
|
| |
| if token.text in bigger_references_sg or token.lemma_ in bigger_references_sg: |
| bigger_list.append({'comparative': [token.text, ">"]}) |
| break |
| |
| elif token.text in lesser_references_sg or token.lemma_ in lesser_references_sg: |
| smaller_list.append({'comparative': [token.text, "<"]}) |
| break |
|
|
| elif token.text in equal_references_sg or token.lemma_ in equal_references_sg: |
| equal_list.append({'comparative': [token.text, "="]}) |
| break |
|
|
| else: |
|
|
| |
| if token.pos_ == "VERB": |
|
|
| for lemma in token.lemma_.split('|'): |
| synsets = wordnet.synsets(lemma, pos='v') |
|
|
| for syn in synsets: |
| if any(lemma in bigger_references_sg for lemma in syn.lemma_names()): |
| bigger_list.append({'comparative': [token.text, ">"]}) |
| break |
|
|
| elif any(lemma in lesser_references_sg for lemma in syn.lemma_names()): |
| smaller_list.append({'comparative': [token.text, "<"]}) |
| break |
|
|
| elif any(lemma in equal_references_sg for lemma in syn.lemma_names()): |
| equal_list.append({'comparative': [token.text, "="]}) |
| break |
|
|
|
|
| final_list = bigger_list + smaller_list + equal_list |
|
|
| if final_list: |
| return final_list |
| else: |
| return [] |
|
|
|
|
| |
|
|
| |
| bigger_list = ["is a cut above", "is ahead of", "is superior to", "is greater than", "raise the bar", "climb the ladder", "set the standard", "set the pace", "break the mold", "push the envelope", "raise the game", "is a class apart"] |
| smaller_list = ["fall behind", "is inferior to", "is smaller than", "lag behind", "trail behind", "is second to", "bring up the rear", "lose ground", "bring up the tail end", "fall short", "fall beneath", "fail to measure up", "put off"] |
| equal_list = ["is in line with", "is equal to", "is on a par with", "is on par with", "is the same as", "is comparable to", "is in sync with", "is in harmony with", "is in step with", "is in tune with", "is in accord with", "is consistent with", "is consonant with", "keep pace with", "keep up with", "is equivalent to", "balance out", "even out"] |
|
|
| |
| bigger_embeddings = [np.mean([token.vector for token in nlp(verb)], axis=0) for verb in bigger_list] |
| smaller_embeddings = [np.mean([token.vector for token in nlp(verb)], axis=0) for verb in smaller_list] |
| equal_embeddings = [np.mean([token.vector for token in nlp(verb)], axis=0) for verb in equal_list] |
|
|
|
|
| |
| def check_list(ngram, verb_list): |
| """ |
| This is a function to check if n-gram is in multi-word verb list |
| """ |
|
|
| if ngram in verb_list: |
| return True |
| else: |
| return False |
|
|
|
|
| def cosine_sim(a, b): |
| """ |
| This is a function to calculate cosine similarity |
| """ |
|
|
| return cosine_similarity(a.reshape(1,-1), b.reshape(1,-1))[0][0] |
|
|
|
|
|
|
| |
| |
|
|
| def multiword_verb_comptives(sentence): |
| """ |
| This function takes a sentence and identifies any mention of bigger than, smaller than, equal to, expressed |
| as multi-word verbs. Based on three refernces lists it performs initially a simple string comparison with each |
| of their elements and the ngrams of the input sentence. If there is no match there, it performs the same procedure |
| with cosine similarity to identify any similar ngrams. |
| """ |
|
|
| |
| tokens = sentence.split() |
|
|
| |
| label = None |
| max_sim = 0 |
|
|
| |
| bigger_l = [] |
| smaller_l = [] |
| equal_l = [] |
|
|
| |
| matched_ngrams = set() |
|
|
| |
| for n in range(5, 0, -1): |
| for i in range(len(tokens)-n+1): |
| ngram = ' '.join(tokens[i:i+n]) |
|
|
| |
| if ngram in matched_ngrams: |
| continue |
|
|
| |
| if check_list(ngram, bigger_list): |
| matched_ngrams.update(set(ngram.split())) |
| bigger_l.append({"comparative": [ngram, '>']}) |
|
|
| |
| elif check_list(ngram, smaller_list): |
| matched_ngrams.update(set(ngram.split())) |
| smaller_l.append({"comparative":[ngram, '<']}) |
|
|
| |
| elif check_list(ngram, equal_list): |
| matched_ngrams.update(set(ngram.split())) |
| equal_l.append({"comparative":[ngram, '=']}) |
|
|
| |
| else: |
| ngram_emb = np.mean([token.vector for token in nlp(ngram)], axis=0) |
| similarities_bigger = [cosine_sim(ngram_emb, verb_emb) for verb_emb in bigger_embeddings] |
| max_sim_bigger = max(similarities_bigger) |
|
|
| |
| similarities_smaller = [cosine_sim(ngram_emb, verb_emb) for verb_emb in smaller_embeddings] |
| max_sim_smaller = max(similarities_smaller) |
|
|
| |
| similarities_equal = [cosine_sim(ngram_emb, verb_emb) for verb_emb in equal_embeddings] |
| max_sim_equal = max(similarities_equal) |
|
|
| |
| if max_sim_bigger > max_sim_smaller and max_sim_bigger > max_sim_equal and max_sim_bigger > max_sim: |
| max_sim = max_sim_bigger |
| if max_sim > 0.9: |
| matched_ngrams.update(set(ngram.split())) |
| bigger_l.append({"comparative":[ngram, '>']}) |
| else: |
| matched_ngrams.update(set(ngram.split())) |
|
|
|
|
| elif max_sim_smaller > max_sim_bigger and max_sim_smaller > max_sim_equal and max_sim_smaller > max_sim: |
| max_sim = max_sim_smaller |
| if max_sim > 0.9: |
| matched_ngrams.update(set(ngram.split())) |
| smaller_l.append({"comparative":[ngram, '<']}) |
| else: |
| matched_ngrams.update(set(ngram.split())) |
|
|
|
|
| elif max_sim_equal > max_sim_bigger and max_sim_equal > max_sim_smaller and max_sim_equal > max_sim: |
| max_sim = max_sim_smaller |
| if max_sim > 0.9: |
| matched_ngrams.update(set(ngram.split())) |
| equal_l.append({"comparative":[ngram, '=']}) |
| else: |
| matched_ngrams.update(set(ngram.split())) |
|
|
|
|
| return bigger_l + smaller_l + equal_l |
|
|
|
|
|
|
| def identify_comparatives(sentence): |
| """ |
| This function combines the results of all the aforementioned techniques (simple and advance) to identify bigger than, smaller than, equal to patterns |
| """ |
|
|
| |
| straight_comptives = find_comptives_straight_patterns(sentence) |
|
|
| |
| bigger_smaller_comparatives = identify_bigger_smaller_advanced(sentence) |
|
|
| |
| equal_to_comparatives = find_equal_to_comptives_ngrams(sentence) |
|
|
| single_verb = single_verb_comptives(sentence) |
| multi_verb = multiword_verb_comptives(sentence) |
|
|
| |
| comparatives = straight_comptives + bigger_smaller_comparatives + equal_to_comparatives + single_verb + multi_verb |
|
|
| |
| |
|
|
| |
| comparatives.sort(key=lambda item: len(item['comparative'][0]), reverse=False) |
|
|
| unique_comparatives = {} |
| for i, item in enumerate(comparatives): |
| comparative = item['comparative'][0] |
| |
| is_unique = True |
| for existing_comp in unique_comparatives: |
| if (comparative in existing_comp) or (existing_comp in comparative): |
| is_unique = False |
| break |
| if is_unique: |
| unique_comparatives[comparative] = item |
| elif i == len(comparatives) - 1: |
| |
| for j, existing_item in enumerate(unique_comparatives.values()): |
| if (existing_item['comparative'][0] in comparative) or (comparative in existing_item['comparative'][0]): |
| unique_comparatives.pop(list(unique_comparatives.keys())[j]) |
| unique_comparatives[comparative] = item |
| break |
|
|
| unique_output = list(unique_comparatives.values()) |
|
|
| return unique_output |
|
|
|
|
| def comparatives_binding(sentence): |
|
|
| try: |
| comparative_symbols = find_comptives_symbols(sentence) |
|
|
| comparative_mentions = identify_comparatives(sentence) |
|
|
| |
| if len(comparative_symbols) == 1: |
|
|
| |
| if len(comparative_mentions) == 0: |
| return comparative_symbols |
|
|
| else: |
| return (0, "COMPARATIVES", "more_comparatives_mentions") |
|
|
| |
| elif len(comparative_symbols) == 0: |
|
|
| |
| if len(comparative_mentions) == 1: |
| return comparative_mentions |
|
|
| |
| elif len(comparative_mentions) == 0: |
| return (0, "COMPARATIVES", "no_comparatives") |
|
|
| |
| else: |
| return (0, "COMPARATIVES", "more_comparatives_mentions") |
|
|
| |
| else: |
| return (0, "COMPARATIVES", "more_symbol_comparatives") |
|
|
| except: |
| return (0, "COMPARATIVES", "unknown_error") |